File size: 3,433 Bytes
e127123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import streamlit as st

import re 
from kiwipiepy import Kiwi
kiwi = Kiwi()

from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def kiwi_tokenize(txt, nouns=True, remove1=False, stopwords=[]):
    '''๋ฌธ์ž์—ด txt๋ฅผ ๋ฐ›์•„ kiwi๋กœ ํ˜•ํƒœ์†Œ ์ถ”์ถœ: nouns=๋ช…์‚ฌ๋งŒ ์ถ”์ถœ ์—ฌ๋ถ€, remove1=1์Œ์ ˆ ํ† ํฐ ์ œ์™ธ ์—ฌ๋ถ€, stopwords=๋ถˆ์šฉ์–ด ๋ฆฌ์ŠคํŠธ '''
    try:
        # ์ •์ œ(cleaning): ๋น„๋ฌธ์ž์ˆซ์ž ๋“ฑ ๋…ธ์ด์ฆˆ ์ œ๊ฑฐ
        txt1=re.sub(r"[^\s๊ฐ€-ํžฃa-zA-Z0-9]", " ", txt)   # re.sub: ๋ฌธ์ž์—ด ๋ถ€๋ถ„ ๊ต์ฒด. r์€ ์ •๊ทœํ‘œํ˜„์‹ ์‚ฌ์šฉํ•œ๋‹ค๋Š” ํ‘œ์‹œ. 
                                                        # "[^ ๊ฐ€-ํžฃa-zA-Z1-9]"๋Š” ํ•œ๊ธ€ ์˜์–ด ์ˆซ์ž ์ด์™ธ์˜ ๋ฌธ์ž์—ด ์˜๋ฏธ. 
                                                        # txt1=txt1.replace("X", " "):  ํŠน์ • ๋‹จ์–ด๋งŒ ์‚ญ์ œํ•  ๋•Œ์—๋Š” replace ํ•จ์ˆ˜๋กœ ๊ฐ„๋‹จํžˆ ์‹คํ–‰
        # ํ† ํฐํ™”(tokenization): ํ˜•ํƒœ์†Œ ์ถ”์ถœ
        morphs=kiwi.tokenize(txt1)
        morphs_all=[m[0] for m in morphs]                # ๋ชจ๋“  ํ’ˆ์‚ฌ์— ํ•ด๋‹นํ•˜๋Š” ํ˜•ํƒœ์†Œ ๋ชจ๋‘ ์ถ”์ถœ
        morphs_select=['NNG', 'NNP', 'NP', 'NR', 'VV', 'VX', 'VCP', 'VCN', 'VA','VA-I', 'MM', 'MAG']  # ์ผ๋ฐ˜๋ช…์‚ฌ, ๊ณ ์œ ๋ช…์‚ฌ, ์šฉ์–ธ(๋™์‚ฌ, ํ˜•์šฉ์‚ฌ ๋“ฑ), ๊ด€ํ˜•์‚ฌ, ์ผ๋ฐ˜๋ถ€์‚ฌ # ํ’ˆ์‚ฌ ๋ถ„๋ฅ˜ํ‘œ ์ฐธ์กฐ
        # ๋ช…์‚ฌ ์ถ”์ถœ(nou extraction) ์—ฌ๋ถ€ ์„ ํƒ
        if nouns==True:                                 
            token_lst=[m[0] for m in morphs if m[1] in morphs_select[:4]]  
        else:             
            token_lst=[m for m in morphs if m[1] in morphs_select]     
            # stemming(์–ด๊ฐ„ ์ถ”์ถœ, ๋™์‚ฌ-ํ˜•์šฉ์‚ฌ ๋“ฑ ์šฉ์–ธ์˜ ์›ํ˜• ๋ณต๊ตฌ) ์ ์šฉ    
            token_lst=[m[0]+'๋‹ค' if m[1].startswith('V') else m[0] for m in token_lst]  
        # 1์Œ์ ˆ ํ† ํฐ ์ œ์™ธ ์—ฌ๋ถ€ ์„ ํƒ
        if remove1==True:                                 
            token_lst=[t for t in token_lst if len(t)>1 ]
        else: 
            pass
        # ๋ถˆ์šฉ์–ด(stopwords) ์ ์šฉ: ์ œ์™ธํ•ด์•ผ ํ•  ํ† ํฐ๋“ค์˜ ์ง‘ํ•ฉ    
        token_lst=[t for t in token_lst if t not in stopwords]   
    except: 
        token_lst=[] 
    return token_lst

def generate_wordcloud(text):
    token_list=kiwi_tokenize(text, nouns=True, remove1=True, stopwords=[])
    keywords_all=Counter(token_list).most_common(100)

    mywordcloud = WordCloud(
        font_path = 'NanumGothic-Regular.ttf',       # ํฐํŠธ ์ €์žฅ ๊ฒฝ๋กœ
        background_color='white',                                                       
        colormap = "Accent_r",                                                         # ์‚ฌ์šฉ ์ƒ‰์ƒ ์ง€์ •  # https://matplotlib.org/stable/tutorials/colors/colormaps.html
        width=1500, height=1000                                                        # ๊ทธ๋ฆผ ํ”ฝ์…€
        ).generate_from_frequencies(dict(keywords_all)) 
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8))    
    plt.imshow(mywordcloud, interpolation='bilinear') 
    plt.axis('off') 
    st.pyplot(fig)

def main():
    st.title("์›Œ๋“œํด๋ผ์šฐ๋“œ(Word Cloud) ๋งŒ๋“ค๊ธฐ")
    st.write("๊ฐ€๊ณตํ•  ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”:")
    text_input = st.text_area("ํ…์ŠคํŠธ", "")
    
    if st.button("์›Œ๋“œํด๋ผ์šฐ๋“œ ์‹œ์ž‘"):
        if text_input:
            generate_wordcloud(text_input)
        else:
            st.warning("Please enter some text.")


if __name__ == "__main__":
    main()