jonghhhh commited on
Commit
e127123
ยท
verified ยท
1 Parent(s): d525cd9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ import re
4
+ from kiwipiepy import Kiwi
5
+ kiwi = Kiwi()
6
+
7
+ from collections import Counter
8
+ from wordcloud import WordCloud
9
+ import matplotlib.pyplot as plt
10
+
11
+ def kiwi_tokenize(txt, nouns=True, remove1=False, stopwords=[]):
12
+ '''๋ฌธ์ž์—ด txt๋ฅผ ๋ฐ›์•„ kiwi๋กœ ํ˜•ํƒœ์†Œ ์ถ”์ถœ: nouns=๋ช…์‚ฌ๋งŒ ์ถ”์ถœ ์—ฌ๋ถ€, remove1=1์Œ์ ˆ ํ† ํฐ ์ œ์™ธ ์—ฌ๋ถ€, stopwords=๋ถˆ์šฉ์–ด ๋ฆฌ์ŠคํŠธ '''
13
+ try:
14
+ # ์ •์ œ(cleaning): ๋น„๋ฌธ์ž์ˆซ์ž ๋“ฑ ๋…ธ์ด์ฆˆ ์ œ๊ฑฐ
15
+ txt1=re.sub(r"[^\s๊ฐ€-ํžฃa-zA-Z0-9]", " ", txt) # re.sub: ๋ฌธ์ž์—ด ๋ถ€๋ถ„ ๊ต์ฒด. r์€ ์ •๊ทœํ‘œํ˜„์‹ ์‚ฌ์šฉํ•œ๋‹ค๋Š” ํ‘œ์‹œ.
16
+ # "[^ ๊ฐ€-ํžฃa-zA-Z1-9]"๋Š” ํ•œ๊ธ€ ์˜์–ด ์ˆซ์ž ์ด์™ธ์˜ ๋ฌธ์ž์—ด ์˜๋ฏธ.
17
+ # txt1=txt1.replace("X", " "): ํŠน์ • ๋‹จ์–ด๋งŒ ์‚ญ์ œํ•  ๋•Œ์—๋Š” replace ํ•จ์ˆ˜๋กœ ๊ฐ„๋‹จํžˆ ์‹คํ–‰
18
+ # ํ† ํฐํ™”(tokenization): ํ˜•ํƒœ์†Œ ์ถ”์ถœ
19
+ morphs=kiwi.tokenize(txt1)
20
+ morphs_all=[m[0] for m in morphs] # ๋ชจ๋“  ํ’ˆ์‚ฌ์— ํ•ด๋‹นํ•˜๋Š” ํ˜•ํƒœ์†Œ ๋ชจ๋‘ ์ถ”์ถœ
21
+ morphs_select=['NNG', 'NNP', 'NP', 'NR', 'VV', 'VX', 'VCP', 'VCN', 'VA','VA-I', 'MM', 'MAG'] # ์ผ๋ฐ˜๋ช…์‚ฌ, ๊ณ ์œ ๋ช…์‚ฌ, ์šฉ์–ธ(๋™์‚ฌ, ํ˜•์šฉ์‚ฌ ๋“ฑ), ๊ด€ํ˜•์‚ฌ, ์ผ๋ฐ˜๋ถ€์‚ฌ # ํ’ˆ์‚ฌ ๋ถ„๋ฅ˜ํ‘œ ์ฐธ์กฐ
22
+ # ๋ช…์‚ฌ ์ถ”์ถœ(nou extraction) ์—ฌ๋ถ€ ์„ ํƒ
23
+ if nouns==True:
24
+ token_lst=[m[0] for m in morphs if m[1] in morphs_select[:4]]
25
+ else:
26
+ token_lst=[m for m in morphs if m[1] in morphs_select]
27
+ # stemming(์–ด๊ฐ„ ์ถ”์ถœ, ๋™์‚ฌ-ํ˜•์šฉ์‚ฌ ๋“ฑ ์šฉ์–ธ์˜ ์›ํ˜• ๋ณต๊ตฌ) ์ ์šฉ
28
+ token_lst=[m[0]+'๋‹ค' if m[1].startswith('V') else m[0] for m in token_lst]
29
+ # 1์Œ์ ˆ ํ† ํฐ ์ œ์™ธ ์—ฌ๋ถ€ ์„ ํƒ
30
+ if remove1==True:
31
+ token_lst=[t for t in token_lst if len(t)>1 ]
32
+ else:
33
+ pass
34
+ # ๋ถˆ์šฉ์–ด(stopwords) ์ ์šฉ: ์ œ์™ธํ•ด์•ผ ํ•  ํ† ํฐ๋“ค์˜ ์ง‘ํ•ฉ
35
+ token_lst=[t for t in token_lst if t not in stopwords]
36
+ except:
37
+ token_lst=[]
38
+ return token_lst
39
+
40
+ def generate_wordcloud(text):
41
+ token_list=kiwi_tokenize(text, nouns=True, remove1=True, stopwords=[])
42
+ keywords_all=Counter(token_list).most_common(100)
43
+
44
+ mywordcloud = WordCloud(
45
+ font_path = 'NanumGothic-Regular.ttf', # ํฐํŠธ ์ €์žฅ ๊ฒฝ๋กœ
46
+ background_color='white',
47
+ colormap = "Accent_r", # ์‚ฌ์šฉ ์ƒ‰์ƒ ์ง€์ • # https://matplotlib.org/stable/tutorials/colors/colormaps.html
48
+ width=1500, height=1000 # ๊ทธ๋ฆผ ํ”ฝ์…€
49
+ ).generate_from_frequencies(dict(keywords_all))
50
+ fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8))
51
+ plt.imshow(mywordcloud, interpolation='bilinear')
52
+ plt.axis('off')
53
+ st.pyplot(fig)
54
+
55
+ def main():
56
+ st.title("์›Œ๋“œํด๋ผ์šฐ๋“œ(Word Cloud) ๋งŒ๋“ค๊ธฐ")
57
+ st.write("๊ฐ€๊ณตํ•  ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”:")
58
+ text_input = st.text_area("ํ…์ŠคํŠธ", "")
59
+
60
+ if st.button("์›Œ๋“œํด๋ผ์šฐ๋“œ ์‹œ์ž‘"):
61
+ if text_input:
62
+ generate_wordcloud(text_input)
63
+ else:
64
+ st.warning("Please enter some text.")
65
+
66
+
67
+ if __name__ == "__main__":
68
+ main()