Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
import re
|
4 |
+
from kiwipiepy import Kiwi
|
5 |
+
kiwi = Kiwi()
|
6 |
+
|
7 |
+
from collections import Counter
|
8 |
+
from wordcloud import WordCloud
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
|
11 |
+
def kiwi_tokenize(txt, nouns=True, remove1=False, stopwords=[]):
|
12 |
+
'''๋ฌธ์์ด txt๋ฅผ ๋ฐ์ kiwi๋ก ํํ์ ์ถ์ถ: nouns=๋ช
์ฌ๋ง ์ถ์ถ ์ฌ๋ถ, remove1=1์์ ํ ํฐ ์ ์ธ ์ฌ๋ถ, stopwords=๋ถ์ฉ์ด ๋ฆฌ์คํธ '''
|
13 |
+
try:
|
14 |
+
# ์ ์ (cleaning): ๋น๋ฌธ์์ซ์ ๋ฑ ๋
ธ์ด์ฆ ์ ๊ฑฐ
|
15 |
+
txt1=re.sub(r"[^\s๊ฐ-ํฃa-zA-Z0-9]", " ", txt) # re.sub: ๋ฌธ์์ด ๋ถ๋ถ ๊ต์ฒด. r์ ์ ๊ทํํ์ ์ฌ์ฉํ๋ค๋ ํ์.
|
16 |
+
# "[^ ๊ฐ-ํฃa-zA-Z1-9]"๋ ํ๊ธ ์์ด ์ซ์ ์ด์ธ์ ๋ฌธ์์ด ์๋ฏธ.
|
17 |
+
# txt1=txt1.replace("X", " "): ํน์ ๋จ์ด๋ง ์ญ์ ํ ๋์๋ replace ํจ์๋ก ๊ฐ๋จํ ์คํ
|
18 |
+
# ํ ํฐํ(tokenization): ํํ์ ์ถ์ถ
|
19 |
+
morphs=kiwi.tokenize(txt1)
|
20 |
+
morphs_all=[m[0] for m in morphs] # ๋ชจ๋ ํ์ฌ์ ํด๋นํ๋ ํํ์ ๋ชจ๋ ์ถ์ถ
|
21 |
+
morphs_select=['NNG', 'NNP', 'NP', 'NR', 'VV', 'VX', 'VCP', 'VCN', 'VA','VA-I', 'MM', 'MAG'] # ์ผ๋ฐ๋ช
์ฌ, ๊ณ ์ ๋ช
์ฌ, ์ฉ์ธ(๋์ฌ, ํ์ฉ์ฌ ๋ฑ), ๊ดํ์ฌ, ์ผ๋ฐ๋ถ์ฌ # ํ์ฌ ๋ถ๋ฅํ ์ฐธ์กฐ
|
22 |
+
# ๋ช
์ฌ ์ถ์ถ(nou extraction) ์ฌ๋ถ ์ ํ
|
23 |
+
if nouns==True:
|
24 |
+
token_lst=[m[0] for m in morphs if m[1] in morphs_select[:4]]
|
25 |
+
else:
|
26 |
+
token_lst=[m for m in morphs if m[1] in morphs_select]
|
27 |
+
# stemming(์ด๊ฐ ์ถ์ถ, ๋์ฌ-ํ์ฉ์ฌ ๋ฑ ์ฉ์ธ์ ์ํ ๋ณต๊ตฌ) ์ ์ฉ
|
28 |
+
token_lst=[m[0]+'๋ค' if m[1].startswith('V') else m[0] for m in token_lst]
|
29 |
+
# 1์์ ํ ํฐ ์ ์ธ ์ฌ๋ถ ์ ํ
|
30 |
+
if remove1==True:
|
31 |
+
token_lst=[t for t in token_lst if len(t)>1 ]
|
32 |
+
else:
|
33 |
+
pass
|
34 |
+
# ๋ถ์ฉ์ด(stopwords) ์ ์ฉ: ์ ์ธํด์ผ ํ ํ ํฐ๋ค์ ์งํฉ
|
35 |
+
token_lst=[t for t in token_lst if t not in stopwords]
|
36 |
+
except:
|
37 |
+
token_lst=[]
|
38 |
+
return token_lst
|
39 |
+
|
40 |
+
def generate_wordcloud(text):
|
41 |
+
token_list=kiwi_tokenize(text, nouns=True, remove1=True, stopwords=[])
|
42 |
+
keywords_all=Counter(token_list).most_common(100)
|
43 |
+
|
44 |
+
mywordcloud = WordCloud(
|
45 |
+
font_path = 'NanumGothic-Regular.ttf', # ํฐํธ ์ ์ฅ ๊ฒฝ๋ก
|
46 |
+
background_color='white',
|
47 |
+
colormap = "Accent_r", # ์ฌ์ฉ ์์ ์ง์ # https://matplotlib.org/stable/tutorials/colors/colormaps.html
|
48 |
+
width=1500, height=1000 # ๊ทธ๋ฆผ ํฝ์
|
49 |
+
).generate_from_frequencies(dict(keywords_all))
|
50 |
+
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8))
|
51 |
+
plt.imshow(mywordcloud, interpolation='bilinear')
|
52 |
+
plt.axis('off')
|
53 |
+
st.pyplot(fig)
|
54 |
+
|
55 |
+
def main():
|
56 |
+
st.title("์๋ํด๋ผ์ฐ๋(Word Cloud) ๋ง๋ค๊ธฐ")
|
57 |
+
st.write("๊ฐ๊ณตํ ํ
์คํธ๋ฅผ ์
๋ ฅํ์ธ์:")
|
58 |
+
text_input = st.text_area("ํ
์คํธ", "")
|
59 |
+
|
60 |
+
if st.button("์๋ํด๋ผ์ฐ๋ ์์"):
|
61 |
+
if text_input:
|
62 |
+
generate_wordcloud(text_input)
|
63 |
+
else:
|
64 |
+
st.warning("Please enter some text.")
|
65 |
+
|
66 |
+
|
67 |
+
if __name__ == "__main__":
|
68 |
+
main()
|