Spaces:
Sleeping
Sleeping
File size: 3,433 Bytes
e127123 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import streamlit as st
import re
from kiwipiepy import Kiwi
kiwi = Kiwi()
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def kiwi_tokenize(txt, nouns=True, remove1=False, stopwords=[]):
'''๋ฌธ์์ด txt๋ฅผ ๋ฐ์ kiwi๋ก ํํ์ ์ถ์ถ: nouns=๋ช
์ฌ๋ง ์ถ์ถ ์ฌ๋ถ, remove1=1์์ ํ ํฐ ์ ์ธ ์ฌ๋ถ, stopwords=๋ถ์ฉ์ด ๋ฆฌ์คํธ '''
try:
# ์ ์ (cleaning): ๋น๋ฌธ์์ซ์ ๋ฑ ๋
ธ์ด์ฆ ์ ๊ฑฐ
txt1=re.sub(r"[^\s๊ฐ-ํฃa-zA-Z0-9]", " ", txt) # re.sub: ๋ฌธ์์ด ๋ถ๋ถ ๊ต์ฒด. r์ ์ ๊ทํํ์ ์ฌ์ฉํ๋ค๋ ํ์.
# "[^ ๊ฐ-ํฃa-zA-Z1-9]"๋ ํ๊ธ ์์ด ์ซ์ ์ด์ธ์ ๋ฌธ์์ด ์๋ฏธ.
# txt1=txt1.replace("X", " "): ํน์ ๋จ์ด๋ง ์ญ์ ํ ๋์๋ replace ํจ์๋ก ๊ฐ๋จํ ์คํ
# ํ ํฐํ(tokenization): ํํ์ ์ถ์ถ
morphs=kiwi.tokenize(txt1)
morphs_all=[m[0] for m in morphs] # ๋ชจ๋ ํ์ฌ์ ํด๋นํ๋ ํํ์ ๋ชจ๋ ์ถ์ถ
morphs_select=['NNG', 'NNP', 'NP', 'NR', 'VV', 'VX', 'VCP', 'VCN', 'VA','VA-I', 'MM', 'MAG'] # ์ผ๋ฐ๋ช
์ฌ, ๊ณ ์ ๋ช
์ฌ, ์ฉ์ธ(๋์ฌ, ํ์ฉ์ฌ ๋ฑ), ๊ดํ์ฌ, ์ผ๋ฐ๋ถ์ฌ # ํ์ฌ ๋ถ๋ฅํ ์ฐธ์กฐ
# ๋ช
์ฌ ์ถ์ถ(nou extraction) ์ฌ๋ถ ์ ํ
if nouns==True:
token_lst=[m[0] for m in morphs if m[1] in morphs_select[:4]]
else:
token_lst=[m for m in morphs if m[1] in morphs_select]
# stemming(์ด๊ฐ ์ถ์ถ, ๋์ฌ-ํ์ฉ์ฌ ๋ฑ ์ฉ์ธ์ ์ํ ๋ณต๊ตฌ) ์ ์ฉ
token_lst=[m[0]+'๋ค' if m[1].startswith('V') else m[0] for m in token_lst]
# 1์์ ํ ํฐ ์ ์ธ ์ฌ๋ถ ์ ํ
if remove1==True:
token_lst=[t for t in token_lst if len(t)>1 ]
else:
pass
# ๋ถ์ฉ์ด(stopwords) ์ ์ฉ: ์ ์ธํด์ผ ํ ํ ํฐ๋ค์ ์งํฉ
token_lst=[t for t in token_lst if t not in stopwords]
except:
token_lst=[]
return token_lst
def generate_wordcloud(text):
token_list=kiwi_tokenize(text, nouns=True, remove1=True, stopwords=[])
keywords_all=Counter(token_list).most_common(100)
mywordcloud = WordCloud(
font_path = 'NanumGothic-Regular.ttf', # ํฐํธ ์ ์ฅ ๊ฒฝ๋ก
background_color='white',
colormap = "Accent_r", # ์ฌ์ฉ ์์ ์ง์ # https://matplotlib.org/stable/tutorials/colors/colormaps.html
width=1500, height=1000 # ๊ทธ๋ฆผ ํฝ์
).generate_from_frequencies(dict(keywords_all))
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8))
plt.imshow(mywordcloud, interpolation='bilinear')
plt.axis('off')
st.pyplot(fig)
def main():
st.title("์๋ํด๋ผ์ฐ๋(Word Cloud) ๋ง๋ค๊ธฐ")
st.write("๊ฐ๊ณตํ ํ
์คํธ๋ฅผ ์
๋ ฅํ์ธ์:")
text_input = st.text_area("ํ
์คํธ", "")
if st.button("์๋ํด๋ผ์ฐ๋ ์์"):
if text_input:
generate_wordcloud(text_input)
else:
st.warning("Please enter some text.")
if __name__ == "__main__":
main()
|