Spaces:
Sleeping
Sleeping
import streamlit as st | |
import re | |
from kiwipiepy import Kiwi | |
kiwi = Kiwi() | |
from collections import Counter | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
def kiwi_tokenize(txt, nouns=True, remove1=False, stopwords=[]): | |
'''๋ฌธ์์ด txt๋ฅผ ๋ฐ์ kiwi๋ก ํํ์ ์ถ์ถ: nouns=๋ช ์ฌ๋ง ์ถ์ถ ์ฌ๋ถ, remove1=1์์ ํ ํฐ ์ ์ธ ์ฌ๋ถ, stopwords=๋ถ์ฉ์ด ๋ฆฌ์คํธ ''' | |
try: | |
# ์ ์ (cleaning): ๋น๋ฌธ์์ซ์ ๋ฑ ๋ ธ์ด์ฆ ์ ๊ฑฐ | |
txt1=re.sub(r"[^\s๊ฐ-ํฃa-zA-Z0-9]", " ", txt) # re.sub: ๋ฌธ์์ด ๋ถ๋ถ ๊ต์ฒด. r์ ์ ๊ทํํ์ ์ฌ์ฉํ๋ค๋ ํ์. | |
# "[^ ๊ฐ-ํฃa-zA-Z1-9]"๋ ํ๊ธ ์์ด ์ซ์ ์ด์ธ์ ๋ฌธ์์ด ์๋ฏธ. | |
# txt1=txt1.replace("X", " "): ํน์ ๋จ์ด๋ง ์ญ์ ํ ๋์๋ replace ํจ์๋ก ๊ฐ๋จํ ์คํ | |
# ํ ํฐํ(tokenization): ํํ์ ์ถ์ถ | |
morphs=kiwi.tokenize(txt1) | |
morphs_all=[m[0] for m in morphs] # ๋ชจ๋ ํ์ฌ์ ํด๋นํ๋ ํํ์ ๋ชจ๋ ์ถ์ถ | |
morphs_select=['NNG', 'NNP', 'NP', 'NR', 'VV', 'VX', 'VCP', 'VCN', 'VA','VA-I', 'MM', 'MAG'] # ์ผ๋ฐ๋ช ์ฌ, ๊ณ ์ ๋ช ์ฌ, ์ฉ์ธ(๋์ฌ, ํ์ฉ์ฌ ๋ฑ), ๊ดํ์ฌ, ์ผ๋ฐ๋ถ์ฌ # ํ์ฌ ๋ถ๋ฅํ ์ฐธ์กฐ | |
# ๋ช ์ฌ ์ถ์ถ(nou extraction) ์ฌ๋ถ ์ ํ | |
if nouns==True: | |
token_lst=[m[0] for m in morphs if m[1] in morphs_select[:4]] | |
else: | |
token_lst=[m for m in morphs if m[1] in morphs_select] | |
# stemming(์ด๊ฐ ์ถ์ถ, ๋์ฌ-ํ์ฉ์ฌ ๋ฑ ์ฉ์ธ์ ์ํ ๋ณต๊ตฌ) ์ ์ฉ | |
token_lst=[m[0]+'๋ค' if m[1].startswith('V') else m[0] for m in token_lst] | |
# 1์์ ํ ํฐ ์ ์ธ ์ฌ๋ถ ์ ํ | |
if remove1==True: | |
token_lst=[t for t in token_lst if len(t)>1 ] | |
else: | |
pass | |
# ๋ถ์ฉ์ด(stopwords) ์ ์ฉ: ์ ์ธํด์ผ ํ ํ ํฐ๋ค์ ์งํฉ | |
token_lst=[t for t in token_lst if t not in stopwords] | |
except: | |
token_lst=[] | |
return token_lst | |
def generate_wordcloud(text): | |
token_list=kiwi_tokenize(text, nouns=True, remove1=True, stopwords=[]) | |
keywords_all=Counter(token_list).most_common(100) | |
mywordcloud = WordCloud( | |
font_path = 'NanumGothic-Regular.ttf', # ํฐํธ ์ ์ฅ ๊ฒฝ๋ก | |
background_color='white', | |
colormap = "Accent_r", # ์ฌ์ฉ ์์ ์ง์ # https://matplotlib.org/stable/tutorials/colors/colormaps.html | |
width=1500, height=1000 # ๊ทธ๋ฆผ ํฝ์ | |
).generate_from_frequencies(dict(keywords_all)) | |
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8)) | |
plt.imshow(mywordcloud, interpolation='bilinear') | |
plt.axis('off') | |
st.pyplot(fig) | |
def main(): | |
st.title("์๋ํด๋ผ์ฐ๋(Word Cloud) ๋ง๋ค๊ธฐ") | |
st.write("๊ฐ๊ณตํ ํ ์คํธ๋ฅผ ์ ๋ ฅํ์ธ์:") | |
text_input = st.text_area("ํ ์คํธ", "") | |
if st.button("์๋ํด๋ผ์ฐ๋ ์์"): | |
if text_input: | |
generate_wordcloud(text_input) | |
else: | |
st.warning("Please enter some text.") | |
if __name__ == "__main__": | |
main() | |