word_cloud / app.py
jonghhhh's picture
Create app.py
e127123 verified
import streamlit as st
import re
from kiwipiepy import Kiwi
kiwi = Kiwi()
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def kiwi_tokenize(txt, nouns=True, remove1=False, stopwords=[]):
'''๋ฌธ์ž์—ด txt๋ฅผ ๋ฐ›์•„ kiwi๋กœ ํ˜•ํƒœ์†Œ ์ถ”์ถœ: nouns=๋ช…์‚ฌ๋งŒ ์ถ”์ถœ ์—ฌ๋ถ€, remove1=1์Œ์ ˆ ํ† ํฐ ์ œ์™ธ ์—ฌ๋ถ€, stopwords=๋ถˆ์šฉ์–ด ๋ฆฌ์ŠคํŠธ '''
try:
# ์ •์ œ(cleaning): ๋น„๋ฌธ์ž์ˆซ์ž ๋“ฑ ๋…ธ์ด์ฆˆ ์ œ๊ฑฐ
txt1=re.sub(r"[^\s๊ฐ€-ํžฃa-zA-Z0-9]", " ", txt) # re.sub: ๋ฌธ์ž์—ด ๋ถ€๋ถ„ ๊ต์ฒด. r์€ ์ •๊ทœํ‘œํ˜„์‹ ์‚ฌ์šฉํ•œ๋‹ค๋Š” ํ‘œ์‹œ.
# "[^ ๊ฐ€-ํžฃa-zA-Z1-9]"๋Š” ํ•œ๊ธ€ ์˜์–ด ์ˆซ์ž ์ด์™ธ์˜ ๋ฌธ์ž์—ด ์˜๋ฏธ.
# txt1=txt1.replace("X", " "): ํŠน์ • ๋‹จ์–ด๋งŒ ์‚ญ์ œํ•  ๋•Œ์—๋Š” replace ํ•จ์ˆ˜๋กœ ๊ฐ„๋‹จํžˆ ์‹คํ–‰
# ํ† ํฐํ™”(tokenization): ํ˜•ํƒœ์†Œ ์ถ”์ถœ
morphs=kiwi.tokenize(txt1)
morphs_all=[m[0] for m in morphs] # ๋ชจ๋“  ํ’ˆ์‚ฌ์— ํ•ด๋‹นํ•˜๋Š” ํ˜•ํƒœ์†Œ ๋ชจ๋‘ ์ถ”์ถœ
morphs_select=['NNG', 'NNP', 'NP', 'NR', 'VV', 'VX', 'VCP', 'VCN', 'VA','VA-I', 'MM', 'MAG'] # ์ผ๋ฐ˜๋ช…์‚ฌ, ๊ณ ์œ ๋ช…์‚ฌ, ์šฉ์–ธ(๋™์‚ฌ, ํ˜•์šฉ์‚ฌ ๋“ฑ), ๊ด€ํ˜•์‚ฌ, ์ผ๋ฐ˜๋ถ€์‚ฌ # ํ’ˆ์‚ฌ ๋ถ„๋ฅ˜ํ‘œ ์ฐธ์กฐ
# ๋ช…์‚ฌ ์ถ”์ถœ(nou extraction) ์—ฌ๋ถ€ ์„ ํƒ
if nouns==True:
token_lst=[m[0] for m in morphs if m[1] in morphs_select[:4]]
else:
token_lst=[m for m in morphs if m[1] in morphs_select]
# stemming(์–ด๊ฐ„ ์ถ”์ถœ, ๋™์‚ฌ-ํ˜•์šฉ์‚ฌ ๋“ฑ ์šฉ์–ธ์˜ ์›ํ˜• ๋ณต๊ตฌ) ์ ์šฉ
token_lst=[m[0]+'๋‹ค' if m[1].startswith('V') else m[0] for m in token_lst]
# 1์Œ์ ˆ ํ† ํฐ ์ œ์™ธ ์—ฌ๋ถ€ ์„ ํƒ
if remove1==True:
token_lst=[t for t in token_lst if len(t)>1 ]
else:
pass
# ๋ถˆ์šฉ์–ด(stopwords) ์ ์šฉ: ์ œ์™ธํ•ด์•ผ ํ•  ํ† ํฐ๋“ค์˜ ์ง‘ํ•ฉ
token_lst=[t for t in token_lst if t not in stopwords]
except:
token_lst=[]
return token_lst
def generate_wordcloud(text):
token_list=kiwi_tokenize(text, nouns=True, remove1=True, stopwords=[])
keywords_all=Counter(token_list).most_common(100)
mywordcloud = WordCloud(
font_path = 'NanumGothic-Regular.ttf', # ํฐํŠธ ์ €์žฅ ๊ฒฝ๋กœ
background_color='white',
colormap = "Accent_r", # ์‚ฌ์šฉ ์ƒ‰์ƒ ์ง€์ • # https://matplotlib.org/stable/tutorials/colors/colormaps.html
width=1500, height=1000 # ๊ทธ๋ฆผ ํ”ฝ์…€
).generate_from_frequencies(dict(keywords_all))
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8))
plt.imshow(mywordcloud, interpolation='bilinear')
plt.axis('off')
st.pyplot(fig)
def main():
st.title("์›Œ๋“œํด๋ผ์šฐ๋“œ(Word Cloud) ๋งŒ๋“ค๊ธฐ")
st.write("๊ฐ€๊ณตํ•  ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”:")
text_input = st.text_area("ํ…์ŠคํŠธ", "")
if st.button("์›Œ๋“œํด๋ผ์šฐ๋“œ ์‹œ์ž‘"):
if text_input:
generate_wordcloud(text_input)
else:
st.warning("Please enter some text.")
if __name__ == "__main__":
main()