File size: 7,021 Bytes
2a97daa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571aae6
2a97daa
 
 
 
 
4ca5e8f
 
 
 
 
 
 
 
 
2a97daa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import streamlit as st
from streamlit.components.v1 import html
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud.wordcloud import WordCloud
from configs.db_configs import add_one_item
from configs.html_features import set_image, HTML_WRAPPER

from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch
from torch.nn.functional import softmax

from spacy import displacy
import spacy
nlp = spacy.load('en_core_web_sm')

from collections import Counter
import neattext as nt
import neattext.functions as nfx
from textblob import TextBlob
import nltk

# These corpora are commonly used by TextBlob for various natural language processing tasks.
nltk.download('brown')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('conll2000')
nltk.download('movie_reviews')

def get_tokens_analysis(text):
    doc_obj = nlp(text)
    tokens_stats = [(token.text, token.shape_, token.pos_, token.tag_, token.lemma_, token.is_alpha, token.is_stop) for token in doc_obj]
    tokens_stats_df = pd.DataFrame(tokens_stats, columns=['Token', 'Shape', 'Part-of-Speech', 'Part-of-Speech Tag', 'Root', 'IsAlpha', 'IsStop'])
    return tokens_stats_df


def get_entities_tokens(text):
    doc_obj = nlp(text)

    html = displacy.render(doc_obj, style='ent')
    html = html.replace('\n\n', '\n')
    entities_tokens_html = HTML_WRAPPER.format(html)
    return entities_tokens_html


def get_word_stats(text):
    text_frame_obj = nt.TextFrame(text)
    word_stats = text_frame_obj.word_stats()
    word_length_freq = text_frame_obj.word_length_freq()
    word_length_df = pd.DataFrame(word_length_freq.items(), columns=['word length', 'frequency'])
    word_length_df['word length'] = word_length_df['word length'].astype(str)
    word_length_df['word length'] = 'length ' + word_length_df['word length']
    custom_color = px.colors.sequential.Blues_r
    figure = px.pie(word_length_df, names='word length', values='frequency', title='Word Percentage Frequency by length', width=400, height=400, color_discrete_sequence=custom_color)
    return word_stats, figure


def plot_top_keywords_frequencies(text, n_top_keywords):
    preprocessed_text = nfx.remove_stopwords(text)
    blob = TextBlob(preprocessed_text)
    words = blob.words
    top_keywords = Counter(words).most_common(n_top_keywords)
    top_keywords_df = pd.DataFrame(top_keywords, columns=['words', 'frequency'])
    figure = px.bar(top_keywords_df, x='words', y='frequency', color='frequency', title=f'the frequency of {n_top_keywords} top keywords', width=400, height=400, color_continuous_scale='Blues')
    return figure


def get_sentence_stats(text):
    blob = TextBlob(text)
    sentences = [str(sentence) for sentence in blob.sentences]
    noun_phrases = list(blob.noun_phrases)
    sentence_stats = {
        'Number of Sentences' : len(sentences),
        'Number of Noun Phrases' : len(noun_phrases)
    }
    sentence_stats_df = pd.DataFrame(sentence_stats, index=[0])
    return sentences, noun_phrases, sentence_stats_df


def plot_tokens_pos(tokens_stats_df):
    pos_df = tokens_stats_df['Part-of-Speech'].value_counts().to_frame().reset_index()
    pos_df.columns = ['Part-of-Speech', 'Frequency']
    figure = px.bar(pos_df, x='Part-of-Speech', y='Frequency', color='Frequency', title=f'The Frequency of Tokens Part of speech', width=400, height=400, color_continuous_scale='Blues')
    return figure


def get_sentiment_analysis_res(text):
    tokenizer = AutoTokenizer.from_pretrained('stevhliu/my_awesome_model')
    inputs = tokenizer(text, return_tensors='pt')
    model = AutoModelForSequenceClassification.from_pretrained('stevhliu/my_awesome_model')
    with torch.no_grad():
        logits = model(**inputs).logits
        
    predicted_class_id = logits.argmax().item()
    model.config.id2label = {0:'Negative', 1:'Positive'}
    label = model.config.id2label[predicted_class_id]
    score = float(softmax(logits, dim=1)[0][predicted_class_id])
    sentiment_df = pd.DataFrame([[label, score]], columns=['Text Polarity', 'Belonging Probability'])
    return sentiment_df


def plot_word_frequency(text):
    wc = WordCloud(width=600, height=500).generate(text)
    fig = plt.figure()
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    return fig

def main():
    st.title('Text Analyzer')
    im1, im2, im3 = st.columns([1, 5.3, 1])
    with im1:
        pass
    with im2:
        url = "https://i.postimg.cc/jdF1hPng/combined.png"
        html(set_image(url), height=500, width=500)
    with im3:
        pass

    text = st.text_area('Text Analyzer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
    n_top_keywords = st.sidebar.slider('n Top keywords', 5, 15, 5, 1)
    if st.button('Analyze it'):
        if text != '':
            with st.expander('Original Text'):
                st.write(text)
                add_one_item(text, 'Text Analyzer')
            
            with st.expander('Text Analysis'):
                tokens_stats_df = get_tokens_analysis(text)
                st.dataframe(tokens_stats_df)

            with st.expander('Text Entities'):
                entities_tokens_html = get_entities_tokens(text)
                html(entities_tokens_html, height=300, scrolling=True)

            col11, col12 = st.columns(2)
            with col11:
                with st.expander('Word Statistics'):
                    word_stats_json, figure = get_word_stats(text)
                    st.json(word_stats_json)
                    st.plotly_chart(figure)
            
            with col12:
                with st.expander(f'The Frequency of {n_top_keywords} Top Keywords'):
                    figure = plot_top_keywords_frequencies(text, n_top_keywords)
                    st.plotly_chart(figure)
            
            col21, col22 = st.columns(2)
            with col21:
                with st.expander('Sentence Statistics'):
                    sentences, noun_phrases, sentence_stats_df = get_sentence_stats(text)
                    st.dataframe(sentence_stats_df)
                    st.write('Sentences:\n', sentences)
                    st.write('Noun Phrases:\n', noun_phrases)

            with col22:
                with st.expander('The Frequency of Tokens Part of speech'):
                    figure = plot_tokens_pos(tokens_stats_df)
                    st.plotly_chart(figure)

            col31, col32 = st.columns(2)
            with col31:
                with st.expander('Sentiment Analysis'):
                    sentiment_df = get_sentiment_analysis_res(text)
                    st.dataframe(sentiment_df)

            with col32:
                with st.expander('Word Frequency'):
                    fig = plot_word_frequency(text)
                    st.pyplot(fig)

        else:
            st.error('Please enter a non-empty text.')
        

if __name__ == '__main__':
    main()