TuanScientist commited on
Commit
6f6118e
·
1 Parent(s): 8e5b4f1

Upload app1.py

Browse files
Files changed (1) hide show
  1. app1.py +181 -0
app1.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import RobertaForSequenceClassification, AutoTokenizer
2
+ import torch
3
+ import docx2txt
4
+ import pandas as pd
5
+ import matplotlib.pyplot as plt
6
+ import openpyxl
7
+ from openpyxl.styles import Font, Color, PatternFill
8
+ from openpyxl.styles.colors import WHITE
9
+ import gradio as gr
10
+ import underthesea
11
+
12
+ # Load the model and tokenizer
13
+ senti_model = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment")
14
+ senti_tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False)
15
+
16
+
17
+ def segmentation(sentences):
18
+ segmented_sentences = []
19
+ for sentence in sentences:
20
+ segmented_sentence = underthesea.word_tokenize(sentence)
21
+ segmented_sentences.append(' '.join(segmented_sentence))
22
+ return segmented_sentences
23
+
24
+
25
+ def analyze(sentence):
26
+ input_ids = torch.tensor([senti_tokenizer.encode(sentence)])
27
+ with torch.no_grad():
28
+ out = senti_model(input_ids)
29
+ results = out.logits.softmax(dim=-1).tolist()
30
+ return results[0]
31
+
32
+
33
+ def read_file(docx):
34
+ try:
35
+ text = docx2txt.process(docx)
36
+ lines = text.split('\n')
37
+ lines = [line.strip() for line in lines]
38
+ lines = [line for line in lines if line]
39
+ return lines
40
+ except Exception as e:
41
+ print(f"Error reading file: {e}")
42
+
43
+
44
+ def process_file(docx):
45
+ # Read the file and segment the sentences
46
+ sentences = read_file(docx)
47
+ segmented_sentences = segmentation(sentences)
48
+
49
+ # Analyze the sentiment of each sentence
50
+ results = []
51
+ for sentence in segmented_sentences:
52
+ results.append(analyze(sentence))
53
+
54
+ # Create a DataFrame from the results
55
+ df = pd.DataFrame(results, columns=['Negative', 'Neutral', 'Positive'])
56
+ df['Text'] = read_file(docx)
57
+
58
+ # Generate the pie chart and excel file
59
+ pie_chart_name = generate_pie_chart(df)
60
+ excel_file_path = generate_excel_file(df)
61
+
62
+ return excel_file_path, pie_chart_name
63
+
64
+
65
+ def analyze_text(text, docx_file):
66
+ if text:
67
+ # Perform analysis on the text
68
+ segmented_text = segmentation([text])
69
+ results = []
70
+ for sentence in segmented_text:
71
+ results.append(analyze(sentence))
72
+
73
+ df = pd.DataFrame(results, columns=['Negative', 'Neutral', 'Positive'])
74
+ df['Text'] = [text]
75
+ pie_chart_name = generate_pie_chart(df)
76
+ excel_file_path = generate_excel_file(df)
77
+ return excel_file_path, pie_chart_name
78
+
79
+ elif docx_file:
80
+ return process_file(docx_file.name)
81
+
82
+ else:
83
+ # No input provided
84
+ return None
85
+
86
+
87
+ def generate_pie_chart(df):
88
+ # Calculate the average scores
89
+ neg_avg = df['Negative'].mean()
90
+ neu_avg = df['Neutral'].mean()
91
+ pos_avg = df['Positive'].mean()
92
+
93
+ # Create a new DataFrame with the average scores
94
+ avg_df = pd.DataFrame({'Sentiment': ['Negative', 'Neutral', 'Positive'],
95
+ 'Score': [neg_avg, neu_avg, pos_avg]})
96
+
97
+ # Set custom colors for the pie chart
98
+ colors = ['#BDBDBD', '#87CEFA', '#9ACD32']
99
+
100
+ # Create a pie chart showing the average scores
101
+ plt.pie(avg_df['Score'], labels=avg_df['Sentiment'], colors=colors, autopct='%1.1f%%')
102
+ plt.title('Average Scores by Sentiment')
103
+
104
+ # Save the pie chart as an image file in the static folder
105
+ pie_chart_name = 'pie_chart.png'
106
+ plt.savefig(pie_chart_name)
107
+ plt.close()
108
+
109
+ return pie_chart_name
110
+
111
+
112
+ def generate_excel_file(df):
113
+ # Create a new workbook and worksheet
114
+ wb = openpyxl.Workbook()
115
+ ws = wb.active
116
+
117
+ # Add column headers to the worksheet
118
+ headers = ['Negative', 'Neutral', 'Positive', 'Text']
119
+ for col_num, header in enumerate(headers, 1):
120
+ cell = ws.cell(row=1, column=col_num)
121
+ cell.value = header
122
+ cell.font = Font(bold=True)
123
+
124
+ # Set up cell formatting for each sentiment
125
+ fill_dict = {
126
+ 'Negative': PatternFill(start_color='BDBDBD', end_color='BDBDBD', fill_type='solid'),
127
+ 'Neutral': PatternFill(start_color='87CEFA', end_color='87CEFA', fill_type='solid'),
128
+ 'Positive': PatternFill(start_color='9ACD32', end_color='9ACD32', fill_type='solid')
129
+ }
130
+
131
+ # Loop through each row of the input DataFrame and write data to the worksheet
132
+ for row_num, row_data in df.iterrows():
133
+ # Calculate the highest score and corresponding sentiment for this row
134
+ sentiment_cols = ['Negative', 'Neutral', 'Positive']
135
+ scores = [row_data[col] for col in sentiment_cols]
136
+ max_score = max(scores)
137
+ max_index = scores.index(max_score)
138
+ sentiment = sentiment_cols[max_index]
139
+
140
+ # Write the data to the worksheet
141
+ for col_num, col_data in enumerate(row_data, 1):
142
+ cell = ws.cell(row=row_num + 2, column=col_num)
143
+ cell.value = col_data
144
+ if col_num in [1, 2, 3]:
145
+ if col_data == max_score:
146
+ cell.fill = fill_dict[sentiment]
147
+ if col_num == 4:
148
+ fill = fill_dict[sentiment]
149
+ font_color = WHITE if fill.start_color.rgb == 'BDBDBD' else Color('000000')
150
+ cell.fill = fill
151
+ cell.font = Font(color=font_color)
152
+ if col_data == max_score:
153
+ cell.fill = fill_dict[sentiment]
154
+
155
+ # Save the workbook
156
+ excel_file_path = 'result.xlsx'
157
+ wb.save(excel_file_path)
158
+
159
+ return excel_file_path
160
+
161
+
162
+ inputs = [
163
+ gr.inputs.Textbox(label="Nhập Văn Bản"),
164
+ gr.inputs.File(label="Chọn Tệp Bạn Muốn Phân Tích")
165
+ ]
166
+
167
+ outputs = [
168
+ gr.outputs.File(label="Kết Quả Phân Tích Excel"),
169
+ gr.outputs.Image(type="filepath", label="Thông Số Phân Tích")
170
+ ]
171
+
172
+ interface = gr.Interface(
173
+ fn=analyze_text,
174
+ inputs=inputs,
175
+ outputs=outputs,
176
+ title="Phân Tích Cảm xúc thông qua Hội Thoại",
177
+ allow_flagging="never" # Disable flag button
178
+ )
179
+
180
+ if __name__ == "__main__":
181
+ interface.launch(share=True)