Spaces:
Runtime error
Runtime error
Duy-Anh Dang
commited on
Commit
·
0641616
1
Parent(s):
fe86c4e
load_data()
Browse files- FunctionsModelSA_V1.py +132 -56
FunctionsModelSA_V1.py
CHANGED
@@ -1,10 +1,11 @@
|
|
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
from numpy import arange
|
4 |
-
|
5 |
import plotly.graph_objects as go
|
6 |
from nltk import tokenize
|
7 |
-
|
8 |
from PIL import ImageColor
|
9 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
10 |
import nltk
|
@@ -17,13 +18,18 @@ from scipy import spatial
|
|
17 |
import re
|
18 |
import pytorch_lightning as pl
|
19 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
20 |
from transformers import BertTokenizerFast as BertTokenizer, BertModel, BertConfig
|
21 |
import torch.nn as nn
|
22 |
import torch
|
|
|
23 |
import boto3
|
|
|
24 |
from scipy import spatial
|
25 |
-
|
26 |
-
import
|
27 |
|
28 |
|
29 |
PARAMS={
|
@@ -33,11 +39,11 @@ PARAMS={
|
|
33 |
'N_EPOCHS': 10,
|
34 |
'n_classes':8,
|
35 |
'LABEL_COLUMNS': ['label_analytical', 'label_casual', 'label_confident', 'label_friendly',
|
36 |
-
'label_joyful', '
|
37 |
'label_urgent'],
|
38 |
'TEXTCOL': 'text',
|
39 |
'rf_labels':['label_analytical', 'label_casual', 'label_confident',
|
40 |
-
'label_friendly', 'label_joyful', '
|
41 |
'label_respectful', 'label_urgent',
|
42 |
'industry_Academic and Education', 'industry_Energy',
|
43 |
'industry_Entertainment', 'industry_Finance and Banking',
|
@@ -50,7 +56,7 @@ PARAMS={
|
|
50 |
'campaign_type_Usage_and_Consumption', 'campaign_type_Webinar']
|
51 |
}
|
52 |
|
53 |
-
CI_rates
|
54 |
|
55 |
### create file uploading widget
|
56 |
def email_upload():
|
@@ -174,6 +180,7 @@ def convert_text_to_tone(text,model=model,params=PARAMS):
|
|
174 |
|
175 |
data.append([plain_text,sentiment_dict,predictions])
|
176 |
final=pd.DataFrame(data,columns=['text','sentiment','sentencetone'])
|
|
|
177 |
agg_tones=final['sentencetone'].apply(np.mean,axis=0)
|
178 |
tones=pd.DataFrame(agg_tones.tolist(),columns=params['LABEL_COLUMNS'])
|
179 |
return final,tones
|
@@ -204,15 +211,15 @@ model_dict={'Open_Rate':ORM,
|
|
204 |
'Revenue_Per_Email':RVM}
|
205 |
|
206 |
|
207 |
-
|
208 |
-
def plot_CI(pred,lower,upper,scale_factor=0.5
|
209 |
"""This function plots the confidence intervals of your prediction
|
210 |
pred- The prediction varaible given from the Random Forest for the target variable
|
211 |
lower- The lower half of the prediction confidence interval
|
212 |
upper- The upper half of the confidence interval
|
213 |
scale_factor- This will modify the size of the graph """
|
214 |
|
215 |
-
|
216 |
title=f'The Predicted Value is {pred}'
|
217 |
fig = go.Figure()
|
218 |
fig.update_xaxes(showgrid=False)
|
@@ -229,59 +236,88 @@ def plot_CI(pred,lower,upper,scale_factor=0.5,streamlit=False):
|
|
229 |
fig.add_vline(x=upper,annotation_text=f"{upper}",annotation_position="top")
|
230 |
fig.add_vrect(lower,upper,fillcolor='red',opacity=0.25,annotation_text='95% CI',annotation_position="outside top")
|
231 |
fig.update_layout(title_text=title, title_x=0.5)
|
232 |
-
|
233 |
-
if streamlit:
|
234 |
-
st.plotly_chart(fig)
|
235 |
-
else:
|
236 |
-
fig.show()
|
237 |
|
238 |
def find_max_cat(df,target,industry,campaign):
|
|
|
239 |
d=df[(df[campaign]==1) & (df[industry]==1)]
|
240 |
if(len(d)>0):
|
241 |
-
rec=df.loc[d[target].idxmax()][3:11]
|
242 |
-
return round(d[target].max(),3),rec
|
243 |
else:
|
244 |
-
return 0,0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
|
246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
''' This function creates the recomended changes plots it takes it the tones, the changes and '''
|
|
|
248 |
fig = go.Figure()
|
249 |
fig.add_trace(go.Bar(
|
250 |
-
|
251 |
-
|
252 |
-
name='Current Tones',
|
253 |
-
orientation='h',
|
254 |
-
# text=np.round(tones.values[0],3),
|
255 |
-
width=.5,
|
256 |
-
marker=dict(
|
257 |
-
color='#00e6b1',
|
258 |
-
line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
|
259 |
-
)
|
260 |
-
|
261 |
-
))
|
262 |
-
fig.add_trace(go.Bar(
|
263 |
-
y=tones.columns,
|
264 |
x=recommend_changes,
|
265 |
name='Recommend changes',
|
266 |
orientation='h',
|
267 |
text=np.round(recommend_changes,3),
|
268 |
-
width
|
269 |
marker=dict(
|
270 |
color='#e60f00',
|
271 |
-
line=dict(color='rgba(58, 71, 80, 1.0)', width=
|
272 |
)
|
273 |
))
|
274 |
fig.update_traces(textfont_size=18, textposition="outside", cliponaxis=False)
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
st.plotly_chart(fig)
|
281 |
else:
|
282 |
-
|
|
|
|
|
283 |
|
284 |
-
|
285 |
|
286 |
def prediction(tones,campaign_val,industry_val,target):
|
287 |
model_val=pd.DataFrame(tones,columns=PARAMS['rf_labels']).fillna(0)
|
@@ -289,15 +325,58 @@ def prediction(tones,campaign_val,industry_val,target):
|
|
289 |
model_val.loc[0,industry_val]=1
|
290 |
model=model_dict[target]
|
291 |
pred=model.predict(model_val)[0]
|
|
|
|
|
|
|
|
|
292 |
CI=CI_rates[CI_rates['model']==target]
|
293 |
lower=pred+CI['2_5'].values[0]
|
294 |
higher=pred+CI['97_5'].values[0]
|
295 |
-
return
|
296 |
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
df_unique = df.drop_duplicates()
|
302 |
df_unique = pd.get_dummies(df_unique, columns=['industry','campaign_type'])
|
303 |
df_data=df_unique.drop(columns=['Unnamed: 0','body'])
|
@@ -305,7 +384,7 @@ def load_data(buckets,key):
|
|
305 |
return df_data
|
306 |
|
307 |
|
308 |
-
def plot_table(sorted_setence_tuple
|
309 |
""" Plots the bottom most table, takes in a list of tuples where the tuple is the sentence the sentiment distance
|
310 |
from the best values """
|
311 |
sentences=list(zip(*sorted_setence_tuple))[0]
|
@@ -327,13 +406,10 @@ def plot_table(sorted_setence_tuple,streamlit=True):
|
|
327 |
align=['left','center'],
|
328 |
font=dict(family="Arial",size=12)))
|
329 |
])
|
|
|
|
|
330 |
|
331 |
-
|
332 |
-
st.plotly_chart(fig)
|
333 |
-
else:
|
334 |
-
fig.show()
|
335 |
-
|
336 |
-
def corrections(best,df,streamlit=False):
|
337 |
"""This function finds the the difference between the tone of each sentence and the best tone for the desired metric
|
338 |
best- tone values of the best email for the current categories
|
339 |
df- dataframe of the sentences of the uploaded email and the """
|
@@ -350,7 +426,7 @@ def corrections(best,df,streamlit=False):
|
|
350 |
rbg=ImageColor.getcolor(f'{col}', "RGB")
|
351 |
sentence_order.append((text,new_value,rbg))
|
352 |
sorted_sentences=sorted(sentence_order,key=lambda x: x[1],reverse=True)
|
353 |
-
plot_table(sorted_sentences
|
354 |
|
355 |
def read_file(fc):
|
356 |
with open(fc.selected) as file: # Use file to refer to the file object
|
|
|
1 |
+
import s3fs
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
from numpy import arange
|
5 |
+
from colour import Color
|
6 |
import plotly.graph_objects as go
|
7 |
from nltk import tokenize
|
8 |
+
from IPython.display import Markdown
|
9 |
from PIL import ImageColor
|
10 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
11 |
import nltk
|
|
|
18 |
import re
|
19 |
import pytorch_lightning as pl
|
20 |
from bs4 import BeautifulSoup
|
21 |
+
import ipywidgets as widgets
|
22 |
+
from ipywidgets import FileUpload
|
23 |
+
from urlextract import URLExtract
|
24 |
from transformers import BertTokenizerFast as BertTokenizer, BertModel, BertConfig
|
25 |
import torch.nn as nn
|
26 |
import torch
|
27 |
+
from ipywidgets import interact, Dropdown
|
28 |
import boto3
|
29 |
+
from sagemaker import get_execution_role
|
30 |
from scipy import spatial
|
31 |
+
from ipyfilechooser import FileChooser
|
32 |
+
import random
|
33 |
|
34 |
|
35 |
PARAMS={
|
|
|
39 |
'N_EPOCHS': 10,
|
40 |
'n_classes':8,
|
41 |
'LABEL_COLUMNS': ['label_analytical', 'label_casual', 'label_confident', 'label_friendly',
|
42 |
+
'label_joyful', 'label_optimistic', 'label_respectful',
|
43 |
'label_urgent'],
|
44 |
'TEXTCOL': 'text',
|
45 |
'rf_labels':['label_analytical', 'label_casual', 'label_confident',
|
46 |
+
'label_friendly', 'label_joyful', 'label_optimistic',
|
47 |
'label_respectful', 'label_urgent',
|
48 |
'industry_Academic and Education', 'industry_Energy',
|
49 |
'industry_Entertainment', 'industry_Finance and Banking',
|
|
|
56 |
'campaign_type_Usage_and_Consumption', 'campaign_type_Webinar']
|
57 |
}
|
58 |
|
59 |
+
CI_rates=pd.read_csv('CI_RATES.csv')
|
60 |
|
61 |
### create file uploading widget
|
62 |
def email_upload():
|
|
|
180 |
|
181 |
data.append([plain_text,sentiment_dict,predictions])
|
182 |
final=pd.DataFrame(data,columns=['text','sentiment','sentencetone'])
|
183 |
+
# print(final)
|
184 |
agg_tones=final['sentencetone'].apply(np.mean,axis=0)
|
185 |
tones=pd.DataFrame(agg_tones.tolist(),columns=params['LABEL_COLUMNS'])
|
186 |
return final,tones
|
|
|
211 |
'Revenue_Per_Email':RVM}
|
212 |
|
213 |
|
214 |
+
## Plot confidence interval
|
215 |
+
def plot_CI(pred,lower,upper,scale_factor=0.5):
|
216 |
"""This function plots the confidence intervals of your prediction
|
217 |
pred- The prediction varaible given from the Random Forest for the target variable
|
218 |
lower- The lower half of the prediction confidence interval
|
219 |
upper- The upper half of the confidence interval
|
220 |
scale_factor- This will modify the size of the graph """
|
221 |
|
222 |
+
|
223 |
title=f'The Predicted Value is {pred}'
|
224 |
fig = go.Figure()
|
225 |
fig.update_xaxes(showgrid=False)
|
|
|
236 |
fig.add_vline(x=upper,annotation_text=f"{upper}",annotation_position="top")
|
237 |
fig.add_vrect(lower,upper,fillcolor='red',opacity=0.25,annotation_text='95% CI',annotation_position="outside top")
|
238 |
fig.update_layout(title_text=title, title_x=0.5)
|
239 |
+
fig.show()
|
|
|
|
|
|
|
|
|
240 |
|
241 |
def find_max_cat(df,target,industry,campaign):
|
242 |
+
#### Select entries with the matching industry and campaign (1 == True)
|
243 |
d=df[(df[campaign]==1) & (df[industry]==1)]
|
244 |
if(len(d)>0):
|
245 |
+
rec=df.loc[d[target].idxmax()][3:11] ## Select the tone values for the best target values
|
246 |
+
return round(d[target].min(),3),round(d[target].max(),3),rec ## select the top target variable value and return with the tones
|
247 |
else:
|
248 |
+
return 0,0,0
|
249 |
+
|
250 |
+
|
251 |
+
def scale_values(val, tn): ## val = slider value, tn = current tone value
|
252 |
+
val = tn*100
|
253 |
+
return val
|
254 |
+
|
255 |
+
tone_labels = ['Analytical', 'Casual', 'Confident', 'Friendly', 'Joyful', 'Optimistic', 'Respectful', 'Urgent']
|
256 |
+
|
257 |
+
# ## Plot recommendations - ORIGINAL FROM V1.0
|
258 |
+
# def recommend(tones,recommend_changes,change,target):
|
259 |
+
# ''' This function creates the recomended changes plots it takes it the tones, the changes and '''
|
260 |
+
# fig = go.Figure()
|
261 |
+
# fig.add_trace(go.Bar(
|
262 |
+
# y=tones.columns,
|
263 |
+
# x=tones.values[0],
|
264 |
+
# name='Current Tones',
|
265 |
+
# orientation='h',
|
266 |
+
# # text=np.round(tones.values[0],3),
|
267 |
+
# width=.9,
|
268 |
+
# marker=dict(
|
269 |
+
# color='#00e6b1',
|
270 |
+
# line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
|
271 |
+
# )
|
272 |
|
273 |
+
# ))
|
274 |
+
# fig.add_trace(go.Bar(
|
275 |
+
# y=tones.columns,
|
276 |
+
# x=recommend_changes,
|
277 |
+
# name='Recommend changes',
|
278 |
+
# orientation='h',
|
279 |
+
# text=np.round(recommend_changes,3),
|
280 |
+
# width=.5,
|
281 |
+
# marker=dict(
|
282 |
+
# color='#e60f00',
|
283 |
+
# line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
|
284 |
+
# )
|
285 |
+
# ))
|
286 |
+
# fig.update_traces(textfont_size=18, textposition="outside", cliponaxis=False)
|
287 |
+
# fig.update_layout(height=1000, plot_bgcolor='white')
|
288 |
+
# fig.update_layout(barmode='stack', yaxis={'categoryorder':'array','categoryarray': recommend_changes.sort_values(key=abs,ascending=True).index})
|
289 |
+
# fig.update_layout(title_text=f'The following Changes will yield a {round(change,3)} increase in {target}')
|
290 |
+
# fig.show()
|
291 |
+
|
292 |
+
## Plot recommendations - MODIFIED
|
293 |
+
def recommend(tones,recommend_changes,change,target):
|
294 |
''' This function creates the recomended changes plots it takes it the tones, the changes and '''
|
295 |
+
|
296 |
fig = go.Figure()
|
297 |
fig.add_trace(go.Bar(
|
298 |
+
# y=tones.columns,
|
299 |
+
y=tone_labels,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
x=recommend_changes,
|
301 |
name='Recommend changes',
|
302 |
orientation='h',
|
303 |
text=np.round(recommend_changes,3),
|
304 |
+
width=.5,
|
305 |
marker=dict(
|
306 |
color='#e60f00',
|
307 |
+
line=dict(color='rgba(58, 71, 80, 1.0)', width=1)
|
308 |
)
|
309 |
))
|
310 |
fig.update_traces(textfont_size=18, textposition="outside", cliponaxis=False)
|
311 |
+
# fig.update_layout(height=1000, plot_bgcolor='white')
|
312 |
+
# fig.update_layout(barmode='stack', yaxis={'categoryorder':'array','categoryarray': recommend_changes.sort_values(key=abs,ascending=True).index})
|
313 |
+
# fig.update_layout(title_text=f'The following Changes will yield a {round(change,3)} increase in {target}')
|
314 |
+
if target == 'Revenue_Per_Email':
|
315 |
+
out = f"${round(change,2)}"
|
|
|
316 |
else:
|
317 |
+
out = f"{round(change,2)*100}%"
|
318 |
+
fig.update_layout(title_text=f'The following Changes will yield a {out} increase in {target}')
|
319 |
+
fig.show()
|
320 |
|
|
|
321 |
|
322 |
def prediction(tones,campaign_val,industry_val,target):
|
323 |
model_val=pd.DataFrame(tones,columns=PARAMS['rf_labels']).fillna(0)
|
|
|
325 |
model_val.loc[0,industry_val]=1
|
326 |
model=model_dict[target]
|
327 |
pred=model.predict(model_val)[0]
|
328 |
+
|
329 |
+
# y_pred = regr.predict(X_test)
|
330 |
+
# r2_test = r2_score(y_test, y_pred)
|
331 |
+
|
332 |
CI=CI_rates[CI_rates['model']==target]
|
333 |
lower=pred+CI['2_5'].values[0]
|
334 |
higher=pred+CI['97_5'].values[0]
|
335 |
+
return pred,round(lower,3),round(higher,3),model
|
336 |
|
337 |
+
|
338 |
+
## Plot recommendations for intensity changes
|
339 |
+
def intensity_changes(tones,recommend_changes,change,target):
|
340 |
+
''' This function creates a plot to show the change made to intensities and shows the resulting change in target rate '''
|
341 |
+
|
342 |
+
fig = go.Figure()
|
343 |
+
fig.add_trace(go.Bar(
|
344 |
+
# y=tones.columns,
|
345 |
+
y=tone_labels,
|
346 |
+
x=recommend_changes,
|
347 |
+
name='Recommend changes',
|
348 |
+
orientation='h',
|
349 |
+
text=np.round(recommend_changes,3),
|
350 |
+
width=.5,
|
351 |
+
marker=dict(
|
352 |
+
color='#00e6b1',
|
353 |
+
line=dict(color='rgba(58, 71, 80, 1.0)', width=1)
|
354 |
+
)
|
355 |
+
))
|
356 |
+
fig.update_traces(textfont_size=18, textposition="outside", cliponaxis=False)
|
357 |
+
|
358 |
+
if change < 0:
|
359 |
+
if target == 'Revenue_Per_Email':
|
360 |
+
out = f"${round(change*(-1),2)}"
|
361 |
+
else:
|
362 |
+
out = f"{round(change*(-1),2)}%"
|
363 |
+
|
364 |
+
fig.update_layout(title_text=f'The following Changes will decrease the {target} by {out}')
|
365 |
+
|
366 |
+
elif change >= 0:
|
367 |
+
if target == 'Revenue_Per_Email':
|
368 |
+
out = f"${round(change,2)}"
|
369 |
+
else:
|
370 |
+
out = f"{round(change,2)*100}%"
|
371 |
+
fig.update_layout(title_text=f'The following Changes will increase the {target} by {out}')
|
372 |
+
|
373 |
+
# fig.update_layout(title_text=f'The changes made to the tone intensities')
|
374 |
+
fig.show()
|
375 |
+
|
376 |
+
|
377 |
+
def load_data():
|
378 |
+
data_location='Tone_and_target.csv'
|
379 |
+
df=pd.read_csv(data_location)
|
380 |
df_unique = df.drop_duplicates()
|
381 |
df_unique = pd.get_dummies(df_unique, columns=['industry','campaign_type'])
|
382 |
df_data=df_unique.drop(columns=['Unnamed: 0','body'])
|
|
|
384 |
return df_data
|
385 |
|
386 |
|
387 |
+
def plot_table(sorted_setence_tuple):
|
388 |
""" Plots the bottom most table, takes in a list of tuples where the tuple is the sentence the sentiment distance
|
389 |
from the best values """
|
390 |
sentences=list(zip(*sorted_setence_tuple))[0]
|
|
|
406 |
align=['left','center'],
|
407 |
font=dict(family="Arial",size=12)))
|
408 |
])
|
409 |
+
|
410 |
+
#fig.show()
|
411 |
|
412 |
+
def corrections(best,df):
|
|
|
|
|
|
|
|
|
|
|
413 |
"""This function finds the the difference between the tone of each sentence and the best tone for the desired metric
|
414 |
best- tone values of the best email for the current categories
|
415 |
df- dataframe of the sentences of the uploaded email and the """
|
|
|
426 |
rbg=ImageColor.getcolor(f'{col}', "RGB")
|
427 |
sentence_order.append((text,new_value,rbg))
|
428 |
sorted_sentences=sorted(sentence_order,key=lambda x: x[1],reverse=True)
|
429 |
+
plot_table(sorted_sentences)
|
430 |
|
431 |
def read_file(fc):
|
432 |
with open(fc.selected) as file: # Use file to refer to the file object
|