Duy-Anh Dang commited on
Commit
0641616
·
1 Parent(s): fe86c4e

load_data()

Browse files
Files changed (1) hide show
  1. FunctionsModelSA_V1.py +132 -56
FunctionsModelSA_V1.py CHANGED
@@ -1,10 +1,11 @@
 
1
  import pandas as pd
2
  import numpy as np
3
  from numpy import arange
4
- # from colour import Color
5
  import plotly.graph_objects as go
6
  from nltk import tokenize
7
- # from IPython.display import Markdown
8
  from PIL import ImageColor
9
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
10
  import nltk
@@ -17,13 +18,18 @@ from scipy import spatial
17
  import re
18
  import pytorch_lightning as pl
19
  from bs4 import BeautifulSoup
 
 
 
20
  from transformers import BertTokenizerFast as BertTokenizer, BertModel, BertConfig
21
  import torch.nn as nn
22
  import torch
 
23
  import boto3
 
24
  from scipy import spatial
25
- import streamlit as st
26
- import utils
27
 
28
 
29
  PARAMS={
@@ -33,11 +39,11 @@ PARAMS={
33
  'N_EPOCHS': 10,
34
  'n_classes':8,
35
  'LABEL_COLUMNS': ['label_analytical', 'label_casual', 'label_confident', 'label_friendly',
36
- 'label_joyful', 'label_opstimistic', 'label_respectful',
37
  'label_urgent'],
38
  'TEXTCOL': 'text',
39
  'rf_labels':['label_analytical', 'label_casual', 'label_confident',
40
- 'label_friendly', 'label_joyful', 'label_opstimistic',
41
  'label_respectful', 'label_urgent',
42
  'industry_Academic and Education', 'industry_Energy',
43
  'industry_Entertainment', 'industry_Finance and Banking',
@@ -50,7 +56,7 @@ PARAMS={
50
  'campaign_type_Usage_and_Consumption', 'campaign_type_Webinar']
51
  }
52
 
53
- CI_rates = utils.get_files_from_aws('emailcampaignmodeldata','ModelSADataSets/CI_RATES.csv')
54
 
55
  ### create file uploading widget
56
  def email_upload():
@@ -174,6 +180,7 @@ def convert_text_to_tone(text,model=model,params=PARAMS):
174
 
175
  data.append([plain_text,sentiment_dict,predictions])
176
  final=pd.DataFrame(data,columns=['text','sentiment','sentencetone'])
 
177
  agg_tones=final['sentencetone'].apply(np.mean,axis=0)
178
  tones=pd.DataFrame(agg_tones.tolist(),columns=params['LABEL_COLUMNS'])
179
  return final,tones
@@ -204,15 +211,15 @@ model_dict={'Open_Rate':ORM,
204
  'Revenue_Per_Email':RVM}
205
 
206
 
207
-
208
- def plot_CI(pred,lower,upper,scale_factor=0.5,streamlit=False):
209
  """This function plots the confidence intervals of your prediction
210
  pred- The prediction varaible given from the Random Forest for the target variable
211
  lower- The lower half of the prediction confidence interval
212
  upper- The upper half of the confidence interval
213
  scale_factor- This will modify the size of the graph """
214
 
215
-
216
  title=f'The Predicted Value is {pred}'
217
  fig = go.Figure()
218
  fig.update_xaxes(showgrid=False)
@@ -229,59 +236,88 @@ def plot_CI(pred,lower,upper,scale_factor=0.5,streamlit=False):
229
  fig.add_vline(x=upper,annotation_text=f"{upper}",annotation_position="top")
230
  fig.add_vrect(lower,upper,fillcolor='red',opacity=0.25,annotation_text='95% CI',annotation_position="outside top")
231
  fig.update_layout(title_text=title, title_x=0.5)
232
-
233
- if streamlit:
234
- st.plotly_chart(fig)
235
- else:
236
- fig.show()
237
 
238
  def find_max_cat(df,target,industry,campaign):
 
239
  d=df[(df[campaign]==1) & (df[industry]==1)]
240
  if(len(d)>0):
241
- rec=df.loc[d[target].idxmax()][3:11]
242
- return round(d[target].max(),3),rec
243
  else:
244
- return 0,0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
- def recommend(tones,recommend_changes,change,target,streamlit=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  ''' This function creates the recomended changes plots it takes it the tones, the changes and '''
 
248
  fig = go.Figure()
249
  fig.add_trace(go.Bar(
250
- y=tones.columns,
251
- x=tones.values[0],
252
- name='Current Tones',
253
- orientation='h',
254
- # text=np.round(tones.values[0],3),
255
- width=.5,
256
- marker=dict(
257
- color='#00e6b1',
258
- line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
259
- )
260
-
261
- ))
262
- fig.add_trace(go.Bar(
263
- y=tones.columns,
264
  x=recommend_changes,
265
  name='Recommend changes',
266
  orientation='h',
267
  text=np.round(recommend_changes,3),
268
- width=0.3,
269
  marker=dict(
270
  color='#e60f00',
271
- line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
272
  )
273
  ))
274
  fig.update_traces(textfont_size=18, textposition="outside", cliponaxis=False)
275
- fig.update_layout(height=700, plot_bgcolor='white')
276
- fig.update_layout(barmode='stack', yaxis={'categoryorder':'array','categoryarray': recommend_changes.sort_values(key=abs,ascending=True).index})
277
- fig.update_layout(title_text=f'The following Changes will yield a {round(change,3)} increase in {target}')
278
-
279
- if streamlit:
280
- st.plotly_chart(fig)
281
  else:
282
- fig.show()
 
 
283
 
284
-
285
 
286
  def prediction(tones,campaign_val,industry_val,target):
287
  model_val=pd.DataFrame(tones,columns=PARAMS['rf_labels']).fillna(0)
@@ -289,15 +325,58 @@ def prediction(tones,campaign_val,industry_val,target):
289
  model_val.loc[0,industry_val]=1
290
  model=model_dict[target]
291
  pred=model.predict(model_val)[0]
 
 
 
 
292
  CI=CI_rates[CI_rates['model']==target]
293
  lower=pred+CI['2_5'].values[0]
294
  higher=pred+CI['97_5'].values[0]
295
- return round(pred,3),round(lower,3),round(higher,3),model
296
 
297
- def load_data(buckets,key):
298
- # data_location='Tone_and_target.csv'
299
- # data=pd.read_csv(data_location)
300
- df=utils.get_files_from_aws(buckets,key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  df_unique = df.drop_duplicates()
302
  df_unique = pd.get_dummies(df_unique, columns=['industry','campaign_type'])
303
  df_data=df_unique.drop(columns=['Unnamed: 0','body'])
@@ -305,7 +384,7 @@ def load_data(buckets,key):
305
  return df_data
306
 
307
 
308
- def plot_table(sorted_setence_tuple,streamlit=True):
309
  """ Plots the bottom most table, takes in a list of tuples where the tuple is the sentence the sentiment distance
310
  from the best values """
311
  sentences=list(zip(*sorted_setence_tuple))[0]
@@ -327,13 +406,10 @@ def plot_table(sorted_setence_tuple,streamlit=True):
327
  align=['left','center'],
328
  font=dict(family="Arial",size=12)))
329
  ])
 
 
330
 
331
- if streamlit:
332
- st.plotly_chart(fig)
333
- else:
334
- fig.show()
335
-
336
- def corrections(best,df,streamlit=False):
337
  """This function finds the the difference between the tone of each sentence and the best tone for the desired metric
338
  best- tone values of the best email for the current categories
339
  df- dataframe of the sentences of the uploaded email and the """
@@ -350,7 +426,7 @@ def corrections(best,df,streamlit=False):
350
  rbg=ImageColor.getcolor(f'{col}', "RGB")
351
  sentence_order.append((text,new_value,rbg))
352
  sorted_sentences=sorted(sentence_order,key=lambda x: x[1],reverse=True)
353
- plot_table(sorted_sentences,streamlit)
354
 
355
  def read_file(fc):
356
  with open(fc.selected) as file: # Use file to refer to the file object
 
1
+ import s3fs
2
  import pandas as pd
3
  import numpy as np
4
  from numpy import arange
5
+ from colour import Color
6
  import plotly.graph_objects as go
7
  from nltk import tokenize
8
+ from IPython.display import Markdown
9
  from PIL import ImageColor
10
  from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
11
  import nltk
 
18
  import re
19
  import pytorch_lightning as pl
20
  from bs4 import BeautifulSoup
21
+ import ipywidgets as widgets
22
+ from ipywidgets import FileUpload
23
+ from urlextract import URLExtract
24
  from transformers import BertTokenizerFast as BertTokenizer, BertModel, BertConfig
25
  import torch.nn as nn
26
  import torch
27
+ from ipywidgets import interact, Dropdown
28
  import boto3
29
+ from sagemaker import get_execution_role
30
  from scipy import spatial
31
+ from ipyfilechooser import FileChooser
32
+ import random
33
 
34
 
35
  PARAMS={
 
39
  'N_EPOCHS': 10,
40
  'n_classes':8,
41
  'LABEL_COLUMNS': ['label_analytical', 'label_casual', 'label_confident', 'label_friendly',
42
+ 'label_joyful', 'label_optimistic', 'label_respectful',
43
  'label_urgent'],
44
  'TEXTCOL': 'text',
45
  'rf_labels':['label_analytical', 'label_casual', 'label_confident',
46
+ 'label_friendly', 'label_joyful', 'label_optimistic',
47
  'label_respectful', 'label_urgent',
48
  'industry_Academic and Education', 'industry_Energy',
49
  'industry_Entertainment', 'industry_Finance and Banking',
 
56
  'campaign_type_Usage_and_Consumption', 'campaign_type_Webinar']
57
  }
58
 
59
+ CI_rates=pd.read_csv('CI_RATES.csv')
60
 
61
  ### create file uploading widget
62
  def email_upload():
 
180
 
181
  data.append([plain_text,sentiment_dict,predictions])
182
  final=pd.DataFrame(data,columns=['text','sentiment','sentencetone'])
183
+ # print(final)
184
  agg_tones=final['sentencetone'].apply(np.mean,axis=0)
185
  tones=pd.DataFrame(agg_tones.tolist(),columns=params['LABEL_COLUMNS'])
186
  return final,tones
 
211
  'Revenue_Per_Email':RVM}
212
 
213
 
214
+ ## Plot confidence interval
215
+ def plot_CI(pred,lower,upper,scale_factor=0.5):
216
  """This function plots the confidence intervals of your prediction
217
  pred- The prediction varaible given from the Random Forest for the target variable
218
  lower- The lower half of the prediction confidence interval
219
  upper- The upper half of the confidence interval
220
  scale_factor- This will modify the size of the graph """
221
 
222
+
223
  title=f'The Predicted Value is {pred}'
224
  fig = go.Figure()
225
  fig.update_xaxes(showgrid=False)
 
236
  fig.add_vline(x=upper,annotation_text=f"{upper}",annotation_position="top")
237
  fig.add_vrect(lower,upper,fillcolor='red',opacity=0.25,annotation_text='95% CI',annotation_position="outside top")
238
  fig.update_layout(title_text=title, title_x=0.5)
239
+ fig.show()
 
 
 
 
240
 
241
  def find_max_cat(df,target,industry,campaign):
242
+ #### Select entries with the matching industry and campaign (1 == True)
243
  d=df[(df[campaign]==1) & (df[industry]==1)]
244
  if(len(d)>0):
245
+ rec=df.loc[d[target].idxmax()][3:11] ## Select the tone values for the best target values
246
+ return round(d[target].min(),3),round(d[target].max(),3),rec ## select the top target variable value and return with the tones
247
  else:
248
+ return 0,0,0
249
+
250
+
251
+ def scale_values(val, tn): ## val = slider value, tn = current tone value
252
+ val = tn*100
253
+ return val
254
+
255
+ tone_labels = ['Analytical', 'Casual', 'Confident', 'Friendly', 'Joyful', 'Optimistic', 'Respectful', 'Urgent']
256
+
257
+ # ## Plot recommendations - ORIGINAL FROM V1.0
258
+ # def recommend(tones,recommend_changes,change,target):
259
+ # ''' This function creates the recomended changes plots it takes it the tones, the changes and '''
260
+ # fig = go.Figure()
261
+ # fig.add_trace(go.Bar(
262
+ # y=tones.columns,
263
+ # x=tones.values[0],
264
+ # name='Current Tones',
265
+ # orientation='h',
266
+ # # text=np.round(tones.values[0],3),
267
+ # width=.9,
268
+ # marker=dict(
269
+ # color='#00e6b1',
270
+ # line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
271
+ # )
272
 
273
+ # ))
274
+ # fig.add_trace(go.Bar(
275
+ # y=tones.columns,
276
+ # x=recommend_changes,
277
+ # name='Recommend changes',
278
+ # orientation='h',
279
+ # text=np.round(recommend_changes,3),
280
+ # width=.5,
281
+ # marker=dict(
282
+ # color='#e60f00',
283
+ # line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
284
+ # )
285
+ # ))
286
+ # fig.update_traces(textfont_size=18, textposition="outside", cliponaxis=False)
287
+ # fig.update_layout(height=1000, plot_bgcolor='white')
288
+ # fig.update_layout(barmode='stack', yaxis={'categoryorder':'array','categoryarray': recommend_changes.sort_values(key=abs,ascending=True).index})
289
+ # fig.update_layout(title_text=f'The following Changes will yield a {round(change,3)} increase in {target}')
290
+ # fig.show()
291
+
292
+ ## Plot recommendations - MODIFIED
293
+ def recommend(tones,recommend_changes,change,target):
294
  ''' This function creates the recomended changes plots it takes it the tones, the changes and '''
295
+
296
  fig = go.Figure()
297
  fig.add_trace(go.Bar(
298
+ # y=tones.columns,
299
+ y=tone_labels,
 
 
 
 
 
 
 
 
 
 
 
 
300
  x=recommend_changes,
301
  name='Recommend changes',
302
  orientation='h',
303
  text=np.round(recommend_changes,3),
304
+ width=.5,
305
  marker=dict(
306
  color='#e60f00',
307
+ line=dict(color='rgba(58, 71, 80, 1.0)', width=1)
308
  )
309
  ))
310
  fig.update_traces(textfont_size=18, textposition="outside", cliponaxis=False)
311
+ # fig.update_layout(height=1000, plot_bgcolor='white')
312
+ # fig.update_layout(barmode='stack', yaxis={'categoryorder':'array','categoryarray': recommend_changes.sort_values(key=abs,ascending=True).index})
313
+ # fig.update_layout(title_text=f'The following Changes will yield a {round(change,3)} increase in {target}')
314
+ if target == 'Revenue_Per_Email':
315
+ out = f"${round(change,2)}"
 
316
  else:
317
+ out = f"{round(change,2)*100}%"
318
+ fig.update_layout(title_text=f'The following Changes will yield a {out} increase in {target}')
319
+ fig.show()
320
 
 
321
 
322
  def prediction(tones,campaign_val,industry_val,target):
323
  model_val=pd.DataFrame(tones,columns=PARAMS['rf_labels']).fillna(0)
 
325
  model_val.loc[0,industry_val]=1
326
  model=model_dict[target]
327
  pred=model.predict(model_val)[0]
328
+
329
+ # y_pred = regr.predict(X_test)
330
+ # r2_test = r2_score(y_test, y_pred)
331
+
332
  CI=CI_rates[CI_rates['model']==target]
333
  lower=pred+CI['2_5'].values[0]
334
  higher=pred+CI['97_5'].values[0]
335
+ return pred,round(lower,3),round(higher,3),model
336
 
337
+
338
+ ## Plot recommendations for intensity changes
339
+ def intensity_changes(tones,recommend_changes,change,target):
340
+ ''' This function creates a plot to show the change made to intensities and shows the resulting change in target rate '''
341
+
342
+ fig = go.Figure()
343
+ fig.add_trace(go.Bar(
344
+ # y=tones.columns,
345
+ y=tone_labels,
346
+ x=recommend_changes,
347
+ name='Recommend changes',
348
+ orientation='h',
349
+ text=np.round(recommend_changes,3),
350
+ width=.5,
351
+ marker=dict(
352
+ color='#00e6b1',
353
+ line=dict(color='rgba(58, 71, 80, 1.0)', width=1)
354
+ )
355
+ ))
356
+ fig.update_traces(textfont_size=18, textposition="outside", cliponaxis=False)
357
+
358
+ if change < 0:
359
+ if target == 'Revenue_Per_Email':
360
+ out = f"${round(change*(-1),2)}"
361
+ else:
362
+ out = f"{round(change*(-1),2)}%"
363
+
364
+ fig.update_layout(title_text=f'The following Changes will decrease the {target} by {out}')
365
+
366
+ elif change >= 0:
367
+ if target == 'Revenue_Per_Email':
368
+ out = f"${round(change,2)}"
369
+ else:
370
+ out = f"{round(change,2)*100}%"
371
+ fig.update_layout(title_text=f'The following Changes will increase the {target} by {out}')
372
+
373
+ # fig.update_layout(title_text=f'The changes made to the tone intensities')
374
+ fig.show()
375
+
376
+
377
+ def load_data():
378
+ data_location='Tone_and_target.csv'
379
+ df=pd.read_csv(data_location)
380
  df_unique = df.drop_duplicates()
381
  df_unique = pd.get_dummies(df_unique, columns=['industry','campaign_type'])
382
  df_data=df_unique.drop(columns=['Unnamed: 0','body'])
 
384
  return df_data
385
 
386
 
387
+ def plot_table(sorted_setence_tuple):
388
  """ Plots the bottom most table, takes in a list of tuples where the tuple is the sentence the sentiment distance
389
  from the best values """
390
  sentences=list(zip(*sorted_setence_tuple))[0]
 
406
  align=['left','center'],
407
  font=dict(family="Arial",size=12)))
408
  ])
409
+
410
+ #fig.show()
411
 
412
+ def corrections(best,df):
 
 
 
 
 
413
  """This function finds the the difference between the tone of each sentence and the best tone for the desired metric
414
  best- tone values of the best email for the current categories
415
  df- dataframe of the sentences of the uploaded email and the """
 
426
  rbg=ImageColor.getcolor(f'{col}', "RGB")
427
  sentence_order.append((text,new_value,rbg))
428
  sorted_sentences=sorted(sentence_order,key=lambda x: x[1],reverse=True)
429
+ plot_table(sorted_sentences)
430
 
431
  def read_file(fc):
432
  with open(fc.selected) as file: # Use file to refer to the file object