Joshua1808 commited on
Commit
0e521b1
·
1 Parent(s): bc04379

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -58
app.py CHANGED
@@ -58,6 +58,7 @@ def preprocess(text):
58
  text=" ".join(text.split())
59
  return text
60
 
 
61
  def highlight_survived(s):
62
  return ['background-color: red']*len(s) if (s.Sexista == 1) else ['background-color: green']*len(s)
63
 
@@ -85,6 +86,118 @@ with colT2:
85
  font-size:16px ; font-family: 'Times New Roman'; color: #3358ff;}
86
  </style> """, unsafe_allow_html=True)
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  def run():
90
  with st.form("my_form"):
@@ -107,65 +220,10 @@ def run():
107
 
108
  if (error == False):
109
  if (termino):
110
- #new_search = search_words + " -filter:retweets"
111
- #tweets =tw.Cursor(api.search_tweets,q=new_search,lang="es").items(number_of_tweets)
112
- # Tokenizar la frase
113
- tokens = tokenizer.tokenize(search_words)
114
- # Convertir los tokens a un formato compatible con el modelo
115
- input_ids = tokenizer.convert_tokens_to_ids(tokens)
116
- attention_masks = [1] * len(input_ids)
117
- # Pasar los tokens al modelo
118
- outputs = model(torch.tensor([input_ids]), token_type_ids=None, attention_mask=torch.tensor([attention_masks]))
119
-
120
- # Obtener la probabilidad de que la frase sea "sexista"
121
- probabilidad_sexista = outputs[0][0][1].item()
122
- print(probabilidad_sexista)
123
- # Crear un Dataframe
124
- text= pd.DataFrame({'palabra': [search_words],'probabilidad':[probabilidad_sexista]})
125
- #print(text)
126
- st.table(text)
127
 
128
  elif (usuario):
129
- tweets = api.user_timeline(screen_name = search_words,count=number_of_tweets)
130
- tweet_list = [i.text for i in tweets]
131
- text= pd.DataFrame(tweet_list)
132
- text[0] = text[0].apply(preprocess_tweet)
133
- text1=text[0].values
134
- indices1=tokenizer.batch_encode_plus(text1.tolist(),max_length=128,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)
135
- input_ids1=indices1["input_ids"]
136
- attention_masks1=indices1["attention_mask"]
137
- prediction_inputs1= torch.tensor(input_ids1)
138
- prediction_masks1 = torch.tensor(attention_masks1)
139
- # Set the batch size.
140
- batch_size = 25
141
- # Create the DataLoader.
142
- prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1)
143
- prediction_sampler1 = SequentialSampler(prediction_data1)
144
- prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size)
145
- print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs1)))
146
- # Pone el modelo en modo evaluación
147
- model.eval()
148
- # Variables de Seguimiento
149
- predictions = []
150
- # Predict
151
- for batch in prediction_dataloader1:
152
- batch = tuple(t.to(device) for t in batch)
153
- # Descomprimir las entradas de nuestro cargador de datos
154
- b_input_ids1, b_input_mask1 = batch
155
- # Decirle al modelo que no calcule ni almacene gradientes, ahorrando memoria y # acelerando la predicción.
156
- with torch.no_grad():
157
- # Forward pass, calculate logit predictions
158
- outputs1 = model(b_input_ids1, token_type_ids=None,attention_mask=b_input_mask1)
159
- logits1 = outputs1[0]
160
- # Move logits and labels to CPU
161
- logits1 = logits1.detach().cpu().numpy()
162
- # Store predictions and true labels
163
- predictions.append(logits1)
164
- flat_predictions = [item for sublist in predictions for item in sublist]
165
- flat_predictions = np.argmax(flat_predictions, axis=1).flatten()#p = [i for i in classifier(tweet_list)]
166
- df = pd.DataFrame(list(zip(tweet_list, flat_predictions)),columns =['Últimos '+ str(number_of_tweets)+' Tweets'+' de '+search_words, 'Sexista'])
167
- df['Sexista']= np.where(df['Sexista']== 0, 'No Sexistas', 'Sexistas')
168
-
169
- st.table(df.reset_index(drop=True).head(20).style.applymap(color_survived, subset=['Sexista']))
170
 
171
  run()
 
58
  text=" ".join(text.split())
59
  return text
60
 
61
+
62
  def highlight_survived(s):
63
  return ['background-color: red']*len(s) if (s.Sexista == 1) else ['background-color: green']*len(s)
64
 
 
86
  font-size:16px ; font-family: 'Times New Roman'; color: #3358ff;}
87
  </style> """, unsafe_allow_html=True)
88
 
89
+
90
+
91
+
92
+
93
+ def analizar_tweets(search_words, number_of_tweets ):
94
+ tweets = api.user_timeline(screen_name = search_words, count= number_of_tweets)
95
+ tweet_list = [i.text for i in tweets]
96
+ text= pd.DataFrame(tweet_list)
97
+ text[0] = text[0].apply(preprocess_tweet)
98
+ text1=text[0].values
99
+ indices1=tokenizer.batch_encode_plus(text1.tolist(), max_length=128,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)
100
+ input_ids1=indices1["input_ids"]
101
+ attention_masks1=indices1["attention_mask"]
102
+ prediction_inputs1= torch.tensor(input_ids1)
103
+ prediction_masks1 = torch.tensor(attention_masks1)
104
+ batch_size = 25
105
+ # Create the DataLoader.
106
+ prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1)
107
+ prediction_sampler1 = SequentialSampler(prediction_data1)
108
+ prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size)
109
+ #print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs1)))
110
+ # Put model in evaluation mode
111
+ model.eval()
112
+ # Tracking variables
113
+ predictions = []
114
+ for batch in prediction_dataloader1:
115
+ batch = tuple(t.to(device) for t in batch)
116
+ # Unpack the inputs from our dataloader
117
+ b_input_ids1, b_input_mask1 = batch
118
+
119
+ #Telling the model not to compute or store gradients, saving memory and # speeding up prediction
120
+ with torch.no_grad():
121
+ # Forward pass, calculate logit predictions
122
+ outputs1 = model(b_input_ids1, token_type_ids=None,attention_mask=b_input_mask1)
123
+ logits1 = outputs1[0]
124
+ # Move logits and labels to CPU
125
+ logits1 = logits1.detach().cpu().numpy()
126
+ # Store predictions and true labels
127
+ predictions.append(logits1)
128
+
129
+ #flat_predictions = [item for sublist in predictions for item in sublist]
130
+ flat_predictions = [item for sublist in predictions for item in sublist]
131
+
132
+ flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
133
+
134
+ probability = np.amax(logits1,axis=1).flatten()
135
+ Tweets =['Últimos '+ str(number_of_tweets)+' Tweets'+' de '+search_words]
136
+ df = pd.DataFrame(list(zip(text1, flat_predictions,probability)), columns = ['Tweets' , 'Sexista','Probabilidad'])
137
+
138
+ df['Sexista']= np.where(df['Sexista']== 0, 'No Sexista', 'Sexista')
139
+ df['Tweets'] = df['Tweets'].str.replace('RT|@', '')
140
+ #df['Tweets'] = df['Tweets'].apply(lambda x: re.sub(r'[:;][-o^]?[)\]DpP3]|[(/\\]|[\U0001f600-\U0001f64f]|[\U0001f300-\U0001f5ff]|[\U0001f680-\U0001f6ff]|[\U0001f1e0-\U0001f1ff]','', x))
141
+
142
+ st.table(df.reset_index(drop=True).head(20).style.applymap(color_survived, subset=['Sexista']))
143
+
144
+ return df
145
+
146
+ def analizar_frase(frase):
147
+ #palabra = frase.split()
148
+ palabra = [frase]
149
+
150
+ indices1=tokenizer.batch_encode_plus(palabra,max_length=128,add_special_tokens=True,
151
+ return_attention_mask=True,
152
+ pad_to_max_length=True,
153
+ truncation=True)
154
+ input_ids1=indices1["input_ids"]
155
+ attention_masks1=indices1["attention_mask"]
156
+ prediction_inputs1= torch.tensor(input_ids1)
157
+ prediction_masks1 = torch.tensor(attention_masks1)
158
+ batch_size = 25
159
+ prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1)
160
+ prediction_sampler1 = SequentialSampler(prediction_data1)
161
+ prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size)
162
+ model.eval()
163
+ predictions = []
164
+ # Predict
165
+ for batch in prediction_dataloader1:
166
+ batch = tuple(t.to(device) for t in batch)
167
+ # Unpack the inputs from our dataloader
168
+ b_input_ids1, b_input_mask1 = batch
169
+ # Telling the model not to compute or store gradients, saving memory and # speeding up prediction
170
+ with torch.no_grad():
171
+ # Forward pass, calculate logit predictions
172
+ outputs1 = model(b_input_ids1, token_type_ids=None,attention_mask=b_input_mask1)
173
+ logits1 = outputs1[0]
174
+ # Move logits and labels to CPU
175
+ logits1 = logits1.detach().cpu().numpy()
176
+ # Store predictions and true labels
177
+ predictions.append(logits1)
178
+ flat_predictions = [item for sublist in predictions for item in sublist]
179
+ flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
180
+ tokens = tokenizer.tokenize(frase)
181
+ # Convertir los tokens a un formato compatible con el modelo
182
+ input_ids = tokenizer.convert_tokens_to_ids(tokens)
183
+ attention_masks = [1] * len(input_ids)
184
+
185
+ # Pasar los tokens al modelo
186
+ outputs = model(torch.tensor([input_ids]), token_type_ids=None, attention_mask=torch.tensor([attention_masks]))
187
+ scores = outputs[0]
188
+ #prediccion = scores.argmax(dim=1).item()
189
+ # Obtener la probabilidad de que la frase sea "sexista"
190
+ probabilidad_sexista = scores.amax(dim=1).item()
191
+ #print(probabilidad_sexista)
192
+
193
+ # Crear un Dataframe
194
+ text= pd.DataFrame({'Frase': [frase], 'Prediccion':[flat_predictions], 'Probabilidad':[probabilidad_sexista]})
195
+ text['prediccion'] = np.where(text['prediccion'] == 0 , 'No Sexista', 'Sexista')
196
+
197
+ st.table(df.reset_index(drop=True).head(20).style.applymap(color_survived, subset=['Sexista']))
198
+
199
+ return text
200
+
201
 
202
  def run():
203
  with st.form("my_form"):
 
220
 
221
  if (error == False):
222
  if (termino):
223
+ analizar_frase(search_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  elif (usuario):
226
+ analizar_tweets(search_words,number_of_tweets)
227
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  run()