Spaces:
Runtime error
Runtime error
Commit
·
0e521b1
1
Parent(s):
bc04379
Update app.py
Browse files
app.py
CHANGED
@@ -58,6 +58,7 @@ def preprocess(text):
|
|
58 |
text=" ".join(text.split())
|
59 |
return text
|
60 |
|
|
|
61 |
def highlight_survived(s):
|
62 |
return ['background-color: red']*len(s) if (s.Sexista == 1) else ['background-color: green']*len(s)
|
63 |
|
@@ -85,6 +86,118 @@ with colT2:
|
|
85 |
font-size:16px ; font-family: 'Times New Roman'; color: #3358ff;}
|
86 |
</style> """, unsafe_allow_html=True)
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
def run():
|
90 |
with st.form("my_form"):
|
@@ -107,65 +220,10 @@ def run():
|
|
107 |
|
108 |
if (error == False):
|
109 |
if (termino):
|
110 |
-
|
111 |
-
#tweets =tw.Cursor(api.search_tweets,q=new_search,lang="es").items(number_of_tweets)
|
112 |
-
# Tokenizar la frase
|
113 |
-
tokens = tokenizer.tokenize(search_words)
|
114 |
-
# Convertir los tokens a un formato compatible con el modelo
|
115 |
-
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
116 |
-
attention_masks = [1] * len(input_ids)
|
117 |
-
# Pasar los tokens al modelo
|
118 |
-
outputs = model(torch.tensor([input_ids]), token_type_ids=None, attention_mask=torch.tensor([attention_masks]))
|
119 |
-
|
120 |
-
# Obtener la probabilidad de que la frase sea "sexista"
|
121 |
-
probabilidad_sexista = outputs[0][0][1].item()
|
122 |
-
print(probabilidad_sexista)
|
123 |
-
# Crear un Dataframe
|
124 |
-
text= pd.DataFrame({'palabra': [search_words],'probabilidad':[probabilidad_sexista]})
|
125 |
-
#print(text)
|
126 |
-
st.table(text)
|
127 |
|
128 |
elif (usuario):
|
129 |
-
|
130 |
-
|
131 |
-
text= pd.DataFrame(tweet_list)
|
132 |
-
text[0] = text[0].apply(preprocess_tweet)
|
133 |
-
text1=text[0].values
|
134 |
-
indices1=tokenizer.batch_encode_plus(text1.tolist(),max_length=128,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)
|
135 |
-
input_ids1=indices1["input_ids"]
|
136 |
-
attention_masks1=indices1["attention_mask"]
|
137 |
-
prediction_inputs1= torch.tensor(input_ids1)
|
138 |
-
prediction_masks1 = torch.tensor(attention_masks1)
|
139 |
-
# Set the batch size.
|
140 |
-
batch_size = 25
|
141 |
-
# Create the DataLoader.
|
142 |
-
prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1)
|
143 |
-
prediction_sampler1 = SequentialSampler(prediction_data1)
|
144 |
-
prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size)
|
145 |
-
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs1)))
|
146 |
-
# Pone el modelo en modo evaluación
|
147 |
-
model.eval()
|
148 |
-
# Variables de Seguimiento
|
149 |
-
predictions = []
|
150 |
-
# Predict
|
151 |
-
for batch in prediction_dataloader1:
|
152 |
-
batch = tuple(t.to(device) for t in batch)
|
153 |
-
# Descomprimir las entradas de nuestro cargador de datos
|
154 |
-
b_input_ids1, b_input_mask1 = batch
|
155 |
-
# Decirle al modelo que no calcule ni almacene gradientes, ahorrando memoria y # acelerando la predicción.
|
156 |
-
with torch.no_grad():
|
157 |
-
# Forward pass, calculate logit predictions
|
158 |
-
outputs1 = model(b_input_ids1, token_type_ids=None,attention_mask=b_input_mask1)
|
159 |
-
logits1 = outputs1[0]
|
160 |
-
# Move logits and labels to CPU
|
161 |
-
logits1 = logits1.detach().cpu().numpy()
|
162 |
-
# Store predictions and true labels
|
163 |
-
predictions.append(logits1)
|
164 |
-
flat_predictions = [item for sublist in predictions for item in sublist]
|
165 |
-
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()#p = [i for i in classifier(tweet_list)]
|
166 |
-
df = pd.DataFrame(list(zip(tweet_list, flat_predictions)),columns =['Últimos '+ str(number_of_tweets)+' Tweets'+' de '+search_words, 'Sexista'])
|
167 |
-
df['Sexista']= np.where(df['Sexista']== 0, 'No Sexistas', 'Sexistas')
|
168 |
-
|
169 |
-
st.table(df.reset_index(drop=True).head(20).style.applymap(color_survived, subset=['Sexista']))
|
170 |
|
171 |
run()
|
|
|
58 |
text=" ".join(text.split())
|
59 |
return text
|
60 |
|
61 |
+
|
62 |
def highlight_survived(s):
|
63 |
return ['background-color: red']*len(s) if (s.Sexista == 1) else ['background-color: green']*len(s)
|
64 |
|
|
|
86 |
font-size:16px ; font-family: 'Times New Roman'; color: #3358ff;}
|
87 |
</style> """, unsafe_allow_html=True)
|
88 |
|
89 |
+
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
def analizar_tweets(search_words, number_of_tweets ):
|
94 |
+
tweets = api.user_timeline(screen_name = search_words, count= number_of_tweets)
|
95 |
+
tweet_list = [i.text for i in tweets]
|
96 |
+
text= pd.DataFrame(tweet_list)
|
97 |
+
text[0] = text[0].apply(preprocess_tweet)
|
98 |
+
text1=text[0].values
|
99 |
+
indices1=tokenizer.batch_encode_plus(text1.tolist(), max_length=128,add_special_tokens=True, return_attention_mask=True,pad_to_max_length=True,truncation=True)
|
100 |
+
input_ids1=indices1["input_ids"]
|
101 |
+
attention_masks1=indices1["attention_mask"]
|
102 |
+
prediction_inputs1= torch.tensor(input_ids1)
|
103 |
+
prediction_masks1 = torch.tensor(attention_masks1)
|
104 |
+
batch_size = 25
|
105 |
+
# Create the DataLoader.
|
106 |
+
prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1)
|
107 |
+
prediction_sampler1 = SequentialSampler(prediction_data1)
|
108 |
+
prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size)
|
109 |
+
#print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs1)))
|
110 |
+
# Put model in evaluation mode
|
111 |
+
model.eval()
|
112 |
+
# Tracking variables
|
113 |
+
predictions = []
|
114 |
+
for batch in prediction_dataloader1:
|
115 |
+
batch = tuple(t.to(device) for t in batch)
|
116 |
+
# Unpack the inputs from our dataloader
|
117 |
+
b_input_ids1, b_input_mask1 = batch
|
118 |
+
|
119 |
+
#Telling the model not to compute or store gradients, saving memory and # speeding up prediction
|
120 |
+
with torch.no_grad():
|
121 |
+
# Forward pass, calculate logit predictions
|
122 |
+
outputs1 = model(b_input_ids1, token_type_ids=None,attention_mask=b_input_mask1)
|
123 |
+
logits1 = outputs1[0]
|
124 |
+
# Move logits and labels to CPU
|
125 |
+
logits1 = logits1.detach().cpu().numpy()
|
126 |
+
# Store predictions and true labels
|
127 |
+
predictions.append(logits1)
|
128 |
+
|
129 |
+
#flat_predictions = [item for sublist in predictions for item in sublist]
|
130 |
+
flat_predictions = [item for sublist in predictions for item in sublist]
|
131 |
+
|
132 |
+
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
|
133 |
+
|
134 |
+
probability = np.amax(logits1,axis=1).flatten()
|
135 |
+
Tweets =['Últimos '+ str(number_of_tweets)+' Tweets'+' de '+search_words]
|
136 |
+
df = pd.DataFrame(list(zip(text1, flat_predictions,probability)), columns = ['Tweets' , 'Sexista','Probabilidad'])
|
137 |
+
|
138 |
+
df['Sexista']= np.where(df['Sexista']== 0, 'No Sexista', 'Sexista')
|
139 |
+
df['Tweets'] = df['Tweets'].str.replace('RT|@', '')
|
140 |
+
#df['Tweets'] = df['Tweets'].apply(lambda x: re.sub(r'[:;][-o^]?[)\]DpP3]|[(/\\]|[\U0001f600-\U0001f64f]|[\U0001f300-\U0001f5ff]|[\U0001f680-\U0001f6ff]|[\U0001f1e0-\U0001f1ff]','', x))
|
141 |
+
|
142 |
+
st.table(df.reset_index(drop=True).head(20).style.applymap(color_survived, subset=['Sexista']))
|
143 |
+
|
144 |
+
return df
|
145 |
+
|
146 |
+
def analizar_frase(frase):
|
147 |
+
#palabra = frase.split()
|
148 |
+
palabra = [frase]
|
149 |
+
|
150 |
+
indices1=tokenizer.batch_encode_plus(palabra,max_length=128,add_special_tokens=True,
|
151 |
+
return_attention_mask=True,
|
152 |
+
pad_to_max_length=True,
|
153 |
+
truncation=True)
|
154 |
+
input_ids1=indices1["input_ids"]
|
155 |
+
attention_masks1=indices1["attention_mask"]
|
156 |
+
prediction_inputs1= torch.tensor(input_ids1)
|
157 |
+
prediction_masks1 = torch.tensor(attention_masks1)
|
158 |
+
batch_size = 25
|
159 |
+
prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1)
|
160 |
+
prediction_sampler1 = SequentialSampler(prediction_data1)
|
161 |
+
prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size)
|
162 |
+
model.eval()
|
163 |
+
predictions = []
|
164 |
+
# Predict
|
165 |
+
for batch in prediction_dataloader1:
|
166 |
+
batch = tuple(t.to(device) for t in batch)
|
167 |
+
# Unpack the inputs from our dataloader
|
168 |
+
b_input_ids1, b_input_mask1 = batch
|
169 |
+
# Telling the model not to compute or store gradients, saving memory and # speeding up prediction
|
170 |
+
with torch.no_grad():
|
171 |
+
# Forward pass, calculate logit predictions
|
172 |
+
outputs1 = model(b_input_ids1, token_type_ids=None,attention_mask=b_input_mask1)
|
173 |
+
logits1 = outputs1[0]
|
174 |
+
# Move logits and labels to CPU
|
175 |
+
logits1 = logits1.detach().cpu().numpy()
|
176 |
+
# Store predictions and true labels
|
177 |
+
predictions.append(logits1)
|
178 |
+
flat_predictions = [item for sublist in predictions for item in sublist]
|
179 |
+
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
|
180 |
+
tokens = tokenizer.tokenize(frase)
|
181 |
+
# Convertir los tokens a un formato compatible con el modelo
|
182 |
+
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
183 |
+
attention_masks = [1] * len(input_ids)
|
184 |
+
|
185 |
+
# Pasar los tokens al modelo
|
186 |
+
outputs = model(torch.tensor([input_ids]), token_type_ids=None, attention_mask=torch.tensor([attention_masks]))
|
187 |
+
scores = outputs[0]
|
188 |
+
#prediccion = scores.argmax(dim=1).item()
|
189 |
+
# Obtener la probabilidad de que la frase sea "sexista"
|
190 |
+
probabilidad_sexista = scores.amax(dim=1).item()
|
191 |
+
#print(probabilidad_sexista)
|
192 |
+
|
193 |
+
# Crear un Dataframe
|
194 |
+
text= pd.DataFrame({'Frase': [frase], 'Prediccion':[flat_predictions], 'Probabilidad':[probabilidad_sexista]})
|
195 |
+
text['prediccion'] = np.where(text['prediccion'] == 0 , 'No Sexista', 'Sexista')
|
196 |
+
|
197 |
+
st.table(df.reset_index(drop=True).head(20).style.applymap(color_survived, subset=['Sexista']))
|
198 |
+
|
199 |
+
return text
|
200 |
+
|
201 |
|
202 |
def run():
|
203 |
with st.form("my_form"):
|
|
|
220 |
|
221 |
if (error == False):
|
222 |
if (termino):
|
223 |
+
analizar_frase(search_words)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
225 |
elif (usuario):
|
226 |
+
analizar_tweets(search_words,number_of_tweets)
|
227 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
|
229 |
run()
|