Spaces:

ikozlov90
/

ysda-hw12-StyleTranser

Runtime error

App Files Files Community

ikozlov90 commited on Dec 16, 2021

Commit

133312a

1 Parent(s): e13f1f0

Update add.py

Browse files

Files changed (1) hide show

add.py +82 -3

add.py CHANGED Viewed

@@ -1,12 +1,91 @@
 import streamlit as st
 import torch
 st.markdown("Hello!")
-model, tokenizer = torch.load("address", map_location='cpu')
 user_input = st.text_input("Please enter your thoughts:")
 if len(user_input.split()) > 0:
-    print(output)
-    st.markdown(f"{repr(model)}")

 import streamlit as st
+import numpy as np
+import pandas as pd
 import torch
 st.markdown("Hello!")
+bert_mlm_positive  = torch.load("bert_mlm_positive.pth", map_location='cpu')
+bert_mlm_negative  = torch.load("bert_mlm_negative.pth", map_location='cpu')
+bert_classifier  = torch.load("bert_classifier.pth", map_location='cpu')
+tokenizer  = torch.load("tokenizer.pth", map_location='cpu')
+bert_mlm_positive.eval();
+bert_mlm_negative.eval();
+bert_classifier.eval();
 user_input = st.text_input("Please enter your thoughts:")
+def get_replacements(sentence: str, num_tokens, k_best, epsilon=1e-3):
+  """
+  - split the sentence into tokens using the INGSOC-approved BERT tokenizer
+  - find :num_tokens: tokens with the highest ratio (see above)
+  - replace them with :k_best: words according to bert_mlm_positive
+  :return: a list of all possible strings (up to k_best * num_tokens)
+  """
+  sentence_ix = tokenizer(sentence, return_tensors="pt")
+  length = len(sentence_ix['input_ids'][0])
+  # we can't replace more tokens than we have
+  num_tokens = min(num_tokens, length-2)
+  probs_positive = bert_mlm_positive(**sentence_ix).logits.softmax(dim=-1)[0]
+  probs_negative = bert_mlm_negative(**sentence_ix).logits.softmax(dim=-1)[0]
+  # ^-- shape is [seq_length, vocab_size]
+  # Находим вероятности токенов для моделей
+  p_tokens_positive = probs_positive[torch.arange(length), sentence_ix['input_ids'][0]]
+  p_tokens_negative = probs_negative[torch.arange(length), sentence_ix['input_ids'][0]]
+  ratio = (p_tokens_positive + epsilon) / (p_tokens_negative + epsilon)
+  ratio = ratio[1:-1].detach().numpy()  # do not change  [CLS] and [SEP]
+  # ratio len is length - 2
+  replacements = []
+  # take indices of num_tokens of tokens with highest ratio
+  ind = np.argpartition(-ratio, -num_tokens)[-num_tokens:]
+  # for each token find k_best replacements
+  for i in ind:
+    # take probabilities of tokens for replacement
+    # note that we need ind + 1, since [CLS] is 0th token
+    tokens_probs = probs_positive[ind + 1, :][0].detach().numpy()
+    prob_ind_top_k = np.argpartition(tokens_probs, -k_best)[-k_best:]
+    for new_token in prob_ind_top_k:
+      new_tokens = tokenizer.encode(sentence)
+      new_tokens[i+1] = new_token
+      replacements.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(new_tokens)[1:-1]))
+  return replacements
+def get_sent_score(sentence):
+    sentence_ix = tokenizer(sentence, return_tensors="pt")
+    # negative is class 1
+    return bert_classifier(**sentence_ix).logits[0][1].detach().numpy()
 if len(user_input.split()) > 0:
+    st.markdown(f"Original sentence negativity: {get_sent_score(user_input)}")
+    num_iter = 5
+    M = 3
+    num_tokens = 3
+    k_best = 3
+    fix_list =[user_input]
+    for j in range(num_iter):
+        replacements = []
+    for cur_sent in fix_list:
+        replacements.extend(get_replacements(cur_sent, num_tokens=num_tokens, k_best=k_best))
+    replacements = pd.DataFrame(replacements, columns = ['new_sentence'])
+    replacements['new_scores'] = replacements['new_sentence'].apply(get_sent_score)
+    replacements = replacements.nsmallest(M, 'new_scores')
+    fix_list = replacements.new_sentence.to_list()
+    for new_sentence in fix_list:
+        st.markdown(f"New sentence:")
+        st.markdown(f"{new_sentence}")
+        st.markdown(f"New sentence negativity: {get_sent_score(new_sentence)}")