import streamlit as st import numpy as np import pandas as pd import torch st.markdown("We demand your opinion!") bert_mlm_positive = torch.load("bert_mlm_positive.pth", map_location='cpu') bert_mlm_negative = torch.load("bert_mlm_negative.pth", map_location='cpu') bert_classifier = torch.load("bert_classifier.pth", map_location='cpu') tokenizer = torch.load("tokenizer.pth", map_location='cpu') bert_mlm_positive.eval(); bert_mlm_negative.eval(); bert_classifier.eval(); user_input = st.text_input("Please enter your thoughts:") def get_replacements(sentence: str, num_tokens, k_best, epsilon=1e-3): """ - split the sentence into tokens using the INGSOC-approved BERT tokenizer - find :num_tokens: tokens with the highest ratio (see above) - replace them with :k_best: words according to bert_mlm_positive :return: a list of all possible strings (up to k_best * num_tokens) """ sentence_ix = tokenizer(sentence, return_tensors="pt") length = len(sentence_ix['input_ids'][0]) # we can't replace more tokens than we have num_tokens = min(num_tokens, length-2) probs_positive = bert_mlm_positive(**sentence_ix).logits.softmax(dim=-1)[0] probs_negative = bert_mlm_negative(**sentence_ix).logits.softmax(dim=-1)[0] # ^-- shape is [seq_length, vocab_size] # Находим вероятности токенов для моделей p_tokens_positive = probs_positive[torch.arange(length), sentence_ix['input_ids'][0]] p_tokens_negative = probs_negative[torch.arange(length), sentence_ix['input_ids'][0]] ratio = (p_tokens_positive + epsilon) / (p_tokens_negative + epsilon) ratio = ratio[1:-1].detach().numpy() # do not change [CLS] and [SEP] # ratio len is length - 2 replacements = [] # take indices of num_tokens of tokens with highest ratio ind = np.argpartition(-ratio, -num_tokens)[-num_tokens:] # for each token find k_best replacements for i in ind: # take probabilities of tokens for replacement # note that we need ind + 1, since [CLS] is 0th token tokens_probs = probs_positive[ind + 1, :][0].detach().numpy() prob_ind_top_k = np.argpartition(tokens_probs, -k_best)[-k_best:] for new_token in prob_ind_top_k: new_tokens = tokenizer.encode(sentence) new_tokens[i+1] = new_token replacements.append(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(new_tokens)[1:-1])) return replacements def get_sent_score(sentence): sentence_ix = tokenizer(sentence, return_tensors="pt") # negative is class 1 return bert_classifier(**sentence_ix).logits[0][1].detach().numpy() if len(user_input.split()) > 0: st.markdown(f"Original sentence negativity: {get_sent_score(user_input)}") num_iter = 5 M = 3 num_tokens = 3 k_best = 3 fix_list =[user_input] for j in range(num_iter): replacements = [] for cur_sent in fix_list: replacements.extend(get_replacements(cur_sent, num_tokens=num_tokens, k_best=k_best)) replacements = pd.DataFrame(replacements, columns = ['new_sentence']) replacements.drop_duplicates(inplace=True) replacements['new_scores'] = replacements['new_sentence'].apply(get_sent_score) replacements = replacements.nsmallest(M, 'new_scores') fix_list = replacements.new_sentence.to_list() for new_sentence in fix_list: st.caption(f"New sentence:") st.markdown(f"{new_sentence}") st.markdown(f"New sentence negativity: {get_sent_score(new_sentence)}")