File size: 2,592 Bytes
ea90e06 e683309 ea90e06 a09216c 407249a a09216c 5d2b0fe 12fc412 ea4abbd b768565 12fc412 538d7ca 5d2b0fe ea90e06 d8f9f62 ea90e06 4bb560b ccc474b 4bb560b 407249a ea90e06 30ba48b ea90e06 5d2b0fe e7bd68e a09216c dee9089 c871b03 12fc412 be34b4b 12fc412 be34b4b 12fc412 be34b4b 12fc412 be34b4b 12fc412 be34b4b 7078b67 be34b4b cd7dcf3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import streamlit as st
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
import string
st.title("المساعدة اللغوية في التنبؤ بالمتلازمات والمتصاحبات وتصحيحها")
default_value = "بيعت الأسلحة في السوق"
# sent is the variable holding the user's input
sent = st.text_area("مدخل", default_value, height=20)
st.checkbox('استعمال الرسم البياني', value=False)
tmt = {}
VocMap = './voc.csv'
ibra_gr = './BM25.csv'
df3 = pd.read_csv(VocMap, delimiter='\t')
df_g = pd.read_csv(ibra_gr, delimiter='\t')
df_g.set_index(['ID1','ID2'], inplace=True)
df_in = pd.read_csv(ibra_gr, delimiter='\t')
df_in.set_index(['ID1'], inplace=True)
def Query2id(voc, query):
return [voc.index[voc['word'] == word].values[0] for word in query.split()]
id_list = Query2id(df3, sent)
def setQueriesVoc(df, id_list):
res = []
for e in id_list:
res.extend(list(df.loc[e]['ID2'].values))
return list(set(res))
L = setQueriesVoc(df_in, id_list)
tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART", max_length=128, padding=True, pad_to_max_length = True, truncation=True)
model = AutoModelForMaskedLM.from_pretrained("Hamda/test-1-finetuned-AraBART")
#@st.cache
def next_word(text, pipe):
res_dict= {
'Word':[],
'Score':[],
}
for e in pipe(text):
if all(c not in list(string.punctuation) for c in e['token_str']):
res_dict['Word'].append(e['token_str'])
res_dict['Score'].append(e['score'])
return res_dict
text_st = sent+ ' <mask>'
pipe = pipeline("fill-mask", tokenizer=tokenizer, model=model, top_k=10)
dict_next_words = next_word(text_st, pipe)
df = pd.DataFrame.from_dict(dict_next_words)
df.reset_index(drop=True, inplace=True)
for nc in L:
score = 0.0
temp = []
for ni in id_list:
try:
score = score + df_g.loc[(ni, nc),'score']
except KeyError:
continue
key = df3.loc[nc].values[0]
tmt[key] = score
exp_terms = []
t_li = tmt.values()
tmexp = sorted(tmt.items(), key=lambda x: x[1], reverse=True)
i = 0
dict_res = {'word':[], 'score':[]}
for key, value in tmexp:
new_score=((value-min(t_li))/(max(t_li)-min(t_li)))-0.0001
dict_res['score'].append(str(new_score)[:6])
dict_res['word'].append(key)
i+=1
if (i==10):
break
res_df = pd.DataFrame.from_dict(dict_res)
res_df.index += 1
st.dataframe(df)
st.dataframe(res_df)
#st.table(df) |