import streamlit as st import transformers from transformers import pipeline from transformers import AutoTokenizer, AutoModelForMaskedLM import pandas as pd import string from time import time st.title("المساعدة اللغوية في التنبؤ بالمتلازمات والمتصاحبات وتصحيحها") default_value = "بيعت الأسلحة في السوق" # sent is the variable holding the user's input sent = st.text_area('مدخل',default_value) tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART", max_length=128, padding=True, pad_to_max_length = True, truncation=True) model = AutoModelForMaskedLM.from_pretrained("Hamda/test-1-finetuned-AraBART") #@st.cache if (st.button('بحث', disabled=False)): def next_word(text, pipe): res_dict= { 'الكلمة المقترحة':[], 'العلامة':[], } for e in pipe(text): if all(c not in list(string.punctuation) for c in e['token_str']): res_dict['الكلمة المقترحة'].append(e['token_str']) res_dict['العلامة'].append(e['score']) return res_dict text_st = sent+ ' ' pipe = pipeline("fill-mask", tokenizer=tokenizer, model=model, top_k=10) dict_next_words = next_word(text_st, pipe) df = pd.DataFrame.from_dict(dict_next_words) df.reset_index(drop=True, inplace=True) st.dataframe(df) if (st.button('استعمال الرسم البياني', disabled=False)): VocMap = './voc.csv' ScoreMap = './BM25.csv' #@st.cache def reading_df(path1, path2): df_voc = pd.read_csv(path1, delimiter='\t') df_graph = pd.read_csv(path2, delimiter='\t') df_graph.set_index(['ID1','ID2'], inplace=True) df_gr = pd.read_csv(ScoreMap, delimiter='\t') df_gr.set_index(['ID1'], inplace=True) return df_voc, df_graph, df_gr df3, df_g, df_in = reading_df(VocMap, ScoreMap) a = time() #@st.cache def Query2id(voc, query): return [voc.index[voc['word'] == word].values[0] for word in query.split()] id_list = Query2id(df3, sent) #@st.cache def setQueriesVoc(df, id_list): res = [] for e in id_list: res.extend(list(df.loc[e]['ID2'].values)) return list(set(res)) L = setQueriesVoc(df_in, id_list) @st.cache def compute_score(L_terms, id_l): tmt = {} for nc in L_terms: score = 0.0 temp = [] for ni in id_l: try: score = score + df_g.loc[(ni, nc),'score'] except KeyError: continue key = df3.loc[nc].values[0] tmt[key] = score return tmt tmt = compute_score(L, id_list) exp_terms = [] t_li = tmt.values() tmexp = sorted(tmt.items(), key=lambda x: x[1], reverse=True) i = 0 dict_res = {'الكلمة المقترحة':[], 'العلامة':[]} for key, value in tmexp: new_score=((value-min(t_li))/(max(t_li)-min(t_li)))-0.0001 dict_res['العلامة'].append(str(new_score)[:6]) dict_res['الكلمة المقترحة'].append(key) i+=1 if (i==10): break res_df = pd.DataFrame.from_dict(dict_res) res_df.index += 1 st.dataframe(res_df) b = time() exec_time = (b-a) st.write(f'{len(id_list)} :عدد الكلمات في المدخل') st.write(f'{exec_time} :الوقت المستغرق باستعمال الرسم البياني') #st.table(df)