File size: 2,592 Bytes
ea90e06
e683309
ea90e06
 
a09216c
407249a
a09216c
5d2b0fe
12fc412
 
 
 
 
 
 
 
ea4abbd
b768565
12fc412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538d7ca
5d2b0fe
ea90e06
d8f9f62
ea90e06
 
4bb560b
 
 
ccc474b
4bb560b
407249a
 
ea90e06
 
30ba48b
ea90e06
5d2b0fe
e7bd68e
a09216c
dee9089
c871b03
12fc412
 
 
 
 
 
 
 
 
 
be34b4b
 
12fc412
be34b4b
12fc412
 
be34b4b
12fc412
be34b4b
 
 
12fc412
 
 
be34b4b
 
7078b67
be34b4b
 
cd7dcf3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import streamlit as st
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
import string


st.title("المساعدة اللغوية في التنبؤ بالمتلازمات والمتصاحبات وتصحيحها")
default_value = "بيعت الأسلحة في السوق"
# sent is the variable holding the user's input
sent = st.text_area("مدخل", default_value, height=20)

st.checkbox('استعمال الرسم البياني', value=False)

tmt = {} 
VocMap = './voc.csv'
ibra_gr = './BM25.csv'

df3 = pd.read_csv(VocMap, delimiter='\t')
df_g = pd.read_csv(ibra_gr, delimiter='\t')
df_g.set_index(['ID1','ID2'], inplace=True)

df_in = pd.read_csv(ibra_gr, delimiter='\t')
df_in.set_index(['ID1'], inplace=True)

def Query2id(voc, query):
    return [voc.index[voc['word'] == word].values[0] for word in query.split()]

id_list = Query2id(df3, sent)

def setQueriesVoc(df, id_list):
    res = []
    for e in id_list:
        res.extend(list(df.loc[e]['ID2'].values))   
    return list(set(res))

L = setQueriesVoc(df_in, id_list)

tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART", max_length=128, padding=True, pad_to_max_length = True, truncation=True)
model = AutoModelForMaskedLM.from_pretrained("Hamda/test-1-finetuned-AraBART")

#@st.cache
def next_word(text, pipe):
    res_dict= {  
      'Word':[],
      'Score':[],
    }
    for e in pipe(text):
        if all(c not in list(string.punctuation) for c in e['token_str']):
            res_dict['Word'].append(e['token_str'])
            res_dict['Score'].append(e['score'])
    return res_dict

text_st = sent+ ' <mask>'

pipe = pipeline("fill-mask", tokenizer=tokenizer, model=model, top_k=10)
dict_next_words = next_word(text_st, pipe)
df = pd.DataFrame.from_dict(dict_next_words)
df.reset_index(drop=True, inplace=True)

for nc in L:
    score = 0.0
    temp = []
    for ni in id_list:
        try:
            score = score + df_g.loc[(ni, nc),'score']
        except KeyError:
            continue
    key  = df3.loc[nc].values[0]
    tmt[key] = score
    
    
exp_terms = []
t_li = tmt.values()
tmexp = sorted(tmt.items(), key=lambda x: x[1], reverse=True)
i = 0
dict_res = {'word':[], 'score':[]}
for key, value in tmexp:
    new_score=((value-min(t_li))/(max(t_li)-min(t_li)))-0.0001
    dict_res['score'].append(str(new_score)[:6])
    dict_res['word'].append(key)
    i+=1
    if (i==10):
        break
res_df = pd.DataFrame.from_dict(dict_res)
res_df.index += 1

st.dataframe(df)
st.dataframe(res_df)
#st.table(df)