Update app.py
Browse files
app.py
CHANGED
@@ -3,10 +3,40 @@ import transformers
|
|
3 |
from transformers import pipeline
|
4 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
5 |
import pandas as pd
|
6 |
-
import numpy as np
|
7 |
import string
|
8 |
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART", max_length=128, padding=True, pad_to_max_length = True, truncation=True)
|
11 |
model = AutoModelForMaskedLM.from_pretrained("Hamda/test-1-finetuned-AraBART")
|
12 |
|
@@ -22,13 +52,6 @@ def next_word(text, pipe):
|
|
22 |
res_dict['Score'].append(e['score'])
|
23 |
return res_dict
|
24 |
|
25 |
-
st.title("المساعدة اللغوية في التنبؤ بالمتلازمات والمتصاحبات والتعبيرات الاصطلاحية وتصحيحها")
|
26 |
-
default_value = "بيعت الأسلحة في السوق"
|
27 |
-
# sent is the variable holding the user's input
|
28 |
-
sent = st.text_area("مدخل", default_value, height=20)
|
29 |
-
|
30 |
-
st.checkbox('استعمال الرسم البياني', value=False)
|
31 |
-
|
32 |
text_st = sent+ ' <mask>'
|
33 |
|
34 |
pipe = pipeline("fill-mask", tokenizer=tokenizer, model=model, top_k=10)
|
@@ -36,7 +59,25 @@ dict_next_words = next_word(text_st, pipe)
|
|
36 |
df = pd.DataFrame.from_dict(dict_next_words)
|
37 |
df.reset_index(drop=True, inplace=True)
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
st.dataframe(df)
|
40 |
-
|
41 |
|
42 |
#st.table(df)
|
|
|
3 |
from transformers import pipeline
|
4 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
5 |
import pandas as pd
|
|
|
6 |
import string
|
7 |
|
8 |
|
9 |
+
st.title("المساعدة اللغوية في التنبؤ بالمتلازمات والمتصاحبات وتصحيحها")
|
10 |
+
default_value = "بيعت الأسلحة في السوق"
|
11 |
+
# sent is the variable holding the user's input
|
12 |
+
sent = st.text_area("مدخل", default_value, height=20)
|
13 |
+
|
14 |
+
st.checkbox('استعمال الرسم البياني', value=False)
|
15 |
+
|
16 |
+
tmt = {}
|
17 |
+
VocMap = r'.\voc.csv'
|
18 |
+
ibra_gr = r'.\BM25.csv'
|
19 |
+
|
20 |
+
df3 = pd.read_csv(VocMap, delimiter='\t')
|
21 |
+
df_g = pd.read_csv(ibra_gr, delimiter='\t')
|
22 |
+
df_g.set_index(['ID1','ID2'], inplace=True)
|
23 |
+
|
24 |
+
df_in = pd.read_csv(ibra_gr, delimiter='\t')
|
25 |
+
df_in.set_index(['ID1'], inplace=True)
|
26 |
+
|
27 |
+
def Query2id(voc, query):
|
28 |
+
return [voc.index[voc['word'] == word].values[0] for word in query.split()]
|
29 |
+
|
30 |
+
id_list = Query2id(df3, sent)
|
31 |
+
|
32 |
+
def setQueriesVoc(df, id_list):
|
33 |
+
res = []
|
34 |
+
for e in id_list:
|
35 |
+
res.extend(list(df.loc[e]['ID2'].values))
|
36 |
+
return list(set(res))
|
37 |
+
|
38 |
+
L = setQueriesVoc(df_in, id_list)
|
39 |
+
|
40 |
tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART", max_length=128, padding=True, pad_to_max_length = True, truncation=True)
|
41 |
model = AutoModelForMaskedLM.from_pretrained("Hamda/test-1-finetuned-AraBART")
|
42 |
|
|
|
52 |
res_dict['Score'].append(e['score'])
|
53 |
return res_dict
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
text_st = sent+ ' <mask>'
|
56 |
|
57 |
pipe = pipeline("fill-mask", tokenizer=tokenizer, model=model, top_k=10)
|
|
|
59 |
df = pd.DataFrame.from_dict(dict_next_words)
|
60 |
df.reset_index(drop=True, inplace=True)
|
61 |
|
62 |
+
for nc in L:
|
63 |
+
score = 0.0
|
64 |
+
temp = []
|
65 |
+
for ni in id_list:
|
66 |
+
try:
|
67 |
+
score = score + df_g.loc[(ni, nc),'score']
|
68 |
+
except KeyError:
|
69 |
+
continue
|
70 |
+
key = df3.loc[nc].values[0]
|
71 |
+
tmt[key] = score
|
72 |
+
exp_terms = []
|
73 |
+
tmexp = sorted(tmt.items(), key=lambda x: x[1], reverse=True)
|
74 |
+
i = 0
|
75 |
+
for key, value in tmexp:
|
76 |
+
exp_terms.append(str(key)+' | '+str(value))
|
77 |
+
i+=1
|
78 |
+
if (i==10):
|
79 |
+
break
|
80 |
st.dataframe(df)
|
81 |
+
st.write(exp_terms)
|
82 |
|
83 |
#st.table(df)
|