Hamda commited on
Commit
a8e534e
·
1 Parent(s): be34b4b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -70
app.py CHANGED
@@ -11,81 +11,80 @@ default_value = "بيعت الأسلحة في السوق"
11
  # sent is the variable holding the user's input
12
  sent = st.text_area("مدخل", default_value, height=20)
13
 
14
- st.checkbox('استعمال الرسم البياني', value=False)
15
-
16
- tmt = {}
17
- VocMap = './voc.csv'
18
- ibra_gr = './BM25.csv'
19
-
20
- df3 = pd.read_csv(VocMap, delimiter='\t')
21
- df_g = pd.read_csv(ibra_gr, delimiter='\t')
22
- df_g.set_index(['ID1','ID2'], inplace=True)
23
-
24
- df_in = pd.read_csv(ibra_gr, delimiter='\t')
25
- df_in.set_index(['ID1'], inplace=True)
26
-
27
- def Query2id(voc, query):
28
- return [voc.index[voc['word'] == word].values[0] for word in query.split()]
29
-
30
- id_list = Query2id(df3, sent)
31
-
32
- def setQueriesVoc(df, id_list):
33
- res = []
34
- for e in id_list:
35
- res.extend(list(df.loc[e]['ID2'].values))
36
- return list(set(res))
37
-
38
- L = setQueriesVoc(df_in, id_list)
39
-
40
  tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART", max_length=128, padding=True, pad_to_max_length = True, truncation=True)
41
  model = AutoModelForMaskedLM.from_pretrained("Hamda/test-1-finetuned-AraBART")
42
 
43
  #@st.cache
44
- def next_word(text, pipe):
45
- res_dict= {
46
- 'Word':[],
47
- 'Score':[],
48
- }
49
- for e in pipe(text):
50
- if all(c not in list(string.punctuation) for c in e['token_str']):
51
- res_dict['Word'].append(e['token_str'])
52
- res_dict['Score'].append(e['score'])
53
- return res_dict
54
-
55
- text_st = sent+ ' <mask>'
56
-
57
- pipe = pipeline("fill-mask", tokenizer=tokenizer, model=model, top_k=10)
58
- dict_next_words = next_word(text_st, pipe)
59
- df = pd.DataFrame.from_dict(dict_next_words)
60
- df.reset_index(drop=True, inplace=True)
61
-
62
- for nc in L:
63
- score = 0.0
64
- temp = []
65
- for ni in id_list:
66
- try:
67
- score = score + df_g.loc[(ni, nc),'score']
68
- except KeyError:
69
- continue
70
- key = df3.loc[nc].values[0]
71
- tmt[key] = score
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
 
73
 
74
- exp_terms = []
75
- t_li = tmt.values()
76
- tmexp = sorted(tmt.items(), key=lambda x: x[1], reverse=True)
77
- i = 0
78
- dict_res = {'word':[], 'score':[]}
79
- for key, value in tmexp:
80
- new_score=((value-min(t_li))/(max(t_li)-min(t_li)))-0.0001
81
- dict_res['score'].append(str(new_score)[:6])
82
- dict_res['word'].append(key)
83
- i+=1
84
- if (i==10):
85
- break
86
- res_df = pd.DataFrame.from_dict(dict_res)
87
- res_df.index += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- st.dataframe(df)
90
- st.dataframe(res_df)
91
  #st.table(df)
 
11
  # sent is the variable holding the user's input
12
  sent = st.text_area("مدخل", default_value, height=20)
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  tokenizer = AutoTokenizer.from_pretrained("moussaKam/AraBART", max_length=128, padding=True, pad_to_max_length = True, truncation=True)
15
  model = AutoModelForMaskedLM.from_pretrained("Hamda/test-1-finetuned-AraBART")
16
 
17
  #@st.cache
18
+ if (st.button('بحث', disabled=False)):
19
+ def next_word(text, pipe):
20
+ res_dict= {
21
+ 'Word':[],
22
+ 'Score':[],
23
+ }
24
+ for e in pipe(text):
25
+ if all(c not in list(string.punctuation) for c in e['token_str']):
26
+ res_dict['Word'].append(e['token_str'])
27
+ res_dict['Score'].append(e['score'])
28
+ return res_dict
29
+
30
+ text_st = sent+ ' <mask>'
31
+ pipe = pipeline("fill-mask", tokenizer=tokenizer, model=model, top_k=10)
32
+ dict_next_words = next_word(text_st, pipe)
33
+ df = pd.DataFrame.from_dict(dict_next_words)
34
+ df.reset_index(drop=True, inplace=True)
35
+ st.dataframe(df)
36
+
37
+ if (st.button('استعمال الرسم البياني', disabled=False)):
38
+ tmt = {}
39
+ VocMap = './voc.csv'
40
+ ScoreMap = './BM25.csv'
41
+
42
+ df3 = pd.read_csv(VocMap, delimiter='\t')
43
+ df_g = pd.read_csv(ScoreMap, delimiter='\t')
44
+ df_g.set_index(['ID1','ID2'], inplace=True)
45
+
46
+ df_in = pd.read_csv(ScoreMap, delimiter='\t')
47
+ df_in.set_index(['ID1'], inplace=True)
48
+
49
+ def Query2id(voc, query):
50
+ return [voc.index[voc['word'] == word].values[0] for word in query.split()]
51
+
52
+ id_list = Query2id(df3, sent)
53
+
54
+ def setQueriesVoc(df, id_list):
55
+ res = []
56
+ for e in id_list:
57
+ res.extend(list(df.loc[e]['ID2'].values))
58
+ return list(set(res))
59
 
60
+ L = setQueriesVoc(df_in, id_list)
61
 
62
+ for nc in L:
63
+ score = 0.0
64
+ temp = []
65
+ for ni in id_list:
66
+ try:
67
+ score = score + df_g.loc[(ni, nc),'score']
68
+ except KeyError:
69
+ continue
70
+ key = df3.loc[nc].values[0]
71
+ tmt[key] = score
72
+
73
+
74
+ exp_terms = []
75
+ t_li = tmt.values()
76
+ tmexp = sorted(tmt.items(), key=lambda x: x[1], reverse=True)
77
+ i = 0
78
+ dict_res = {'word':[], 'score':[]}
79
+ for key, value in tmexp:
80
+ new_score=((value-min(t_li))/(max(t_li)-min(t_li)))-0.0001
81
+ dict_res['score'].append(str(new_score)[:6])
82
+ dict_res['word'].append(key)
83
+ i+=1
84
+ if (i==10):
85
+ break
86
+ res_df = pd.DataFrame.from_dict(dict_res)
87
+ res_df.index += 1
88
+ st.dataframe(res_df)
89
 
 
 
90
  #st.table(df)