mohbay commited on
Commit
3dafe6c
ยท
verified ยท
1 Parent(s): 52d7dfb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +400 -239
app.py CHANGED
@@ -1,286 +1,447 @@
1
- import torch
2
- import pandas as pd
3
- from sentence_transformers import SentenceTransformer, util
4
- import gradio as gr
5
- import re
6
- from rank_bm25 import BM25Okapi
7
- import numpy as np
8
 
9
- # Load models
10
- model = SentenceTransformer("distilbert-base-multilingual-cased")
11
- modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
12
 
13
- # Load data
14
- df = pd.read_csv("cleaned1.csv")
15
- df2 = pd.read_csv("cleaned2.csv")
16
- df3 = pd.read_csv("cleaned3.csv")
17
 
18
- # Load pre-computed embeddings
19
- embeddings = torch.load("embeddings1_1.pt")
20
- embeddings2 = torch.load("embeddings2_1.pt")
21
- embeddings3 = torch.load("embeddings3_1.pt")
22
 
23
- embeddingsa = torch.load("embeddings1.pt")
24
- embeddingsa2 = torch.load("embeddings2.pt")
25
- embeddingsa3 = torch.load("embeddings3.pt")
26
 
27
- # Extract questions and links
28
- df_questions = df["question"].values
29
- df_links = df["link"].values
30
- df2_questions = df2["question"].values
31
- df2_links = df2["link"].values
32
- df3_questions = df3["question"].values
33
- df3_links = df3["url"].values
34
 
35
- ARABIC_STOPWORDS = {
36
- 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
37
- 'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
38
- 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ู„ู‡', 'ู„ู‡ุง', 'ู„ู‡ู…', 'ูˆ', 'ุฃู…', 'ุฅู†'
39
- }
40
 
41
- def arabic_word_tokenize(text):
42
- if not isinstance(text, str):
43
- return []
44
- # Remove diacritics
45
- text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
46
- # Extract only Arabic words (length โ‰ฅ 2)
47
- tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
48
- return [t for t in tokens if t not in ARABIC_STOPWORDS]
49
-
50
- def prepare_bm25_corpus(questions):
51
- """Prepare tokenized corpus for BM25"""
52
- tokenized_corpus = []
53
- for question in questions:
54
- tokens = arabic_word_tokenize(question)
55
- tokenized_corpus.append(tokens)
56
- return tokenized_corpus
57
-
58
- # Initialize BM25 models for each dataset
59
- print("Initializing BM25 models...")
60
- bm25_corpus1 = prepare_bm25_corpus(df_questions)
61
- bm25_corpus2 = prepare_bm25_corpus(df2_questions)
62
- bm25_corpus3 = prepare_bm25_corpus(df3_questions)
63
-
64
- bm25_model1 = BM25Okapi(bm25_corpus1)
65
- bm25_model2 = BM25Okapi(bm25_corpus2)
66
- bm25_model3 = BM25Okapi(bm25_corpus3)
67
- print("BM25 models initialized!")
68
-
69
- def compute_bm25_scores(query, bm25_model):
70
- """Compute BM25 scores for a query"""
71
- query_tokens = arabic_word_tokenize(query)
72
- if not query_tokens:
73
- return np.zeros(len(bm25_model.corpus))
74
 
75
- scores = bm25_model.get_scores(query_tokens)
76
- return scores
77
 
78
- def compute_word_overlap(query, questions):
79
- """Enhanced word overlap computation"""
80
- query_words = set(arabic_word_tokenize(query))
81
- if len(query_words) == 0:
82
- return [0.0] * len(questions)
83
 
84
- overlaps = []
85
- for q in questions:
86
- q_words = set(arabic_word_tokenize(q))
87
- if len(q_words) == 0:
88
- overlaps.append(0.0)
89
- continue
90
 
91
- # Use Jaccard similarity (intersection over union)
92
- intersection = len(query_words & q_words)
93
- union = len(query_words | q_words)
94
- jaccard = intersection / union if union > 0 else 0.0
95
 
96
- # Also compute coverage (how much of query is matched)
97
- coverage = intersection / len(query_words)
98
 
99
- # Combine both: prioritize coverage but consider similarity
100
- overlap_score = 0.7 * coverage + 0.3 * jaccard
101
- overlaps.append(overlap_score)
102
 
103
- return overlaps
104
 
105
- def normalize_scores(scores):
106
- """Normalize scores to 0-1 range"""
107
- scores = np.array(scores)
108
- if np.max(scores) == np.min(scores):
109
- return np.zeros_like(scores)
110
- return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
111
-
112
- def predict(text):
113
- print(f"Received query: {text}")
114
- if not text or text.strip() == "":
115
- return "No query provided"
116
-
117
- # Semantic similarity scores
118
- query_embedding = model.encode(text, convert_to_tensor=True)
119
- query_embeddinga = modela.encode(text, convert_to_tensor=True)
120
-
121
- # Cosine similarities (averaged from two models)
122
- sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
123
- util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
124
- sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
125
- util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
126
- sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
127
- util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
128
-
129
- # BM25 scores
130
- bm25_scores1 = compute_bm25_scores(text, bm25_model1)
131
- bm25_scores2 = compute_bm25_scores(text, bm25_model2)
132
- bm25_scores3 = compute_bm25_scores(text, bm25_model3)
133
-
134
- # Word overlap scores
135
- word_overlap1 = compute_word_overlap(text, df_questions)
136
- word_overlap2 = compute_word_overlap(text, df2_questions)
137
- word_overlap3 = compute_word_overlap(text, df3_questions)
138
-
139
- # Normalize all scores for fair combination
140
- norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
141
- norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
142
- norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
143
 
144
- norm_bm25_1 = normalize_scores(bm25_scores1)
145
- norm_bm25_2 = normalize_scores(bm25_scores2)
146
- norm_bm25_3 = normalize_scores(bm25_scores3)
147
 
148
- norm_word1 = normalize_scores(word_overlap1)
149
- norm_word2 = normalize_scores(word_overlap2)
150
- norm_word3 = normalize_scores(word_overlap3)
151
 
152
- # Adaptive weighting based on query characteristics
153
- query_words = arabic_word_tokenize(text)
154
- query_length = len(query_words)
155
 
156
- if query_length <= 2:
157
- # Short queries: prioritize exact matches (BM25 + word overlap)
158
- semantic_weight = 0.3
159
- bm25_weight = 0.4
160
- word_weight = 0.3
161
- elif query_length <= 5:
162
- # Medium queries: balanced approach
163
- semantic_weight = 0.4
164
- bm25_weight = 0.35
165
- word_weight = 0.25
166
- else:
167
- # Long queries: prioritize semantic understanding
168
- semantic_weight = 0.5
169
- bm25_weight = 0.3
170
- word_weight = 0.2
171
 
172
- def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
173
- combined_results = []
174
 
175
- for i in range(len(questions)):
176
- semantic_score = float(norm_semantic[i])
177
- bm25_score = float(norm_bm25[i])
178
- word_score = float(norm_word[i])
179
 
180
- # Enhanced scoring with BM25
181
- combined_score = (semantic_weight * semantic_score +
182
- bm25_weight * bm25_score +
183
- word_weight * word_score)
184
 
185
- # Boost results that perform well across multiple metrics
186
- high_performance_count = sum([
187
- semantic_score > 0.7,
188
- bm25_score > 0.7,
189
- word_score > 0.5
190
- ])
191
 
192
- if high_performance_count >= 2:
193
- boost = 0.1
194
- elif high_performance_count >= 1:
195
- boost = 0.05
196
- else:
197
- boost = 0.0
198
 
199
- final_score = combined_score + boost
200
 
201
- combined_results.append({
202
- "question": questions[i],
203
- "link": links[i],
204
- "semantic_score": semantic_score,
205
- "bm25_score": bm25_score,
206
- "word_overlap_score": word_score,
207
- "combined_score": final_score
208
- })
209
 
210
- return combined_results
211
-
212
- # Create combined results for all datasets
213
- combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
214
- combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
215
- combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)
216
-
217
- def get_diverse_top_results(combined_results, top_k=5):
218
- """Get diverse top results using multiple ranking strategies"""
219
- # Sort by combined score and get top candidates
220
- by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
221
- top_combined = by_combined[:3]
222
 
223
- # Get questions from top combined to avoid duplicates
224
- used_questions = {item["question"] for item in top_combined}
225
 
226
- # Add best BM25 result not already included
227
- by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
228
- bm25_pick = None
229
- for item in by_bm25:
230
- if item["question"] not in used_questions:
231
- bm25_pick = item
232
- break
233
 
234
- # Add best semantic result not already included
235
- by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
236
- semantic_pick = None
237
- if bm25_pick:
238
- used_questions.add(bm25_pick["question"])
239
 
240
- for item in by_semantic:
241
- if item["question"] not in used_questions:
242
- semantic_pick = item
243
- break
244
 
245
- # Combine results
246
- final_results = top_combined.copy()
247
- if bm25_pick:
248
- final_results.append(bm25_pick)
249
- if semantic_pick:
250
- final_results.append(semantic_pick)
251
 
252
- return final_results[:top_k]
253
 
254
- # Get top results for each dataset
255
- top1 = get_diverse_top_results(combined1)
256
- top2 = get_diverse_top_results(combined2)
257
- top3 = get_diverse_top_results(combined3)
258
 
259
- results = {
260
 
261
- "top2": top2,
262
- "top3": top3,
263
- "top1": top1,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  "query_info": {
265
- "query_length": query_length,
266
- "weights": {
267
- "semantic": semantic_weight,
268
- "bm25": bm25_weight,
269
- "word_overlap": word_weight
270
- }
271
  }
272
  }
273
 
274
- return results
275
-
276
- title = "Enhanced Search with BM25"
277
  iface = gr.Interface(
278
  fn=predict,
279
  inputs=[gr.Textbox(label="Search Query", lines=3)],
280
- outputs='json',
281
  title=title,
282
- description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
283
  )
284
 
285
  if __name__ == "__main__":
286
- iface.launch()
 
1
+ # import torch
2
+ # import pandas as pd
3
+ # from sentence_transformers import SentenceTransformer, util
4
+ # import gradio as gr
5
+ # import re
6
+ # from rank_bm25 import BM25Okapi
7
+ # import numpy as np
8
 
9
+ # # Load models
10
+ # model = SentenceTransformer("distilbert-base-multilingual-cased")
11
+ # modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
12
 
13
+ # # Load data
14
+ # df = pd.read_csv("cleaned1.csv")
15
+ # df2 = pd.read_csv("cleaned2.csv")
16
+ # df3 = pd.read_csv("cleaned3.csv")
17
 
18
+ # # Load pre-computed embeddings
19
+ # embeddings = torch.load("embeddings1_1.pt")
20
+ # embeddings2 = torch.load("embeddings2_1.pt")
21
+ # embeddings3 = torch.load("embeddings3_1.pt")
22
 
23
+ # embeddingsa = torch.load("embeddings1.pt")
24
+ # embeddingsa2 = torch.load("embeddings2.pt")
25
+ # embeddingsa3 = torch.load("embeddings3.pt")
26
 
27
+ # # Extract questions and links
28
+ # df_questions = df["question"].values
29
+ # df_links = df["link"].values
30
+ # df2_questions = df2["question"].values
31
+ # df2_links = df2["link"].values
32
+ # df3_questions = df3["question"].values
33
+ # df3_links = df3["url"].values
34
 
35
+ # ARABIC_STOPWORDS = {
36
+ # 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
37
+ # 'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
38
+ # 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ู„ู‡', 'ู„ู‡ุง', 'ู„ู‡ู…', 'ูˆ', 'ุฃู…', 'ุฅู†'
39
+ # }
40
 
41
+ # def arabic_word_tokenize(text):
42
+ # if not isinstance(text, str):
43
+ # return []
44
+ # # Remove diacritics
45
+ # text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
46
+ # # Extract only Arabic words (length โ‰ฅ 2)
47
+ # tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
48
+ # return [t for t in tokens if t not in ARABIC_STOPWORDS]
49
+
50
+ # def prepare_bm25_corpus(questions):
51
+ # """Prepare tokenized corpus for BM25"""
52
+ # tokenized_corpus = []
53
+ # for question in questions:
54
+ # tokens = arabic_word_tokenize(question)
55
+ # tokenized_corpus.append(tokens)
56
+ # return tokenized_corpus
57
+
58
+ # # Initialize BM25 models for each dataset
59
+ # print("Initializing BM25 models...")
60
+ # bm25_corpus1 = prepare_bm25_corpus(df_questions)
61
+ # bm25_corpus2 = prepare_bm25_corpus(df2_questions)
62
+ # bm25_corpus3 = prepare_bm25_corpus(df3_questions)
63
+
64
+ # bm25_model1 = BM25Okapi(bm25_corpus1)
65
+ # bm25_model2 = BM25Okapi(bm25_corpus2)
66
+ # bm25_model3 = BM25Okapi(bm25_corpus3)
67
+ # print("BM25 models initialized!")
68
+
69
+ # def compute_bm25_scores(query, bm25_model):
70
+ # """Compute BM25 scores for a query"""
71
+ # query_tokens = arabic_word_tokenize(query)
72
+ # if not query_tokens:
73
+ # return np.zeros(len(bm25_model.corpus))
74
 
75
+ # scores = bm25_model.get_scores(query_tokens)
76
+ # return scores
77
 
78
+ # def compute_word_overlap(query, questions):
79
+ # """Enhanced word overlap computation"""
80
+ # query_words = set(arabic_word_tokenize(query))
81
+ # if len(query_words) == 0:
82
+ # return [0.0] * len(questions)
83
 
84
+ # overlaps = []
85
+ # for q in questions:
86
+ # q_words = set(arabic_word_tokenize(q))
87
+ # if len(q_words) == 0:
88
+ # overlaps.append(0.0)
89
+ # continue
90
 
91
+ # # Use Jaccard similarity (intersection over union)
92
+ # intersection = len(query_words & q_words)
93
+ # union = len(query_words | q_words)
94
+ # jaccard = intersection / union if union > 0 else 0.0
95
 
96
+ # # Also compute coverage (how much of query is matched)
97
+ # coverage = intersection / len(query_words)
98
 
99
+ # # Combine both: prioritize coverage but consider similarity
100
+ # overlap_score = 0.7 * coverage + 0.3 * jaccard
101
+ # overlaps.append(overlap_score)
102
 
103
+ # return overlaps
104
 
105
+ # def normalize_scores(scores):
106
+ # """Normalize scores to 0-1 range"""
107
+ # scores = np.array(scores)
108
+ # if np.max(scores) == np.min(scores):
109
+ # return np.zeros_like(scores)
110
+ # return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
111
+
112
+ # def predict(text):
113
+ # print(f"Received query: {text}")
114
+ # if not text or text.strip() == "":
115
+ # return "No query provided"
116
+
117
+ # # Semantic similarity scores
118
+ # query_embedding = model.encode(text, convert_to_tensor=True)
119
+ # query_embeddinga = modela.encode(text, convert_to_tensor=True)
120
+
121
+ # # Cosine similarities (averaged from two models)
122
+ # sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
123
+ # util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
124
+ # sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
125
+ # util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
126
+ # sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
127
+ # util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
128
+
129
+ # # BM25 scores
130
+ # bm25_scores1 = compute_bm25_scores(text, bm25_model1)
131
+ # bm25_scores2 = compute_bm25_scores(text, bm25_model2)
132
+ # bm25_scores3 = compute_bm25_scores(text, bm25_model3)
133
+
134
+ # # Word overlap scores
135
+ # word_overlap1 = compute_word_overlap(text, df_questions)
136
+ # word_overlap2 = compute_word_overlap(text, df2_questions)
137
+ # word_overlap3 = compute_word_overlap(text, df3_questions)
138
+
139
+ # # Normalize all scores for fair combination
140
+ # norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
141
+ # norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
142
+ # norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
143
 
144
+ # norm_bm25_1 = normalize_scores(bm25_scores1)
145
+ # norm_bm25_2 = normalize_scores(bm25_scores2)
146
+ # norm_bm25_3 = normalize_scores(bm25_scores3)
147
 
148
+ # norm_word1 = normalize_scores(word_overlap1)
149
+ # norm_word2 = normalize_scores(word_overlap2)
150
+ # norm_word3 = normalize_scores(word_overlap3)
151
 
152
+ # # Adaptive weighting based on query characteristics
153
+ # query_words = arabic_word_tokenize(text)
154
+ # query_length = len(query_words)
155
 
156
+ # if query_length <= 2:
157
+ # # Short queries: prioritize exact matches (BM25 + word overlap)
158
+ # semantic_weight = 0.3
159
+ # bm25_weight = 0.4
160
+ # word_weight = 0.3
161
+ # elif query_length <= 5:
162
+ # # Medium queries: balanced approach
163
+ # semantic_weight = 0.4
164
+ # bm25_weight = 0.35
165
+ # word_weight = 0.25
166
+ # else:
167
+ # # Long queries: prioritize semantic understanding
168
+ # semantic_weight = 0.5
169
+ # bm25_weight = 0.3
170
+ # word_weight = 0.2
171
 
172
+ # def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
173
+ # combined_results = []
174
 
175
+ # for i in range(len(questions)):
176
+ # semantic_score = float(norm_semantic[i])
177
+ # bm25_score = float(norm_bm25[i])
178
+ # word_score = float(norm_word[i])
179
 
180
+ # # Enhanced scoring with BM25
181
+ # combined_score = (semantic_weight * semantic_score +
182
+ # bm25_weight * bm25_score +
183
+ # word_weight * word_score)
184
 
185
+ # # Boost results that perform well across multiple metrics
186
+ # high_performance_count = sum([
187
+ # semantic_score > 0.7,
188
+ # bm25_score > 0.7,
189
+ # word_score > 0.5
190
+ # ])
191
 
192
+ # if high_performance_count >= 2:
193
+ # boost = 0.1
194
+ # elif high_performance_count >= 1:
195
+ # boost = 0.05
196
+ # else:
197
+ # boost = 0.0
198
 
199
+ # final_score = combined_score + boost
200
 
201
+ # combined_results.append({
202
+ # "question": questions[i],
203
+ # "link": links[i],
204
+ # "semantic_score": semantic_score,
205
+ # "bm25_score": bm25_score,
206
+ # "word_overlap_score": word_score,
207
+ # "combined_score": final_score
208
+ # })
209
 
210
+ # return combined_results
211
+
212
+ # # Create combined results for all datasets
213
+ # combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
214
+ # combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
215
+ # combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)
216
+
217
+ # def get_diverse_top_results(combined_results, top_k=5):
218
+ # """Get diverse top results using multiple ranking strategies"""
219
+ # # Sort by combined score and get top candidates
220
+ # by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
221
+ # top_combined = by_combined[:3]
222
 
223
+ # # Get questions from top combined to avoid duplicates
224
+ # used_questions = {item["question"] for item in top_combined}
225
 
226
+ # # Add best BM25 result not already included
227
+ # by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
228
+ # bm25_pick = None
229
+ # for item in by_bm25:
230
+ # if item["question"] not in used_questions:
231
+ # bm25_pick = item
232
+ # break
233
 
234
+ # # Add best semantic result not already included
235
+ # by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
236
+ # semantic_pick = None
237
+ # if bm25_pick:
238
+ # used_questions.add(bm25_pick["question"])
239
 
240
+ # for item in by_semantic:
241
+ # if item["question"] not in used_questions:
242
+ # semantic_pick = item
243
+ # break
244
 
245
+ # # Combine results
246
+ # final_results = top_combined.copy()
247
+ # if bm25_pick:
248
+ # final_results.append(bm25_pick)
249
+ # if semantic_pick:
250
+ # final_results.append(semantic_pick)
251
 
252
+ # return final_results[:top_k]
253
 
254
+ # # Get top results for each dataset
255
+ # top1 = get_diverse_top_results(combined1)
256
+ # top2 = get_diverse_top_results(combined2)
257
+ # top3 = get_diverse_top_results(combined3)
258
 
259
+ # results = {
260
 
261
+ # "top2": top2,
262
+ # "top3": top3,
263
+ # "top1": top1,
264
+ # "query_info": {
265
+ # "query_length": query_length,
266
+ # "weights": {
267
+ # "semantic": semantic_weight,
268
+ # "bm25": bm25_weight,
269
+ # "word_overlap": word_weight
270
+ # }
271
+ # }
272
+ # }
273
+
274
+ # return results
275
+
276
+ # title = "Enhanced Search with BM25"
277
+ # iface = gr.Interface(
278
+ # fn=predict,
279
+ # inputs=[gr.Textbox(label="Search Query", lines=3)],
280
+ # outputs='json',
281
+ # title=title,
282
+ # description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
283
+ # )
284
+
285
+ # if __name__ == "__main__":
286
+ # iface.launch()
287
+
288
+
289
+ import torch
290
+ import pandas as pd
291
+ from sentence_transformers import SentenceTransformer, util
292
+ import gradio as gr
293
+ import re
294
+ import numpy as np
295
+ import math
296
+ from collections import Counter
297
+
298
+ # Load both models
299
+ model1 = SentenceTransformer("distilbert-base-multilingual-cased")
300
+ model2 = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
301
+
302
+ # Load data
303
+ print("Loading data and embeddings...")
304
+ df = pd.read_csv("cleaned1.csv")
305
+ df2 = pd.read_csv("cleaned2.csv")
306
+ df3 = pd.read_csv("cleaned3.csv")
307
+
308
+ embeddings1 = torch.load("embeddings1_1.pt")
309
+ embeddings2 = torch.load("embeddings2_1.pt")
310
+ embeddings3 = torch.load("embeddings3_1.pt")
311
+
312
+ embeddings1a = torch.load("embeddings1.pt")
313
+ embeddings2a = torch.load("embeddings2.pt")
314
+ embeddings3a = torch.load("embeddings3.pt")
315
+
316
+ # Arabic stopwords
317
+ ARABIC_STOPWORDS = {
318
+ 'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุฐู„ูƒ', 'ุชู„ูƒ',
319
+ 'ุงู„ุชูŠ', 'ุงู„ุฐูŠ', 'ู…ุง', 'ู„ุง', 'ุฃู†', 'ุฃูˆ', 'ู„ูƒู†', 'ู‚ุฏ', 'ุญูƒู…', 'ู‚ุงู„',
320
+ 'ูƒุงู†', 'ูƒุงู†ุช', 'ูŠูƒูˆู†', 'ุชูƒูˆู†', 'ู„ู‡', 'ู„ู‡ุง', 'ู„ู‡ู…', 'ูˆ', 'ุฃู…', 'ุฅู†',
321
+ 'ุฑุถูŠ', 'ุนู„ูŠู‡ุง', 'ุนู†ู‡ู…', 'ุนู†ู‡', 'ุนู„ูŠู‡ู…', 'ุตู„ู‰', 'ูˆุณู„ู…',
322
+ 'ุณู„ุงู…', 'ุนู„ูŠู‡', 'ุงู„ุฑุณูˆู„', 'ุงู„ู†ุจูŠ', 'ุนู„ูŠู‡', 'ุงู„ุณู„ุงู…', 'ุญุฏูŠุซ', 'ุงุญุงุฏูŠุซ'
323
+ }
324
+
325
+ def arabic_word_tokenize(text):
326
+ if not isinstance(text, str): return []
327
+ text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
328
+ return [t for t in re.findall(r'[\u0600-\u06FF]{2,}', text) if t not in ARABIC_STOPWORDS]
329
+
330
+ # Pre-tokenize questions and compute doc frequencies
331
+ def setup_tokenization_and_freqs(questions):
332
+ tokenized = [arabic_word_tokenize(q) for q in questions]
333
+ doc_freqs = Counter(word for doc in tokenized for word in set(doc))
334
+ return tokenized, doc_freqs
335
+
336
+ tokenized1, doc_freqs1 = setup_tokenization_and_freqs(df["question"].values)
337
+ tokenized2, doc_freqs2 = setup_tokenization_and_freqs(df2["question"].values)
338
+ tokenized3, doc_freqs3 = setup_tokenization_and_freqs(df3["question"].values)
339
+
340
+ def compute_word_overlap(query, questions):
341
+ q_words = set(arabic_word_tokenize(query))
342
+ scores = []
343
+ for doc in questions:
344
+ d_words = set(arabic_word_tokenize(doc))
345
+ if not d_words or not q_words:
346
+ scores.append(0.0)
347
+ continue
348
+ inter = len(q_words & d_words)
349
+ union = len(q_words | d_words)
350
+ jaccard = inter / union if union else 0.0
351
+ coverage = inter / len(q_words)
352
+ scores.append(0.7 * coverage + 0.3 * jaccard)
353
+ return scores
354
+
355
+ def lightweight_bm25_score(query_tokens, doc_tokens, doc_freqs, total_docs, k1=1.2, b=0.75):
356
+ score = 0.0
357
+ doc_len = len(doc_tokens)
358
+ avg_doc_len = 10
359
+ for term in query_tokens:
360
+ if term in doc_tokens:
361
+ tf = doc_tokens.count(term)
362
+ df = doc_freqs.get(term, 0)
363
+ if df > 0:
364
+ idf = math.log((total_docs - df + 0.5) / (df + 0.5))
365
+ score += idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avg_doc_len)))
366
+ return score
367
+
368
+ def normalize_scores(scores):
369
+ arr = np.array(scores)
370
+ if arr.max() == arr.min(): return np.zeros_like(arr)
371
+ return (arr - arr.min()) / (arr.max() - arr.min())
372
+
373
+ def combine_scores(query, questions, tokenized, doc_freqs, emb1, emb2):
374
+ total_docs = len(questions)
375
+ q_emb1 = model1.encode(query, convert_to_tensor=True)
376
+ q_emb2 = model2.encode(query, convert_to_tensor=True)
377
+
378
+ sim1 = util.pytorch_cos_sim(q_emb1, emb1)[0]
379
+ sim2 = util.pytorch_cos_sim(q_emb2, emb2)[0]
380
+ sim_scores = ((sim1 + sim2) / 2).cpu().numpy()
381
+
382
+ bm25_scores = [lightweight_bm25_score(arabic_word_tokenize(query), doc_tokens, doc_freqs, total_docs)
383
+ for doc_tokens in tokenized]
384
+ word_scores = compute_word_overlap(query, questions)
385
+
386
+ norm_bm25 = normalize_scores(bm25_scores)
387
+ norm_word = normalize_scores(word_scores)
388
+ norm_sim = normalize_scores(sim_scores)
389
+
390
+ query_len = len(arabic_word_tokenize(query))
391
+ if query_len <= 2:
392
+ w_sem, w_bm, w_word = 0.3, 0.4, 0.3
393
+ elif query_len <= 5:
394
+ w_sem, w_bm, w_word = 0.4, 0.35, 0.25
395
+ else:
396
+ w_sem, w_bm, w_word = 0.5, 0.3, 0.2
397
+
398
+ results = []
399
+ for i, q in enumerate(questions):
400
+ sem, bm, word = norm_sim[i], norm_bm25[i], norm_word[i]
401
+ combined = w_sem*sem + w_bm*bm + w_word*word
402
+ boost = 0.1 if sum([sem > 0.7, bm > 0.7, word > 0.5]) >= 2 else (0.05 if sum([sem > 0.7, bm > 0.7, word > 0.5]) == 1 else 0.0)
403
+ results.append({
404
+ "question": q,
405
+ "semantic_score": sem,
406
+ "bm25_score": bm,
407
+ "word_overlap_score": word,
408
+ "combined_score": combined + boost
409
+ })
410
+ return results
411
+
412
+ def get_top_diverse(results, links, top_k=5):
413
+ results = [dict(r, link=links[i]) for i, r in enumerate(results)]
414
+ top_combined = sorted(results, key=lambda x: x['combined_score'], reverse=True)[:3]
415
+ used_q = {r['question'] for r in top_combined}
416
+ top_bm = next((r for r in sorted(results, key=lambda x: x['bm25_score'], reverse=True) if r['question'] not in used_q), None)
417
+ if top_bm: used_q.add(top_bm['question'])
418
+ top_sem = next((r for r in sorted(results, key=lambda x: x['semantic_score'], reverse=True) if r['question'] not in used_q), None)
419
+ final = top_combined + ([top_bm] if top_bm else []) + ([top_sem] if top_sem else [])
420
+ return final[:top_k]
421
+
422
+ def predict(query):
423
+ print(f"Query: {query}")
424
+ results1 = combine_scores(query, df["question"].values, tokenized1, doc_freqs1, embeddings1, embeddings1a)
425
+ results2 = combine_scores(query, df2["question"].values, tokenized2, doc_freqs2, embeddings2, embeddings2a)
426
+ results3 = combine_scores(query, df3["question"].values, tokenized3, doc_freqs3, embeddings3, embeddings3a)
427
+
428
+ return {
429
+ "top2": get_top_diverse(results2, df2["link"].values),
430
+ "top3": get_top_diverse(results3, df3["url"].values),
431
+ "top1": get_top_diverse(results1, df["link"].values),
432
  "query_info": {
433
+ "query_length": len(arabic_word_tokenize(query))
 
 
 
 
 
434
  }
435
  }
436
 
437
+ title = "Arabic Search: Dual-Model + BM25 + Overlap"
 
 
438
  iface = gr.Interface(
439
  fn=predict,
440
  inputs=[gr.Textbox(label="Search Query", lines=3)],
441
+ outputs="json",
442
  title=title,
443
+ description="Accurate Arabic search using two semantic models, fast BM25, and word overlap."
444
  )
445
 
446
  if __name__ == "__main__":
447
+ iface.launch()