mohbay commited on
Commit
892da5a
·
verified ·
1 Parent(s): 6c11a17

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +217 -41
app.py CHANGED
@@ -3,13 +3,19 @@ import pandas as pd
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
  import re
 
 
6
 
 
7
  model = SentenceTransformer("distilbert-base-multilingual-cased")
8
  modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
 
 
9
  df = pd.read_csv("cleaned1.csv")
10
  df2 = pd.read_csv("cleaned2.csv")
11
  df3 = pd.read_csv("cleaned3.csv")
12
 
 
13
  embeddings = torch.load("embeddings1_1.pt")
14
  embeddings2 = torch.load("embeddings2_1.pt")
15
  embeddings3 = torch.load("embeddings3_1.pt")
@@ -18,6 +24,7 @@ embeddingsa = torch.load("embeddings1.pt")
18
  embeddingsa2 = torch.load("embeddings2.pt")
19
  embeddingsa3 = torch.load("embeddings3.pt")
20
 
 
21
  df_questions = df["question"].values
22
  df_links = df["link"].values
23
  df2_questions = df2["question"].values
@@ -26,30 +33,105 @@ df3_questions = df3["question"].values
26
  df3_links = df3["url"].values
27
 
28
  def arabic_word_tokenize(text):
 
29
  if not isinstance(text, str):
30
  return []
31
- return re.findall(r'\w+', text)
 
 
 
 
32
 
33
- def compute_word_overlap(query, questions):
 
34
  query_words = set(arabic_word_tokenize(query))
 
 
 
35
  overlaps = []
36
  for q in questions:
37
  q_words = set(arabic_word_tokenize(q))
38
- if len(query_words) > 0:
39
- overlap_score = len(query_words & q_words) / len(query_words)
40
- else:
41
- overlap_score = 0.0
42
- overlaps.append(overlap_score)
 
 
 
 
 
 
 
 
 
 
 
43
  return overlaps
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def predict(text):
46
  if not text or text.strip() == "":
47
  return "No query provided"
48
 
 
49
  query_embedding = model.encode(text, convert_to_tensor=True)
50
  query_embeddinga = modela.encode(text, convert_to_tensor=True)
51
 
52
- # Cosine similarities
53
  sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
54
  util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
55
  sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
@@ -57,65 +139,159 @@ def predict(text):
57
  sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
58
  util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
59
 
60
- # Word overlaps
61
- word_overlap1 = compute_word_overlap(text, df_questions)
62
- word_overlap2 = compute_word_overlap(text, df2_questions)
63
- word_overlap3 = compute_word_overlap(text, df3_questions)
 
 
 
 
 
 
 
 
 
 
64
 
65
- weight = 0.4
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- # Collect top1
68
- combined1 = [
69
- {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  "question": df_questions[i],
71
  "link": df_links[i],
72
  "cosine_score": float(sim_scores1[i].cpu().item()),
73
  "word_overlap_score": float(word_overlap1[i]),
74
- "combined_score": float(sim_scores1[i].cpu().item()) + weight * word_overlap1[i]
75
- }
76
- for i in range(len(df_questions))
77
- ]
78
- top1 = sorted(combined1, key=lambda x: x["combined_score"], reverse=True)[:3]
79
 
80
- # Collect top2
81
- combined2 = [
82
- {
 
 
 
 
 
 
 
83
  "question": df2_questions[i],
84
  "link": df2_links[i],
85
  "cosine_score": float(sim_scores2[i].cpu().item()),
86
  "word_overlap_score": float(word_overlap2[i]),
87
- "combined_score": float(sim_scores2[i].cpu().item()) + weight * word_overlap2[i]
88
- }
89
- for i in range(len(df2_questions))
90
- ]
91
- top2 = sorted(combined2, key=lambda x: x["combined_score"], reverse=True)[:3]
92
 
93
- # Collect top3
94
- combined3 = [
95
- {
 
 
 
 
 
 
 
96
  "question": df3_questions[i],
97
  "link": df3_links[i],
98
  "cosine_score": float(sim_scores3[i].cpu().item()),
99
  "word_overlap_score": float(word_overlap3[i]),
100
- "combined_score": float(sim_scores3[i].cpu().item()) + weight * word_overlap3[i]
101
- }
102
- for i in range(len(df3_questions))
103
- ]
104
- top3 = sorted(combined3, key=lambda x: x["combined_score"], reverse=True)[:3]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  results = {
 
107
  "top2": top2,
108
  "top3": top3,
109
- "top1": top1
 
 
 
 
 
110
  }
111
 
112
  return results
113
 
114
- title = "Search CSV"
115
  iface = gr.Interface(
116
  fn=predict,
117
- inputs=[gr.Textbox(label="text", lines=3)],
118
  outputs='json',
119
  title=title,
 
120
  )
121
- iface.launch()
 
 
 
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
  import re
6
+ import numpy as np
7
+ from collections import Counter
8
 
9
+ # Load models
10
  model = SentenceTransformer("distilbert-base-multilingual-cased")
11
  modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
12
+
13
+ # Load data
14
  df = pd.read_csv("cleaned1.csv")
15
  df2 = pd.read_csv("cleaned2.csv")
16
  df3 = pd.read_csv("cleaned3.csv")
17
 
18
+ # Load embeddings
19
  embeddings = torch.load("embeddings1_1.pt")
20
  embeddings2 = torch.load("embeddings2_1.pt")
21
  embeddings3 = torch.load("embeddings3_1.pt")
 
24
  embeddingsa2 = torch.load("embeddings2.pt")
25
  embeddingsa3 = torch.load("embeddings3.pt")
26
 
27
+ # Extract data arrays
28
  df_questions = df["question"].values
29
  df_links = df["link"].values
30
  df2_questions = df2["question"].values
 
33
  df3_links = df3["url"].values
34
 
35
  def arabic_word_tokenize(text):
36
+ """Improved tokenization with better handling of Arabic text"""
37
  if not isinstance(text, str):
38
  return []
39
+ # Remove diacritics and normalize
40
+ text = re.sub(r'[\u064B-\u065F\u0670\u06D6-\u06ED]', '', text)
41
+ # Extract words (Arabic, English, and numbers)
42
+ words = re.findall(r'[\u0600-\u06FF\u0750-\u077F\w]+', text.lower())
43
+ return words
44
 
45
+ def compute_enhanced_word_overlap(query, questions):
46
+ """Enhanced word overlap with better scoring"""
47
  query_words = set(arabic_word_tokenize(query))
48
+ if len(query_words) == 0:
49
+ return [0.0] * len(questions)
50
+
51
  overlaps = []
52
  for q in questions:
53
  q_words = set(arabic_word_tokenize(q))
54
+ if len(q_words) == 0:
55
+ overlaps.append(0.0)
56
+ continue
57
+
58
+ # Jaccard similarity (intersection over union)
59
+ intersection = len(query_words & q_words)
60
+ union = len(query_words | q_words)
61
+ jaccard = intersection / union if union > 0 else 0.0
62
+
63
+ # Word coverage (how much of query is covered)
64
+ coverage = intersection / len(query_words)
65
+
66
+ # Combine both metrics
67
+ combined_overlap = 0.6 * jaccard + 0.4 * coverage
68
+ overlaps.append(combined_overlap)
69
+
70
  return overlaps
71
 
72
+ def compute_fuzzy_matches(query, questions):
73
+ """Compute fuzzy string matching scores"""
74
+ query_words = arabic_word_tokenize(query)
75
+ if len(query_words) == 0:
76
+ return [0.0] * len(questions)
77
+
78
+ fuzzy_scores = []
79
+ for q in questions:
80
+ q_words = arabic_word_tokenize(q)
81
+ if len(q_words) == 0:
82
+ fuzzy_scores.append(0.0)
83
+ continue
84
+
85
+ # Find partial matches (substrings)
86
+ matches = 0
87
+ for q_word in query_words:
88
+ for doc_word in q_words:
89
+ if len(q_word) >= 3 and len(doc_word) >= 3:
90
+ if q_word in doc_word or doc_word in q_word:
91
+ matches += 1
92
+ break
93
+
94
+ fuzzy_score = matches / len(query_words) if len(query_words) > 0 else 0.0
95
+ fuzzy_scores.append(fuzzy_score)
96
+
97
+ return fuzzy_scores
98
+
99
+ def compute_length_penalty(query, questions):
100
+ """Penalize very long or very short results relative to query"""
101
+ query_len = len(arabic_word_tokenize(query))
102
+ penalties = []
103
+
104
+ for q in questions:
105
+ q_len = len(arabic_word_tokenize(q))
106
+ if q_len == 0:
107
+ penalties.append(0.0)
108
+ continue
109
+
110
+ # Optimal length ratio (prefer similar lengths)
111
+ ratio = min(query_len, q_len) / max(query_len, q_len)
112
+ # Penalty for very short results
113
+ if q_len < 3:
114
+ ratio *= 0.5
115
+ penalties.append(ratio)
116
+
117
+ return penalties
118
+
119
+ def normalize_scores(scores):
120
+ """Normalize scores to 0-1 range"""
121
+ scores = np.array(scores)
122
+ if scores.max() - scores.min() == 0:
123
+ return scores
124
+ return (scores - scores.min()) / (scores.max() - scores.min())
125
+
126
  def predict(text):
127
  if not text or text.strip() == "":
128
  return "No query provided"
129
 
130
+ # Encode query with both models
131
  query_embedding = model.encode(text, convert_to_tensor=True)
132
  query_embeddinga = modela.encode(text, convert_to_tensor=True)
133
 
134
+ # Compute semantic similarities
135
  sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
136
  util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
137
  sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
 
139
  sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
140
  util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
141
 
142
+ # Compute enhanced word overlaps
143
+ word_overlap1 = compute_enhanced_word_overlap(text, df_questions)
144
+ word_overlap2 = compute_enhanced_word_overlap(text, df2_questions)
145
+ word_overlap3 = compute_enhanced_word_overlap(text, df3_questions)
146
+
147
+ # Compute fuzzy matches
148
+ fuzzy_scores1 = compute_fuzzy_matches(text, df_questions)
149
+ fuzzy_scores2 = compute_fuzzy_matches(text, df2_questions)
150
+ fuzzy_scores3 = compute_fuzzy_matches(text, df3_questions)
151
+
152
+ # Compute length penalties
153
+ length_penalties1 = compute_length_penalty(text, df_questions)
154
+ length_penalties2 = compute_length_penalty(text, df2_questions)
155
+ length_penalties3 = compute_length_penalty(text, df3_questions)
156
 
157
+ # Normalize all scores
158
+ sem_scores1 = normalize_scores([float(x.cpu().item()) for x in sim_scores1])
159
+ sem_scores2 = normalize_scores([float(x.cpu().item()) for x in sim_scores2])
160
+ sem_scores3 = normalize_scores([float(x.cpu().item()) for x in sim_scores3])
161
+
162
+ word_scores1 = normalize_scores(word_overlap1)
163
+ word_scores2 = normalize_scores(word_overlap2)
164
+ word_scores3 = normalize_scores(word_overlap3)
165
+
166
+ fuzzy_scores1_norm = normalize_scores(fuzzy_scores1)
167
+ fuzzy_scores2_norm = normalize_scores(fuzzy_scores2)
168
+ fuzzy_scores3_norm = normalize_scores(fuzzy_scores3)
169
 
170
+ # Adaptive weights based on query characteristics
171
+ query_words = arabic_word_tokenize(text)
172
+ if len(query_words) <= 2:
173
+ # Short queries: prioritize exact matches
174
+ semantic_weight = 0.3
175
+ word_weight = 0.5
176
+ fuzzy_weight = 0.2
177
+ elif len(query_words) <= 5:
178
+ # Medium queries: balanced approach
179
+ semantic_weight = 0.4
180
+ word_weight = 0.4
181
+ fuzzy_weight = 0.2
182
+ else:
183
+ # Long queries: prioritize semantic similarity
184
+ semantic_weight = 0.5
185
+ word_weight = 0.3
186
+ fuzzy_weight = 0.2
187
+
188
+ # Collect results for dataset 1
189
+ combined1 = []
190
+ for i in range(len(df_questions)):
191
+ combined_score = (
192
+ semantic_weight * sem_scores1[i] +
193
+ word_weight * word_scores1[i] +
194
+ fuzzy_weight * fuzzy_scores1_norm[i]
195
+ ) * length_penalties1[i]
196
+
197
+ combined1.append({
198
  "question": df_questions[i],
199
  "link": df_links[i],
200
  "cosine_score": float(sim_scores1[i].cpu().item()),
201
  "word_overlap_score": float(word_overlap1[i]),
202
+ "fuzzy_score": float(fuzzy_scores1[i]),
203
+ "length_penalty": float(length_penalties1[i]),
204
+ "combined_score": float(combined_score)
205
+ })
 
206
 
207
+ # Collect results for dataset 2
208
+ combined2 = []
209
+ for i in range(len(df2_questions)):
210
+ combined_score = (
211
+ semantic_weight * sem_scores2[i] +
212
+ word_weight * word_scores2[i] +
213
+ fuzzy_weight * fuzzy_scores2_norm[i]
214
+ ) * length_penalties2[i]
215
+
216
+ combined2.append({
217
  "question": df2_questions[i],
218
  "link": df2_links[i],
219
  "cosine_score": float(sim_scores2[i].cpu().item()),
220
  "word_overlap_score": float(word_overlap2[i]),
221
+ "fuzzy_score": float(fuzzy_scores2[i]),
222
+ "length_penalty": float(length_penalties2[i]),
223
+ "combined_score": float(combined_score)
224
+ })
 
225
 
226
+ # Collect results for dataset 3
227
+ combined3 = []
228
+ for i in range(len(df3_questions)):
229
+ combined_score = (
230
+ semantic_weight * sem_scores3[i] +
231
+ word_weight * word_scores3[i] +
232
+ fuzzy_weight * fuzzy_scores3_norm[i]
233
+ ) * length_penalties3[i]
234
+
235
+ combined3.append({
236
  "question": df3_questions[i],
237
  "link": df3_links[i],
238
  "cosine_score": float(sim_scores3[i].cpu().item()),
239
  "word_overlap_score": float(word_overlap3[i]),
240
+ "fuzzy_score": float(fuzzy_scores3[i]),
241
+ "length_penalty": float(length_penalties3[i]),
242
+ "combined_score": float(combined_score)
243
+ })
244
+
245
+ # Get top results with diversity filtering
246
+ def get_diverse_top_results(results, top_k=5):
247
+ """Get top results while avoiding too similar ones"""
248
+ sorted_results = sorted(results, key=lambda x: x["combined_score"], reverse=True)
249
+
250
+ diverse_results = []
251
+ for result in sorted_results:
252
+ if len(diverse_results) >= top_k:
253
+ break
254
+
255
+ # Check if this result is too similar to already selected ones
256
+ is_diverse = True
257
+ for selected in diverse_results:
258
+ # Simple diversity check based on word overlap
259
+ overlap = compute_enhanced_word_overlap(result["question"], [selected["question"]])[0]
260
+ if overlap > 0.8: # Too similar
261
+ is_diverse = False
262
+ break
263
+
264
+ if is_diverse:
265
+ diverse_results.append(result)
266
+
267
+ return diverse_results
268
+
269
+ top1 = get_diverse_top_results(combined1, 3)
270
+ top2 = get_diverse_top_results(combined2, 3)
271
+ top3 = get_diverse_top_results(combined3, 3)
272
 
273
  results = {
274
+ "top1": top1,
275
  "top2": top2,
276
  "top3": top3,
277
+ "query_analysis": {
278
+ "word_count": len(query_words),
279
+ "semantic_weight": semantic_weight,
280
+ "word_weight": word_weight,
281
+ "fuzzy_weight": fuzzy_weight
282
+ }
283
  }
284
 
285
  return results
286
 
287
+ title = "Enhanced Search CSV"
288
  iface = gr.Interface(
289
  fn=predict,
290
+ inputs=[gr.Textbox(label="Search Query", lines=3, placeholder="Enter your search query here...")],
291
  outputs='json',
292
  title=title,
293
+ description="Enhanced semantic search with improved matching algorithms"
294
  )
295
+
296
+ if __name__ == "__main__":
297
+ iface.launch()