Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,13 +3,19 @@ import pandas as pd
|
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import gradio as gr
|
5 |
import re
|
|
|
|
|
6 |
|
|
|
7 |
model = SentenceTransformer("distilbert-base-multilingual-cased")
|
8 |
modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
|
|
|
|
9 |
df = pd.read_csv("cleaned1.csv")
|
10 |
df2 = pd.read_csv("cleaned2.csv")
|
11 |
df3 = pd.read_csv("cleaned3.csv")
|
12 |
|
|
|
13 |
embeddings = torch.load("embeddings1_1.pt")
|
14 |
embeddings2 = torch.load("embeddings2_1.pt")
|
15 |
embeddings3 = torch.load("embeddings3_1.pt")
|
@@ -18,6 +24,7 @@ embeddingsa = torch.load("embeddings1.pt")
|
|
18 |
embeddingsa2 = torch.load("embeddings2.pt")
|
19 |
embeddingsa3 = torch.load("embeddings3.pt")
|
20 |
|
|
|
21 |
df_questions = df["question"].values
|
22 |
df_links = df["link"].values
|
23 |
df2_questions = df2["question"].values
|
@@ -26,30 +33,105 @@ df3_questions = df3["question"].values
|
|
26 |
df3_links = df3["url"].values
|
27 |
|
28 |
def arabic_word_tokenize(text):
|
|
|
29 |
if not isinstance(text, str):
|
30 |
return []
|
31 |
-
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
def
|
|
|
34 |
query_words = set(arabic_word_tokenize(query))
|
|
|
|
|
|
|
35 |
overlaps = []
|
36 |
for q in questions:
|
37 |
q_words = set(arabic_word_tokenize(q))
|
38 |
-
if len(
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
return overlaps
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
def predict(text):
|
46 |
if not text or text.strip() == "":
|
47 |
return "No query provided"
|
48 |
|
|
|
49 |
query_embedding = model.encode(text, convert_to_tensor=True)
|
50 |
query_embeddinga = modela.encode(text, convert_to_tensor=True)
|
51 |
|
52 |
-
#
|
53 |
sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
|
54 |
util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
|
55 |
sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
|
@@ -57,65 +139,159 @@ def predict(text):
|
|
57 |
sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
|
58 |
util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
|
59 |
|
60 |
-
#
|
61 |
-
word_overlap1 =
|
62 |
-
word_overlap2 =
|
63 |
-
word_overlap3 =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
-
#
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
"question": df_questions[i],
|
71 |
"link": df_links[i],
|
72 |
"cosine_score": float(sim_scores1[i].cpu().item()),
|
73 |
"word_overlap_score": float(word_overlap1[i]),
|
74 |
-
"
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
top1 = sorted(combined1, key=lambda x: x["combined_score"], reverse=True)[:3]
|
79 |
|
80 |
-
# Collect
|
81 |
-
combined2 = [
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
"question": df2_questions[i],
|
84 |
"link": df2_links[i],
|
85 |
"cosine_score": float(sim_scores2[i].cpu().item()),
|
86 |
"word_overlap_score": float(word_overlap2[i]),
|
87 |
-
"
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
top2 = sorted(combined2, key=lambda x: x["combined_score"], reverse=True)[:3]
|
92 |
|
93 |
-
# Collect
|
94 |
-
combined3 = [
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
"question": df3_questions[i],
|
97 |
"link": df3_links[i],
|
98 |
"cosine_score": float(sim_scores3[i].cpu().item()),
|
99 |
"word_overlap_score": float(word_overlap3[i]),
|
100 |
-
"
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
results = {
|
|
|
107 |
"top2": top2,
|
108 |
"top3": top3,
|
109 |
-
"
|
|
|
|
|
|
|
|
|
|
|
110 |
}
|
111 |
|
112 |
return results
|
113 |
|
114 |
-
title = "Search CSV"
|
115 |
iface = gr.Interface(
|
116 |
fn=predict,
|
117 |
-
inputs=[gr.Textbox(label="
|
118 |
outputs='json',
|
119 |
title=title,
|
|
|
120 |
)
|
121 |
-
|
|
|
|
|
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import gradio as gr
|
5 |
import re
|
6 |
+
import numpy as np
|
7 |
+
from collections import Counter
|
8 |
|
9 |
+
# Load models
|
10 |
model = SentenceTransformer("distilbert-base-multilingual-cased")
|
11 |
modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
12 |
+
|
13 |
+
# Load data
|
14 |
df = pd.read_csv("cleaned1.csv")
|
15 |
df2 = pd.read_csv("cleaned2.csv")
|
16 |
df3 = pd.read_csv("cleaned3.csv")
|
17 |
|
18 |
+
# Load embeddings
|
19 |
embeddings = torch.load("embeddings1_1.pt")
|
20 |
embeddings2 = torch.load("embeddings2_1.pt")
|
21 |
embeddings3 = torch.load("embeddings3_1.pt")
|
|
|
24 |
embeddingsa2 = torch.load("embeddings2.pt")
|
25 |
embeddingsa3 = torch.load("embeddings3.pt")
|
26 |
|
27 |
+
# Extract data arrays
|
28 |
df_questions = df["question"].values
|
29 |
df_links = df["link"].values
|
30 |
df2_questions = df2["question"].values
|
|
|
33 |
df3_links = df3["url"].values
|
34 |
|
35 |
def arabic_word_tokenize(text):
|
36 |
+
"""Improved tokenization with better handling of Arabic text"""
|
37 |
if not isinstance(text, str):
|
38 |
return []
|
39 |
+
# Remove diacritics and normalize
|
40 |
+
text = re.sub(r'[\u064B-\u065F\u0670\u06D6-\u06ED]', '', text)
|
41 |
+
# Extract words (Arabic, English, and numbers)
|
42 |
+
words = re.findall(r'[\u0600-\u06FF\u0750-\u077F\w]+', text.lower())
|
43 |
+
return words
|
44 |
|
45 |
+
def compute_enhanced_word_overlap(query, questions):
|
46 |
+
"""Enhanced word overlap with better scoring"""
|
47 |
query_words = set(arabic_word_tokenize(query))
|
48 |
+
if len(query_words) == 0:
|
49 |
+
return [0.0] * len(questions)
|
50 |
+
|
51 |
overlaps = []
|
52 |
for q in questions:
|
53 |
q_words = set(arabic_word_tokenize(q))
|
54 |
+
if len(q_words) == 0:
|
55 |
+
overlaps.append(0.0)
|
56 |
+
continue
|
57 |
+
|
58 |
+
# Jaccard similarity (intersection over union)
|
59 |
+
intersection = len(query_words & q_words)
|
60 |
+
union = len(query_words | q_words)
|
61 |
+
jaccard = intersection / union if union > 0 else 0.0
|
62 |
+
|
63 |
+
# Word coverage (how much of query is covered)
|
64 |
+
coverage = intersection / len(query_words)
|
65 |
+
|
66 |
+
# Combine both metrics
|
67 |
+
combined_overlap = 0.6 * jaccard + 0.4 * coverage
|
68 |
+
overlaps.append(combined_overlap)
|
69 |
+
|
70 |
return overlaps
|
71 |
|
72 |
+
def compute_fuzzy_matches(query, questions):
|
73 |
+
"""Compute fuzzy string matching scores"""
|
74 |
+
query_words = arabic_word_tokenize(query)
|
75 |
+
if len(query_words) == 0:
|
76 |
+
return [0.0] * len(questions)
|
77 |
+
|
78 |
+
fuzzy_scores = []
|
79 |
+
for q in questions:
|
80 |
+
q_words = arabic_word_tokenize(q)
|
81 |
+
if len(q_words) == 0:
|
82 |
+
fuzzy_scores.append(0.0)
|
83 |
+
continue
|
84 |
+
|
85 |
+
# Find partial matches (substrings)
|
86 |
+
matches = 0
|
87 |
+
for q_word in query_words:
|
88 |
+
for doc_word in q_words:
|
89 |
+
if len(q_word) >= 3 and len(doc_word) >= 3:
|
90 |
+
if q_word in doc_word or doc_word in q_word:
|
91 |
+
matches += 1
|
92 |
+
break
|
93 |
+
|
94 |
+
fuzzy_score = matches / len(query_words) if len(query_words) > 0 else 0.0
|
95 |
+
fuzzy_scores.append(fuzzy_score)
|
96 |
+
|
97 |
+
return fuzzy_scores
|
98 |
+
|
99 |
+
def compute_length_penalty(query, questions):
|
100 |
+
"""Penalize very long or very short results relative to query"""
|
101 |
+
query_len = len(arabic_word_tokenize(query))
|
102 |
+
penalties = []
|
103 |
+
|
104 |
+
for q in questions:
|
105 |
+
q_len = len(arabic_word_tokenize(q))
|
106 |
+
if q_len == 0:
|
107 |
+
penalties.append(0.0)
|
108 |
+
continue
|
109 |
+
|
110 |
+
# Optimal length ratio (prefer similar lengths)
|
111 |
+
ratio = min(query_len, q_len) / max(query_len, q_len)
|
112 |
+
# Penalty for very short results
|
113 |
+
if q_len < 3:
|
114 |
+
ratio *= 0.5
|
115 |
+
penalties.append(ratio)
|
116 |
+
|
117 |
+
return penalties
|
118 |
+
|
119 |
+
def normalize_scores(scores):
|
120 |
+
"""Normalize scores to 0-1 range"""
|
121 |
+
scores = np.array(scores)
|
122 |
+
if scores.max() - scores.min() == 0:
|
123 |
+
return scores
|
124 |
+
return (scores - scores.min()) / (scores.max() - scores.min())
|
125 |
+
|
126 |
def predict(text):
|
127 |
if not text or text.strip() == "":
|
128 |
return "No query provided"
|
129 |
|
130 |
+
# Encode query with both models
|
131 |
query_embedding = model.encode(text, convert_to_tensor=True)
|
132 |
query_embeddinga = modela.encode(text, convert_to_tensor=True)
|
133 |
|
134 |
+
# Compute semantic similarities
|
135 |
sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
|
136 |
util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
|
137 |
sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
|
|
|
139 |
sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
|
140 |
util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
|
141 |
|
142 |
+
# Compute enhanced word overlaps
|
143 |
+
word_overlap1 = compute_enhanced_word_overlap(text, df_questions)
|
144 |
+
word_overlap2 = compute_enhanced_word_overlap(text, df2_questions)
|
145 |
+
word_overlap3 = compute_enhanced_word_overlap(text, df3_questions)
|
146 |
+
|
147 |
+
# Compute fuzzy matches
|
148 |
+
fuzzy_scores1 = compute_fuzzy_matches(text, df_questions)
|
149 |
+
fuzzy_scores2 = compute_fuzzy_matches(text, df2_questions)
|
150 |
+
fuzzy_scores3 = compute_fuzzy_matches(text, df3_questions)
|
151 |
+
|
152 |
+
# Compute length penalties
|
153 |
+
length_penalties1 = compute_length_penalty(text, df_questions)
|
154 |
+
length_penalties2 = compute_length_penalty(text, df2_questions)
|
155 |
+
length_penalties3 = compute_length_penalty(text, df3_questions)
|
156 |
|
157 |
+
# Normalize all scores
|
158 |
+
sem_scores1 = normalize_scores([float(x.cpu().item()) for x in sim_scores1])
|
159 |
+
sem_scores2 = normalize_scores([float(x.cpu().item()) for x in sim_scores2])
|
160 |
+
sem_scores3 = normalize_scores([float(x.cpu().item()) for x in sim_scores3])
|
161 |
+
|
162 |
+
word_scores1 = normalize_scores(word_overlap1)
|
163 |
+
word_scores2 = normalize_scores(word_overlap2)
|
164 |
+
word_scores3 = normalize_scores(word_overlap3)
|
165 |
+
|
166 |
+
fuzzy_scores1_norm = normalize_scores(fuzzy_scores1)
|
167 |
+
fuzzy_scores2_norm = normalize_scores(fuzzy_scores2)
|
168 |
+
fuzzy_scores3_norm = normalize_scores(fuzzy_scores3)
|
169 |
|
170 |
+
# Adaptive weights based on query characteristics
|
171 |
+
query_words = arabic_word_tokenize(text)
|
172 |
+
if len(query_words) <= 2:
|
173 |
+
# Short queries: prioritize exact matches
|
174 |
+
semantic_weight = 0.3
|
175 |
+
word_weight = 0.5
|
176 |
+
fuzzy_weight = 0.2
|
177 |
+
elif len(query_words) <= 5:
|
178 |
+
# Medium queries: balanced approach
|
179 |
+
semantic_weight = 0.4
|
180 |
+
word_weight = 0.4
|
181 |
+
fuzzy_weight = 0.2
|
182 |
+
else:
|
183 |
+
# Long queries: prioritize semantic similarity
|
184 |
+
semantic_weight = 0.5
|
185 |
+
word_weight = 0.3
|
186 |
+
fuzzy_weight = 0.2
|
187 |
+
|
188 |
+
# Collect results for dataset 1
|
189 |
+
combined1 = []
|
190 |
+
for i in range(len(df_questions)):
|
191 |
+
combined_score = (
|
192 |
+
semantic_weight * sem_scores1[i] +
|
193 |
+
word_weight * word_scores1[i] +
|
194 |
+
fuzzy_weight * fuzzy_scores1_norm[i]
|
195 |
+
) * length_penalties1[i]
|
196 |
+
|
197 |
+
combined1.append({
|
198 |
"question": df_questions[i],
|
199 |
"link": df_links[i],
|
200 |
"cosine_score": float(sim_scores1[i].cpu().item()),
|
201 |
"word_overlap_score": float(word_overlap1[i]),
|
202 |
+
"fuzzy_score": float(fuzzy_scores1[i]),
|
203 |
+
"length_penalty": float(length_penalties1[i]),
|
204 |
+
"combined_score": float(combined_score)
|
205 |
+
})
|
|
|
206 |
|
207 |
+
# Collect results for dataset 2
|
208 |
+
combined2 = []
|
209 |
+
for i in range(len(df2_questions)):
|
210 |
+
combined_score = (
|
211 |
+
semantic_weight * sem_scores2[i] +
|
212 |
+
word_weight * word_scores2[i] +
|
213 |
+
fuzzy_weight * fuzzy_scores2_norm[i]
|
214 |
+
) * length_penalties2[i]
|
215 |
+
|
216 |
+
combined2.append({
|
217 |
"question": df2_questions[i],
|
218 |
"link": df2_links[i],
|
219 |
"cosine_score": float(sim_scores2[i].cpu().item()),
|
220 |
"word_overlap_score": float(word_overlap2[i]),
|
221 |
+
"fuzzy_score": float(fuzzy_scores2[i]),
|
222 |
+
"length_penalty": float(length_penalties2[i]),
|
223 |
+
"combined_score": float(combined_score)
|
224 |
+
})
|
|
|
225 |
|
226 |
+
# Collect results for dataset 3
|
227 |
+
combined3 = []
|
228 |
+
for i in range(len(df3_questions)):
|
229 |
+
combined_score = (
|
230 |
+
semantic_weight * sem_scores3[i] +
|
231 |
+
word_weight * word_scores3[i] +
|
232 |
+
fuzzy_weight * fuzzy_scores3_norm[i]
|
233 |
+
) * length_penalties3[i]
|
234 |
+
|
235 |
+
combined3.append({
|
236 |
"question": df3_questions[i],
|
237 |
"link": df3_links[i],
|
238 |
"cosine_score": float(sim_scores3[i].cpu().item()),
|
239 |
"word_overlap_score": float(word_overlap3[i]),
|
240 |
+
"fuzzy_score": float(fuzzy_scores3[i]),
|
241 |
+
"length_penalty": float(length_penalties3[i]),
|
242 |
+
"combined_score": float(combined_score)
|
243 |
+
})
|
244 |
+
|
245 |
+
# Get top results with diversity filtering
|
246 |
+
def get_diverse_top_results(results, top_k=5):
|
247 |
+
"""Get top results while avoiding too similar ones"""
|
248 |
+
sorted_results = sorted(results, key=lambda x: x["combined_score"], reverse=True)
|
249 |
+
|
250 |
+
diverse_results = []
|
251 |
+
for result in sorted_results:
|
252 |
+
if len(diverse_results) >= top_k:
|
253 |
+
break
|
254 |
+
|
255 |
+
# Check if this result is too similar to already selected ones
|
256 |
+
is_diverse = True
|
257 |
+
for selected in diverse_results:
|
258 |
+
# Simple diversity check based on word overlap
|
259 |
+
overlap = compute_enhanced_word_overlap(result["question"], [selected["question"]])[0]
|
260 |
+
if overlap > 0.8: # Too similar
|
261 |
+
is_diverse = False
|
262 |
+
break
|
263 |
+
|
264 |
+
if is_diverse:
|
265 |
+
diverse_results.append(result)
|
266 |
+
|
267 |
+
return diverse_results
|
268 |
+
|
269 |
+
top1 = get_diverse_top_results(combined1, 3)
|
270 |
+
top2 = get_diverse_top_results(combined2, 3)
|
271 |
+
top3 = get_diverse_top_results(combined3, 3)
|
272 |
|
273 |
results = {
|
274 |
+
"top1": top1,
|
275 |
"top2": top2,
|
276 |
"top3": top3,
|
277 |
+
"query_analysis": {
|
278 |
+
"word_count": len(query_words),
|
279 |
+
"semantic_weight": semantic_weight,
|
280 |
+
"word_weight": word_weight,
|
281 |
+
"fuzzy_weight": fuzzy_weight
|
282 |
+
}
|
283 |
}
|
284 |
|
285 |
return results
|
286 |
|
287 |
+
title = "Enhanced Search CSV"
|
288 |
iface = gr.Interface(
|
289 |
fn=predict,
|
290 |
+
inputs=[gr.Textbox(label="Search Query", lines=3, placeholder="Enter your search query here...")],
|
291 |
outputs='json',
|
292 |
title=title,
|
293 |
+
description="Enhanced semantic search with improved matching algorithms"
|
294 |
)
|
295 |
+
|
296 |
+
if __name__ == "__main__":
|
297 |
+
iface.launch()
|