Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,7 @@ import torch
|
|
2 |
import pandas as pd
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import gradio as gr
|
|
|
5 |
|
6 |
model = SentenceTransformer("distilbert-base-multilingual-cased")
|
7 |
modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
@@ -24,6 +25,22 @@ df2_questions = df2["question"].values
|
|
24 |
df2_links = df2["link"].values
|
25 |
df3_questions = df3["question"].values
|
26 |
df3_links = df3["url"].values
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def predict(text):
|
28 |
if not text or text.strip() == "":
|
29 |
return "No query provided"
|
@@ -53,25 +70,76 @@ def predict(text):
|
|
53 |
sim_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
|
54 |
sim_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
|
55 |
sim_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
top3_scores1, top3_idx1 = sim_scores1.topk(3)
|
58 |
top3_scores2, top3_idx2 = sim_scores2.topk(3)
|
59 |
top3_scores3, top3_idx3 = sim_scores3.topk(3)
|
60 |
-
|
61 |
top3_idx1_cpu = top3_idx1.cpu().numpy()
|
62 |
top3_idx2_cpu = top3_idx2.cpu().numpy()
|
63 |
top3_idx3_cpu = top3_idx3.cpu().numpy()
|
64 |
-
|
65 |
top3_scores1_cpu = top3_scores1.cpu().numpy()
|
66 |
top3_scores2_cpu = top3_scores2.cpu().numpy()
|
67 |
top3_scores3_cpu = top3_scores3.cpu().numpy()
|
68 |
-
|
69 |
results = {
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
"top2": [
|
72 |
{
|
73 |
"question": df2_questions[idx],
|
74 |
-
"link": df2_links[idx],
|
75 |
"score": float(score)
|
76 |
}
|
77 |
for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
|
@@ -79,29 +147,21 @@ def predict(text):
|
|
79 |
"top3": [
|
80 |
{
|
81 |
"question": df3_questions[idx],
|
82 |
-
"link": df3_links[idx],
|
83 |
"score": float(score)
|
84 |
}
|
85 |
for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
|
86 |
],
|
87 |
-
"
|
88 |
-
{
|
89 |
-
"question": df_questions[idx],
|
90 |
-
"link": df_links[idx],
|
91 |
-
"score": float(score)
|
92 |
-
}
|
93 |
-
for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
|
94 |
-
],
|
95 |
}
|
96 |
-
|
97 |
return results
|
98 |
|
99 |
-
# Match the EXACT structure of your working translation app
|
100 |
title = "Search CSV"
|
101 |
iface = gr.Interface(
|
102 |
-
fn=predict,
|
103 |
inputs=[gr.Textbox(label="text", lines=3)],
|
104 |
outputs='json',
|
105 |
title=title,
|
106 |
)
|
107 |
-
iface.launch()
|
|
|
2 |
import pandas as pd
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import gradio as gr
|
5 |
+
import re
|
6 |
|
7 |
model = SentenceTransformer("distilbert-base-multilingual-cased")
|
8 |
modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
|
|
25 |
df2_links = df2["link"].values
|
26 |
df3_questions = df3["question"].values
|
27 |
df3_links = df3["url"].values
|
28 |
+
|
29 |
+
def arabic_word_tokenize(text):
|
30 |
+
return re.findall(r'\w+', text)
|
31 |
+
|
32 |
+
def compute_word_overlap(query, questions):
|
33 |
+
query_words = set(arabic_word_tokenize(query))
|
34 |
+
overlaps = []
|
35 |
+
for q in questions:
|
36 |
+
q_words = set(arabic_word_tokenize(q))
|
37 |
+
if len(query_words) > 0:
|
38 |
+
overlap_score = len(query_words & q_words) / len(query_words)
|
39 |
+
else:
|
40 |
+
overlap_score = 0.0
|
41 |
+
overlaps.append(overlap_score)
|
42 |
+
return overlaps
|
43 |
+
|
44 |
def predict(text):
|
45 |
if not text or text.strip() == "":
|
46 |
return "No query provided"
|
|
|
70 |
sim_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
|
71 |
sim_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
|
72 |
sim_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
|
73 |
+
|
74 |
+
# Compute word overlap scores
|
75 |
+
word_overlap1 = compute_word_overlap(text, df_questions)
|
76 |
+
|
77 |
+
word_overlap2 = compute_word_overlap(text, df2_questions)
|
78 |
+
word_overlap3 = compute_word_overlap(text, df3_questions)
|
79 |
+
|
80 |
+
# Prepare combined results list
|
81 |
+
weight = 0.5 # word overlap weight
|
82 |
+
combined_results = []
|
83 |
+
|
84 |
+
for i, score in enumerate(sim_scores1.cpu().numpy()):
|
85 |
+
combined_score = float(score) + weight * word_overlap1[i]
|
86 |
+
combined_results.append({
|
87 |
+
"question": df_questions[i],
|
88 |
+
"link": df_links[i],
|
89 |
+
"cosine_score": float(score),
|
90 |
+
"word_overlap_score": float(word_overlap1[i]),
|
91 |
+
"combined_score": combined_score
|
92 |
+
})
|
93 |
+
|
94 |
+
for i, score in enumerate(sim_scores2.cpu().numpy()):
|
95 |
+
combined_score = float(score) + weight * word_overlap2[i]
|
96 |
+
combined_results.append({
|
97 |
+
"question": df2_questions[i],
|
98 |
+
"link": df2_links[i],
|
99 |
+
"cosine_score": float(score),
|
100 |
+
"word_overlap_score": float(word_overlap2[i]),
|
101 |
+
"combined_score": combined_score
|
102 |
+
})
|
103 |
+
|
104 |
+
for i, score in enumerate(sim_scores3.cpu().numpy()):
|
105 |
+
combined_score = float(score) + weight * word_overlap3[i]
|
106 |
+
combined_results.append({
|
107 |
+
"question": df3_questions[i],
|
108 |
+
"link": df3_links[i],
|
109 |
+
"cosine_score": float(score),
|
110 |
+
"word_overlap_score": float(word_overlap3[i]),
|
111 |
+
"combined_score": combined_score
|
112 |
+
})
|
113 |
+
|
114 |
+
# Get top 3 combined
|
115 |
+
top3_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)[:3]
|
116 |
+
|
117 |
+
# Also keep your original top1/top2/top3 as is
|
118 |
top3_scores1, top3_idx1 = sim_scores1.topk(3)
|
119 |
top3_scores2, top3_idx2 = sim_scores2.topk(3)
|
120 |
top3_scores3, top3_idx3 = sim_scores3.topk(3)
|
121 |
+
|
122 |
top3_idx1_cpu = top3_idx1.cpu().numpy()
|
123 |
top3_idx2_cpu = top3_idx2.cpu().numpy()
|
124 |
top3_idx3_cpu = top3_idx3.cpu().numpy()
|
125 |
+
|
126 |
top3_scores1_cpu = top3_scores1.cpu().numpy()
|
127 |
top3_scores2_cpu = top3_scores2.cpu().numpy()
|
128 |
top3_scores3_cpu = top3_scores3.cpu().numpy()
|
129 |
+
|
130 |
results = {
|
131 |
+
"top1": [
|
132 |
+
{
|
133 |
+
"question": df_questions[idx],
|
134 |
+
"link": df_links[idx],
|
135 |
+
"score": float(score)
|
136 |
+
}
|
137 |
+
for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
|
138 |
+
],
|
139 |
"top2": [
|
140 |
{
|
141 |
"question": df2_questions[idx],
|
142 |
+
"link": df2_links[idx],
|
143 |
"score": float(score)
|
144 |
}
|
145 |
for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
|
|
|
147 |
"top3": [
|
148 |
{
|
149 |
"question": df3_questions[idx],
|
150 |
+
"link": df3_links[idx],
|
151 |
"score": float(score)
|
152 |
}
|
153 |
for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
|
154 |
],
|
155 |
+
"top3_combined": top3_combined
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
}
|
157 |
+
|
158 |
return results
|
159 |
|
|
|
160 |
title = "Search CSV"
|
161 |
iface = gr.Interface(
|
162 |
+
fn=predict,
|
163 |
inputs=[gr.Textbox(label="text", lines=3)],
|
164 |
outputs='json',
|
165 |
title=title,
|
166 |
)
|
167 |
+
iface.launch()
|