Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,117 +2,57 @@ import torch
|
|
2 |
import pandas as pd
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import gradio as gr
|
5 |
-
import numpy as np
|
6 |
-
from transformers import MarianMTModel, MarianTokenizer
|
7 |
-
import re
|
8 |
-
|
9 |
-
translator_model_name = "Helsinki-NLP/opus-mt-en-ar"
|
10 |
-
translator_tokenizer = MarianTokenizer.from_pretrained(translator_model_name)
|
11 |
-
translator_model = MarianMTModel.from_pretrained(translator_model_name)
|
12 |
-
|
13 |
-
models = [
|
14 |
-
SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2"),
|
15 |
-
SentenceTransformer("distilbert-base-multilingual-cased"),
|
16 |
-
SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
|
17 |
-
]
|
18 |
-
|
19 |
|
|
|
|
|
20 |
df = pd.read_csv("cleaned1.csv")
|
21 |
df2 = pd.read_csv("cleaned2.csv")
|
22 |
df3 = pd.read_csv("cleaned3.csv")
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
torch.load("embeddings1_1.pt"), # Model 2 embeddings (paraphrase-multilingual-MiniLM-L12-v2)
|
28 |
-
torch.load("embeddings1_2.pt") # Model 3 embeddings (paraphrase-multilingual-mpnet-base-v2)
|
29 |
-
]
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
torch.load("embeddings2_2.pt") # Model 3 embeddings
|
35 |
-
]
|
36 |
|
37 |
-
|
38 |
-
torch.load("embeddings3.pt"), # Model 1 embeddings
|
39 |
-
torch.load("embeddings3_1.pt"), # Model 2 embeddings
|
40 |
-
torch.load("embeddings3_2.pt") # Model 3 embeddings
|
41 |
-
]
|
42 |
-
|
43 |
-
# Pre-extract DataFrame columns
|
44 |
df_questions = df["question"].values
|
45 |
df_links = df["link"].values
|
46 |
df2_questions = df2["question"].values
|
47 |
df2_links = df2["link"].values
|
48 |
df3_questions = df3["question"].values
|
49 |
df3_links = df3["url"].values
|
50 |
-
|
51 |
-
def is_arabic(text):
|
52 |
-
"""Check if text contains Arabic characters"""
|
53 |
-
arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]')
|
54 |
-
return bool(arabic_pattern.search(text))
|
55 |
-
|
56 |
-
def translate_to_arabic(text):
|
57 |
-
"""Translate English text to Arabic"""
|
58 |
-
if is_arabic(text):
|
59 |
-
return text # Already Arabic, no translation needed
|
60 |
-
|
61 |
-
try:
|
62 |
-
# Tokenize and translate
|
63 |
-
inputs = translator_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
64 |
-
translated = translator_model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
|
65 |
-
arabic_text = translator_tokenizer.decode(translated[0], skip_special_tokens=True)
|
66 |
-
return arabic_text
|
67 |
-
except Exception as e:
|
68 |
-
print(f"Translation error: {e}")
|
69 |
-
return text # Return original text if translation fails
|
70 |
-
|
71 |
-
def fast_ensemble_similarity(query_text, embeddings_list, models, weights=None):
|
72 |
-
"""
|
73 |
-
Fast ensemble similarity calculation - matches each model with its correct embeddings
|
74 |
-
"""
|
75 |
-
if weights is None:
|
76 |
-
weights = [1.0] * len(models)
|
77 |
-
|
78 |
-
all_scores = []
|
79 |
-
|
80 |
-
for i, (model, embeddings, weight) in enumerate(zip(models, embeddings_list, weights)):
|
81 |
-
# Each model uses its corresponding embeddings
|
82 |
-
query_embedding = model.encode(query_text, convert_to_tensor=True)
|
83 |
-
sim_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
|
84 |
-
weighted_scores = sim_scores * weight
|
85 |
-
all_scores.append(weighted_scores)
|
86 |
-
|
87 |
-
# Combine scores efficiently
|
88 |
-
ensemble_scores = torch.stack(all_scores).mean(dim=0)
|
89 |
-
return ensemble_scores
|
90 |
-
|
91 |
-
def predict_with_translation(text):
|
92 |
-
"""Fast prediction with translation support - correctly matches models with embeddings"""
|
93 |
if not text or text.strip() == "":
|
94 |
return "No query provided"
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
top3_scores1, top3_idx1 = sim_scores1.topk(3)
|
113 |
top3_scores2, top3_idx2 = sim_scores2.topk(3)
|
114 |
top3_scores3, top3_idx3 = sim_scores3.topk(3)
|
115 |
-
|
116 |
# Convert to CPU once
|
117 |
top3_idx1_cpu = top3_idx1.cpu().numpy()
|
118 |
top3_idx2_cpu = top3_idx2.cpu().numpy()
|
@@ -121,92 +61,7 @@ def predict_with_translation(text):
|
|
121 |
top3_scores1_cpu = top3_scores1.cpu().numpy()
|
122 |
top3_scores2_cpu = top3_scores2.cpu().numpy()
|
123 |
top3_scores3_cpu = top3_scores3.cpu().numpy()
|
124 |
-
|
125 |
-
# Format results
|
126 |
-
results = {
|
127 |
-
"top2": [
|
128 |
-
{
|
129 |
-
"question": df2_questions[idx],
|
130 |
-
"link": df2_links[idx],
|
131 |
-
"score": float(score)
|
132 |
-
}
|
133 |
-
for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
|
134 |
-
],
|
135 |
-
"top3": [
|
136 |
-
{
|
137 |
-
"question": df3_questions[idx],
|
138 |
-
"link": df3_links[idx],
|
139 |
-
"score": float(score)
|
140 |
-
}
|
141 |
-
for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
|
142 |
-
],
|
143 |
-
"top1": [
|
144 |
-
{
|
145 |
-
"question": df_questions[idx],
|
146 |
-
"link": df_links[idx],
|
147 |
-
"score": float(score)
|
148 |
-
}
|
149 |
-
for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
|
150 |
-
],
|
151 |
-
"arabic_text": arabic_text ,
|
152 |
-
}
|
153 |
-
|
154 |
-
return results
|
155 |
-
|
156 |
-
|
157 |
-
# Alternative version with dynamic model loading (saves memory)
|
158 |
-
def predict_dynamic(text):
|
159 |
-
"""
|
160 |
-
Alternative approach: encode with multiple models on-the-fly
|
161 |
-
Uses more computation but less memory
|
162 |
-
"""
|
163 |
-
if not text or text.strip() == "":
|
164 |
-
return "No query provided"
|
165 |
-
|
166 |
-
# Load your original embeddings (generated with first model)
|
167 |
-
embeddings1 = torch.load("embeddings1_1.pt")
|
168 |
-
embeddings2 = torch.load("embeddings2_1.pt")
|
169 |
-
embeddings3 = torch.load("embeddings3_1.pt")
|
170 |
-
|
171 |
-
model_weights = [0.4, 0.35, 0.25]
|
172 |
-
|
173 |
-
# Calculate ensemble scores for each dataset
|
174 |
-
all_sim_scores1 = []
|
175 |
-
all_sim_scores2 = []
|
176 |
-
all_sim_scores3 = []
|
177 |
-
|
178 |
-
for i, model in enumerate(models):
|
179 |
-
query_embedding = model.encode(text, convert_to_tensor=True)
|
180 |
-
|
181 |
-
# For this example, using same embeddings for all models
|
182 |
-
# In practice, you'd want different embeddings for each model
|
183 |
-
sim1 = util.pytorch_cos_sim(query_embedding, embeddings1)[0] * model_weights[i]
|
184 |
-
sim2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0] * model_weights[i]
|
185 |
-
sim3 = util.pytorch_cos_sim(query_embedding, embeddings3)[0] * model_weights[i]
|
186 |
-
|
187 |
-
all_sim_scores1.append(sim1)
|
188 |
-
all_sim_scores2.append(sim2)
|
189 |
-
all_sim_scores3.append(sim3)
|
190 |
-
|
191 |
-
# Combine scores
|
192 |
-
final_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
|
193 |
-
final_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
|
194 |
-
final_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
|
195 |
-
|
196 |
-
# Get top results
|
197 |
-
top3_scores1, top3_idx1 = final_scores1.topk(3)
|
198 |
-
top3_scores2, top3_idx2 = final_scores2.topk(3)
|
199 |
-
top3_scores3, top3_idx3 = final_scores3.topk(3)
|
200 |
-
|
201 |
-
# Convert and format results (same as before)
|
202 |
-
top3_idx1_cpu = top3_idx1.cpu().numpy()
|
203 |
-
top3_idx2_cpu = top3_idx2.cpu().numpy()
|
204 |
-
top3_idx3_cpu = top3_idx3.cpu().numpy()
|
205 |
-
|
206 |
-
top3_scores1_cpu = top3_scores1.cpu().numpy()
|
207 |
-
top3_scores2_cpu = top3_scores2.cpu().numpy()
|
208 |
-
top3_scores3_cpu = top3_scores3.cpu().numpy()
|
209 |
-
|
210 |
results = {
|
211 |
|
212 |
"top2": [
|
@@ -233,20 +88,16 @@ def predict_dynamic(text):
|
|
233 |
}
|
234 |
for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
|
235 |
],
|
236 |
-
"arabic_text": text ,
|
237 |
}
|
238 |
|
239 |
return results
|
240 |
|
241 |
-
#
|
242 |
-
title = "
|
243 |
iface = gr.Interface(
|
244 |
-
fn=
|
245 |
-
inputs=[gr.Textbox(label="
|
246 |
outputs='json',
|
247 |
title=title,
|
248 |
-
description="Ask questions in English or Arabic. English queries will be translated to Arabic for better matching."
|
249 |
)
|
250 |
-
|
251 |
-
if __name__ == "__main__":
|
252 |
-
iface.launch()
|
|
|
2 |
import pandas as pd
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
model = SentenceTransformer("distilbert-base-multilingual-cased")
|
7 |
+
modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
8 |
df = pd.read_csv("cleaned1.csv")
|
9 |
df2 = pd.read_csv("cleaned2.csv")
|
10 |
df3 = pd.read_csv("cleaned3.csv")
|
11 |
|
12 |
+
embeddings = torch.load("embeddings1_1.pt")
|
13 |
+
embeddings2 = torch.load("embeddings2_1.pt")
|
14 |
+
embeddings3 = torch.load("embeddings3_1.pt")
|
|
|
|
|
|
|
15 |
|
16 |
+
embeddingsa = torch.load("embeddings1.pt")
|
17 |
+
embeddingsa2 = torch.load("embeddings2.pt")
|
18 |
+
embeddingsa3 = torch.load("embeddings3.pt")
|
|
|
|
|
19 |
|
20 |
+
# Pre-extract DataFrame columns to avoid repeated iloc calls
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
df_questions = df["question"].values
|
22 |
df_links = df["link"].values
|
23 |
df2_questions = df2["question"].values
|
24 |
df2_links = df2["link"].values
|
25 |
df3_questions = df3["question"].values
|
26 |
df3_links = df3["url"].values
|
27 |
+
def predict(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
if not text or text.strip() == "":
|
29 |
return "No query provided"
|
30 |
|
31 |
+
query_embedding = model.encode(text, convert_to_tensor=True)
|
32 |
+
query_embeddinga = modela.encode(text, convert_to_tensor=True)
|
33 |
+
all_sim_scores1 = []
|
34 |
+
all_sim_scores2 = []
|
35 |
+
all_sim_scores3 = []
|
36 |
+
# Compute similarity scores
|
37 |
+
sim_scores1 = util.pytorch_cos_sim(query_embedding, embeddings)[0]
|
38 |
+
sim_scores2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
|
39 |
+
sim_scores3 = util.pytorch_cos_sim(query_embedding, embeddings3)[0]
|
40 |
+
all_sim_scores1.append(sim_scores1)
|
41 |
+
all_sim_scores2.append(sim_scores2)
|
42 |
+
all_sim_scores1.append(sim_scores3)
|
43 |
+
sim_scores1 = util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]
|
44 |
+
sim_scores2 = util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]
|
45 |
+
sim_scores3 = util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]
|
46 |
+
all_sim_scores1.append(sim_scores1)
|
47 |
+
all_sim_scores2.append(sim_scores2)
|
48 |
+
all_sim_scores1.append(sim_scores3)
|
49 |
+
sim_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
|
50 |
+
sim_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
|
51 |
+
sim_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
|
52 |
+
# Get top 3 values and indices in one call
|
53 |
top3_scores1, top3_idx1 = sim_scores1.topk(3)
|
54 |
top3_scores2, top3_idx2 = sim_scores2.topk(3)
|
55 |
top3_scores3, top3_idx3 = sim_scores3.topk(3)
|
|
|
56 |
# Convert to CPU once
|
57 |
top3_idx1_cpu = top3_idx1.cpu().numpy()
|
58 |
top3_idx2_cpu = top3_idx2.cpu().numpy()
|
|
|
61 |
top3_scores1_cpu = top3_scores1.cpu().numpy()
|
62 |
top3_scores2_cpu = top3_scores2.cpu().numpy()
|
63 |
top3_scores3_cpu = top3_scores3.cpu().numpy()
|
64 |
+
# Prepare results using pre-extracted arrays
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
results = {
|
66 |
|
67 |
"top2": [
|
|
|
88 |
}
|
89 |
for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
|
90 |
],
|
|
|
91 |
}
|
92 |
|
93 |
return results
|
94 |
|
95 |
+
# Match the EXACT structure of your working translation app
|
96 |
+
title = "Search CSV"
|
97 |
iface = gr.Interface(
|
98 |
+
fn=predict, # Changed from search_fatwa to predict
|
99 |
+
inputs=[gr.Textbox(label="text", lines=3)],
|
100 |
outputs='json',
|
101 |
title=title,
|
|
|
102 |
)
|
103 |
+
iface.launch()
|
|
|
|