mohbay commited on
Commit
6acd5d2
·
verified ·
1 Parent(s): 5eaeaa1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -187
app.py CHANGED
@@ -2,117 +2,57 @@ import torch
2
  import pandas as pd
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
- import numpy as np
6
- from transformers import MarianMTModel, MarianTokenizer
7
- import re
8
-
9
- translator_model_name = "Helsinki-NLP/opus-mt-en-ar"
10
- translator_tokenizer = MarianTokenizer.from_pretrained(translator_model_name)
11
- translator_model = MarianMTModel.from_pretrained(translator_model_name)
12
-
13
- models = [
14
- SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2"),
15
- SentenceTransformer("distilbert-base-multilingual-cased"),
16
- SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
17
- ]
18
-
19
 
 
 
20
  df = pd.read_csv("cleaned1.csv")
21
  df2 = pd.read_csv("cleaned2.csv")
22
  df3 = pd.read_csv("cleaned3.csv")
23
 
24
- # Load embeddings for each model - match embeddings to their corresponding models
25
- embeddings_list = [
26
- torch.load("embeddings1.pt"), # Model 1 embeddings (distilbert-base-multilingual-cased)
27
- torch.load("embeddings1_1.pt"), # Model 2 embeddings (paraphrase-multilingual-MiniLM-L12-v2)
28
- torch.load("embeddings1_2.pt") # Model 3 embeddings (paraphrase-multilingual-mpnet-base-v2)
29
- ]
30
 
31
- embeddings2_list = [
32
- torch.load("embeddings2.pt"), # Model 1 embeddings
33
- torch.load("embeddings2_1.pt"), # Model 2 embeddings
34
- torch.load("embeddings2_2.pt") # Model 3 embeddings
35
- ]
36
 
37
- embeddings3_list = [
38
- torch.load("embeddings3.pt"), # Model 1 embeddings
39
- torch.load("embeddings3_1.pt"), # Model 2 embeddings
40
- torch.load("embeddings3_2.pt") # Model 3 embeddings
41
- ]
42
-
43
- # Pre-extract DataFrame columns
44
  df_questions = df["question"].values
45
  df_links = df["link"].values
46
  df2_questions = df2["question"].values
47
  df2_links = df2["link"].values
48
  df3_questions = df3["question"].values
49
  df3_links = df3["url"].values
50
-
51
- def is_arabic(text):
52
- """Check if text contains Arabic characters"""
53
- arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]')
54
- return bool(arabic_pattern.search(text))
55
-
56
- def translate_to_arabic(text):
57
- """Translate English text to Arabic"""
58
- if is_arabic(text):
59
- return text # Already Arabic, no translation needed
60
-
61
- try:
62
- # Tokenize and translate
63
- inputs = translator_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
64
- translated = translator_model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
65
- arabic_text = translator_tokenizer.decode(translated[0], skip_special_tokens=True)
66
- return arabic_text
67
- except Exception as e:
68
- print(f"Translation error: {e}")
69
- return text # Return original text if translation fails
70
-
71
- def fast_ensemble_similarity(query_text, embeddings_list, models, weights=None):
72
- """
73
- Fast ensemble similarity calculation - matches each model with its correct embeddings
74
- """
75
- if weights is None:
76
- weights = [1.0] * len(models)
77
-
78
- all_scores = []
79
-
80
- for i, (model, embeddings, weight) in enumerate(zip(models, embeddings_list, weights)):
81
- # Each model uses its corresponding embeddings
82
- query_embedding = model.encode(query_text, convert_to_tensor=True)
83
- sim_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
84
- weighted_scores = sim_scores * weight
85
- all_scores.append(weighted_scores)
86
-
87
- # Combine scores efficiently
88
- ensemble_scores = torch.stack(all_scores).mean(dim=0)
89
- return ensemble_scores
90
-
91
- def predict_with_translation(text):
92
- """Fast prediction with translation support - correctly matches models with embeddings"""
93
  if not text or text.strip() == "":
94
  return "No query provided"
95
 
96
- # Translate to Arabic if needed
97
- arabic_text = translate_to_arabic(text)
98
-
99
- # Model weights - adjust based on your testing
100
- model_weights = [0.35, 0.4, 0.25]
101
-
102
- # Choose which text to use for search
103
- search_text = arabic_text if arabic_text != text else text
104
-
105
- # Fast ensemble similarity calculation for each dataset
106
- # Each model now uses its correct corresponding embeddings
107
- sim_scores1 = fast_ensemble_similarity(search_text, embeddings_list, models, model_weights)
108
- sim_scores2 = fast_ensemble_similarity(search_text, embeddings2_list, models, model_weights)
109
- sim_scores3 = fast_ensemble_similarity(search_text, embeddings3_list, models, model_weights)
110
-
111
- # Get top 3 results efficiently
 
 
 
 
 
 
112
  top3_scores1, top3_idx1 = sim_scores1.topk(3)
113
  top3_scores2, top3_idx2 = sim_scores2.topk(3)
114
  top3_scores3, top3_idx3 = sim_scores3.topk(3)
115
-
116
  # Convert to CPU once
117
  top3_idx1_cpu = top3_idx1.cpu().numpy()
118
  top3_idx2_cpu = top3_idx2.cpu().numpy()
@@ -121,92 +61,7 @@ def predict_with_translation(text):
121
  top3_scores1_cpu = top3_scores1.cpu().numpy()
122
  top3_scores2_cpu = top3_scores2.cpu().numpy()
123
  top3_scores3_cpu = top3_scores3.cpu().numpy()
124
-
125
- # Format results
126
- results = {
127
- "top2": [
128
- {
129
- "question": df2_questions[idx],
130
- "link": df2_links[idx],
131
- "score": float(score)
132
- }
133
- for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
134
- ],
135
- "top3": [
136
- {
137
- "question": df3_questions[idx],
138
- "link": df3_links[idx],
139
- "score": float(score)
140
- }
141
- for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
142
- ],
143
- "top1": [
144
- {
145
- "question": df_questions[idx],
146
- "link": df_links[idx],
147
- "score": float(score)
148
- }
149
- for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
150
- ],
151
- "arabic_text": arabic_text ,
152
- }
153
-
154
- return results
155
-
156
-
157
- # Alternative version with dynamic model loading (saves memory)
158
- def predict_dynamic(text):
159
- """
160
- Alternative approach: encode with multiple models on-the-fly
161
- Uses more computation but less memory
162
- """
163
- if not text or text.strip() == "":
164
- return "No query provided"
165
-
166
- # Load your original embeddings (generated with first model)
167
- embeddings1 = torch.load("embeddings1_1.pt")
168
- embeddings2 = torch.load("embeddings2_1.pt")
169
- embeddings3 = torch.load("embeddings3_1.pt")
170
-
171
- model_weights = [0.4, 0.35, 0.25]
172
-
173
- # Calculate ensemble scores for each dataset
174
- all_sim_scores1 = []
175
- all_sim_scores2 = []
176
- all_sim_scores3 = []
177
-
178
- for i, model in enumerate(models):
179
- query_embedding = model.encode(text, convert_to_tensor=True)
180
-
181
- # For this example, using same embeddings for all models
182
- # In practice, you'd want different embeddings for each model
183
- sim1 = util.pytorch_cos_sim(query_embedding, embeddings1)[0] * model_weights[i]
184
- sim2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0] * model_weights[i]
185
- sim3 = util.pytorch_cos_sim(query_embedding, embeddings3)[0] * model_weights[i]
186
-
187
- all_sim_scores1.append(sim1)
188
- all_sim_scores2.append(sim2)
189
- all_sim_scores3.append(sim3)
190
-
191
- # Combine scores
192
- final_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
193
- final_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
194
- final_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
195
-
196
- # Get top results
197
- top3_scores1, top3_idx1 = final_scores1.topk(3)
198
- top3_scores2, top3_idx2 = final_scores2.topk(3)
199
- top3_scores3, top3_idx3 = final_scores3.topk(3)
200
-
201
- # Convert and format results (same as before)
202
- top3_idx1_cpu = top3_idx1.cpu().numpy()
203
- top3_idx2_cpu = top3_idx2.cpu().numpy()
204
- top3_idx3_cpu = top3_idx3.cpu().numpy()
205
-
206
- top3_scores1_cpu = top3_scores1.cpu().numpy()
207
- top3_scores2_cpu = top3_scores2.cpu().numpy()
208
- top3_scores3_cpu = top3_scores3.cpu().numpy()
209
-
210
  results = {
211
 
212
  "top2": [
@@ -233,20 +88,16 @@ def predict_dynamic(text):
233
  }
234
  for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
235
  ],
236
- "arabic_text": text ,
237
  }
238
 
239
  return results
240
 
241
- # Create Gradio interface
242
- title = "Enhanced Multi-Model Search with Translation"
243
  iface = gr.Interface(
244
- fn=predict_with_translation, # Use the new function with translation
245
- inputs=[gr.Textbox(label="Enter your question (English or Arabic)", lines=3)],
246
  outputs='json',
247
  title=title,
248
- description="Ask questions in English or Arabic. English queries will be translated to Arabic for better matching."
249
  )
250
-
251
- if __name__ == "__main__":
252
- iface.launch()
 
2
  import pandas as pd
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ model = SentenceTransformer("distilbert-base-multilingual-cased")
7
+ modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
8
  df = pd.read_csv("cleaned1.csv")
9
  df2 = pd.read_csv("cleaned2.csv")
10
  df3 = pd.read_csv("cleaned3.csv")
11
 
12
+ embeddings = torch.load("embeddings1_1.pt")
13
+ embeddings2 = torch.load("embeddings2_1.pt")
14
+ embeddings3 = torch.load("embeddings3_1.pt")
 
 
 
15
 
16
+ embeddingsa = torch.load("embeddings1.pt")
17
+ embeddingsa2 = torch.load("embeddings2.pt")
18
+ embeddingsa3 = torch.load("embeddings3.pt")
 
 
19
 
20
+ # Pre-extract DataFrame columns to avoid repeated iloc calls
 
 
 
 
 
 
21
  df_questions = df["question"].values
22
  df_links = df["link"].values
23
  df2_questions = df2["question"].values
24
  df2_links = df2["link"].values
25
  df3_questions = df3["question"].values
26
  df3_links = df3["url"].values
27
+ def predict(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  if not text or text.strip() == "":
29
  return "No query provided"
30
 
31
+ query_embedding = model.encode(text, convert_to_tensor=True)
32
+ query_embeddinga = modela.encode(text, convert_to_tensor=True)
33
+ all_sim_scores1 = []
34
+ all_sim_scores2 = []
35
+ all_sim_scores3 = []
36
+ # Compute similarity scores
37
+ sim_scores1 = util.pytorch_cos_sim(query_embedding, embeddings)[0]
38
+ sim_scores2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
39
+ sim_scores3 = util.pytorch_cos_sim(query_embedding, embeddings3)[0]
40
+ all_sim_scores1.append(sim_scores1)
41
+ all_sim_scores2.append(sim_scores2)
42
+ all_sim_scores1.append(sim_scores3)
43
+ sim_scores1 = util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]
44
+ sim_scores2 = util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]
45
+ sim_scores3 = util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]
46
+ all_sim_scores1.append(sim_scores1)
47
+ all_sim_scores2.append(sim_scores2)
48
+ all_sim_scores1.append(sim_scores3)
49
+ sim_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
50
+ sim_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
51
+ sim_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
52
+ # Get top 3 values and indices in one call
53
  top3_scores1, top3_idx1 = sim_scores1.topk(3)
54
  top3_scores2, top3_idx2 = sim_scores2.topk(3)
55
  top3_scores3, top3_idx3 = sim_scores3.topk(3)
 
56
  # Convert to CPU once
57
  top3_idx1_cpu = top3_idx1.cpu().numpy()
58
  top3_idx2_cpu = top3_idx2.cpu().numpy()
 
61
  top3_scores1_cpu = top3_scores1.cpu().numpy()
62
  top3_scores2_cpu = top3_scores2.cpu().numpy()
63
  top3_scores3_cpu = top3_scores3.cpu().numpy()
64
+ # Prepare results using pre-extracted arrays
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  results = {
66
 
67
  "top2": [
 
88
  }
89
  for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
90
  ],
 
91
  }
92
 
93
  return results
94
 
95
+ # Match the EXACT structure of your working translation app
96
+ title = "Search CSV"
97
  iface = gr.Interface(
98
+ fn=predict, # Changed from search_fatwa to predict
99
+ inputs=[gr.Textbox(label="text", lines=3)],
100
  outputs='json',
101
  title=title,
 
102
  )
103
+ iface.launch()