mohbay commited on
Commit
762dded
·
verified ·
1 Parent(s): 7d96acd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -19
app.py CHANGED
@@ -2,6 +2,7 @@ import torch
2
  import pandas as pd
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
 
5
 
6
  model = SentenceTransformer("distilbert-base-multilingual-cased")
7
  modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
@@ -24,6 +25,22 @@ df2_questions = df2["question"].values
24
  df2_links = df2["link"].values
25
  df3_questions = df3["question"].values
26
  df3_links = df3["url"].values
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def predict(text):
28
  if not text or text.strip() == "":
29
  return "No query provided"
@@ -53,25 +70,76 @@ def predict(text):
53
  sim_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
54
  sim_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
55
  sim_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
56
- # Get top 3 values and indices in one call
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  top3_scores1, top3_idx1 = sim_scores1.topk(3)
58
  top3_scores2, top3_idx2 = sim_scores2.topk(3)
59
  top3_scores3, top3_idx3 = sim_scores3.topk(3)
60
- # Convert to CPU once
61
  top3_idx1_cpu = top3_idx1.cpu().numpy()
62
  top3_idx2_cpu = top3_idx2.cpu().numpy()
63
  top3_idx3_cpu = top3_idx3.cpu().numpy()
64
-
65
  top3_scores1_cpu = top3_scores1.cpu().numpy()
66
  top3_scores2_cpu = top3_scores2.cpu().numpy()
67
  top3_scores3_cpu = top3_scores3.cpu().numpy()
68
- # Prepare results using pre-extracted arrays
69
  results = {
70
-
 
 
 
 
 
 
 
71
  "top2": [
72
  {
73
  "question": df2_questions[idx],
74
- "link": df2_links[idx],
75
  "score": float(score)
76
  }
77
  for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
@@ -79,29 +147,21 @@ def predict(text):
79
  "top3": [
80
  {
81
  "question": df3_questions[idx],
82
- "link": df3_links[idx],
83
  "score": float(score)
84
  }
85
  for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
86
  ],
87
- "top1": [
88
- {
89
- "question": df_questions[idx],
90
- "link": df_links[idx],
91
- "score": float(score)
92
- }
93
- for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
94
- ],
95
  }
96
-
97
  return results
98
 
99
- # Match the EXACT structure of your working translation app
100
  title = "Search CSV"
101
  iface = gr.Interface(
102
- fn=predict, # Changed from search_fatwa to predict
103
  inputs=[gr.Textbox(label="text", lines=3)],
104
  outputs='json',
105
  title=title,
106
  )
107
- iface.launch()
 
2
  import pandas as pd
3
  from sentence_transformers import SentenceTransformer, util
4
  import gradio as gr
5
+ import re
6
 
7
  model = SentenceTransformer("distilbert-base-multilingual-cased")
8
  modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
 
25
  df2_links = df2["link"].values
26
  df3_questions = df3["question"].values
27
  df3_links = df3["url"].values
28
+
29
+ def arabic_word_tokenize(text):
30
+ return re.findall(r'\w+', text)
31
+
32
+ def compute_word_overlap(query, questions):
33
+ query_words = set(arabic_word_tokenize(query))
34
+ overlaps = []
35
+ for q in questions:
36
+ q_words = set(arabic_word_tokenize(q))
37
+ if len(query_words) > 0:
38
+ overlap_score = len(query_words & q_words) / len(query_words)
39
+ else:
40
+ overlap_score = 0.0
41
+ overlaps.append(overlap_score)
42
+ return overlaps
43
+
44
  def predict(text):
45
  if not text or text.strip() == "":
46
  return "No query provided"
 
70
  sim_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
71
  sim_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
72
  sim_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
73
+
74
+ # Compute word overlap scores
75
+ word_overlap1 = compute_word_overlap(text, df_questions)
76
+
77
+ word_overlap2 = compute_word_overlap(text, df2_questions)
78
+ word_overlap3 = compute_word_overlap(text, df3_questions)
79
+
80
+ # Prepare combined results list
81
+ weight = 0.5 # word overlap weight
82
+ combined_results = []
83
+
84
+ for i, score in enumerate(sim_scores1.cpu().numpy()):
85
+ combined_score = float(score) + weight * word_overlap1[i]
86
+ combined_results.append({
87
+ "question": df_questions[i],
88
+ "link": df_links[i],
89
+ "cosine_score": float(score),
90
+ "word_overlap_score": float(word_overlap1[i]),
91
+ "combined_score": combined_score
92
+ })
93
+
94
+ for i, score in enumerate(sim_scores2.cpu().numpy()):
95
+ combined_score = float(score) + weight * word_overlap2[i]
96
+ combined_results.append({
97
+ "question": df2_questions[i],
98
+ "link": df2_links[i],
99
+ "cosine_score": float(score),
100
+ "word_overlap_score": float(word_overlap2[i]),
101
+ "combined_score": combined_score
102
+ })
103
+
104
+ for i, score in enumerate(sim_scores3.cpu().numpy()):
105
+ combined_score = float(score) + weight * word_overlap3[i]
106
+ combined_results.append({
107
+ "question": df3_questions[i],
108
+ "link": df3_links[i],
109
+ "cosine_score": float(score),
110
+ "word_overlap_score": float(word_overlap3[i]),
111
+ "combined_score": combined_score
112
+ })
113
+
114
+ # Get top 3 combined
115
+ top3_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)[:3]
116
+
117
+ # Also keep your original top1/top2/top3 as is
118
  top3_scores1, top3_idx1 = sim_scores1.topk(3)
119
  top3_scores2, top3_idx2 = sim_scores2.topk(3)
120
  top3_scores3, top3_idx3 = sim_scores3.topk(3)
121
+
122
  top3_idx1_cpu = top3_idx1.cpu().numpy()
123
  top3_idx2_cpu = top3_idx2.cpu().numpy()
124
  top3_idx3_cpu = top3_idx3.cpu().numpy()
125
+
126
  top3_scores1_cpu = top3_scores1.cpu().numpy()
127
  top3_scores2_cpu = top3_scores2.cpu().numpy()
128
  top3_scores3_cpu = top3_scores3.cpu().numpy()
129
+
130
  results = {
131
+ "top1": [
132
+ {
133
+ "question": df_questions[idx],
134
+ "link": df_links[idx],
135
+ "score": float(score)
136
+ }
137
+ for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
138
+ ],
139
  "top2": [
140
  {
141
  "question": df2_questions[idx],
142
+ "link": df2_links[idx],
143
  "score": float(score)
144
  }
145
  for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
 
147
  "top3": [
148
  {
149
  "question": df3_questions[idx],
150
+ "link": df3_links[idx],
151
  "score": float(score)
152
  }
153
  for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
154
  ],
155
+ "top3_combined": top3_combined
 
 
 
 
 
 
 
156
  }
157
+
158
  return results
159
 
 
160
  title = "Search CSV"
161
  iface = gr.Interface(
162
+ fn=predict,
163
  inputs=[gr.Textbox(label="text", lines=3)],
164
  outputs='json',
165
  title=title,
166
  )
167
+ iface.launch()