mohbay commited on
Commit
23cd5e3
·
verified ·
1 Parent(s): 8a11400

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -93
app.py CHANGED
@@ -18,7 +18,6 @@ embeddingsa = torch.load("embeddings1.pt")
18
  embeddingsa2 = torch.load("embeddings2.pt")
19
  embeddingsa3 = torch.load("embeddings3.pt")
20
 
21
- # Pre-extract DataFrame columns to avoid repeated iloc calls
22
  df_questions = df["question"].values
23
  df_links = df["link"].values
24
  df2_questions = df2["question"].values
@@ -31,7 +30,6 @@ def arabic_word_tokenize(text):
31
  return []
32
  return re.findall(r'\w+', text)
33
 
34
-
35
  def compute_word_overlap(query, questions):
36
  query_words = set(arabic_word_tokenize(query))
37
  overlaps = []
@@ -47,115 +45,68 @@ def compute_word_overlap(query, questions):
47
  def predict(text):
48
  if not text or text.strip() == "":
49
  return "No query provided"
50
-
51
  query_embedding = model.encode(text, convert_to_tensor=True)
52
  query_embeddinga = modela.encode(text, convert_to_tensor=True)
53
- all_sim_scores1 = []
54
- all_sim_scores2 = []
55
- all_sim_scores3 = []
56
- # Compute similarity scores
57
- sim_scores1 = util.pytorch_cos_sim(query_embedding, embeddings)[0]
58
- sim_scores2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
59
- sim_scores3 = util.pytorch_cos_sim(query_embedding, embeddings3)[0]
60
-
61
- all_sim_scores1.append(sim_scores1)
62
- all_sim_scores2.append(sim_scores2)
63
- all_sim_scores3.append(sim_scores3)
64
-
65
- sim_scores1a = util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]
66
- sim_scores2a = util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]
67
- sim_scores3a = util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]
68
-
69
- all_sim_scores1.append(sim_scores1a)
70
- all_sim_scores2.append(sim_scores2a)
71
- all_sim_scores3.append(sim_scores3a)
72
-
73
- sim_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
74
- sim_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
75
- sim_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
76
-
77
- # Compute word overlap scores
78
  word_overlap1 = compute_word_overlap(text, df_questions)
79
-
80
  word_overlap2 = compute_word_overlap(text, df2_questions)
81
  word_overlap3 = compute_word_overlap(text, df3_questions)
82
 
83
- # Prepare combined results list
84
- weight = 0.5 # word overlap weight
85
- combined_results = []
86
 
87
- for i, score in enumerate(sim_scores1.cpu().numpy()):
88
- combined_score = float(score) + weight * word_overlap1[i]
89
- combined_results.append({
90
  "question": df_questions[i],
91
  "link": df_links[i],
92
- "cosine_score": float(score),
93
  "word_overlap_score": float(word_overlap1[i]),
94
- "combined_score": combined_score
95
- })
96
-
97
- for i, score in enumerate(sim_scores2.cpu().numpy()):
98
- combined_score = float(score) + weight * word_overlap2[i]
99
- combined_results.append({
 
 
 
100
  "question": df2_questions[i],
101
  "link": df2_links[i],
102
- "cosine_score": float(score),
103
  "word_overlap_score": float(word_overlap2[i]),
104
- "combined_score": combined_score
105
- })
106
-
107
- for i, score in enumerate(sim_scores3.cpu().numpy()):
108
- combined_score = float(score) + weight * word_overlap3[i]
109
- combined_results.append({
 
 
 
110
  "question": df3_questions[i],
111
  "link": df3_links[i],
112
- "cosine_score": float(score),
113
  "word_overlap_score": float(word_overlap3[i]),
114
- "combined_score": combined_score
115
- })
116
-
117
- # Get top 3 combined
118
- top3_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)[:3]
119
-
120
- # Also keep your original top1/top2/top3 as is
121
- top3_scores1, top3_idx1 = sim_scores1.topk(3)
122
- top3_scores2, top3_idx2 = sim_scores2.topk(3)
123
- top3_scores3, top3_idx3 = sim_scores3.topk(3)
124
-
125
- top3_idx1_cpu = top3_idx1.cpu().numpy()
126
- top3_idx2_cpu = top3_idx2.cpu().numpy()
127
- top3_idx3_cpu = top3_idx3.cpu().numpy()
128
-
129
- top3_scores1_cpu = top3_scores1.cpu().numpy()
130
- top3_scores2_cpu = top3_scores2.cpu().numpy()
131
- top3_scores3_cpu = top3_scores3.cpu().numpy()
132
 
133
  results = {
134
- "top1": [
135
- {
136
- "question": df_questions[idx],
137
- "link": df_links[idx],
138
- "score": float(score)
139
- }
140
- for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
141
- ],
142
- "top2": [
143
- {
144
- "question": df2_questions[idx],
145
- "link": df2_links[idx],
146
- "score": float(score)
147
- }
148
- for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
149
- ],
150
- "top3": [
151
- {
152
- "question": df3_questions[idx],
153
- "link": df3_links[idx],
154
- "score": float(score)
155
- }
156
- for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
157
- ],
158
- "top3_combined": top3_combined
159
  }
160
 
161
  return results
 
18
  embeddingsa2 = torch.load("embeddings2.pt")
19
  embeddingsa3 = torch.load("embeddings3.pt")
20
 
 
21
  df_questions = df["question"].values
22
  df_links = df["link"].values
23
  df2_questions = df2["question"].values
 
30
  return []
31
  return re.findall(r'\w+', text)
32
 
 
33
  def compute_word_overlap(query, questions):
34
  query_words = set(arabic_word_tokenize(query))
35
  overlaps = []
 
45
  def predict(text):
46
  if not text or text.strip() == "":
47
  return "No query provided"
48
+
49
  query_embedding = model.encode(text, convert_to_tensor=True)
50
  query_embeddinga = modela.encode(text, convert_to_tensor=True)
51
+
52
+ # Cosine similarities
53
+ sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
54
+ util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
55
+ sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
56
+ util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
57
+ sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
58
+ util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
59
+
60
+ # Word overlaps
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  word_overlap1 = compute_word_overlap(text, df_questions)
 
62
  word_overlap2 = compute_word_overlap(text, df2_questions)
63
  word_overlap3 = compute_word_overlap(text, df3_questions)
64
 
65
+ weight = 0.4
 
 
66
 
67
+ # Collect top1
68
+ combined1 = [
69
+ {
70
  "question": df_questions[i],
71
  "link": df_links[i],
72
+ "cosine_score": float(sim_scores1[i].cpu().item()),
73
  "word_overlap_score": float(word_overlap1[i]),
74
+ "combined_score": float(sim_scores1[i].cpu().item()) + weight * word_overlap1[i]
75
+ }
76
+ for i in range(len(df_questions))
77
+ ]
78
+ top1 = sorted(combined1, key=lambda x: x["combined_score"], reverse=True)[:3]
79
+
80
+ # Collect top2
81
+ combined2 = [
82
+ {
83
  "question": df2_questions[i],
84
  "link": df2_links[i],
85
+ "cosine_score": float(sim_scores2[i].cpu().item()),
86
  "word_overlap_score": float(word_overlap2[i]),
87
+ "combined_score": float(sim_scores2[i].cpu().item()) + weight * word_overlap2[i]
88
+ }
89
+ for i in range(len(df2_questions))
90
+ ]
91
+ top2 = sorted(combined2, key=lambda x: x["combined_score"], reverse=True)[:3]
92
+
93
+ # Collect top3
94
+ combined3 = [
95
+ {
96
  "question": df3_questions[i],
97
  "link": df3_links[i],
98
+ "cosine_score": float(sim_scores3[i].cpu().item()),
99
  "word_overlap_score": float(word_overlap3[i]),
100
+ "combined_score": float(sim_scores3[i].cpu().item()) + weight * word_overlap3[i]
101
+ }
102
+ for i in range(len(df3_questions))
103
+ ]
104
+ top3 = sorted(combined3, key=lambda x: x["combined_score"], reverse=True)[:3]
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  results = {
107
+ "top1": top1,
108
+ "top2": top2,
109
+ "top3": top3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  }
111
 
112
  return results