Spaces:

mohbay
/

searchcsv2

Running

App Files Files Community

mohbay commited on Jul 1

Commit

762dded

verified ·

1 Parent(s): 7d96acd

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -19

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import torch
 import pandas as pd
 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
 model = SentenceTransformer("distilbert-base-multilingual-cased")
 modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
@@ -24,6 +25,22 @@ df2_questions = df2["question"].values
 df2_links = df2["link"].values
 df3_questions = df3["question"].values
 df3_links = df3["url"].values
 def predict(text):
     if not text or text.strip() == "":
         return "No query provided"
@@ -53,25 +70,76 @@ def predict(text):
     sim_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
     sim_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
     sim_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
-    # Get top 3 values and indices in one call
     top3_scores1, top3_idx1 = sim_scores1.topk(3)
     top3_scores2, top3_idx2 = sim_scores2.topk(3)
     top3_scores3, top3_idx3 = sim_scores3.topk(3)
-    # Convert to CPU once
     top3_idx1_cpu = top3_idx1.cpu().numpy()
     top3_idx2_cpu = top3_idx2.cpu().numpy()
     top3_idx3_cpu = top3_idx3.cpu().numpy()
     top3_scores1_cpu = top3_scores1.cpu().numpy()
     top3_scores2_cpu = top3_scores2.cpu().numpy()
     top3_scores3_cpu = top3_scores3.cpu().numpy()
-    # Prepare results using pre-extracted arrays
     results = {
         "top2": [
             {
                 "question": df2_questions[idx],
-                "link": df2_links[idx],
                 "score": float(score)
             }
             for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
@@ -79,29 +147,21 @@ def predict(text):
         "top3": [
             {
                 "question": df3_questions[idx],
-                "link": df3_links[idx],
                 "score": float(score)
             }
             for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
         ],
-        "top1": [
-            {
-                "question": df_questions[idx],
-                "link": df_links[idx],
-                "score": float(score)
-            }
-            for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
-        ],
     }
     return results
-# Match the EXACT structure of your working translation app
 title = "Search CSV"
 iface = gr.Interface(
-    fn=predict,  # Changed from search_fatwa to predict
     inputs=[gr.Textbox(label="text", lines=3)],
     outputs='json',
     title=title,
 )
-iface.launch()

 import pandas as pd
 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
+import re
 model = SentenceTransformer("distilbert-base-multilingual-cased")
 modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
 df2_links = df2["link"].values
 df3_questions = df3["question"].values
 df3_links = df3["url"].values
+def arabic_word_tokenize(text):
+    return re.findall(r'\w+', text)
+def compute_word_overlap(query, questions):
+    query_words = set(arabic_word_tokenize(query))
+    overlaps = []
+    for q in questions:
+        q_words = set(arabic_word_tokenize(q))
+        if len(query_words) > 0:
+            overlap_score = len(query_words & q_words) / len(query_words)
+        else:
+            overlap_score = 0.0
+        overlaps.append(overlap_score)
+    return overlaps
 def predict(text):
     if not text or text.strip() == "":
         return "No query provided"
     sim_scores1 = torch.stack(all_sim_scores1).mean(dim=0)
     sim_scores2 = torch.stack(all_sim_scores2).mean(dim=0)
     sim_scores3 = torch.stack(all_sim_scores3).mean(dim=0)
+    # Compute word overlap scores
+    word_overlap1 = compute_word_overlap(text, df_questions)
+    word_overlap2 = compute_word_overlap(text, df2_questions)
+    word_overlap3 = compute_word_overlap(text, df3_questions)
+    # Prepare combined results list
+    weight = 0.5  # word overlap weight
+    combined_results = []
+    for i, score in enumerate(sim_scores1.cpu().numpy()):
+        combined_score = float(score) + weight * word_overlap1[i]
+        combined_results.append({
+            "question": df_questions[i],
+            "link": df_links[i],
+            "cosine_score": float(score),
+            "word_overlap_score": float(word_overlap1[i]),
+            "combined_score": combined_score
+        })
+    for i, score in enumerate(sim_scores2.cpu().numpy()):
+        combined_score = float(score) + weight * word_overlap2[i]
+        combined_results.append({
+            "question": df2_questions[i],
+            "link": df2_links[i],
+            "cosine_score": float(score),
+            "word_overlap_score": float(word_overlap2[i]),
+            "combined_score": combined_score
+        })
+    for i, score in enumerate(sim_scores3.cpu().numpy()):
+        combined_score = float(score) + weight * word_overlap3[i]
+        combined_results.append({
+            "question": df3_questions[i],
+            "link": df3_links[i],
+            "cosine_score": float(score),
+            "word_overlap_score": float(word_overlap3[i]),
+            "combined_score": combined_score
+        })
+    # Get top 3 combined
+    top3_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)[:3]
+    # Also keep your original top1/top2/top3 as is
     top3_scores1, top3_idx1 = sim_scores1.topk(3)
     top3_scores2, top3_idx2 = sim_scores2.topk(3)
     top3_scores3, top3_idx3 = sim_scores3.topk(3)
     top3_idx1_cpu = top3_idx1.cpu().numpy()
     top3_idx2_cpu = top3_idx2.cpu().numpy()
     top3_idx3_cpu = top3_idx3.cpu().numpy()
     top3_scores1_cpu = top3_scores1.cpu().numpy()
     top3_scores2_cpu = top3_scores2.cpu().numpy()
     top3_scores3_cpu = top3_scores3.cpu().numpy()
     results = {
+        "top1": [
+            {
+                "question": df_questions[idx],
+                "link": df_links[idx],
+                "score": float(score)
+            }
+            for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
+        ],
         "top2": [
             {
                 "question": df2_questions[idx],
+                "link": df2_links[idx],
                 "score": float(score)
             }
             for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
         "top3": [
             {
                 "question": df3_questions[idx],
+                "link": df3_links[idx],
                 "score": float(score)
             }
             for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
         ],
+        "top3_combined": top3_combined
     }
     return results
 title = "Search CSV"
 iface = gr.Interface(
+    fn=predict,
     inputs=[gr.Textbox(label="text", lines=3)],
     outputs='json',
     title=title,
 )
+iface.launch()