Spaces:

mohbay
/

searchcsv2

Running

App Files Files Community

mohbay commited on Jul 5

Commit

892da5a

verified ·

1 Parent(s): 6c11a17

Update app.py

Browse files

Files changed (1) hide show

app.py +217 -41

app.py CHANGED Viewed

@@ -3,13 +3,19 @@ import pandas as pd
 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
 import re
 model = SentenceTransformer("distilbert-base-multilingual-cased")
 modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
 df = pd.read_csv("cleaned1.csv")
 df2 = pd.read_csv("cleaned2.csv")
 df3 = pd.read_csv("cleaned3.csv")
 embeddings = torch.load("embeddings1_1.pt")
 embeddings2 = torch.load("embeddings2_1.pt")
 embeddings3 = torch.load("embeddings3_1.pt")
@@ -18,6 +24,7 @@ embeddingsa = torch.load("embeddings1.pt")
 embeddingsa2 = torch.load("embeddings2.pt")
 embeddingsa3 = torch.load("embeddings3.pt")
 df_questions = df["question"].values
 df_links = df["link"].values
 df2_questions = df2["question"].values
@@ -26,30 +33,105 @@ df3_questions = df3["question"].values
 df3_links = df3["url"].values
 def arabic_word_tokenize(text):
     if not isinstance(text, str):
         return []
-    return re.findall(r'\w+', text)
-def compute_word_overlap(query, questions):
     query_words = set(arabic_word_tokenize(query))
     overlaps = []
     for q in questions:
         q_words = set(arabic_word_tokenize(q))
-        if len(query_words) > 0:
-            overlap_score = len(query_words & q_words) / len(query_words)
-        else:
-            overlap_score = 0.0
-        overlaps.append(overlap_score)
     return overlaps
 def predict(text):
     if not text or text.strip() == "":
         return "No query provided"
     query_embedding = model.encode(text, convert_to_tensor=True)
     query_embeddinga = modela.encode(text, convert_to_tensor=True)
-    # Cosine similarities
     sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
                    util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
     sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
@@ -57,65 +139,159 @@ def predict(text):
     sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
                    util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
-    # Word overlaps
-    word_overlap1 = compute_word_overlap(text, df_questions)
-    word_overlap2 = compute_word_overlap(text, df2_questions)
-    word_overlap3 = compute_word_overlap(text, df3_questions)
-    weight = 0.4
-    # Collect top1
-    combined1 = [
-        {
             "question": df_questions[i],
             "link": df_links[i],
             "cosine_score": float(sim_scores1[i].cpu().item()),
             "word_overlap_score": float(word_overlap1[i]),
-            "combined_score": float(sim_scores1[i].cpu().item()) + weight * word_overlap1[i]
-        }
-        for i in range(len(df_questions))
-    ]
-    top1 = sorted(combined1, key=lambda x: x["combined_score"], reverse=True)[:3]
-    # Collect top2
-    combined2 = [
-        {
             "question": df2_questions[i],
             "link": df2_links[i],
             "cosine_score": float(sim_scores2[i].cpu().item()),
             "word_overlap_score": float(word_overlap2[i]),
-            "combined_score": float(sim_scores2[i].cpu().item()) + weight * word_overlap2[i]
-        }
-        for i in range(len(df2_questions))
-    ]
-    top2 = sorted(combined2, key=lambda x: x["combined_score"], reverse=True)[:3]
-    # Collect top3
-    combined3 = [
-        {
             "question": df3_questions[i],
             "link": df3_links[i],
             "cosine_score": float(sim_scores3[i].cpu().item()),
             "word_overlap_score": float(word_overlap3[i]),
-            "combined_score": float(sim_scores3[i].cpu().item()) + weight * word_overlap3[i]
-        }
-        for i in range(len(df3_questions))
-    ]
-    top3 = sorted(combined3, key=lambda x: x["combined_score"], reverse=True)[:3]
     results = {
         "top2": top2,
         "top3": top3,
-        "top1": top1
     }
     return results
-title = "Search CSV"
 iface = gr.Interface(
     fn=predict,
-    inputs=[gr.Textbox(label="text", lines=3)],
     outputs='json',
     title=title,
 )
-iface.launch()

 from sentence_transformers import SentenceTransformer, util
 import gradio as gr
 import re
+import numpy as np
+from collections import Counter
+# Load models
 model = SentenceTransformer("distilbert-base-multilingual-cased")
 modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
+# Load data
 df = pd.read_csv("cleaned1.csv")
 df2 = pd.read_csv("cleaned2.csv")
 df3 = pd.read_csv("cleaned3.csv")
+# Load embeddings
 embeddings = torch.load("embeddings1_1.pt")
 embeddings2 = torch.load("embeddings2_1.pt")
 embeddings3 = torch.load("embeddings3_1.pt")
 embeddingsa2 = torch.load("embeddings2.pt")
 embeddingsa3 = torch.load("embeddings3.pt")
+# Extract data arrays
 df_questions = df["question"].values
 df_links = df["link"].values
 df2_questions = df2["question"].values
 df3_links = df3["url"].values
 def arabic_word_tokenize(text):
+    """Improved tokenization with better handling of Arabic text"""
     if not isinstance(text, str):
         return []
+    # Remove diacritics and normalize
+    text = re.sub(r'[\u064B-\u065F\u0670\u06D6-\u06ED]', '', text)
+    # Extract words (Arabic, English, and numbers)
+    words = re.findall(r'[\u0600-\u06FF\u0750-\u077F\w]+', text.lower())
+    return words
+def compute_enhanced_word_overlap(query, questions):
+    """Enhanced word overlap with better scoring"""
     query_words = set(arabic_word_tokenize(query))
+    if len(query_words) == 0:
+        return [0.0] * len(questions)
     overlaps = []
     for q in questions:
         q_words = set(arabic_word_tokenize(q))
+        if len(q_words) == 0:
+            overlaps.append(0.0)
+            continue
+        # Jaccard similarity (intersection over union)
+        intersection = len(query_words & q_words)
+        union = len(query_words | q_words)
+        jaccard = intersection / union if union > 0 else 0.0
+        # Word coverage (how much of query is covered)
+        coverage = intersection / len(query_words)
+        # Combine both metrics
+        combined_overlap = 0.6 * jaccard + 0.4 * coverage
+        overlaps.append(combined_overlap)
     return overlaps
+def compute_fuzzy_matches(query, questions):
+    """Compute fuzzy string matching scores"""
+    query_words = arabic_word_tokenize(query)
+    if len(query_words) == 0:
+        return [0.0] * len(questions)
+    fuzzy_scores = []
+    for q in questions:
+        q_words = arabic_word_tokenize(q)
+        if len(q_words) == 0:
+            fuzzy_scores.append(0.0)
+            continue
+        # Find partial matches (substrings)
+        matches = 0
+        for q_word in query_words:
+            for doc_word in q_words:
+                if len(q_word) >= 3 and len(doc_word) >= 3:
+                    if q_word in doc_word or doc_word in q_word:
+                        matches += 1
+                        break
+        fuzzy_score = matches / len(query_words) if len(query_words) > 0 else 0.0
+        fuzzy_scores.append(fuzzy_score)
+    return fuzzy_scores
+def compute_length_penalty(query, questions):
+    """Penalize very long or very short results relative to query"""
+    query_len = len(arabic_word_tokenize(query))
+    penalties = []
+    for q in questions:
+        q_len = len(arabic_word_tokenize(q))
+        if q_len == 0:
+            penalties.append(0.0)
+            continue
+        # Optimal length ratio (prefer similar lengths)
+        ratio = min(query_len, q_len) / max(query_len, q_len)
+        # Penalty for very short results
+        if q_len < 3:
+            ratio *= 0.5
+        penalties.append(ratio)
+    return penalties
+def normalize_scores(scores):
+    """Normalize scores to 0-1 range"""
+    scores = np.array(scores)
+    if scores.max() - scores.min() == 0:
+        return scores
+    return (scores - scores.min()) / (scores.max() - scores.min())
 def predict(text):
     if not text or text.strip() == "":
         return "No query provided"
+    # Encode query with both models
     query_embedding = model.encode(text, convert_to_tensor=True)
     query_embeddinga = modela.encode(text, convert_to_tensor=True)
+    # Compute semantic similarities
     sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
                    util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
     sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
     sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
                    util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
+    # Compute enhanced word overlaps
+    word_overlap1 = compute_enhanced_word_overlap(text, df_questions)
+    word_overlap2 = compute_enhanced_word_overlap(text, df2_questions)
+    word_overlap3 = compute_enhanced_word_overlap(text, df3_questions)
+    # Compute fuzzy matches
+    fuzzy_scores1 = compute_fuzzy_matches(text, df_questions)
+    fuzzy_scores2 = compute_fuzzy_matches(text, df2_questions)
+    fuzzy_scores3 = compute_fuzzy_matches(text, df3_questions)
+    # Compute length penalties
+    length_penalties1 = compute_length_penalty(text, df_questions)
+    length_penalties2 = compute_length_penalty(text, df2_questions)
+    length_penalties3 = compute_length_penalty(text, df3_questions)
+    # Normalize all scores
+    sem_scores1 = normalize_scores([float(x.cpu().item()) for x in sim_scores1])
+    sem_scores2 = normalize_scores([float(x.cpu().item()) for x in sim_scores2])
+    sem_scores3 = normalize_scores([float(x.cpu().item()) for x in sim_scores3])
+    word_scores1 = normalize_scores(word_overlap1)
+    word_scores2 = normalize_scores(word_overlap2)
+    word_scores3 = normalize_scores(word_overlap3)
+    fuzzy_scores1_norm = normalize_scores(fuzzy_scores1)
+    fuzzy_scores2_norm = normalize_scores(fuzzy_scores2)
+    fuzzy_scores3_norm = normalize_scores(fuzzy_scores3)
+    # Adaptive weights based on query characteristics
+    query_words = arabic_word_tokenize(text)
+    if len(query_words) <= 2:
+        # Short queries: prioritize exact matches
+        semantic_weight = 0.3
+        word_weight = 0.5
+        fuzzy_weight = 0.2
+    elif len(query_words) <= 5:
+        # Medium queries: balanced approach
+        semantic_weight = 0.4
+        word_weight = 0.4
+        fuzzy_weight = 0.2
+    else:
+        # Long queries: prioritize semantic similarity
+        semantic_weight = 0.5
+        word_weight = 0.3
+        fuzzy_weight = 0.2
+    # Collect results for dataset 1
+    combined1 = []
+    for i in range(len(df_questions)):
+        combined_score = (
+            semantic_weight * sem_scores1[i] +
+            word_weight * word_scores1[i] +
+            fuzzy_weight * fuzzy_scores1_norm[i]
+        ) * length_penalties1[i]
+        combined1.append({
             "question": df_questions[i],
             "link": df_links[i],
             "cosine_score": float(sim_scores1[i].cpu().item()),
             "word_overlap_score": float(word_overlap1[i]),
+            "fuzzy_score": float(fuzzy_scores1[i]),
+            "length_penalty": float(length_penalties1[i]),
+            "combined_score": float(combined_score)
+        })
+    # Collect results for dataset 2
+    combined2 = []
+    for i in range(len(df2_questions)):
+        combined_score = (
+            semantic_weight * sem_scores2[i] +
+            word_weight * word_scores2[i] +
+            fuzzy_weight * fuzzy_scores2_norm[i]
+        ) * length_penalties2[i]
+        combined2.append({
             "question": df2_questions[i],
             "link": df2_links[i],
             "cosine_score": float(sim_scores2[i].cpu().item()),
             "word_overlap_score": float(word_overlap2[i]),
+            "fuzzy_score": float(fuzzy_scores2[i]),
+            "length_penalty": float(length_penalties2[i]),
+            "combined_score": float(combined_score)
+        })
+    # Collect results for dataset 3
+    combined3 = []
+    for i in range(len(df3_questions)):
+        combined_score = (
+            semantic_weight * sem_scores3[i] +
+            word_weight * word_scores3[i] +
+            fuzzy_weight * fuzzy_scores3_norm[i]
+        ) * length_penalties3[i]
+        combined3.append({
             "question": df3_questions[i],
             "link": df3_links[i],
             "cosine_score": float(sim_scores3[i].cpu().item()),
             "word_overlap_score": float(word_overlap3[i]),
+            "fuzzy_score": float(fuzzy_scores3[i]),
+            "length_penalty": float(length_penalties3[i]),
+            "combined_score": float(combined_score)
+        })
+    # Get top results with diversity filtering
+    def get_diverse_top_results(results, top_k=5):
+        """Get top results while avoiding too similar ones"""
+        sorted_results = sorted(results, key=lambda x: x["combined_score"], reverse=True)
+        diverse_results = []
+        for result in sorted_results:
+            if len(diverse_results) >= top_k:
+                break
+            # Check if this result is too similar to already selected ones
+            is_diverse = True
+            for selected in diverse_results:
+                # Simple diversity check based on word overlap
+                overlap = compute_enhanced_word_overlap(result["question"], [selected["question"]])[0]
+                if overlap > 0.8:  # Too similar
+                    is_diverse = False
+                    break
+            if is_diverse:
+                diverse_results.append(result)
+        return diverse_results
+    top1 = get_diverse_top_results(combined1, 3)
+    top2 = get_diverse_top_results(combined2, 3)
+    top3 = get_diverse_top_results(combined3, 3)
     results = {
+        "top1": top1,
         "top2": top2,
         "top3": top3,
+        "query_analysis": {
+            "word_count": len(query_words),
+            "semantic_weight": semantic_weight,
+            "word_weight": word_weight,
+            "fuzzy_weight": fuzzy_weight
+        }
     }
     return results
+title = "Enhanced Search CSV"
 iface = gr.Interface(
     fn=predict,
+    inputs=[gr.Textbox(label="Search Query", lines=3, placeholder="Enter your search query here...")],
     outputs='json',
     title=title,
+    description="Enhanced semantic search with improved matching algorithms"
 )
+if __name__ == "__main__":
+    iface.launch()