Spaces:

mohbay
/

searchcsv2

Running

App Files Files Community

mohbay commited on Jul 8

Commit

3dafe6c

verified ·

1 Parent(s): 52d7dfb

Update app.py

Browse files

Files changed (1) hide show

app.py +400 -239

app.py CHANGED Viewed

@@ -1,286 +1,447 @@
-import torch
-import pandas as pd
-from sentence_transformers import SentenceTransformer, util
-import gradio as gr
-import re
-from rank_bm25 import BM25Okapi
-import numpy as np
-# Load models
-model = SentenceTransformer("distilbert-base-multilingual-cased")
-modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
-# Load data
-df = pd.read_csv("cleaned1.csv")
-df2 = pd.read_csv("cleaned2.csv")
-df3 = pd.read_csv("cleaned3.csv")
-# Load pre-computed embeddings
-embeddings = torch.load("embeddings1_1.pt")
-embeddings2 = torch.load("embeddings2_1.pt")
-embeddings3 = torch.load("embeddings3_1.pt")
-embeddingsa = torch.load("embeddings1.pt")
-embeddingsa2 = torch.load("embeddings2.pt")
-embeddingsa3 = torch.load("embeddings3.pt")
-# Extract questions and links
-df_questions = df["question"].values
-df_links = df["link"].values
-df2_questions = df2["question"].values
-df2_links = df2["link"].values
-df3_questions = df3["question"].values
-df3_links = df3["url"].values
-ARABIC_STOPWORDS = {
-    'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
-    'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
-    'كان', 'كانت', 'يكون', 'تكون', 'له', 'لها', 'لهم', 'و', 'أم', 'إن'
-}
-def arabic_word_tokenize(text):
-    if not isinstance(text, str):
-        return []
-    # Remove diacritics
-    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
-    # Extract only Arabic words (length ≥ 2)
-    tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
-    return [t for t in tokens if t not in ARABIC_STOPWORDS]
-def prepare_bm25_corpus(questions):
-    """Prepare tokenized corpus for BM25"""
-    tokenized_corpus = []
-    for question in questions:
-        tokens = arabic_word_tokenize(question)
-        tokenized_corpus.append(tokens)
-    return tokenized_corpus
-# Initialize BM25 models for each dataset
-print("Initializing BM25 models...")
-bm25_corpus1 = prepare_bm25_corpus(df_questions)
-bm25_corpus2 = prepare_bm25_corpus(df2_questions)
-bm25_corpus3 = prepare_bm25_corpus(df3_questions)
-bm25_model1 = BM25Okapi(bm25_corpus1)
-bm25_model2 = BM25Okapi(bm25_corpus2)
-bm25_model3 = BM25Okapi(bm25_corpus3)
-print("BM25 models initialized!")
-def compute_bm25_scores(query, bm25_model):
-    """Compute BM25 scores for a query"""
-    query_tokens = arabic_word_tokenize(query)
-    if not query_tokens:
-        return np.zeros(len(bm25_model.corpus))
-    scores = bm25_model.get_scores(query_tokens)
-    return scores
-def compute_word_overlap(query, questions):
-    """Enhanced word overlap computation"""
-    query_words = set(arabic_word_tokenize(query))
-    if len(query_words) == 0:
-        return [0.0] * len(questions)
-    overlaps = []
-    for q in questions:
-        q_words = set(arabic_word_tokenize(q))
-        if len(q_words) == 0:
-            overlaps.append(0.0)
-            continue
-        # Use Jaccard similarity (intersection over union)
-        intersection = len(query_words & q_words)
-        union = len(query_words | q_words)
-        jaccard = intersection / union if union > 0 else 0.0
-        # Also compute coverage (how much of query is matched)
-        coverage = intersection / len(query_words)
-        # Combine both: prioritize coverage but consider similarity
-        overlap_score = 0.7 * coverage + 0.3 * jaccard
-        overlaps.append(overlap_score)
-    return overlaps
-def normalize_scores(scores):
-    """Normalize scores to 0-1 range"""
-    scores = np.array(scores)
-    if np.max(scores) == np.min(scores):
-        return np.zeros_like(scores)
-    return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
-def predict(text):
-    print(f"Received query: {text}")
-    if not text or text.strip() == "":
-        return "No query provided"
-    # Semantic similarity scores
-    query_embedding = model.encode(text, convert_to_tensor=True)
-    query_embeddinga = modela.encode(text, convert_to_tensor=True)
-    # Cosine similarities (averaged from two models)
-    sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
-                   util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
-    sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
-                   util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
-    sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
-                   util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
-    # BM25 scores
-    bm25_scores1 = compute_bm25_scores(text, bm25_model1)
-    bm25_scores2 = compute_bm25_scores(text, bm25_model2)
-    bm25_scores3 = compute_bm25_scores(text, bm25_model3)
-    # Word overlap scores
-    word_overlap1 = compute_word_overlap(text, df_questions)
-    word_overlap2 = compute_word_overlap(text, df2_questions)
-    word_overlap3 = compute_word_overlap(text, df3_questions)
-    # Normalize all scores for fair combination
-    norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
-    norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
-    norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
-    norm_bm25_1 = normalize_scores(bm25_scores1)
-    norm_bm25_2 = normalize_scores(bm25_scores2)
-    norm_bm25_3 = normalize_scores(bm25_scores3)
-    norm_word1 = normalize_scores(word_overlap1)
-    norm_word2 = normalize_scores(word_overlap2)
-    norm_word3 = normalize_scores(word_overlap3)
-    # Adaptive weighting based on query characteristics
-    query_words = arabic_word_tokenize(text)
-    query_length = len(query_words)
-    if query_length <= 2:
-        # Short queries: prioritize exact matches (BM25 + word overlap)
-        semantic_weight = 0.3
-        bm25_weight = 0.4
-        word_weight = 0.3
-    elif query_length <= 5:
-        # Medium queries: balanced approach
-        semantic_weight = 0.4
-        bm25_weight = 0.35
-        word_weight = 0.25
-    else:
-        # Long queries: prioritize semantic understanding
-        semantic_weight = 0.5
-        bm25_weight = 0.3
-        word_weight = 0.2
-    def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
-        combined_results = []
-        for i in range(len(questions)):
-            semantic_score = float(norm_semantic[i])
-            bm25_score = float(norm_bm25[i])
-            word_score = float(norm_word[i])
-            # Enhanced scoring with BM25
-            combined_score = (semantic_weight * semantic_score +
-                            bm25_weight * bm25_score +
-                            word_weight * word_score)
-            # Boost results that perform well across multiple metrics
-            high_performance_count = sum([
-                semantic_score > 0.7,
-                bm25_score > 0.7,
-                word_score > 0.5
-            ])
-            if high_performance_count >= 2:
-                boost = 0.1
-            elif high_performance_count >= 1:
-                boost = 0.05
-            else:
-                boost = 0.0
-            final_score = combined_score + boost
-            combined_results.append({
-                "question": questions[i],
-                "link": links[i],
-                "semantic_score": semantic_score,
-                "bm25_score": bm25_score,
-                "word_overlap_score": word_score,
-                "combined_score": final_score
-            })
-        return combined_results
-    # Create combined results for all datasets
-    combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
-    combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
-    combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)
-    def get_diverse_top_results(combined_results, top_k=5):
-        """Get diverse top results using multiple ranking strategies"""
-        # Sort by combined score and get top candidates
-        by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
-        top_combined = by_combined[:3]
-        # Get questions from top combined to avoid duplicates
-        used_questions = {item["question"] for item in top_combined}
-        # Add best BM25 result not already included
-        by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
-        bm25_pick = None
-        for item in by_bm25:
-            if item["question"] not in used_questions:
-                bm25_pick = item
-                break
-        # Add best semantic result not already included
-        by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
-        semantic_pick = None
-        if bm25_pick:
-            used_questions.add(bm25_pick["question"])
-        for item in by_semantic:
-            if item["question"] not in used_questions:
-                semantic_pick = item
-                break
-        # Combine results
-        final_results = top_combined.copy()
-        if bm25_pick:
-            final_results.append(bm25_pick)
-        if semantic_pick:
-            final_results.append(semantic_pick)
-        return final_results[:top_k]
-    # Get top results for each dataset
-    top1 = get_diverse_top_results(combined1)
-    top2 = get_diverse_top_results(combined2)
-    top3 = get_diverse_top_results(combined3)
-    results = {
-        "top2": top2,
-        "top3": top3,
-        "top1": top1,
         "query_info": {
-            "query_length": query_length,
-            "weights": {
-                "semantic": semantic_weight,
-                "bm25": bm25_weight,
-                "word_overlap": word_weight
-            }
         }
     }
-    return results
-title = "Enhanced Search with BM25"
 iface = gr.Interface(
     fn=predict,
     inputs=[gr.Textbox(label="Search Query", lines=3)],
-    outputs='json',
     title=title,
-    description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
 )
 if __name__ == "__main__":
-    iface.launch()

+# import torch
+# import pandas as pd
+# from sentence_transformers import SentenceTransformer, util
+# import gradio as gr
+# import re
+# from rank_bm25 import BM25Okapi
+# import numpy as np
+# # Load models
+# model = SentenceTransformer("distilbert-base-multilingual-cased")
+# modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
+# # Load data
+# df = pd.read_csv("cleaned1.csv")
+# df2 = pd.read_csv("cleaned2.csv")
+# df3 = pd.read_csv("cleaned3.csv")
+# # Load pre-computed embeddings
+# embeddings = torch.load("embeddings1_1.pt")
+# embeddings2 = torch.load("embeddings2_1.pt")
+# embeddings3 = torch.load("embeddings3_1.pt")
+# embeddingsa = torch.load("embeddings1.pt")
+# embeddingsa2 = torch.load("embeddings2.pt")
+# embeddingsa3 = torch.load("embeddings3.pt")
+# # Extract questions and links
+# df_questions = df["question"].values
+# df_links = df["link"].values
+# df2_questions = df2["question"].values
+# df2_links = df2["link"].values
+# df3_questions = df3["question"].values
+# df3_links = df3["url"].values
+# ARABIC_STOPWORDS = {
+#     'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
+#     'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
+#     'كان', 'كانت', 'يكون', 'تكون', 'له', 'لها', 'لهم', 'و', 'أم', 'إن'
+# }
+# def arabic_word_tokenize(text):
+#     if not isinstance(text, str):
+#         return []
+#     # Remove diacritics
+#     text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
+#     # Extract only Arabic words (length ≥ 2)
+#     tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
+#     return [t for t in tokens if t not in ARABIC_STOPWORDS]
+# def prepare_bm25_corpus(questions):
+#     """Prepare tokenized corpus for BM25"""
+#     tokenized_corpus = []
+#     for question in questions:
+#         tokens = arabic_word_tokenize(question)
+#         tokenized_corpus.append(tokens)
+#     return tokenized_corpus
+# # Initialize BM25 models for each dataset
+# print("Initializing BM25 models...")
+# bm25_corpus1 = prepare_bm25_corpus(df_questions)
+# bm25_corpus2 = prepare_bm25_corpus(df2_questions)
+# bm25_corpus3 = prepare_bm25_corpus(df3_questions)
+# bm25_model1 = BM25Okapi(bm25_corpus1)
+# bm25_model2 = BM25Okapi(bm25_corpus2)
+# bm25_model3 = BM25Okapi(bm25_corpus3)
+# print("BM25 models initialized!")
+# def compute_bm25_scores(query, bm25_model):
+#     """Compute BM25 scores for a query"""
+#     query_tokens = arabic_word_tokenize(query)
+#     if not query_tokens:
+#         return np.zeros(len(bm25_model.corpus))
+#     scores = bm25_model.get_scores(query_tokens)
+#     return scores
+# def compute_word_overlap(query, questions):
+#     """Enhanced word overlap computation"""
+#     query_words = set(arabic_word_tokenize(query))
+#     if len(query_words) == 0:
+#         return [0.0] * len(questions)
+#     overlaps = []
+#     for q in questions:
+#         q_words = set(arabic_word_tokenize(q))
+#         if len(q_words) == 0:
+#             overlaps.append(0.0)
+#             continue
+#         # Use Jaccard similarity (intersection over union)
+#         intersection = len(query_words & q_words)
+#         union = len(query_words | q_words)
+#         jaccard = intersection / union if union > 0 else 0.0
+#         # Also compute coverage (how much of query is matched)
+#         coverage = intersection / len(query_words)
+#         # Combine both: prioritize coverage but consider similarity
+#         overlap_score = 0.7 * coverage + 0.3 * jaccard
+#         overlaps.append(overlap_score)
+#     return overlaps
+# def normalize_scores(scores):
+#     """Normalize scores to 0-1 range"""
+#     scores = np.array(scores)
+#     if np.max(scores) == np.min(scores):
+#         return np.zeros_like(scores)
+#     return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
+# def predict(text):
+#     print(f"Received query: {text}")
+#     if not text or text.strip() == "":
+#         return "No query provided"
+#     # Semantic similarity scores
+#     query_embedding = model.encode(text, convert_to_tensor=True)
+#     query_embeddinga = modela.encode(text, convert_to_tensor=True)
+#     # Cosine similarities (averaged from two models)
+#     sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
+#                    util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
+#     sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
+#                    util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
+#     sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
+#                    util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2
+#     # BM25 scores
+#     bm25_scores1 = compute_bm25_scores(text, bm25_model1)
+#     bm25_scores2 = compute_bm25_scores(text, bm25_model2)
+#     bm25_scores3 = compute_bm25_scores(text, bm25_model3)
+#     # Word overlap scores
+#     word_overlap1 = compute_word_overlap(text, df_questions)
+#     word_overlap2 = compute_word_overlap(text, df2_questions)
+#     word_overlap3 = compute_word_overlap(text, df3_questions)
+#     # Normalize all scores for fair combination
+#     norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
+#     norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
+#     norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())
+#     norm_bm25_1 = normalize_scores(bm25_scores1)
+#     norm_bm25_2 = normalize_scores(bm25_scores2)
+#     norm_bm25_3 = normalize_scores(bm25_scores3)
+#     norm_word1 = normalize_scores(word_overlap1)
+#     norm_word2 = normalize_scores(word_overlap2)
+#     norm_word3 = normalize_scores(word_overlap3)
+#     # Adaptive weighting based on query characteristics
+#     query_words = arabic_word_tokenize(text)
+#     query_length = len(query_words)
+#     if query_length <= 2:
+#         # Short queries: prioritize exact matches (BM25 + word overlap)
+#         semantic_weight = 0.3
+#         bm25_weight = 0.4
+#         word_weight = 0.3
+#     elif query_length <= 5:
+#         # Medium queries: balanced approach
+#         semantic_weight = 0.4
+#         bm25_weight = 0.35
+#         word_weight = 0.25
+#     else:
+#         # Long queries: prioritize semantic understanding
+#         semantic_weight = 0.5
+#         bm25_weight = 0.3
+#         word_weight = 0.2
+#     def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
+#         combined_results = []
+#         for i in range(len(questions)):
+#             semantic_score = float(norm_semantic[i])
+#             bm25_score = float(norm_bm25[i])
+#             word_score = float(norm_word[i])
+#             # Enhanced scoring with BM25
+#             combined_score = (semantic_weight * semantic_score +
+#                             bm25_weight * bm25_score +
+#                             word_weight * word_score)
+#             # Boost results that perform well across multiple metrics
+#             high_performance_count = sum([
+#                 semantic_score > 0.7,
+#                 bm25_score > 0.7,
+#                 word_score > 0.5
+#             ])
+#             if high_performance_count >= 2:
+#                 boost = 0.1
+#             elif high_performance_count >= 1:
+#                 boost = 0.05
+#             else:
+#                 boost = 0.0
+#             final_score = combined_score + boost
+#             combined_results.append({
+#                 "question": questions[i],
+#                 "link": links[i],
+#                 "semantic_score": semantic_score,
+#                 "bm25_score": bm25_score,
+#                 "word_overlap_score": word_score,
+#                 "combined_score": final_score
+#             })
+#         return combined_results
+#     # Create combined results for all datasets
+#     combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
+#     combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
+#     combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)
+#     def get_diverse_top_results(combined_results, top_k=5):
+#         """Get diverse top results using multiple ranking strategies"""
+#         # Sort by combined score and get top candidates
+#         by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
+#         top_combined = by_combined[:3]
+#         # Get questions from top combined to avoid duplicates
+#         used_questions = {item["question"] for item in top_combined}
+#         # Add best BM25 result not already included
+#         by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
+#         bm25_pick = None
+#         for item in by_bm25:
+#             if item["question"] not in used_questions:
+#                 bm25_pick = item
+#                 break
+#         # Add best semantic result not already included
+#         by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
+#         semantic_pick = None
+#         if bm25_pick:
+#             used_questions.add(bm25_pick["question"])
+#         for item in by_semantic:
+#             if item["question"] not in used_questions:
+#                 semantic_pick = item
+#                 break
+#         # Combine results
+#         final_results = top_combined.copy()
+#         if bm25_pick:
+#             final_results.append(bm25_pick)
+#         if semantic_pick:
+#             final_results.append(semantic_pick)
+#         return final_results[:top_k]
+#     # Get top results for each dataset
+#     top1 = get_diverse_top_results(combined1)
+#     top2 = get_diverse_top_results(combined2)
+#     top3 = get_diverse_top_results(combined3)
+#     results = {
+#         "top2": top2,
+#         "top3": top3,
+#         "top1": top1,
+#         "query_info": {
+#             "query_length": query_length,
+#             "weights": {
+#                 "semantic": semantic_weight,
+#                 "bm25": bm25_weight,
+#                 "word_overlap": word_weight
+#             }
+#         }
+#     }
+#     return results
+# title = "Enhanced Search with BM25"
+# iface = gr.Interface(
+#     fn=predict,
+#     inputs=[gr.Textbox(label="Search Query", lines=3)],
+#     outputs='json',
+#     title=title,
+#     description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
+# )
+# if __name__ == "__main__":
+#     iface.launch()
+import torch
+import pandas as pd
+from sentence_transformers import SentenceTransformer, util
+import gradio as gr
+import re
+import numpy as np
+import math
+from collections import Counter
+# Load both models
+model1 = SentenceTransformer("distilbert-base-multilingual-cased")
+model2 = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
+# Load data
+print("Loading data and embeddings...")
+df = pd.read_csv("cleaned1.csv")
+df2 = pd.read_csv("cleaned2.csv")
+df3 = pd.read_csv("cleaned3.csv")
+embeddings1 = torch.load("embeddings1_1.pt")
+embeddings2 = torch.load("embeddings2_1.pt")
+embeddings3 = torch.load("embeddings3_1.pt")
+embeddings1a = torch.load("embeddings1.pt")
+embeddings2a = torch.load("embeddings2.pt")
+embeddings3a = torch.load("embeddings3.pt")
+# Arabic stopwords
+ARABIC_STOPWORDS = {
+    'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
+    'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
+    'كان', 'كانت', 'يكون', 'تكون', 'له', 'لها', 'لهم', 'و', 'أم', 'إن',
+    'رضي', 'عليها', 'عنهم', 'عنه', 'عليهم', 'صلى', 'وسلم',
+    'سلام', 'عليه', 'الرسول', 'النبي', 'عليه', 'السلام', 'حديث', 'احاديث'
+}
+def arabic_word_tokenize(text):
+    if not isinstance(text, str): return []
+    text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
+    return [t for t in re.findall(r'[\u0600-\u06FF]{2,}', text) if t not in ARABIC_STOPWORDS]
+# Pre-tokenize questions and compute doc frequencies
+def setup_tokenization_and_freqs(questions):
+    tokenized = [arabic_word_tokenize(q) for q in questions]
+    doc_freqs = Counter(word for doc in tokenized for word in set(doc))
+    return tokenized, doc_freqs
+tokenized1, doc_freqs1 = setup_tokenization_and_freqs(df["question"].values)
+tokenized2, doc_freqs2 = setup_tokenization_and_freqs(df2["question"].values)
+tokenized3, doc_freqs3 = setup_tokenization_and_freqs(df3["question"].values)
+def compute_word_overlap(query, questions):
+    q_words = set(arabic_word_tokenize(query))
+    scores = []
+    for doc in questions:
+        d_words = set(arabic_word_tokenize(doc))
+        if not d_words or not q_words:
+            scores.append(0.0)
+            continue
+        inter = len(q_words & d_words)
+        union = len(q_words | d_words)
+        jaccard = inter / union if union else 0.0
+        coverage = inter / len(q_words)
+        scores.append(0.7 * coverage + 0.3 * jaccard)
+    return scores
+def lightweight_bm25_score(query_tokens, doc_tokens, doc_freqs, total_docs, k1=1.2, b=0.75):
+    score = 0.0
+    doc_len = len(doc_tokens)
+    avg_doc_len = 10
+    for term in query_tokens:
+        if term in doc_tokens:
+            tf = doc_tokens.count(term)
+            df = doc_freqs.get(term, 0)
+            if df > 0:
+                idf = math.log((total_docs - df + 0.5) / (df + 0.5))
+                score += idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avg_doc_len)))
+    return score
+def normalize_scores(scores):
+    arr = np.array(scores)
+    if arr.max() == arr.min(): return np.zeros_like(arr)
+    return (arr - arr.min()) / (arr.max() - arr.min())
+def combine_scores(query, questions, tokenized, doc_freqs, emb1, emb2):
+    total_docs = len(questions)
+    q_emb1 = model1.encode(query, convert_to_tensor=True)
+    q_emb2 = model2.encode(query, convert_to_tensor=True)
+    sim1 = util.pytorch_cos_sim(q_emb1, emb1)[0]
+    sim2 = util.pytorch_cos_sim(q_emb2, emb2)[0]
+    sim_scores = ((sim1 + sim2) / 2).cpu().numpy()
+    bm25_scores = [lightweight_bm25_score(arabic_word_tokenize(query), doc_tokens, doc_freqs, total_docs)
+                   for doc_tokens in tokenized]
+    word_scores = compute_word_overlap(query, questions)
+    norm_bm25 = normalize_scores(bm25_scores)
+    norm_word = normalize_scores(word_scores)
+    norm_sim = normalize_scores(sim_scores)
+    query_len = len(arabic_word_tokenize(query))
+    if query_len <= 2:
+        w_sem, w_bm, w_word = 0.3, 0.4, 0.3
+    elif query_len <= 5:
+        w_sem, w_bm, w_word = 0.4, 0.35, 0.25
+    else:
+        w_sem, w_bm, w_word = 0.5, 0.3, 0.2
+    results = []
+    for i, q in enumerate(questions):
+        sem, bm, word = norm_sim[i], norm_bm25[i], norm_word[i]
+        combined = w_sem*sem + w_bm*bm + w_word*word
+        boost = 0.1 if sum([sem > 0.7, bm > 0.7, word > 0.5]) >= 2 else (0.05 if sum([sem > 0.7, bm > 0.7, word > 0.5]) == 1 else 0.0)
+        results.append({
+            "question": q,
+            "semantic_score": sem,
+            "bm25_score": bm,
+            "word_overlap_score": word,
+            "combined_score": combined + boost
+        })
+    return results
+def get_top_diverse(results, links, top_k=5):
+    results = [dict(r, link=links[i]) for i, r in enumerate(results)]
+    top_combined = sorted(results, key=lambda x: x['combined_score'], reverse=True)[:3]
+    used_q = {r['question'] for r in top_combined}
+    top_bm = next((r for r in sorted(results, key=lambda x: x['bm25_score'], reverse=True) if r['question'] not in used_q), None)
+    if top_bm: used_q.add(top_bm['question'])
+    top_sem = next((r for r in sorted(results, key=lambda x: x['semantic_score'], reverse=True) if r['question'] not in used_q), None)
+    final = top_combined + ([top_bm] if top_bm else []) + ([top_sem] if top_sem else [])
+    return final[:top_k]
+def predict(query):
+    print(f"Query: {query}")
+    results1 = combine_scores(query, df["question"].values, tokenized1, doc_freqs1, embeddings1, embeddings1a)
+    results2 = combine_scores(query, df2["question"].values, tokenized2, doc_freqs2, embeddings2, embeddings2a)
+    results3 = combine_scores(query, df3["question"].values, tokenized3, doc_freqs3, embeddings3, embeddings3a)
+    return {
+        "top2": get_top_diverse(results2, df2["link"].values),
+        "top3": get_top_diverse(results3, df3["url"].values),
+        "top1": get_top_diverse(results1, df["link"].values),
         "query_info": {
+            "query_length": len(arabic_word_tokenize(query))
         }
     }
+title = "Arabic Search: Dual-Model + BM25 + Overlap"
 iface = gr.Interface(
     fn=predict,
     inputs=[gr.Textbox(label="Search Query", lines=3)],
+    outputs="json",
     title=title,
+    description="Accurate Arabic search using two semantic models, fast BM25, and word overlap."
 )
 if __name__ == "__main__":
+    iface.launch()