Spaces:

mohbay
/

searchcsv2

Running

App Files Files Community

searchcsv2 / app.py

mohbay

Update app.py

2f4967b verified 3 months ago

raw

history blame

10.2 kB

	import torch
	import pandas as pd
	from sentence_transformers import SentenceTransformer, util
	import gradio as gr
	import re
	from rank_bm25 import BM25Okapi
	import numpy as np

	# Load models
	model = SentenceTransformer("distilbert-base-multilingual-cased")
	modela = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

	# Load data
	df = pd.read_csv("cleaned1.csv")
	df2 = pd.read_csv("cleaned2.csv")
	df3 = pd.read_csv("cleaned3.csv")

	# Load pre-computed embeddings
	embeddings = torch.load("embeddings1_1.pt")
	embeddings2 = torch.load("embeddings2_1.pt")
	embeddings3 = torch.load("embeddings3_1.pt")

	embeddingsa = torch.load("embeddings1.pt")
	embeddingsa2 = torch.load("embeddings2.pt")
	embeddingsa3 = torch.load("embeddings3.pt")

	# Extract questions and links
	df_questions = df["question"].values
	df_links = df["link"].values
	df2_questions = df2["question"].values
	df2_links = df2["link"].values
	df3_questions = df3["question"].values
	df3_links = df3["url"].values

	ARABIC_STOPWORDS = {
	'في', 'من', 'إلى', 'عن', 'مع', 'هذا', 'هذه', 'ذلك', 'تلك',
	'التي', 'الذي', 'ما', 'لا', 'أن', 'أو', 'لكن', 'قد', 'حكم', 'قال',
	'كان', 'كانت', 'يكون', 'تكون', 'له', 'لها', 'لهم', 'و', 'أم', 'إن'
	}

	def arabic_word_tokenize(text):
	if not isinstance(text, str):
	return []
	# Remove diacritics
	text = re.sub(r'[\u064B-\u065F\u0670]', '', text)
	# Extract only Arabic words (length ≥ 2)
	tokens = re.findall(r'[\u0600-\u06FF]{2,}', text)
	return [t for t in tokens if t not in ARABIC_STOPWORDS]

	def prepare_bm25_corpus(questions):
	"""Prepare tokenized corpus for BM25"""
	tokenized_corpus = []
	for question in questions:
	tokens = arabic_word_tokenize(question)
	tokenized_corpus.append(tokens)
	return tokenized_corpus

	# Initialize BM25 models for each dataset
	print("Initializing BM25 models...")
	bm25_corpus1 = prepare_bm25_corpus(df_questions)
	bm25_corpus2 = prepare_bm25_corpus(df2_questions)
	bm25_corpus3 = prepare_bm25_corpus(df3_questions)

	bm25_model1 = BM25Okapi(bm25_corpus1)
	bm25_model2 = BM25Okapi(bm25_corpus2)
	bm25_model3 = BM25Okapi(bm25_corpus3)
	print("BM25 models initialized!")

	def compute_bm25_scores(query, bm25_model):
	"""Compute BM25 scores for a query"""
	query_tokens = arabic_word_tokenize(query)
	if not query_tokens:
	return np.zeros(len(bm25_model.corpus))

	scores = bm25_model.get_scores(query_tokens)
	return scores

	def compute_word_overlap(query, questions):
	"""Enhanced word overlap computation"""
	query_words = set(arabic_word_tokenize(query))
	if len(query_words) == 0:
	return [0.0] * len(questions)

	overlaps = []
	for q in questions:
	q_words = set(arabic_word_tokenize(q))
	if len(q_words) == 0:
	overlaps.append(0.0)
	continue

	# Use Jaccard similarity (intersection over union)
	intersection = len(query_words & q_words)
	union = len(query_words \| q_words)
	jaccard = intersection / union if union > 0 else 0.0

	# Also compute coverage (how much of query is matched)
	coverage = intersection / len(query_words)

	# Combine both: prioritize coverage but consider similarity
	overlap_score = 0.7 * coverage + 0.3 * jaccard
	overlaps.append(overlap_score)

	return overlaps

	def normalize_scores(scores):
	"""Normalize scores to 0-1 range"""
	scores = np.array(scores)
	if np.max(scores) == np.min(scores):
	return np.zeros_like(scores)
	return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))

	def predict(text):
	print(f"Received query: {text}")
	if not text or text.strip() == "":
	return "No query provided"

	# Semantic similarity scores
	query_embedding = model.encode(text, convert_to_tensor=True)
	query_embeddinga = modela.encode(text, convert_to_tensor=True)

	# Cosine similarities (averaged from two models)
	sim_scores1 = (util.pytorch_cos_sim(query_embedding, embeddings)[0] +
	util.pytorch_cos_sim(query_embeddinga, embeddingsa)[0]) / 2
	sim_scores2 = (util.pytorch_cos_sim(query_embedding, embeddings2)[0] +
	util.pytorch_cos_sim(query_embeddinga, embeddingsa2)[0]) / 2
	sim_scores3 = (util.pytorch_cos_sim(query_embedding, embeddings3)[0] +
	util.pytorch_cos_sim(query_embeddinga, embeddingsa3)[0]) / 2

	# BM25 scores
	bm25_scores1 = compute_bm25_scores(text, bm25_model1)
	bm25_scores2 = compute_bm25_scores(text, bm25_model2)
	bm25_scores3 = compute_bm25_scores(text, bm25_model3)

	# Word overlap scores
	word_overlap1 = compute_word_overlap(text, df_questions)
	word_overlap2 = compute_word_overlap(text, df2_questions)
	word_overlap3 = compute_word_overlap(text, df3_questions)

	# Normalize all scores for fair combination
	norm_sim1 = normalize_scores(sim_scores1.cpu().numpy())
	norm_sim2 = normalize_scores(sim_scores2.cpu().numpy())
	norm_sim3 = normalize_scores(sim_scores3.cpu().numpy())

	norm_bm25_1 = normalize_scores(bm25_scores1)
	norm_bm25_2 = normalize_scores(bm25_scores2)
	norm_bm25_3 = normalize_scores(bm25_scores3)

	norm_word1 = normalize_scores(word_overlap1)
	norm_word2 = normalize_scores(word_overlap2)
	norm_word3 = normalize_scores(word_overlap3)

	# Adaptive weighting based on query characteristics
	query_words = arabic_word_tokenize(text)
	query_length = len(query_words)

	if query_length <= 2:
	# Short queries: prioritize exact matches (BM25 + word overlap)
	semantic_weight = 0.3
	bm25_weight = 0.4
	word_weight = 0.3
	elif query_length <= 5:
	# Medium queries: balanced approach
	semantic_weight = 0.4
	bm25_weight = 0.35
	word_weight = 0.25
	else:
	# Long queries: prioritize semantic understanding
	semantic_weight = 0.5
	bm25_weight = 0.3
	word_weight = 0.2

	def create_combined_results(questions, links, norm_semantic, norm_bm25, norm_word):
	combined_results = []

	for i in range(len(questions)):
	semantic_score = float(norm_semantic[i])
	bm25_score = float(norm_bm25[i])
	word_score = float(norm_word[i])

	# Enhanced scoring with BM25
	combined_score = (semantic_weight * semantic_score +
	bm25_weight * bm25_score +
	word_weight * word_score)

	# Boost results that perform well across multiple metrics
	high_performance_count = sum([
	semantic_score > 0.7,
	bm25_score > 0.7,
	word_score > 0.5
	])

	if high_performance_count >= 2:
	boost = 0.1
	elif high_performance_count >= 1:
	boost = 0.05
	else:
	boost = 0.0

	final_score = combined_score + boost

	combined_results.append({
	"question": questions[i],
	"link": links[i],
	"semantic_score": semantic_score,
	"bm25_score": bm25_score,
	"word_overlap_score": word_score,
	"combined_score": final_score
	})

	return combined_results

	# Create combined results for all datasets
	combined1 = create_combined_results(df_questions, df_links, norm_sim1, norm_bm25_1, norm_word1)
	combined2 = create_combined_results(df2_questions, df2_links, norm_sim2, norm_bm25_2, norm_word2)
	combined3 = create_combined_results(df3_questions, df3_links, norm_sim3, norm_bm25_3, norm_word3)

	def get_diverse_top_results(combined_results, top_k=5):
	"""Get diverse top results using multiple ranking strategies"""
	# Sort by combined score and get top candidates
	by_combined = sorted(combined_results, key=lambda x: x["combined_score"], reverse=True)
	top_combined = by_combined[:3]

	# Get questions from top combined to avoid duplicates
	used_questions = {item["question"] for item in top_combined}

	# Add best BM25 result not already included
	by_bm25 = sorted(combined_results, key=lambda x: x["bm25_score"], reverse=True)
	bm25_pick = None
	for item in by_bm25:
	if item["question"] not in used_questions:
	bm25_pick = item
	break

	# Add best semantic result not already included
	by_semantic = sorted(combined_results, key=lambda x: x["semantic_score"], reverse=True)
	semantic_pick = None
	if bm25_pick:
	used_questions.add(bm25_pick["question"])

	for item in by_semantic:
	if item["question"] not in used_questions:
	semantic_pick = item
	break

	# Combine results
	final_results = top_combined.copy()
	if bm25_pick:
	final_results.append(bm25_pick)
	if semantic_pick:
	final_results.append(semantic_pick)

	return final_results[:top_k]

	# Get top results for each dataset
	top1 = get_diverse_top_results(combined1)
	top2 = get_diverse_top_results(combined2)
	top3 = get_diverse_top_results(combined3)

	results = {

	"top2": top2,
	"top3": top3,
	"top1": top1,
	"query_info": {
	"query_length": query_length,
	"weights": {
	"semantic": semantic_weight,
	"bm25": bm25_weight,
	"word_overlap": word_weight
	}
	}
	}

	return results

	title = "Enhanced Search with BM25"
	iface = gr.Interface(
	fn=predict,
	inputs=[gr.Textbox(label="Search Query", lines=3)],
	outputs='json',
	title=title,
	description="Arabic text search using combined semantic similarity, BM25, and word overlap scoring"
	)

	if __name__ == "__main__":
	iface.launch()