Spaces:
Running
Running
File size: 6,447 Bytes
1641ca7 fcadab7 1641ca7 e40de26 1641ca7 e40de26 1641ca7 8e3e50c 1641ca7 e40de26 1641ca7 e40de26 9e4b885 1641ca7 b6b04c7 1641ca7 2e553d1 30cf47b 2e553d1 72b5d84 9e4540a 9e4b885 9e4540a 9e4b885 c96f08a 2e553d1 c96f08a 30cf47b 2e553d1 30cf47b 2e553d1 30cf47b 2e553d1 30cf47b c96f08a 30cf47b 7725e6d 2e553d1 7725e6d c96f08a 0430419 9e4540a 0430419 9e4540a 9e4b885 eacf3db 9e4540a 0430419 9e4540a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
# import gradio as gr
# import pandas as pd
# from sentence_transformers import SentenceTransformer, util
# # Load files
# df = pd.read_excel("IslamWeb_output.xlsx")
# df2 = pd.read_excel("JordanFatwas_all.xlsx")
# # Validate
# for d, name in [(df, "IslamWeb"), (df2, "JordanFatwas")]:
# if not {"question", "link"}.issubset(d.columns):
# raise ValueError(f"❌ Missing required columns in {name}")
# # Load model + encode
# model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# embeddings = model.encode(df["question"].fillna('').tolist(), convert_to_tensor=True)
# embeddings2 = model.encode(df2["question"].fillna('').tolist(), convert_to_tensor=True)
# # Define function
# def search_fatwa(query):
# query_embedding = model.encode(query, convert_to_tensor=True)
# scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
# top_idx = int(scores.argmax())
# scores2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
# top_idx2 = int(scores2.argmax())
# return {
# "question1": df.iloc[top_idx]["question"],
# "link1": df.iloc[top_idx]["link"],
# "question2": df2.iloc[top_idx2]["question"],
# "link2": df2.iloc[top_idx2]["link"],
# }
# # Interface
# iface = gr.Interface(
# fn=search_fatwa,
# inputs="text",
# outputs="json",
# allow_flagging="never",
# title="Fatwa Search (Dual Source)",
# description="Get the most relevant fatwas from both datasets"
# )
# iface.launch()
# import torch
# import pandas as pd
# from sentence_transformers import SentenceTransformer, util
# import gradio as gr
# model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# df = pd.read_csv("cleaned1.csv")
# df2 = pd.read_csv("cleaned2.csv")
# embeddings = torch.load("embeddings1.pt")
# embeddings2 = torch.load("embeddings2.pt")
# # def search_fatwa(data):
# # query = data[0] if data else ""
# # query_embedding = model.encode(query, convert_to_tensor=True)
# # top_idx = int(util.pytorch_cos_sim(query_embedding, embeddings)[0].argmax())
# # top_idx2 = int(util.pytorch_cos_sim(query_embedding, embeddings2)[0].argmax())
# # return {
# # "question1": df.iloc[top_idx]["question"],
# # "link1": df.iloc[top_idx]["link"],
# # "question2": df2.iloc[top_idx2]["question"],
# # "link2": df2.iloc[top_idx2]["link"]
# # }
# def search_fatwa(data):
# query = data[0] if isinstance(data, list) else data
# if not query:
# return {"question1": "", "link1": "", "question2": "", "link2": ""}
# query_embedding = model.encode(query, convert_to_tensor=True)
# top_idx = int(util.pytorch_cos_sim(query_embedding, embeddings)[0].argmax())
# top_idx2 = int(util.pytorch_cos_sim(query_embedding, embeddings2)[0].argmax())
# # return {
# # "question1": df.iloc[top_idx]["question"],
# # "link1": df.iloc[top_idx]["link"],
# # "question2": df2.iloc[top_idx2]["question"],
# # "link2": df2.iloc[top_idx2]["link"]
# # }
# result = f"""Question 1: {df.iloc[top_idx]["question"]}
# Link 1: {df.iloc[top_idx]["link"]}
# Question 2: {df2.iloc[top_idx2]["question"]}
# Link 2: {df2.iloc[top_idx2]["link"]}"""
# return result
# iface = gr.Interface(
# fn=search_fatwa,
# inputs=[gr.Textbox(label="text", lines=3)],
# outputs="text" # Changed from "json" to "text"
# )
# # iface = gr.Interface(fn=search_fatwa, inputs=[gr.Textbox(label="text", lines=3)], outputs="json")
# # iface = gr.Interface(
# # fn=predict,
# # inputs=[gr.Textbox(label="text", lines=3)],
# # outputs='text',
# # title=title,
# # )
# iface.launch()
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import gradio as gr
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
df = pd.read_csv("cleaned1.csv")
df2 = pd.read_csv("cleaned2.csv")
df3 = pd.read_csv("cleaned3.csv")
embeddings = torch.load("embeddings1.pt")
embeddings2 = torch.load("embeddings2.pt")
embeddings3 = torch.load("embeddings3.pt")
# Pre-extract DataFrame columns to avoid repeated iloc calls
df_questions = df["question"].values
df_links = df["link"].values
df2_questions = df2["question"].values
df2_links = df2["link"].values
df3_questions = df3["question"].values
df3_links = df3["url"].values
def predict(text):
if not text or text.strip() == "":
return "No query provided"
query_embedding = model.encode(text, convert_to_tensor=True)
# Compute similarity scores
sim_scores1 = util.pytorch_cos_sim(query_embedding, embeddings)[0]
sim_scores2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
sim_scores3 = util.pytorch_cos_sim(query_embedding, embeddings3)[0]
# Get top 3 values and indices in one call
top3_scores1, top3_idx1 = sim_scores1.topk(3)
top3_scores2, top3_idx2 = sim_scores2.topk(3)
top3_scores3, top3_idx3 = sim_scores3.topk(3)
# Convert to CPU once
top3_idx1_cpu = top3_idx1.cpu().numpy()
top3_idx2_cpu = top3_idx2.cpu().numpy()
top3_idx3_cpu = top3_idx3.cpu().numpy()
top3_scores1_cpu = top3_scores1.cpu().numpy()
top3_scores2_cpu = top3_scores2.cpu().numpy()
top3_scores3_cpu = top3_scores3.cpu().numpy()
# Prepare results using pre-extracted arrays
results = {
"top1": [
{
"question": df_questions[idx],
"link": df_links[idx],
"score": float(score)
}
for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
],
"top2": [
{
"question": df2_questions[idx],
"link": df2_links[idx],
"score": float(score)
}
for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
],
"top3": [
{
"question": df3_questions[idx],
"link": df3_links[idx],
"score": float(score)
}
for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
],
}
return results
# Match the EXACT structure of your working translation app
title = "Search CSV"
iface = gr.Interface(
fn=predict, # Changed from search_fatwa to predict
inputs=[gr.Textbox(label="text", lines=3)],
outputs='json',
title=title,
)
iface.launch() |