File size: 6,447 Bytes
1641ca7
 
 
fcadab7
1641ca7
 
 
e40de26
1641ca7
 
 
 
e40de26
1641ca7
 
 
 
8e3e50c
1641ca7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e40de26
1641ca7
e40de26
 
9e4b885
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1641ca7
 
 
 
 
 
 
 
b6b04c7
 
1641ca7
 
2e553d1
30cf47b
 
 
 
 
2e553d1
72b5d84
9e4540a
 
9e4b885
 
9e4540a
9e4b885
c96f08a
 
 
2e553d1
c96f08a
30cf47b
 
 
2e553d1
30cf47b
 
 
2e553d1
 
30cf47b
 
2e553d1
30cf47b
c96f08a
30cf47b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7725e6d
2e553d1
 
 
 
 
 
 
7725e6d
c96f08a
 
 
0430419
9e4540a
 
0430419
9e4540a
9e4b885
eacf3db
9e4540a
0430419
9e4540a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# import gradio as gr
# import pandas as pd
# from sentence_transformers import SentenceTransformer, util

# # Load files
# df = pd.read_excel("IslamWeb_output.xlsx")
# df2 = pd.read_excel("JordanFatwas_all.xlsx")

# # Validate
# for d, name in [(df, "IslamWeb"), (df2, "JordanFatwas")]:
#     if not {"question", "link"}.issubset(d.columns):
#         raise ValueError(f"❌ Missing required columns in {name}")

# # Load model + encode
# model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# embeddings = model.encode(df["question"].fillna('').tolist(), convert_to_tensor=True)
# embeddings2 = model.encode(df2["question"].fillna('').tolist(), convert_to_tensor=True)

# # Define function
# def search_fatwa(query):
#     query_embedding = model.encode(query, convert_to_tensor=True)

#     scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
#     top_idx = int(scores.argmax())

#     scores2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
#     top_idx2 = int(scores2.argmax())

#     return {
#         "question1": df.iloc[top_idx]["question"],
#         "link1": df.iloc[top_idx]["link"],
#         "question2": df2.iloc[top_idx2]["question"],
#         "link2": df2.iloc[top_idx2]["link"],
#     }

# # Interface
# iface = gr.Interface(
#     fn=search_fatwa,
#     inputs="text",
#     outputs="json",
#     allow_flagging="never",
#     title="Fatwa Search (Dual Source)",
#     description="Get the most relevant fatwas from both datasets"
# )

# iface.launch()


# import torch
# import pandas as pd
# from sentence_transformers import SentenceTransformer, util
# import gradio as gr

# model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# df = pd.read_csv("cleaned1.csv")
# df2 = pd.read_csv("cleaned2.csv")
# embeddings = torch.load("embeddings1.pt")
# embeddings2 = torch.load("embeddings2.pt")

# # def search_fatwa(data):
# #     query = data[0] if data else ""
# #     query_embedding = model.encode(query, convert_to_tensor=True)
# #     top_idx = int(util.pytorch_cos_sim(query_embedding, embeddings)[0].argmax())
# #     top_idx2 = int(util.pytorch_cos_sim(query_embedding, embeddings2)[0].argmax())
# #     return {
# #         "question1": df.iloc[top_idx]["question"],
# #         "link1": df.iloc[top_idx]["link"],
# #         "question2": df2.iloc[top_idx2]["question"],
# #         "link2": df2.iloc[top_idx2]["link"]
# #     }

# def search_fatwa(data):
#     query = data[0] if isinstance(data, list) else data
#     if not query:
#         return {"question1": "", "link1": "", "question2": "", "link2": ""}
#     query_embedding = model.encode(query, convert_to_tensor=True)
#     top_idx = int(util.pytorch_cos_sim(query_embedding, embeddings)[0].argmax())
#     top_idx2 = int(util.pytorch_cos_sim(query_embedding, embeddings2)[0].argmax())
#     # return {
#     #     "question1": df.iloc[top_idx]["question"],
#     #     "link1": df.iloc[top_idx]["link"],
#     #     "question2": df2.iloc[top_idx2]["question"],
#     #     "link2": df2.iloc[top_idx2]["link"]
#     # }
#     result = f"""Question 1: {df.iloc[top_idx]["question"]}
#         Link 1: {df.iloc[top_idx]["link"]}
        
#         Question 2: {df2.iloc[top_idx2]["question"]}
#         Link 2: {df2.iloc[top_idx2]["link"]}"""
#     return result

# iface = gr.Interface(
#     fn=search_fatwa, 
#     inputs=[gr.Textbox(label="text", lines=3)], 
#     outputs="text"  # Changed from "json" to "text"
# )

# # iface = gr.Interface(fn=search_fatwa, inputs=[gr.Textbox(label="text", lines=3)], outputs="json")




# # iface = gr.Interface(
# #   fn=predict, 
# #   inputs=[gr.Textbox(label="text", lines=3)],
# #   outputs='text',
# #   title=title,
# # )

# iface.launch()


import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import gradio as gr

model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
df = pd.read_csv("cleaned1.csv")
df2 = pd.read_csv("cleaned2.csv")
df3 = pd.read_csv("cleaned3.csv")

embeddings = torch.load("embeddings1.pt")
embeddings2 = torch.load("embeddings2.pt")
embeddings3 = torch.load("embeddings3.pt")
# Pre-extract DataFrame columns to avoid repeated iloc calls
df_questions = df["question"].values
df_links = df["link"].values
df2_questions = df2["question"].values
df2_links = df2["link"].values
df3_questions = df3["question"].values
df3_links = df3["url"].values
def predict(text):
    if not text or text.strip() == "":
        return "No query provided"
    
    query_embedding = model.encode(text, convert_to_tensor=True)
    
    # Compute similarity scores
    sim_scores1 = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    sim_scores2 = util.pytorch_cos_sim(query_embedding, embeddings2)[0]
    sim_scores3 = util.pytorch_cos_sim(query_embedding, embeddings3)[0]
    
    # Get top 3 values and indices in one call
    top3_scores1, top3_idx1 = sim_scores1.topk(3)
    top3_scores2, top3_idx2 = sim_scores2.topk(3)
    top3_scores3, top3_idx3 = sim_scores3.topk(3)
    # Convert to CPU once
    top3_idx1_cpu = top3_idx1.cpu().numpy()
    top3_idx2_cpu = top3_idx2.cpu().numpy()
    top3_idx3_cpu = top3_idx3.cpu().numpy()
    
    top3_scores1_cpu = top3_scores1.cpu().numpy()
    top3_scores2_cpu = top3_scores2.cpu().numpy()
    top3_scores3_cpu = top3_scores3.cpu().numpy()
    # Prepare results using pre-extracted arrays
    results = {
        "top1": [
            {
                "question": df_questions[idx],
                "link": df_links[idx],
                "score": float(score)
            }
            for idx, score in zip(top3_idx1_cpu, top3_scores1_cpu)
        ],
        "top2": [
            {
                "question": df2_questions[idx],
                "link": df2_links[idx], 
                "score": float(score)
            }
            for idx, score in zip(top3_idx2_cpu, top3_scores2_cpu)
        ],
        "top3": [
            {
                "question": df3_questions[idx],
                "link": df3_links[idx], 
                "score": float(score)
            }
            for idx, score in zip(top3_idx3_cpu, top3_scores3_cpu)
        ],
    }
    
    return results

# Match the EXACT structure of your working translation app
title = "Search CSV"
iface = gr.Interface(
    fn=predict,  # Changed from search_fatwa to predict 
    inputs=[gr.Textbox(label="text", lines=3)],
    outputs='json',
    title=title,
)
iface.launch()