Semantic_Search_2

Runtime error

App Files Files Community

Fawaz commited on Aug 6, 2024

Commit

3240876

1 Parent(s): 193c1e4

Add application file

Browse files

Files changed (1) hide show

app.py +141 -0

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# -*- coding: utf-8 -*-
+"""Task22.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1yBvg6i_GsMk--P2nuSG-mfqCDbuIcEpx
+# Task 2
+- Raghad Al-Rasheed
+- Fawwaz Alsheikhi
+using the E5 model as the embedding model and translated dataset from huggingface
+"""
+!pip install sentence_transformers
+"""## Downloading the Embedding model"""
+from sentence_transformers import SentenceTransformer
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import word_tokenize
+import math
+from scipy import spatial
+model = SentenceTransformer("intfloat/multilingual-e5-large").to('cuda')
+"""## Downloading Translated data from english to arabic"""
+!pip3 install datasets
+from datasets import load_dataset
+ds = load_dataset("Helsinki-NLP/news_commentary", "ar-en",split="train")
+import pandas as pd
+df = pd.DataFrame(ds['translation'])
+df['ar']
+df['ar'][0]
+"""### Extracting the first 10000 rows out of the data"""
+df=df.head(10000)
+df['ar'].shape
+documents =[doc for doc in df['ar']]
+documents[9999]
+"""## Embedding the sentences by rows"""
+embeddings = model.encode(documents)
+from sentence_transformers import SentenceTransformer
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import word_tokenize
+import math
+from scipy import spatial
+import scipy
+def semantic_search(query, embeddings, documents):
+    query_embedding = model.encode(query)
+    document_embeddings = embeddings
+    scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings]
+    ls1 = list()
+    for i, score in enumerate(scores):
+        ls1.append([documents[i],score])
+    print(scores.index(min(scores)))
+    most_similar_doc = documents[scores.index(min(scores))]
+    print("Most similar document", most_similar_doc)
+    return ls1
+output = semantic_search("ـ لم يكن من السهل قط أن ينخرط المرء في محادثة عقلانية حول قيمة الذهب.",embeddings, documents)
+documents[999]
+"""### Extracting top three related sentences"""
+ranked = sorted(output, key=lambda x: x[1])
+ranked[:3]
+df
+"""## using english with arabic to see the semantic search of multilangual model"""
+df['ar']
+df['en']
+df_ar = df['ar'].tolist()[:5000]
+df_en = df['en'].tolist()[:5000]
+combined_list = df_ar + df_en
+print(len(combined_list))
+embeddings1 = model.encode(combined_list)
+from sentence_transformers import SentenceTransformer
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import word_tokenize
+import math
+from scipy import spatial
+import scipy
+def semantic_search(query, embeddings1, combined_list):
+    query_embedding = model.encode(query)
+    document_embeddings = embeddings1
+    scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings]
+    ls1 = list()
+    for i, score in enumerate(scores):
+        ls1.append([combined_list[i],score])
+    print(scores.index(min(scores)))
+    most_similar_doc = combined_list[scores.index(min(scores))]
+    print("Most similar document", most_similar_doc)
+    return ls1
+output = semantic_search("لذهب بعشرة آلاف دولار؟",embeddings1, combined_list)
+ranked = sorted(output, key=lambda x: x[1])
+ranked[:3]
+import gradio as gr
+demo = gr.Interface(fn=semantic_search,inputs = ["text"], outputs=["text", "text", "text"])
+if __name__ == "__main__":
+    demo.launch()