Spaces:

aksj
/

Dreamland-GenAI-Music

Runtime error

App Files Files Community

aksj commited on May 29, 2023

Commit

064ae46

1 Parent(s): cf4d87a

Replaced tfidf

Browse files

Files changed (1) hide show

app.py +37 -12

app.py CHANGED Viewed

@@ -1,36 +1,61 @@
 import os
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import gradio as gr
 def find_closest(query):
-    files_contents = []
     files_names = []
     for file in os.listdir():
         if file.endswith(".txt"):
             with open(file, 'r') as f:
                 content = f.read()
-                files_contents.append(content)
                 files_names.append(file)
-    # Append query to the end
-    files_contents.append(query)
-    # Initialize the TfidfVectorizer
-    tfidf_vectorizer = TfidfVectorizer()
-    # Fit and transform the texts
-    tfidf_matrix = tfidf_vectorizer.fit_transform(files_contents)
     # Compute the cosine similarity between the query and all files
-    similarity_scores = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
     # Get the index of the file with the highest similarity score
     max_similarity_idx = similarity_scores.argmax()
     # Return the name of the file with the highest similarity score
     return files_names[max_similarity_idx]
 def find_closest_mp3(query):
     closest_txt_file = find_closest(query)
     file_name_without_extension, _ = os.path.splitext(closest_txt_file)

 import os
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+import spacy
 import gradio as gr
+# def find_closest(query):
+#     files_contents = []
+#     files_names = []
+#     for file in os.listdir():
+#         if file.endswith(".txt"):
+#             with open(file, 'r') as f:
+#                 content = f.read()
+#                 files_contents.append(content)
+#                 files_names.append(file)
+#     # Append query to the end
+#     files_contents.append(query)
+#     # Initialize the TfidfVectorizer
+#     tfidf_vectorizer = TfidfVectorizer()
+#     # Fit and transform the texts
+#     tfidf_matrix = tfidf_vectorizer.fit_transform(files_contents)
+#     # Compute the cosine similarity between the query and all files
+#     similarity_scores = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
+#     # Get the index of the file with the highest similarity score
+#     max_similarity_idx = similarity_scores.argmax()
+#     # Return the name of the file with the highest similarity score
+#     return files_names[max_similarity_idx]
 def find_closest(query):
+    nlp = spacy.load('en_core_web_md')
     files_names = []
+    files_vectors = []
     for file in os.listdir():
         if file.endswith(".txt"):
             with open(file, 'r') as f:
                 content = f.read()
                 files_names.append(file)
+                # Get the vector representation of the content
+                files_vectors.append(nlp(content).vector)
+    # Get the vector representation of the query
+    query_vector = nlp(query).vector
     # Compute the cosine similarity between the query and all files
+    similarity_scores = cosine_similarity([query_vector], files_vectors)
     # Get the index of the file with the highest similarity score
     max_similarity_idx = similarity_scores.argmax()
     # Return the name of the file with the highest similarity score
     return files_names[max_similarity_idx]
 def find_closest_mp3(query):
     closest_txt_file = find_closest(query)
     file_name_without_extension, _ = os.path.splitext(closest_txt_file)