Spaces:
Runtime error
Runtime error
Replaced tfidf
Browse files
app.py
CHANGED
@@ -1,36 +1,61 @@
|
|
1 |
import os
|
2 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
3 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
4 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
def find_closest(query):
|
6 |
-
|
7 |
files_names = []
|
|
|
8 |
|
9 |
for file in os.listdir():
|
10 |
if file.endswith(".txt"):
|
11 |
with open(file, 'r') as f:
|
12 |
content = f.read()
|
13 |
-
files_contents.append(content)
|
14 |
files_names.append(file)
|
|
|
|
|
15 |
|
16 |
-
#
|
17 |
-
|
18 |
-
|
19 |
-
# Initialize the TfidfVectorizer
|
20 |
-
tfidf_vectorizer = TfidfVectorizer()
|
21 |
-
|
22 |
-
# Fit and transform the texts
|
23 |
-
tfidf_matrix = tfidf_vectorizer.fit_transform(files_contents)
|
24 |
|
25 |
# Compute the cosine similarity between the query and all files
|
26 |
-
similarity_scores = cosine_similarity(
|
27 |
|
28 |
# Get the index of the file with the highest similarity score
|
29 |
max_similarity_idx = similarity_scores.argmax()
|
30 |
|
31 |
# Return the name of the file with the highest similarity score
|
32 |
return files_names[max_similarity_idx]
|
33 |
-
|
34 |
def find_closest_mp3(query):
|
35 |
closest_txt_file = find_closest(query)
|
36 |
file_name_without_extension, _ = os.path.splitext(closest_txt_file)
|
|
|
1 |
import os
|
2 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
3 |
from sklearn.metrics.pairwise import cosine_similarity
|
4 |
+
import spacy
|
5 |
import gradio as gr
|
6 |
+
# def find_closest(query):
|
7 |
+
# files_contents = []
|
8 |
+
# files_names = []
|
9 |
+
|
10 |
+
# for file in os.listdir():
|
11 |
+
# if file.endswith(".txt"):
|
12 |
+
# with open(file, 'r') as f:
|
13 |
+
# content = f.read()
|
14 |
+
# files_contents.append(content)
|
15 |
+
# files_names.append(file)
|
16 |
+
|
17 |
+
# # Append query to the end
|
18 |
+
# files_contents.append(query)
|
19 |
+
|
20 |
+
# # Initialize the TfidfVectorizer
|
21 |
+
# tfidf_vectorizer = TfidfVectorizer()
|
22 |
+
|
23 |
+
# # Fit and transform the texts
|
24 |
+
# tfidf_matrix = tfidf_vectorizer.fit_transform(files_contents)
|
25 |
+
|
26 |
+
# # Compute the cosine similarity between the query and all files
|
27 |
+
# similarity_scores = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
|
28 |
+
|
29 |
+
# # Get the index of the file with the highest similarity score
|
30 |
+
# max_similarity_idx = similarity_scores.argmax()
|
31 |
+
|
32 |
+
# # Return the name of the file with the highest similarity score
|
33 |
+
# return files_names[max_similarity_idx]
|
34 |
+
|
35 |
def find_closest(query):
|
36 |
+
nlp = spacy.load('en_core_web_md')
|
37 |
files_names = []
|
38 |
+
files_vectors = []
|
39 |
|
40 |
for file in os.listdir():
|
41 |
if file.endswith(".txt"):
|
42 |
with open(file, 'r') as f:
|
43 |
content = f.read()
|
|
|
44 |
files_names.append(file)
|
45 |
+
# Get the vector representation of the content
|
46 |
+
files_vectors.append(nlp(content).vector)
|
47 |
|
48 |
+
# Get the vector representation of the query
|
49 |
+
query_vector = nlp(query).vector
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
# Compute the cosine similarity between the query and all files
|
52 |
+
similarity_scores = cosine_similarity([query_vector], files_vectors)
|
53 |
|
54 |
# Get the index of the file with the highest similarity score
|
55 |
max_similarity_idx = similarity_scores.argmax()
|
56 |
|
57 |
# Return the name of the file with the highest similarity score
|
58 |
return files_names[max_similarity_idx]
|
|
|
59 |
def find_closest_mp3(query):
|
60 |
closest_txt_file = find_closest(query)
|
61 |
file_name_without_extension, _ = os.path.splitext(closest_txt_file)
|