aksj commited on
Commit
064ae46
·
1 Parent(s): cf4d87a

Replaced tfidf

Browse files
Files changed (1) hide show
  1. app.py +37 -12
app.py CHANGED
@@ -1,36 +1,61 @@
1
  import os
2
  from sklearn.feature_extraction.text import TfidfVectorizer
3
  from sklearn.metrics.pairwise import cosine_similarity
 
4
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  def find_closest(query):
6
- files_contents = []
7
  files_names = []
 
8
 
9
  for file in os.listdir():
10
  if file.endswith(".txt"):
11
  with open(file, 'r') as f:
12
  content = f.read()
13
- files_contents.append(content)
14
  files_names.append(file)
 
 
15
 
16
- # Append query to the end
17
- files_contents.append(query)
18
-
19
- # Initialize the TfidfVectorizer
20
- tfidf_vectorizer = TfidfVectorizer()
21
-
22
- # Fit and transform the texts
23
- tfidf_matrix = tfidf_vectorizer.fit_transform(files_contents)
24
 
25
  # Compute the cosine similarity between the query and all files
26
- similarity_scores = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
27
 
28
  # Get the index of the file with the highest similarity score
29
  max_similarity_idx = similarity_scores.argmax()
30
 
31
  # Return the name of the file with the highest similarity score
32
  return files_names[max_similarity_idx]
33
-
34
  def find_closest_mp3(query):
35
  closest_txt_file = find_closest(query)
36
  file_name_without_extension, _ = os.path.splitext(closest_txt_file)
 
1
  import os
2
  from sklearn.feature_extraction.text import TfidfVectorizer
3
  from sklearn.metrics.pairwise import cosine_similarity
4
+ import spacy
5
  import gradio as gr
6
+ # def find_closest(query):
7
+ # files_contents = []
8
+ # files_names = []
9
+
10
+ # for file in os.listdir():
11
+ # if file.endswith(".txt"):
12
+ # with open(file, 'r') as f:
13
+ # content = f.read()
14
+ # files_contents.append(content)
15
+ # files_names.append(file)
16
+
17
+ # # Append query to the end
18
+ # files_contents.append(query)
19
+
20
+ # # Initialize the TfidfVectorizer
21
+ # tfidf_vectorizer = TfidfVectorizer()
22
+
23
+ # # Fit and transform the texts
24
+ # tfidf_matrix = tfidf_vectorizer.fit_transform(files_contents)
25
+
26
+ # # Compute the cosine similarity between the query and all files
27
+ # similarity_scores = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
28
+
29
+ # # Get the index of the file with the highest similarity score
30
+ # max_similarity_idx = similarity_scores.argmax()
31
+
32
+ # # Return the name of the file with the highest similarity score
33
+ # return files_names[max_similarity_idx]
34
+
35
  def find_closest(query):
36
+ nlp = spacy.load('en_core_web_md')
37
  files_names = []
38
+ files_vectors = []
39
 
40
  for file in os.listdir():
41
  if file.endswith(".txt"):
42
  with open(file, 'r') as f:
43
  content = f.read()
 
44
  files_names.append(file)
45
+ # Get the vector representation of the content
46
+ files_vectors.append(nlp(content).vector)
47
 
48
+ # Get the vector representation of the query
49
+ query_vector = nlp(query).vector
 
 
 
 
 
 
50
 
51
  # Compute the cosine similarity between the query and all files
52
+ similarity_scores = cosine_similarity([query_vector], files_vectors)
53
 
54
  # Get the index of the file with the highest similarity score
55
  max_similarity_idx = similarity_scores.argmax()
56
 
57
  # Return the name of the file with the highest similarity score
58
  return files_names[max_similarity_idx]
 
59
  def find_closest_mp3(query):
60
  closest_txt_file = find_closest(query)
61
  file_name_without_extension, _ = os.path.splitext(closest_txt_file)