aksj commited on
Commit
b487388
·
1 Parent(s): a613163

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -20
app.py CHANGED
@@ -1,20 +1,21 @@
1
  import os
2
  from sklearn.feature_extraction.text import TfidfVectorizer
3
  from sklearn.metrics.pairwise import cosine_similarity
 
4
  import spacy
5
  import gradio as gr
6
  import subprocess
7
 
8
- def download_spacy_model(model_name):
9
- command = f"python -m spacy download {model_name}"
10
- process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
11
- stdout, stderr = process.communicate()
12
 
13
- # Check if the command executed successfully
14
- if process.returncode != 0:
15
- print(f"An error occurred while downloading the model: {stderr.decode('utf-8')}")
16
- else:
17
- print(f"Successfully downloaded the model: {stdout.decode('utf-8')}")
18
 
19
  # Call the function to download the model
20
 
@@ -47,28 +48,55 @@ def download_spacy_model(model_name):
47
  # # Return the name of the file with the highest similarity score
48
  # return files_names[max_similarity_idx]
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def find_closest(query):
51
- try:
52
- nlp = spacy.load('en_core_web_md')
53
- except:
54
- download_spacy_model('en_core_web_md')
55
- nlp = spacy.load('en_core_web_md')
56
  files_names = []
57
- files_vectors = []
58
 
59
  for file in os.listdir():
60
  if file.endswith(".txt"):
61
  with open(file, 'r') as f:
62
  content = f.read()
 
63
  files_names.append(file)
64
- # Get the vector representation of the content
65
- files_vectors.append(nlp(content).vector)
66
 
67
- # Get the vector representation of the query
68
- query_vector = nlp(query).vector
 
 
 
69
 
70
  # Compute the cosine similarity between the query and all files
71
- similarity_scores = cosine_similarity([query_vector], files_vectors)
72
 
73
  # Get the index of the file with the highest similarity score
74
  max_similarity_idx = similarity_scores.argmax()
 
1
  import os
2
  from sklearn.feature_extraction.text import TfidfVectorizer
3
  from sklearn.metrics.pairwise import cosine_similarity
4
+ from sentence_transformers import SentenceTransformer
5
  import spacy
6
  import gradio as gr
7
  import subprocess
8
 
9
+ # def download_spacy_model(model_name):
10
+ # command = f"python -m spacy download {model_name}"
11
+ # process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
12
+ # stdout, stderr = process.communicate()
13
 
14
+ # # Check if the command executed successfully
15
+ # if process.returncode != 0:
16
+ # print(f"An error occurred while downloading the model: {stderr.decode('utf-8')}")
17
+ # else:
18
+ # print(f"Successfully downloaded the model: {stdout.decode('utf-8')}")
19
 
20
  # Call the function to download the model
21
 
 
48
  # # Return the name of the file with the highest similarity score
49
  # return files_names[max_similarity_idx]
50
 
51
+ # def find_closest(query):
52
+ # try:
53
+ # nlp = spacy.load('en_core_web_md')
54
+ # except:
55
+ # download_spacy_model('en_core_web_md')
56
+ # nlp = spacy.load('en_core_web_md')
57
+ # files_names = []
58
+ # files_vectors = []
59
+
60
+ # for file in os.listdir():
61
+ # if file.endswith(".txt"):
62
+ # with open(file, 'r') as f:
63
+ # content = f.read()
64
+ # files_names.append(file)
65
+ # # Get the vector representation of the content
66
+ # files_vectors.append(nlp(content).vector)
67
+
68
+ # # Get the vector representation of the query
69
+ # query_vector = nlp(query).vector
70
+
71
+ # # Compute the cosine similarity between the query and all files
72
+ # similarity_scores = cosine_similarity([query_vector], files_vectors)
73
+
74
+ # # Get the index of the file with the highest similarity score
75
+ # max_similarity_idx = similarity_scores.argmax()
76
+
77
+ # # Return the name of the file with the highest similarity score
78
+ # return files_names[max_similarity_idx]
79
  def find_closest(query):
80
+ model = SentenceTransformer('all-MiniLM-L6-v2') # You can choose other models
81
+
82
+ files_contents = []
 
 
83
  files_names = []
 
84
 
85
  for file in os.listdir():
86
  if file.endswith(".txt"):
87
  with open(file, 'r') as f:
88
  content = f.read()
89
+ files_contents.append(content)
90
  files_names.append(file)
 
 
91
 
92
+ # Append query to the end
93
+ files_contents.append(query)
94
+
95
+ # Create sentence embeddings for each text
96
+ embeddings = model.encode(files_contents)
97
 
98
  # Compute the cosine similarity between the query and all files
99
+ similarity_scores = cosine_similarity([embeddings[-1]], embeddings[:-1])
100
 
101
  # Get the index of the file with the highest similarity score
102
  max_similarity_idx = similarity_scores.argmax()