ayush-thakur02 commited on
Commit
aa183b9
·
verified ·
1 Parent(s): 194076d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -15
app.py CHANGED
@@ -6,13 +6,23 @@ from nltk.stem import WordNetLemmatizer
6
  from nltk.corpus import stopwords
7
  import nltk
8
  import json
 
9
 
10
  # Download NLTK resources
11
  nltk.download('punkt')
12
  nltk.download('wordnet')
13
  nltk.download('stopwords')
14
 
15
- def preprocess(sentence):
 
 
 
 
 
 
 
 
 
16
  lemmatizer = WordNetLemmatizer()
17
  stop_words = set(stopwords.words('english'))
18
  tokens = word_tokenize(sentence.lower())
@@ -20,45 +30,75 @@ def preprocess(sentence):
20
  tokens = [word for word in tokens if word not in stop_words]
21
  return ' '.join(tokens)
22
 
23
- def find_most_similar(sentence, candidates, threshold=0.15):
 
 
 
 
 
 
 
 
 
 
 
24
  input_bits = preprocess(sentence)
25
  chunks = [preprocess(candidate) for candidate in candidates]
26
 
27
  vectorizer = TfidfVectorizer()
28
  vectors = vectorizer.fit_transform([input_bits] + chunks)
29
 
30
- similarity_scores = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
31
 
32
- similar_sentences = []
33
- for i, score in enumerate(similarity_scores):
34
  if score >= threshold:
35
- similar_sentences.append({"sentence": candidates[i], "f(score)": round(score, 4)})
36
 
37
- return similar_sentences
38
 
39
- def read_sentences_from_file(file_location):
 
 
 
 
 
 
 
 
 
40
  with open(file_location, 'r') as file:
41
  text = file.read().replace('\n', ' ')
42
  sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]
43
  return sentences
44
 
45
- def fetch_vectors(file, sentence):
 
 
 
 
 
 
 
 
 
 
46
  file_location = file.name
47
  chunks = read_sentences_from_file(file_location)
48
- similar_sentences = find_most_similar(sentence, chunks, threshold=0.15)
49
- return json.dumps(similar_sentences, indent=4)
50
 
51
  # Interface
52
  file_uploader = gr.File(label="Upload a .txt file")
53
- text_input = gr.Textbox(label="Enter a sentence")
54
- output_text = gr.Textbox(label="RAG -QA")
55
 
56
  iface = gr.Interface(
57
  fn=fetch_vectors,
58
  inputs=[file_uploader, text_input],
59
  outputs=output_text,
60
  title="Minimal RAG - For QA (Super Fast/Modeless)",
61
- description="Upload a text file and enter the question. The threshold is set to 0.15."
62
  )
63
 
64
- iface.launch(debug=True)
 
6
  from nltk.corpus import stopwords
7
  import nltk
8
  import json
9
+ from typing import List, Dict, Any
10
 
11
  # Download NLTK resources
12
  nltk.download('punkt')
13
  nltk.download('wordnet')
14
  nltk.download('stopwords')
15
 
16
+ def preprocess(sentence: str) -> str:
17
+ """
18
+ Preprocesses a given sentence by converting to lowercase, tokenizing, lemmatizing, and removing stopwords.
19
+
20
+ Parameters:
21
+ sentence (str): The input sentence to be preprocessed.
22
+
23
+ Returns:
24
+ str: The preprocessed sentence.
25
+ """
26
  lemmatizer = WordNetLemmatizer()
27
  stop_words = set(stopwords.words('english'))
28
  tokens = word_tokenize(sentence.lower())
 
30
  tokens = [word for word in tokens if word not in stop_words]
31
  return ' '.join(tokens)
32
 
33
+ def calculate_fx(sentence: str, candidates: List[str], threshold: float = 0.15) -> List[Dict[str, Any]]:
34
+ """
35
+ Calculates the similarity scores between the input sentence and a list of candidate sentences.
36
+
37
+ Parameters:
38
+ sentence (str): The input sentence.
39
+ candidates (List[str]): List of candidate sentences.
40
+ threshold (float, optional): Threshold value for considering a sentence similar. Defaults to 0.15.
41
+
42
+ Returns:
43
+ List[Dict[str, Any]]: List of dictionaries containing similar sentences and their similarity scores.
44
+ """
45
  input_bits = preprocess(sentence)
46
  chunks = [preprocess(candidate) for candidate in candidates]
47
 
48
  vectorizer = TfidfVectorizer()
49
  vectors = vectorizer.fit_transform([input_bits] + chunks)
50
 
51
+ f_scores = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
52
 
53
+ similar_chunks = []
54
+ for i, score in enumerate(f_scores):
55
  if score >= threshold:
56
+ similar_chunks.append({"sentence": candidates[i], "f(score)": round(score, 4)})
57
 
58
+ return similar_chunks
59
 
60
+ def read_sentences_from_file(file_location: str) -> List[str]:
61
+ """
62
+ Reads sentences from a text file located at the given location.
63
+
64
+ Parameters:
65
+ file_location (str): Location of the text file.
66
+
67
+ Returns:
68
+ List[str]: List of sentences read from the file.
69
+ """
70
  with open(file_location, 'r') as file:
71
  text = file.read().replace('\n', ' ')
72
  sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]
73
  return sentences
74
 
75
+ def fetch_vectors(file: Any, sentence: str) -> str:
76
+ """
77
+ Fetches similar sentences from a text file for a given input sentence.
78
+
79
+ Parameters:
80
+ file (Any): File uploaded by the user.
81
+ sentence (str): Input sentence.
82
+
83
+ Returns:
84
+ str: JSON string containing similar sentences and their similarity scores.
85
+ """
86
  file_location = file.name
87
  chunks = read_sentences_from_file(file_location)
88
+ similar_chunks = calculate_fx(sentence, chunks, threshold=0.15)
89
+ return json.dumps(similar_chunks, indent=4)
90
 
91
  # Interface
92
  file_uploader = gr.File(label="Upload a .txt file")
93
+ text_input = gr.Textbox(label="Enter question")
94
+ output_text = gr.Textbox(label="Output")
95
 
96
  iface = gr.Interface(
97
  fn=fetch_vectors,
98
  inputs=[file_uploader, text_input],
99
  outputs=output_text,
100
  title="Minimal RAG - For QA (Super Fast/Modeless)",
101
+ description="Fastest Minimal Rag for Question Answer, calculating cosine similarities and vectorizing using scikit-learn's TfidfVectorizer."
102
  )
103
 
104
+ iface.launch(debug=True)