pendar02 commited on
Commit
adb0a32
·
verified ·
1 Parent(s): dee9a31

Update text_processing.py

Browse files
Files changed (1) hide show
  1. text_processing.py +23 -40
text_processing.py CHANGED
@@ -6,7 +6,6 @@ from nltk.tokenize import word_tokenize
6
  import nltk
7
  import streamlit as st
8
 
9
- # Download required NLTK data
10
  try:
11
  nltk.download('wordnet', quiet=True)
12
  nltk.download('punkt', quiet=True)
@@ -16,22 +15,17 @@ except:
16
 
17
  class TextProcessor:
18
  def __init__(self):
19
- """Initialize the text processor with TF-IDF vectorizer"""
20
  self.vectorizer = TfidfVectorizer(
21
  stop_words='english',
22
  ngram_range=(1, 2),
23
  max_features=10000
24
  )
 
25
 
26
  def preprocess_text(self, text):
27
- """Basic text preprocessing"""
28
- # Convert to lower case
29
  text = text.lower()
30
- # Tokenize
31
  tokens = word_tokenize(text)
32
- # Get POS tags
33
  pos_tags = nltk.pos_tag(tokens)
34
- # Extract nouns and adjectives (medical terms are often these)
35
  medical_terms = [word for word, tag in pos_tags if tag.startswith(('NN', 'JJ'))]
36
  return {
37
  'processed_text': ' '.join(tokens),
@@ -39,7 +33,6 @@ class TextProcessor:
39
  }
40
 
41
  def get_synonyms(self, term):
42
- """Get synonyms for a term using WordNet"""
43
  synonyms = []
44
  for syn in wordnet.synsets(term):
45
  for lemma in syn.lemmas():
@@ -47,27 +40,19 @@ class TextProcessor:
47
  return list(set(synonyms))
48
 
49
  def calculate_relevance_scores(self, question, abstracts):
50
- """Calculate relevance scores using multiple methods"""
51
- # Preprocess question
52
  proc_question = self.preprocess_text(question)
53
 
54
- # 1. TF-IDF Similarity
55
  tfidf_matrix = self.vectorizer.fit_transform([proc_question['processed_text']] + abstracts)
56
  tfidf_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
57
 
58
- # 2. Medical Term Matching
59
  term_scores = []
60
  question_terms = set(proc_question['medical_terms'])
61
  for abstract in abstracts:
62
  abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
63
- # Calculate Jaccard similarity between terms
64
- if len(question_terms.union(abstract_terms)) > 0:
65
- score = len(question_terms.intersection(abstract_terms)) / len(question_terms.union(abstract_terms))
66
- else:
67
- score = 0
68
  term_scores.append(score)
69
 
70
- # 3. Synonym Matching
71
  synonym_scores = []
72
  question_synonyms = set()
73
  for term in proc_question['medical_terms']:
@@ -79,40 +64,38 @@ class TextProcessor:
79
  for term in abstract_terms:
80
  abstract_synonyms.update(self.get_synonyms(term))
81
 
82
- # Calculate synonym overlap
83
- if len(question_synonyms.union(abstract_synonyms)) > 0:
84
- score = len(question_synonyms.intersection(abstract_synonyms)) / len(question_synonyms.union(abstract_synonyms))
85
- else:
86
- score = 0
87
  synonym_scores.append(score)
88
 
89
- # Combine scores with weights
90
- weights = {
91
- 'tfidf': 0.5,
92
- 'term_matching': 0.3,
93
- 'synonym_matching': 0.2
94
- }
95
 
96
  combined_scores = []
97
  for i in range(len(abstracts)):
98
- score = (
99
- weights['tfidf'] * tfidf_scores[i] +
100
- weights['term_matching'] * term_scores[i] +
101
- weights['synonym_matching'] * synonym_scores[i]
102
- )
103
  combined_scores.append(score)
104
 
105
  return np.array(combined_scores)
106
 
107
  def find_most_relevant_abstracts(self, question, abstracts, top_k=5):
108
- """Find the most relevant abstracts for a given question"""
109
- # Calculate relevance scores
110
  scores = self.calculate_relevance_scores(question, abstracts)
111
 
112
- # Get indices of top_k highest scoring abstracts
113
- top_indices = np.argsort(scores)[-top_k:][::-1]
 
 
 
 
 
 
 
 
 
 
 
114
 
115
- # Process question for medical terms
116
  proc_question = self.preprocess_text(question)
117
 
118
  return {
@@ -120,7 +103,7 @@ class TextProcessor:
120
  'scores': scores[top_indices].tolist(),
121
  'processed_question': {
122
  'original': question,
123
- 'corrected': question, # No spell checking in this version
124
  'medical_entities': proc_question['medical_terms']
125
  }
126
  }
 
6
  import nltk
7
  import streamlit as st
8
 
 
9
  try:
10
  nltk.download('wordnet', quiet=True)
11
  nltk.download('punkt', quiet=True)
 
15
 
16
  class TextProcessor:
17
  def __init__(self):
 
18
  self.vectorizer = TfidfVectorizer(
19
  stop_words='english',
20
  ngram_range=(1, 2),
21
  max_features=10000
22
  )
23
+ self.relevance_threshold = 0.1
24
 
25
  def preprocess_text(self, text):
 
 
26
  text = text.lower()
 
27
  tokens = word_tokenize(text)
 
28
  pos_tags = nltk.pos_tag(tokens)
 
29
  medical_terms = [word for word, tag in pos_tags if tag.startswith(('NN', 'JJ'))]
30
  return {
31
  'processed_text': ' '.join(tokens),
 
33
  }
34
 
35
  def get_synonyms(self, term):
 
36
  synonyms = []
37
  for syn in wordnet.synsets(term):
38
  for lemma in syn.lemmas():
 
40
  return list(set(synonyms))
41
 
42
  def calculate_relevance_scores(self, question, abstracts):
 
 
43
  proc_question = self.preprocess_text(question)
44
 
 
45
  tfidf_matrix = self.vectorizer.fit_transform([proc_question['processed_text']] + abstracts)
46
  tfidf_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
47
 
 
48
  term_scores = []
49
  question_terms = set(proc_question['medical_terms'])
50
  for abstract in abstracts:
51
  abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
52
+ score = (len(question_terms.intersection(abstract_terms)) /
53
+ len(question_terms.union(abstract_terms))) if question_terms.union(abstract_terms) else 0
 
 
 
54
  term_scores.append(score)
55
 
 
56
  synonym_scores = []
57
  question_synonyms = set()
58
  for term in proc_question['medical_terms']:
 
64
  for term in abstract_terms:
65
  abstract_synonyms.update(self.get_synonyms(term))
66
 
67
+ score = (len(question_synonyms.intersection(abstract_synonyms)) /
68
+ len(question_synonyms.union(abstract_synonyms))) if question_synonyms.union(abstract_synonyms) else 0
 
 
 
69
  synonym_scores.append(score)
70
 
71
+ weights = {'tfidf': 0.5, 'term_matching': 0.3, 'synonym_matching': 0.2}
 
 
 
 
 
72
 
73
  combined_scores = []
74
  for i in range(len(abstracts)):
75
+ score = (weights['tfidf'] * tfidf_scores[i] +
76
+ weights['term_matching'] * term_scores[i] +
77
+ weights['synonym_matching'] * synonym_scores[i])
 
 
78
  combined_scores.append(score)
79
 
80
  return np.array(combined_scores)
81
 
82
  def find_most_relevant_abstracts(self, question, abstracts, top_k=5):
 
 
83
  scores = self.calculate_relevance_scores(question, abstracts)
84
 
85
+ # Filter by relevance threshold
86
+ relevant_indices = np.where(scores > self.relevance_threshold)[0]
87
+
88
+ if len(relevant_indices) == 0:
89
+ return {
90
+ 'top_indices': [],
91
+ 'scores': [],
92
+ 'processed_question': None
93
+ }
94
+
95
+ # Get top_k from relevant papers only
96
+ top_k = min(top_k, len(relevant_indices))
97
+ top_indices = relevant_indices[np.argsort(scores[relevant_indices])[-top_k:][::-1]]
98
 
 
99
  proc_question = self.preprocess_text(question)
100
 
101
  return {
 
103
  'scores': scores[top_indices].tolist(),
104
  'processed_question': {
105
  'original': question,
106
+ 'corrected': question,
107
  'medical_entities': proc_question['medical_terms']
108
  }
109
  }