pendar02 commited on
Commit
605c3e6
·
verified ·
1 Parent(s): 86cd6d4

Create text_processing.py

Browse files
Files changed (1) hide show
  1. text_processing.py +126 -0
text_processing.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from sklearn.metrics.pairwise import cosine_similarity
3
+ import numpy as np
4
+ from nltk.corpus import wordnet
5
+ from nltk.tokenize import word_tokenize
6
+ import nltk
7
+ import streamlit as st
8
+
9
+ # Download required NLTK data
10
+ try:
11
+ nltk.download('wordnet', quiet=True)
12
+ nltk.download('punkt', quiet=True)
13
+ nltk.download('averaged_perceptron_tagger', quiet=True)
14
+ except:
15
+ pass
16
+
17
+ class TextProcessor:
18
+ def __init__(self):
19
+ """Initialize the text processor with TF-IDF vectorizer"""
20
+ self.vectorizer = TfidfVectorizer(
21
+ stop_words='english',
22
+ ngram_range=(1, 2),
23
+ max_features=10000
24
+ )
25
+
26
+ def preprocess_text(self, text):
27
+ """Basic text preprocessing"""
28
+ # Convert to lower case
29
+ text = text.lower()
30
+ # Tokenize
31
+ tokens = word_tokenize(text)
32
+ # Get POS tags
33
+ pos_tags = nltk.pos_tag(tokens)
34
+ # Extract nouns and adjectives (medical terms are often these)
35
+ medical_terms = [word for word, tag in pos_tags if tag.startswith(('NN', 'JJ'))]
36
+ return {
37
+ 'processed_text': ' '.join(tokens),
38
+ 'medical_terms': medical_terms
39
+ }
40
+
41
+ def get_synonyms(self, term):
42
+ """Get synonyms for a term using WordNet"""
43
+ synonyms = []
44
+ for syn in wordnet.synsets(term):
45
+ for lemma in syn.lemmas():
46
+ synonyms.append(lemma.name())
47
+ return list(set(synonyms))
48
+
49
+ def calculate_relevance_scores(self, question, abstracts):
50
+ """Calculate relevance scores using multiple methods"""
51
+ # Preprocess question
52
+ proc_question = self.preprocess_text(question)
53
+
54
+ # 1. TF-IDF Similarity
55
+ tfidf_matrix = self.vectorizer.fit_transform([proc_question['processed_text']] + abstracts)
56
+ tfidf_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
57
+
58
+ # 2. Medical Term Matching
59
+ term_scores = []
60
+ question_terms = set(proc_question['medical_terms'])
61
+ for abstract in abstracts:
62
+ abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
63
+ # Calculate Jaccard similarity between terms
64
+ if len(question_terms.union(abstract_terms)) > 0:
65
+ score = len(question_terms.intersection(abstract_terms)) / len(question_terms.union(abstract_terms))
66
+ else:
67
+ score = 0
68
+ term_scores.append(score)
69
+
70
+ # 3. Synonym Matching
71
+ synonym_scores = []
72
+ question_synonyms = set()
73
+ for term in proc_question['medical_terms']:
74
+ question_synonyms.update(self.get_synonyms(term))
75
+
76
+ for abstract in abstracts:
77
+ abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
78
+ abstract_synonyms = set()
79
+ for term in abstract_terms:
80
+ abstract_synonyms.update(self.get_synonyms(term))
81
+
82
+ # Calculate synonym overlap
83
+ if len(question_synonyms.union(abstract_synonyms)) > 0:
84
+ score = len(question_synonyms.intersection(abstract_synonyms)) / len(question_synonyms.union(abstract_synonyms))
85
+ else:
86
+ score = 0
87
+ synonym_scores.append(score)
88
+
89
+ # Combine scores with weights
90
+ weights = {
91
+ 'tfidf': 0.5,
92
+ 'term_matching': 0.3,
93
+ 'synonym_matching': 0.2
94
+ }
95
+
96
+ combined_scores = []
97
+ for i in range(len(abstracts)):
98
+ score = (
99
+ weights['tfidf'] * tfidf_scores[i] +
100
+ weights['term_matching'] * term_scores[i] +
101
+ weights['synonym_matching'] * synonym_scores[i]
102
+ )
103
+ combined_scores.append(score)
104
+
105
+ return np.array(combined_scores)
106
+
107
+ def find_most_relevant_abstracts(self, question, abstracts, top_k=5):
108
+ """Find the most relevant abstracts for a given question"""
109
+ # Calculate relevance scores
110
+ scores = self.calculate_relevance_scores(question, abstracts)
111
+
112
+ # Get indices of top_k highest scoring abstracts
113
+ top_indices = np.argsort(scores)[-top_k:][::-1]
114
+
115
+ # Process question for medical terms
116
+ proc_question = self.preprocess_text(question)
117
+
118
+ return {
119
+ 'top_indices': top_indices.tolist(),
120
+ 'scores': scores[top_indices].tolist(),
121
+ 'processed_question': {
122
+ 'original': question,
123
+ 'corrected': question, # No spell checking in this version
124
+ 'medical_entities': proc_question['medical_terms']
125
+ }
126
+ }