from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np from nltk.corpus import wordnet from nltk.tokenize import word_tokenize import nltk import streamlit as st # Download required NLTK data try: nltk.download('wordnet', quiet=True) nltk.download('punkt', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) except: pass class TextProcessor: def __init__(self): """Initialize the text processor with TF-IDF vectorizer""" self.vectorizer = TfidfVectorizer( stop_words='english', ngram_range=(1, 2), max_features=10000 ) def preprocess_text(self, text): """Basic text preprocessing""" # Convert to lower case text = text.lower() # Tokenize tokens = word_tokenize(text) # Get POS tags pos_tags = nltk.pos_tag(tokens) # Extract nouns and adjectives (medical terms are often these) medical_terms = [word for word, tag in pos_tags if tag.startswith(('NN', 'JJ'))] return { 'processed_text': ' '.join(tokens), 'medical_terms': medical_terms } def get_synonyms(self, term): """Get synonyms for a term using WordNet""" synonyms = [] for syn in wordnet.synsets(term): for lemma in syn.lemmas(): synonyms.append(lemma.name()) return list(set(synonyms)) def calculate_relevance_scores(self, question, abstracts): """Calculate relevance scores using multiple methods""" # Preprocess question proc_question = self.preprocess_text(question) # 1. TF-IDF Similarity tfidf_matrix = self.vectorizer.fit_transform([proc_question['processed_text']] + abstracts) tfidf_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0] # 2. Medical Term Matching term_scores = [] question_terms = set(proc_question['medical_terms']) for abstract in abstracts: abstract_terms = set(self.preprocess_text(abstract)['medical_terms']) # Calculate Jaccard similarity between terms if len(question_terms.union(abstract_terms)) > 0: score = len(question_terms.intersection(abstract_terms)) / len(question_terms.union(abstract_terms)) else: score = 0 term_scores.append(score) # 3. Synonym Matching synonym_scores = [] question_synonyms = set() for term in proc_question['medical_terms']: question_synonyms.update(self.get_synonyms(term)) for abstract in abstracts: abstract_terms = set(self.preprocess_text(abstract)['medical_terms']) abstract_synonyms = set() for term in abstract_terms: abstract_synonyms.update(self.get_synonyms(term)) # Calculate synonym overlap if len(question_synonyms.union(abstract_synonyms)) > 0: score = len(question_synonyms.intersection(abstract_synonyms)) / len(question_synonyms.union(abstract_synonyms)) else: score = 0 synonym_scores.append(score) # Combine scores with weights weights = { 'tfidf': 0.5, 'term_matching': 0.3, 'synonym_matching': 0.2 } combined_scores = [] for i in range(len(abstracts)): score = ( weights['tfidf'] * tfidf_scores[i] + weights['term_matching'] * term_scores[i] + weights['synonym_matching'] * synonym_scores[i] ) combined_scores.append(score) return np.array(combined_scores) def find_most_relevant_abstracts(self, question, abstracts, top_k=5): """Find the most relevant abstracts for a given question""" # Calculate relevance scores scores = self.calculate_relevance_scores(question, abstracts) # Get indices of top_k highest scoring abstracts top_indices = np.argsort(scores)[-top_k:][::-1] # Process question for medical terms proc_question = self.preprocess_text(question) return { 'top_indices': top_indices.tolist(), 'scores': scores[top_indices].tolist(), 'processed_question': { 'original': question, 'corrected': question, # No spell checking in this version 'medical_entities': proc_question['medical_terms'] } }