from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np from nltk.corpus import wordnet from nltk.tokenize import word_tokenize import nltk import streamlit as st try: nltk.download('wordnet', quiet=True) nltk.download('punkt', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) except: pass class TextProcessor: def __init__(self): self.vectorizer = TfidfVectorizer( stop_words='english', ngram_range=(1, 2), max_features=10000 ) self.relevance_threshold = 0.1 def preprocess_text(self, text): text = text.lower() tokens = word_tokenize(text) pos_tags = nltk.pos_tag(tokens) medical_terms = [word for word, tag in pos_tags if tag.startswith(('NN', 'JJ'))] return { 'processed_text': ' '.join(tokens), 'medical_terms': medical_terms } def get_synonyms(self, term): synonyms = [] for syn in wordnet.synsets(term): for lemma in syn.lemmas(): synonyms.append(lemma.name()) return list(set(synonyms)) def calculate_relevance_scores(self, question, abstracts): proc_question = self.preprocess_text(question) tfidf_matrix = self.vectorizer.fit_transform([proc_question['processed_text']] + abstracts) tfidf_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0] term_scores = [] question_terms = set(proc_question['medical_terms']) for abstract in abstracts: abstract_terms = set(self.preprocess_text(abstract)['medical_terms']) score = (len(question_terms.intersection(abstract_terms)) / len(question_terms.union(abstract_terms))) if question_terms.union(abstract_terms) else 0 term_scores.append(score) synonym_scores = [] question_synonyms = set() for term in proc_question['medical_terms']: question_synonyms.update(self.get_synonyms(term)) for abstract in abstracts: abstract_terms = set(self.preprocess_text(abstract)['medical_terms']) abstract_synonyms = set() for term in abstract_terms: abstract_synonyms.update(self.get_synonyms(term)) score = (len(question_synonyms.intersection(abstract_synonyms)) / len(question_synonyms.union(abstract_synonyms))) if question_synonyms.union(abstract_synonyms) else 0 synonym_scores.append(score) weights = {'tfidf': 0.5, 'term_matching': 0.3, 'synonym_matching': 0.2} combined_scores = [] for i in range(len(abstracts)): score = (weights['tfidf'] * tfidf_scores[i] + weights['term_matching'] * term_scores[i] + weights['synonym_matching'] * synonym_scores[i]) combined_scores.append(score) return np.array(combined_scores) def find_most_relevant_abstracts(self, question, abstracts, top_k=5): scores = self.calculate_relevance_scores(question, abstracts) # Filter by relevance threshold relevant_indices = np.where(scores > self.relevance_threshold)[0] if len(relevant_indices) == 0: return { 'top_indices': [], 'scores': [], 'processed_question': None } # Get top_k from relevant papers only top_k = min(top_k, len(relevant_indices)) top_indices = relevant_indices[np.argsort(scores[relevant_indices])[-top_k:][::-1]] proc_question = self.preprocess_text(question) return { 'top_indices': top_indices.tolist(), 'scores': scores[top_indices].tolist(), 'processed_question': { 'original': question, 'corrected': question, 'medical_entities': proc_question['medical_terms'] } }