biomedical / text_processing.py
pendar02's picture
Create text_processing.py
605c3e6 verified
raw
history blame
4.73 kB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import nltk
import streamlit as st
# Download required NLTK data
try:
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
except:
pass
class TextProcessor:
def __init__(self):
"""Initialize the text processor with TF-IDF vectorizer"""
self.vectorizer = TfidfVectorizer(
stop_words='english',
ngram_range=(1, 2),
max_features=10000
)
def preprocess_text(self, text):
"""Basic text preprocessing"""
# Convert to lower case
text = text.lower()
# Tokenize
tokens = word_tokenize(text)
# Get POS tags
pos_tags = nltk.pos_tag(tokens)
# Extract nouns and adjectives (medical terms are often these)
medical_terms = [word for word, tag in pos_tags if tag.startswith(('NN', 'JJ'))]
return {
'processed_text': ' '.join(tokens),
'medical_terms': medical_terms
}
def get_synonyms(self, term):
"""Get synonyms for a term using WordNet"""
synonyms = []
for syn in wordnet.synsets(term):
for lemma in syn.lemmas():
synonyms.append(lemma.name())
return list(set(synonyms))
def calculate_relevance_scores(self, question, abstracts):
"""Calculate relevance scores using multiple methods"""
# Preprocess question
proc_question = self.preprocess_text(question)
# 1. TF-IDF Similarity
tfidf_matrix = self.vectorizer.fit_transform([proc_question['processed_text']] + abstracts)
tfidf_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
# 2. Medical Term Matching
term_scores = []
question_terms = set(proc_question['medical_terms'])
for abstract in abstracts:
abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
# Calculate Jaccard similarity between terms
if len(question_terms.union(abstract_terms)) > 0:
score = len(question_terms.intersection(abstract_terms)) / len(question_terms.union(abstract_terms))
else:
score = 0
term_scores.append(score)
# 3. Synonym Matching
synonym_scores = []
question_synonyms = set()
for term in proc_question['medical_terms']:
question_synonyms.update(self.get_synonyms(term))
for abstract in abstracts:
abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
abstract_synonyms = set()
for term in abstract_terms:
abstract_synonyms.update(self.get_synonyms(term))
# Calculate synonym overlap
if len(question_synonyms.union(abstract_synonyms)) > 0:
score = len(question_synonyms.intersection(abstract_synonyms)) / len(question_synonyms.union(abstract_synonyms))
else:
score = 0
synonym_scores.append(score)
# Combine scores with weights
weights = {
'tfidf': 0.5,
'term_matching': 0.3,
'synonym_matching': 0.2
}
combined_scores = []
for i in range(len(abstracts)):
score = (
weights['tfidf'] * tfidf_scores[i] +
weights['term_matching'] * term_scores[i] +
weights['synonym_matching'] * synonym_scores[i]
)
combined_scores.append(score)
return np.array(combined_scores)
def find_most_relevant_abstracts(self, question, abstracts, top_k=5):
"""Find the most relevant abstracts for a given question"""
# Calculate relevance scores
scores = self.calculate_relevance_scores(question, abstracts)
# Get indices of top_k highest scoring abstracts
top_indices = np.argsort(scores)[-top_k:][::-1]
# Process question for medical terms
proc_question = self.preprocess_text(question)
return {
'top_indices': top_indices.tolist(),
'scores': scores[top_indices].tolist(),
'processed_question': {
'original': question,
'corrected': question, # No spell checking in this version
'medical_entities': proc_question['medical_terms']
}
}