Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

biomedical / text_processing.py

pendar02

Create text_processing.py

605c3e6 verified 10 months ago

raw

history blame

4.73 kB

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np
	from nltk.corpus import wordnet
	from nltk.tokenize import word_tokenize
	import nltk
	import streamlit as st

	# Download required NLTK data
	try:
	nltk.download('wordnet', quiet=True)
	nltk.download('punkt', quiet=True)
	nltk.download('averaged_perceptron_tagger', quiet=True)
	except:
	pass

	class TextProcessor:
	def __init__(self):
	"""Initialize the text processor with TF-IDF vectorizer"""
	self.vectorizer = TfidfVectorizer(
	stop_words='english',
	ngram_range=(1, 2),
	max_features=10000
	)

	def preprocess_text(self, text):
	"""Basic text preprocessing"""
	# Convert to lower case
	text = text.lower()
	# Tokenize
	tokens = word_tokenize(text)
	# Get POS tags
	pos_tags = nltk.pos_tag(tokens)
	# Extract nouns and adjectives (medical terms are often these)
	medical_terms = [word for word, tag in pos_tags if tag.startswith(('NN', 'JJ'))]
	return {
	'processed_text': ' '.join(tokens),
	'medical_terms': medical_terms
	}

	def get_synonyms(self, term):
	"""Get synonyms for a term using WordNet"""
	synonyms = []
	for syn in wordnet.synsets(term):
	for lemma in syn.lemmas():
	synonyms.append(lemma.name())
	return list(set(synonyms))

	def calculate_relevance_scores(self, question, abstracts):
	"""Calculate relevance scores using multiple methods"""
	# Preprocess question
	proc_question = self.preprocess_text(question)

	# 1. TF-IDF Similarity
	tfidf_matrix = self.vectorizer.fit_transform([proc_question['processed_text']] + abstracts)
	tfidf_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]

	# 2. Medical Term Matching
	term_scores = []
	question_terms = set(proc_question['medical_terms'])
	for abstract in abstracts:
	abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
	# Calculate Jaccard similarity between terms
	if len(question_terms.union(abstract_terms)) > 0:
	score = len(question_terms.intersection(abstract_terms)) / len(question_terms.union(abstract_terms))
	else:
	score = 0
	term_scores.append(score)

	# 3. Synonym Matching
	synonym_scores = []
	question_synonyms = set()
	for term in proc_question['medical_terms']:
	question_synonyms.update(self.get_synonyms(term))

	for abstract in abstracts:
	abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
	abstract_synonyms = set()
	for term in abstract_terms:
	abstract_synonyms.update(self.get_synonyms(term))

	# Calculate synonym overlap
	if len(question_synonyms.union(abstract_synonyms)) > 0:
	score = len(question_synonyms.intersection(abstract_synonyms)) / len(question_synonyms.union(abstract_synonyms))
	else:
	score = 0
	synonym_scores.append(score)

	# Combine scores with weights
	weights = {
	'tfidf': 0.5,
	'term_matching': 0.3,
	'synonym_matching': 0.2
	}

	combined_scores = []
	for i in range(len(abstracts)):
	score = (
	weights['tfidf'] * tfidf_scores[i] +
	weights['term_matching'] * term_scores[i] +
	weights['synonym_matching'] * synonym_scores[i]
	)
	combined_scores.append(score)

	return np.array(combined_scores)

	def find_most_relevant_abstracts(self, question, abstracts, top_k=5):
	"""Find the most relevant abstracts for a given question"""
	# Calculate relevance scores
	scores = self.calculate_relevance_scores(question, abstracts)

	# Get indices of top_k highest scoring abstracts
	top_indices = np.argsort(scores)[-top_k:][::-1]

	# Process question for medical terms
	proc_question = self.preprocess_text(question)

	return {
	'top_indices': top_indices.tolist(),
	'scores': scores[top_indices].tolist(),
	'processed_question': {
	'original': question,
	'corrected': question, # No spell checking in this version
	'medical_entities': proc_question['medical_terms']
	}
	}