Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 10

Commit

605c3e6

verified ·

1 Parent(s): 86cd6d4

Create text_processing.py

Browse files

Files changed (1) hide show

text_processing.py +126 -0

text_processing.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+from nltk.corpus import wordnet
+from nltk.tokenize import word_tokenize
+import nltk
+import streamlit as st
+# Download required NLTK data
+try:
+    nltk.download('wordnet', quiet=True)
+    nltk.download('punkt', quiet=True)
+    nltk.download('averaged_perceptron_tagger', quiet=True)
+except:
+    pass
+class TextProcessor:
+    def __init__(self):
+        """Initialize the text processor with TF-IDF vectorizer"""
+        self.vectorizer = TfidfVectorizer(
+            stop_words='english',
+            ngram_range=(1, 2),
+            max_features=10000
+        )
+    def preprocess_text(self, text):
+        """Basic text preprocessing"""
+        # Convert to lower case
+        text = text.lower()
+        # Tokenize
+        tokens = word_tokenize(text)
+        # Get POS tags
+        pos_tags = nltk.pos_tag(tokens)
+        # Extract nouns and adjectives (medical terms are often these)
+        medical_terms = [word for word, tag in pos_tags if tag.startswith(('NN', 'JJ'))]
+        return {
+            'processed_text': ' '.join(tokens),
+            'medical_terms': medical_terms
+        }
+    def get_synonyms(self, term):
+        """Get synonyms for a term using WordNet"""
+        synonyms = []
+        for syn in wordnet.synsets(term):
+            for lemma in syn.lemmas():
+                synonyms.append(lemma.name())
+        return list(set(synonyms))
+    def calculate_relevance_scores(self, question, abstracts):
+        """Calculate relevance scores using multiple methods"""
+        # Preprocess question
+        proc_question = self.preprocess_text(question)
+        # 1. TF-IDF Similarity
+        tfidf_matrix = self.vectorizer.fit_transform([proc_question['processed_text']] + abstracts)
+        tfidf_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
+        # 2. Medical Term Matching
+        term_scores = []
+        question_terms = set(proc_question['medical_terms'])
+        for abstract in abstracts:
+            abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
+            # Calculate Jaccard similarity between terms
+            if len(question_terms.union(abstract_terms)) > 0:
+                score = len(question_terms.intersection(abstract_terms)) / len(question_terms.union(abstract_terms))
+            else:
+                score = 0
+            term_scores.append(score)
+        # 3. Synonym Matching
+        synonym_scores = []
+        question_synonyms = set()
+        for term in proc_question['medical_terms']:
+            question_synonyms.update(self.get_synonyms(term))
+        for abstract in abstracts:
+            abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
+            abstract_synonyms = set()
+            for term in abstract_terms:
+                abstract_synonyms.update(self.get_synonyms(term))
+            # Calculate synonym overlap
+            if len(question_synonyms.union(abstract_synonyms)) > 0:
+                score = len(question_synonyms.intersection(abstract_synonyms)) / len(question_synonyms.union(abstract_synonyms))
+            else:
+                score = 0
+            synonym_scores.append(score)
+        # Combine scores with weights
+        weights = {
+            'tfidf': 0.5,
+            'term_matching': 0.3,
+            'synonym_matching': 0.2
+        }
+        combined_scores = []
+        for i in range(len(abstracts)):
+            score = (
+                weights['tfidf'] * tfidf_scores[i] +
+                weights['term_matching'] * term_scores[i] +
+                weights['synonym_matching'] * synonym_scores[i]
+            )
+            combined_scores.append(score)
+        return np.array(combined_scores)
+    def find_most_relevant_abstracts(self, question, abstracts, top_k=5):
+        """Find the most relevant abstracts for a given question"""
+        # Calculate relevance scores
+        scores = self.calculate_relevance_scores(question, abstracts)
+        # Get indices of top_k highest scoring abstracts
+        top_indices = np.argsort(scores)[-top_k:][::-1]
+        # Process question for medical terms
+        proc_question = self.preprocess_text(question)
+        return {
+            'top_indices': top_indices.tolist(),
+            'scores': scores[top_indices].tolist(),
+            'processed_question': {
+                'original': question,
+                'corrected': question,  # No spell checking in this version
+                'medical_entities': proc_question['medical_terms']
+            }
+        }