File size: 4,725 Bytes
605c3e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import nltk
import streamlit as st

# Download required NLTK data
try:
    nltk.download('wordnet', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
except:
    pass

class TextProcessor:
    def __init__(self):
        """Initialize the text processor with TF-IDF vectorizer"""
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            ngram_range=(1, 2),
            max_features=10000
        )

    def preprocess_text(self, text):
        """Basic text preprocessing"""
        # Convert to lower case
        text = text.lower()
        # Tokenize
        tokens = word_tokenize(text)
        # Get POS tags
        pos_tags = nltk.pos_tag(tokens)
        # Extract nouns and adjectives (medical terms are often these)
        medical_terms = [word for word, tag in pos_tags if tag.startswith(('NN', 'JJ'))]
        return {
            'processed_text': ' '.join(tokens),
            'medical_terms': medical_terms
        }

    def get_synonyms(self, term):
        """Get synonyms for a term using WordNet"""
        synonyms = []
        for syn in wordnet.synsets(term):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
        return list(set(synonyms))

    def calculate_relevance_scores(self, question, abstracts):
        """Calculate relevance scores using multiple methods"""
        # Preprocess question
        proc_question = self.preprocess_text(question)
        
        # 1. TF-IDF Similarity
        tfidf_matrix = self.vectorizer.fit_transform([proc_question['processed_text']] + abstracts)
        tfidf_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
        
        # 2. Medical Term Matching
        term_scores = []
        question_terms = set(proc_question['medical_terms'])
        for abstract in abstracts:
            abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
            # Calculate Jaccard similarity between terms
            if len(question_terms.union(abstract_terms)) > 0:
                score = len(question_terms.intersection(abstract_terms)) / len(question_terms.union(abstract_terms))
            else:
                score = 0
            term_scores.append(score)
        
        # 3. Synonym Matching
        synonym_scores = []
        question_synonyms = set()
        for term in proc_question['medical_terms']:
            question_synonyms.update(self.get_synonyms(term))
        
        for abstract in abstracts:
            abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
            abstract_synonyms = set()
            for term in abstract_terms:
                abstract_synonyms.update(self.get_synonyms(term))
            
            # Calculate synonym overlap
            if len(question_synonyms.union(abstract_synonyms)) > 0:
                score = len(question_synonyms.intersection(abstract_synonyms)) / len(question_synonyms.union(abstract_synonyms))
            else:
                score = 0
            synonym_scores.append(score)
        
        # Combine scores with weights
        weights = {
            'tfidf': 0.5,
            'term_matching': 0.3,
            'synonym_matching': 0.2
        }
        
        combined_scores = []
        for i in range(len(abstracts)):
            score = (
                weights['tfidf'] * tfidf_scores[i] +
                weights['term_matching'] * term_scores[i] +
                weights['synonym_matching'] * synonym_scores[i]
            )
            combined_scores.append(score)
        
        return np.array(combined_scores)

    def find_most_relevant_abstracts(self, question, abstracts, top_k=5):
        """Find the most relevant abstracts for a given question"""
        # Calculate relevance scores
        scores = self.calculate_relevance_scores(question, abstracts)
        
        # Get indices of top_k highest scoring abstracts
        top_indices = np.argsort(scores)[-top_k:][::-1]
        
        # Process question for medical terms
        proc_question = self.preprocess_text(question)
        
        return {
            'top_indices': top_indices.tolist(),
            'scores': scores[top_indices].tolist(),
            'processed_question': {
                'original': question,
                'corrected': question,  # No spell checking in this version
                'medical_entities': proc_question['medical_terms']
            }
        }