Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 12

Commit

7bd75d7

verified ·

1 Parent(s): 06d0182

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -167

app.py CHANGED Viewed

@@ -27,61 +27,17 @@ if 'processing_started' not in st.session_state:
 if 'focused_summary_generated' not in st.session_state:
     st.session_state.focused_summary_generated = False
-def preprocess_text(text):
-    """Preprocess text for summarization"""
-    if not isinstance(text, str) or not text.strip():
-        return text
-    # Clean up whitespace
-    text = re.sub(r'\s+', ' ', text)
-    text = text.strip()
-    # Fix common formatting issues
-    text = re.sub(r'(\d+)\s*%', r'\1%', text)  # Fix percentage format
-    text = re.sub(r'\(\s*([Nn])\s*=\s*(\d+)\s*\)', r'(n=\2)', text)  # Fix sample size format
-    text = re.sub(r'([Pp])\s*([<>])\s*(\d)', r'\1\2\3', text)  # Fix p-value format
-    return text
-def verify_facts(summary, original_text):
-    """Verify key facts between summary and original text"""
-    # Extract numbers and percentages
-    def extract_numbers(text):
-        return set(re.findall(r'(\d+\.?\d*)%?', text))
-    # Extract relationships
-    def extract_relationships(text):
-        patterns = [
-            r'associated with', r'predicted', r'correlated',
-            r'increased', r'decreased', r'significant'
-        ]
-        found = []
-        for pattern in patterns:
-            if re.search(pattern, text.lower()):
-                found.append(pattern)
-        return set(found)
-    # Get facts from both texts
-    original_numbers = extract_numbers(original_text)
-    summary_numbers = extract_numbers(summary)
-    original_relations = extract_relationships(original_text)
-    summary_relations = extract_relationships(summary)
-    return {
-        'is_valid': summary_numbers.issubset(original_numbers) and
-                   summary_relations.issubset(original_relations),
-        'missing_numbers': original_numbers - summary_numbers,
-        'missing_relations': original_relations - summary_relations
-    }
 def load_model(model_type):
     """Load appropriate model based on type with proper memory management"""
     try:
         gc.collect()
         torch.cuda.empty_cache()
-        device = "cpu"
         if model_type == "summarize":
             model = AutoModelForSeq2SeqLM.from_pretrained(
                 "pendar02/bart-large-pubmedd",
                 cache_dir="./models",
@@ -92,7 +48,7 @@ def load_model(model_type):
                 "pendar02/bart-large-pubmedd",
                 cache_dir="./models"
             )
-        else:
             base_model = AutoModelForSeq2SeqLM.from_pretrained(
                 "GanjinZero/biobart-base",
                 cache_dir="./models",
@@ -117,6 +73,7 @@ def load_model(model_type):
         raise
 def cleanup_model(model, tokenizer):
     try:
         del model
         del tokenizer
@@ -125,12 +82,15 @@ def cleanup_model(model, tokenizer):
     except Exception:
         pass
 def process_excel(uploaded_file):
     try:
         df = pd.read_excel(uploaded_file)
         required_columns = ['Abstract', 'Article Title', 'Authors',
                           'Source Title', 'Publication Year', 'DOI', 'Times Cited, All Databases']
         missing_columns = [col for col in required_columns if col not in df.columns]
         if missing_columns:
             st.error(f"Missing required columns: {', '.join(missing_columns)}")
@@ -141,111 +101,119 @@ def process_excel(uploaded_file):
         st.error(f"Error processing file: {str(e)}")
         return None
 def improve_summary_generation(text, model, tokenizer):
     """Generate improved summary with better prompt and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
-    try:
-        # Simplified prompt
-        formatted_text = (
-            "Summarize this biomedical abstract into four sections:\n"
-            "1. Background/Objectives: State the main purpose and population\n"
-            "2. Methods: Describe what was done\n"
-            "3. Key findings: Include ALL numerical results and statistical relationships\n"
-            "4. Conclusions: State main implications\n\n"
-            "Important: Preserve all numbers, measurements, and statistical findings.\n\n"
-            "Text: " + preprocess_text(text)
         )
-        inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
-        inputs = {k: v.to(model.device) for k, v in inputs.items()}
-        # Single generation attempt with optimized parameters
         with torch.no_grad():
             summary_ids = model.generate(
                 **{
                     "input_ids": inputs["input_ids"],
                     "attention_mask": inputs["attention_mask"],
-                    "max_length": 300,
-                    "min_length": 100,
-                    "num_beams": 5,
                     "length_penalty": 2.0,
-                    "no_repeat_ngram_size": 3,
-                    "temperature": 0.3,
-                    "repetition_penalty": 2.5
                 }
             )
         summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-        if not summary:
-            return "Error: Could not generate summary."
-        return post_process_summary(summary)
-    except Exception as e:
-        print(f"Error in summary generation: {str(e)}")
-        return "Error generating summary."
-def post_process_summary(summary):
-    """Enhanced post-processing focused on maintaining structure and removing artifacts"""
-    if not summary:
-        return summary
-    # Clean up section headers
-    header_mappings = {
-        r'(?i)background.*objectives?:?': 'Background and objectives:',
-        r'(?i)(materials?\s*and\s*)?methods?:?': 'Methods:',
-        r'(?i)(key\s*)?findings?:?|results?:?': 'Key findings:',
-        r'(?i)conclusions?:?': 'Conclusions:',
-        r'(?i)(study\s*)?aims?:?|goals?:?|purpose:?': '',
-        r'(?i)objectives?:?': '',
-        r'(?i)outcomes?:?': '',
-        r'(?i)discussion:?': ''
-    }
-    for pattern, replacement in header_mappings.items():
-        summary = re.sub(pattern, replacement, summary)
-    # Split into sections and clean
-    sections = re.split(r'(?i)(Background and objectives:|Methods:|Key findings:|Conclusions:)', summary)
-    sections = [s.strip() for s in sections if s.strip()]
-    # Reorganize sections
-    organized_sections = {
-        'Background and objectives': '',
-        'Methods': '',
-        'Key findings': '',
-        'Conclusions': ''
-    }
-    current_section = None
-    for item in sections:
-        if item in organized_sections:
-            current_section = item
-        elif current_section:
-            # Clean up content
-            content = re.sub(r'\s+', ' ', item)  # Fix spacing
-            content = re.sub(r'\.+', '.', content)  # Fix multiple periods
-            content = content.strip('.: ')  # Remove trailing periods and spaces
-            organized_sections[current_section] = content
-    # Build final summary
-    final_sections = []
-    for section, content in organized_sections.items():
-        if content:
-            final_sections.append(f"{section} {content}.")
-    return '\n\n'.join(final_sections)
 def validate_summary(summary, original_text):
     """Validate summary content against original text"""
-    # Perform fact verification
-    verification = verify_facts(summary, original_text)
-    if not verification.get('is_valid', False):
-        return False
     # Check for age inconsistencies
     age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
     if len(age_mentions) > 1:  # Multiple age mentions
@@ -267,40 +235,34 @@ def validate_summary(summary, original_text):
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
-    try:
-        # Preprocess each abstract
-        formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts]
-        combined_input = f"Question: {question}\nSummarize these abstracts to answer the question:\n" + \
-                        "\n---\n".join(formatted_abstracts)
-        inputs = tokenizer(combined_input, return_tensors="pt", max_length=1024, truncation=True)
-        inputs = {k: v.to(model.device) for k, v in inputs.items()}
-        with torch.no_grad():
-            summary_ids = model.generate(
-                **{
-                    "input_ids": inputs["input_ids"],
-                    "attention_mask": inputs["attention_mask"],
-                    "max_length": 300,
-                    "min_length": 100,
-                    "num_beams": 5,
-                    "length_penalty": 2.0,
-                    "temperature": 0.3,
-                    "repetition_penalty": 2.5
-                }
-            )
-        return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-    except Exception as e:
-        print(f"Error in focused summary generation: {str(e)}")
-        return "Error generating focused summary."
 def create_filter_controls(df, sort_column):
     """Create appropriate filter controls based on the selected column"""
     filtered_df = df.copy()
     if sort_column == 'Publication Year':
         year_min = int(df['Publication Year'].min())
         year_max = int(df['Publication Year'].max())
         col1, col2 = st.columns(2)
@@ -320,6 +282,7 @@ def create_filter_controls(df, sort_column):
         ]
     elif sort_column == 'Authors':
         unique_authors = sorted(set(
             author.strip()
             for authors in df['Authors'].dropna()
@@ -337,6 +300,7 @@ def create_filter_controls(df, sort_column):
             ]
     elif sort_column == 'Source Title':
         unique_sources = sorted(df['Source Title'].unique())
         selected_sources = st.multiselect(
             'Select Sources',
@@ -345,7 +309,13 @@ def create_filter_controls(df, sort_column):
         if selected_sources:
             filtered_df = filtered_df[filtered_df['Source Title'].isin(selected_sources)]
     elif sort_column == 'Times Cited':
         cited_min = int(df['Times Cited'].min())
         cited_max = int(df['Times Cited'].max())
         col1, col2 = st.columns(2)
@@ -369,16 +339,19 @@ def create_filter_controls(df, sort_column):
 def main():
     st.title("🔬 Biomedical Papers Analysis")
     uploaded_file = st.file_uploader(
         "Upload Excel file containing papers",
         type=['xlsx', 'xls'],
         help="File must contain: Abstract, Article Title, Authors, Source Title, Publication Year, DOI"
     )
     question_container = st.empty()
     question = ""
     if uploaded_file is not None:
         if st.session_state.processed_data is None:
             with st.spinner("Processing file..."):
                 df = process_excel(uploaded_file)
@@ -389,16 +362,17 @@ def main():
             df = st.session_state.processed_data
             st.write(f"📊 Loaded {len(df)} papers with abstracts")
             with question_container:
                 question = st.text_input(
                     "Enter your research question (optional):",
-                    help="If provided, a focused summary will be generated after individual summaries"
                 )
             # Single button for both processes
-                if not st.session_state.get('processing_started', False):
-                    if st.button("Start Analysis"):
-                        st.session_state.processing_started = True
             # Show processing status and results
             if st.session_state.get('processing_started', False):

 if 'focused_summary_generated' not in st.session_state:
     st.session_state.focused_summary_generated = False
 def load_model(model_type):
     """Load appropriate model based on type with proper memory management"""
     try:
+        # Clear any existing cached data
         gc.collect()
         torch.cuda.empty_cache()
+        device = "cpu"  # Force CPU usage
         if model_type == "summarize":
+            # Load the new fine-tuned model directly
             model = AutoModelForSeq2SeqLM.from_pretrained(
                 "pendar02/bart-large-pubmedd",
                 cache_dir="./models",
                 "pendar02/bart-large-pubmedd",
                 cache_dir="./models"
             )
+        else:  # question_focused
             base_model = AutoModelForSeq2SeqLM.from_pretrained(
                 "GanjinZero/biobart-base",
                 cache_dir="./models",
         raise
 def cleanup_model(model, tokenizer):
+    """Properly cleanup model resources"""
     try:
         del model
         del tokenizer
     except Exception:
         pass
+@st.cache_data
 def process_excel(uploaded_file):
+    """Process uploaded Excel file"""
     try:
         df = pd.read_excel(uploaded_file)
         required_columns = ['Abstract', 'Article Title', 'Authors',
                           'Source Title', 'Publication Year', 'DOI', 'Times Cited, All Databases']
+        # Check required columns
         missing_columns = [col for col in required_columns if col not in df.columns]
         if missing_columns:
             st.error(f"Missing required columns: {', '.join(missing_columns)}")
         st.error(f"Error processing file: {str(e)}")
         return None
+def preprocess_text(text):
+    """Preprocess text to add appropriate formatting before summarization"""
+    if not isinstance(text, str) or not text.strip():
+        return text
+    # Split text into sentences (basic implementation)
+    sentences = [s.strip() for s in text.replace('. ', '.\n').split('\n')]
+    # Remove empty sentences
+    sentences = [s for s in sentences if s]
+    # Join with proper line breaks
+    formatted_text = '\n'.join(sentences)
+    return formatted_text
+def post_process_summary(summary):
+    """Clean up and improve summary coherence"""
+    if not summary:
+        return summary
+    # Split into sentences
+    sentences = [s.strip() for s in summary.split('.')]
+    sentences = [s for s in sentences if s]  # Remove empty sentences
+    # Fix common issues
+    processed_sentences = []
+    for i, sentence in enumerate(sentences):
+        # Remove redundant words/phrases
+        sentence = sentence.replace(" and and ", " and ")
+        sentence = sentence.replace("appointment and appointment", "appointment")
+        # Fix common grammatical issues
+        sentence = sentence.replace("Cancers distress", "Cancer distress")
+        sentence = sentence.replace("  ", " ")  # Remove double spaces
+        # Capitalize first letter of each sentence
+        sentence = sentence.capitalize()
+        # Add to processed sentences if not empty
+        if sentence.strip():
+            processed_sentences.append(sentence)
+    # Join sentences with proper spacing and punctuation
+    cleaned_summary = '. '.join(processed_sentences)
+    if cleaned_summary and not cleaned_summary.endswith('.'):
+        cleaned_summary += '.'
+    return cleaned_summary
 def improve_summary_generation(text, model, tokenizer):
     """Generate improved summary with better prompt and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
+    # Add a more specific prompt
+    formatted_text = (
+        "Summarize this medical research paper following this structure exactly:\n"
+        "1. Background and objectives\n"
+        "2. Methods\n"
+        "3. Key findings with specific numbers/percentages\n"
+        "4. Main conclusions\n"
+        "Original text: " + preprocess_text(text)
+    )
+    # Adjust generation parameters
+    inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        summary_ids = model.generate(
+            **{
+                "input_ids": inputs["input_ids"],
+                "attention_mask": inputs["attention_mask"],
+                "max_length": 200,
+                "min_length": 50,
+                "num_beams": 5,
+                "length_penalty": 1.5,
+                "no_repeat_ngram_size": 3,
+                "temperature": 0.7,
+                "repetition_penalty": 1.5
+            }
         )
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    # Post-process the summary
+    processed_summary = post_process_summary(summary)
+    # Validate the summary
+    if not validate_summary(processed_summary, text):
+        # If validation fails, try one more time with different parameters
         with torch.no_grad():
             summary_ids = model.generate(
                 **{
                     "input_ids": inputs["input_ids"],
                     "attention_mask": inputs["attention_mask"],
+                    "max_length": 200,
+                    "min_length": 50,
+                    "num_beams": 4,
                     "length_penalty": 2.0,
+                    "no_repeat_ngram_size": 4,
+                    "temperature": 0.8,
+                    "repetition_penalty": 2.0
                 }
             )
         summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        processed_summary = post_process_summary(summary)
+    return processed_summary
 def validate_summary(summary, original_text):
     """Validate summary content against original text"""
     # Check for age inconsistencies
     age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
     if len(age_mentions) > 1:  # Multiple age mentions
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
+    # Preprocess each abstract
+    formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts]
+    combined_input = f"Question: {question} Abstracts: " + " [SEP] ".join(formatted_abstracts)
+    inputs = tokenizer(combined_input, return_tensors="pt", max_length=1024, truncation=True)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        summary_ids = model.generate(
+            **{
+                "input_ids": inputs["input_ids"],
+                "attention_mask": inputs["attention_mask"],
+                "max_length": 200,
+                "min_length": 50,
+                "num_beams": 4,
+                "length_penalty": 2.0,
+                "early_stopping": True
+            }
+        )
+    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 def create_filter_controls(df, sort_column):
     """Create appropriate filter controls based on the selected column"""
     filtered_df = df.copy()
     if sort_column == 'Publication Year':
+        # Year range slider
         year_min = int(df['Publication Year'].min())
         year_max = int(df['Publication Year'].max())
         col1, col2 = st.columns(2)
         ]
     elif sort_column == 'Authors':
+        # Multi-select for authors
         unique_authors = sorted(set(
             author.strip()
             for authors in df['Authors'].dropna()
             ]
     elif sort_column == 'Source Title':
+        # Multi-select for source titles
         unique_sources = sorted(df['Source Title'].unique())
         selected_sources = st.multiselect(
             'Select Sources',
         if selected_sources:
             filtered_df = filtered_df[filtered_df['Source Title'].isin(selected_sources)]
+    elif sort_column == 'Article Title':
+        # Only alphabetical sorting, no filtering
+        pass
     elif sort_column == 'Times Cited':
+        # Cited count range slider
         cited_min = int(df['Times Cited'].min())
         cited_max = int(df['Times Cited'].max())
         col1, col2 = st.columns(2)
 def main():
     st.title("🔬 Biomedical Papers Analysis")
+    # File upload section
     uploaded_file = st.file_uploader(
         "Upload Excel file containing papers",
         type=['xlsx', 'xls'],
         help="File must contain: Abstract, Article Title, Authors, Source Title, Publication Year, DOI"
     )
+    # Question input - moved up but hidden initially
     question_container = st.empty()
     question = ""
     if uploaded_file is not None:
+        # Process Excel file
         if st.session_state.processed_data is None:
             with st.spinner("Processing file..."):
                 df = process_excel(uploaded_file)
             df = st.session_state.processed_data
             st.write(f"📊 Loaded {len(df)} papers with abstracts")
+            # Get question before processing
             with question_container:
                 question = st.text_input(
                     "Enter your research question (optional):",
+                    help="If provided, a question-focused summary will be generated after individual summaries"
                 )
             # Single button for both processes
+            if not st.session_state.get('processing_started', False):
+                if st.button("Start Analysis"):
+                    st.session_state.processing_started = True
             # Show processing status and results
             if st.session_state.get('processing_started', False):