Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 26

Commit

dee9a31

verified ·

1 Parent(s): 3ffe379

Update app.py

Browse files

Files changed (1) hide show

app.py +220 -152

app.py CHANGED Viewed

@@ -23,6 +23,10 @@ st.set_page_config(
 )
 # Initialize session state
 if 'processed_data' not in st.session_state:
     st.session_state.processed_data = None
 if 'summaries' not in st.session_state:
@@ -39,6 +43,8 @@ if 'current_tokenizer' not in st.session_state:
     st.session_state.current_tokenizer = None
 if 'model_type' not in st.session_state:
     st.session_state.model_type = None
 # TextProcessor class definition
@@ -193,142 +199,156 @@ def validate_excel_structure(df):
     return len(validation_messages) == 0, validation_messages
 def preprocess_text(text):
-    """Enhanced text preprocessing with improved header and list handling"""
     if not isinstance(text, str) or not text.strip():
         return text
-    # Initial cleanup
-    text = re.sub(r'\s+', ' ', text.strip())
-    # Standardize case for specific terms (e.g., PRIME -> Prime)
-    text = re.sub(r'\b([A-Z]{2,})\b', lambda m: m.group(1).title(), text)
-    # Fix spacing around punctuation and parentheses
-    text = re.sub(r'\s*:\s*', ': ', text)
-    text = re.sub(r'\s*,\s*', ', ', text)
-    text = re.sub(r'\(\s*([ivx\d]+)\s*\)', r'(\1)', text)
-    # Convert numbered lists to consistent format
-    text = re.sub(r'(?m)^\s*(\d+)\.\s*', r'(\1) ', text)
-    # Normalize section headers (using comprehensive patterns)
     section_patterns = {
-        r'\b(?:Introduction|Background|Objectives|Purpose|Context)\s*:': 'Background and Objectives: ',
-        r'\b(?:Methods|Materials and Methods|Approach|Study Design|Experimental Design)\s*:': 'Methods: ',
-        r'\b(?:Results|Findings|Observations|Key Findings)\s*:': 'Results: ',
-        r'\b(?:Discussion|Analysis|Implications|Interpretation)\s*:': 'Discussion: ',
-        r'\b(?:Conclusion|Conclusions|Summary|Final Remarks)\s*:': 'Conclusions: '
     }
-    # Remove nested headers
-    nested_header_pattern = r'\d+\.\s*(?:Background|Objectives|Methods|Results|Discussion|Conclusions)\s*:'
-    text = re.sub(nested_header_pattern, '', text)
-    # Standardize section headers
     for pattern, replacement in section_patterns.items():
         text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
-    # Split merged section headers
-    text = re.sub(r'(?i)Results\s+and\s+Conclusions:', 'Results: ', text)
-    # Handle special characters and normalize spacing
-    text = re.sub(r'[“”]', '"', text)  # Correctly handle double quotes
-    text = re.sub(r"[‘’]", "'", text)  # Correctly handle single quotes
-    text = re.sub(r'\s*-\s*', '-', text)
-    # Tokenize and capitalize sentences
-    sentences = re.split(r'(?<=\w[.!?])\s+|\n(?=\d+\.|\(\w+\)|-)', text)
-    formatted_sentences = [s.strip().capitalize() for s in sentences if s.strip()]
-    return ' '.join(formatted_sentences)
-def post_process_summary(summary):
-    """Enhanced summary post-processing with improved formatting."""
-    if not summary:
-        return summary
-    # Step 1: Remove empty or redundant headers
-    summary = re.sub(r'\b(?:Background|Objectives|Methods|Results|Conclusions)\s*:\s*\.?\s*', '', summary)
-    # Step 2: Fix spacing issues in lists and parentheses
-    summary = re.sub(r'\(\s*([ivx\d]+)\s*\)', r'(\1)', summary)  # Fix space inside parentheses
-    summary = re.sub(r'\s*,\s*(\([ivx\d]+\))', r', \1', summary)  # Fix spacing before list items
-    # Step 3: Ensure proper punctuation and spacing
-    summary = re.sub(r'(?<=[.!?])\s*([A-Z])', r' \1', summary)  # Add space after punctuation
-    summary = re.sub(r'\s*:\s*', ': ', summary)  # Fix spacing around colons
-    # Step 4: Remove sections with too little content
-    sections = [s.strip() for s in summary.split('\n') if len(s.split()) > 3]
-    summary = ' '.join(sections)
-    # Step 5: Remove multiple periods
-    summary = re.sub(r'\.\.+', '.', summary)
-    # Step 6: Ensure summary ends with a single period
-    summary = summary.strip()
-    if not summary.endswith('.'):
-        summary += '.'
-    return summary
 def generate_focused_summary(question, abstracts, model, tokenizer):
-    """Generate a structured summary based on the given question and abstracts."""
-    # Preprocess and clean abstracts
     formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts if abstract.strip()]
-    if not formatted_abstracts:
-        raise ValueError("Abstracts list is empty or improperly formatted.")
-    # Join abstracts with separator
     abstracts_content = " [SEP] ".join(formatted_abstracts)
-    # Create the prompt
     prompt = f"""
-    Generate a structured summary based on the given abstracts and the question. Follow these rules STRICTLY:
-    **QUESTION:** {question}
-    **SECTION FORMATTING RULES:**
-    1. Each section MUST start with the section name followed by ": " (e.g., "Background: ").
-    2. Each section MUST end with a period.
-    3. Write complete, grammatically correct sentences.
-    4. Do not use bullet points, lists, or combined section headers.
-    5. Maintain the exact order of sections: Background, Objectives, Methods, Results, Conclusions.
-    6. Avoid redundancies, incomplete thoughts, and cutting sentences mid-way.
-    7. Use transition words (e.g., "Additionally," "Furthermore," "Moreover") to connect ideas naturally.
-    **REQUIRED SECTIONS AND CONTENT:**
-    1. **Background**:
-       - Provide the context and motivation for the study.
-       - Do not mention objectives, methods, or results in this section.
-    2. **Objectives**:
-       - Clearly state the aim(s) of the study.
-       - Avoid referencing any methods or findings.
-    3. **Methods**:
-       - Describe the approach, tools, and procedures used.
-       - Do not include any findings or results in this section.
-    4. **Results**:
-       - Summarize the key findings, including relevant statistics and outcomes.
-       - Mention implications only if explicitly stated in the abstracts.
-    5. **Conclusions**:
-       - Highlight the overall interpretation of findings.
-       - Emphasize the significance and implications of the study.
-    **CRITICAL FORMAT RULES:**
-    1. Each section title must be followed by a colon and a space.
-    2. All sentences must be grammatically complete and coherent.
-    3. Avoid bullet points, lists, and repeated sections.
-    4. End each section with a period.
-    **INPUT ABSTRACTS:** {abstracts_content}
-    """
-    # Tokenize input (use the correct variable `prompt` here)
-    inputs = tokenizer(prompt,
-                       return_tensors="pt",
-                       max_length=1024,
-                       truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     with torch.no_grad():
@@ -336,20 +356,80 @@ def generate_focused_summary(question, abstracts, model, tokenizer):
             **{
                 "input_ids": inputs["input_ids"],
                 "attention_mask": inputs["attention_mask"],
-                "max_length": 280,
-                "min_length": 100,
                 "num_beams": 4,
                 "length_penalty": 2.0,
-                "no_repeat_ngram_size": 2,
                 "temperature": 0.7,
                 "do_sample": False
             }
         )
-    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
     return post_process_summary(summary)
 def process_papers_in_batches(df, model, tokenizer, batch_size=2):
     """Process papers in batches for better efficiency"""
@@ -619,54 +699,42 @@ def main():
                         if not st.session_state.get('focused_summary_generated', False):
                             try:
                                 with st.spinner("Analyzing relevant papers..."):
-                                    # Initialize text processor if needed
                                     if st.session_state.text_processor is None:
                                         st.session_state.text_processor = TextProcessor()
-                                    # Validate question
-                                    if not question.strip():
-                                        st.warning("Please enter a question first")
-                                        return
-                                    # Find relevant abstracts
                                     results = st.session_state.text_processor.find_most_relevant_abstracts(
                                         question,
                                         df['Abstract'].tolist(),
                                         top_k=5
                                     )
                                     if not results['top_indices']:
-                                        st.warning("No relevant papers found for your question")
-                                        return
-                                    # Load question-focused model
-                                    model, tokenizer = get_model("question_focused")
-                                    if model is None or tokenizer is None:
                                         return
-                                    # Generate focused summary
-                                    try:
-                                        relevant_abstracts = df['Abstract'].iloc[results['top_indices']].tolist()
-                                        focused_summary = generate_focused_summary(
-                                            question,
-                                            relevant_abstracts,
-                                            model,
-                                            tokenizer
-                                        )
-                                        # Store results
-                                        st.session_state.focused_summary = focused_summary
-                                        st.session_state.relevant_papers = df.iloc[results['top_indices']]
-                                        st.session_state.relevance_scores = results['scores']
-                                        st.session_state.focused_summary_generated = True
-                                    finally:
-                                        # Cleanup second model
-                                        cleanup_model(model, tokenizer)
                             except Exception as e:
                                 st.error(f"Error generating focused summary: {str(e)}")
                                 reset_processing_state()
                     # Display focused summary results
                     if st.session_state.get('focused_summary_generated', False):

 )
 # Initialize session state
+if 'relevant_papers' not in st.session_state:
+    st.session_state.relevant_papers = None
+if 'relevance_scores' not in st.session_state:
+    st.session_state.relevance_scores = None
 if 'processed_data' not in st.session_state:
     st.session_state.processed_data = None
 if 'summaries' not in st.session_state:
     st.session_state.current_tokenizer = None
 if 'model_type' not in st.session_state:
     st.session_state.model_type = None
+if 'focused_summary' not in st.session_state:
+    st.session_state.focused_summary = None
 # TextProcessor class definition
     return len(validation_messages) == 0, validation_messages
 def preprocess_text(text):
+    """Clean biomedical text by handling common formatting issues and standardizing structure."""
     if not isinstance(text, str) or not text.strip():
         return text
+    # Remove extra whitespace
+    text = ' '.join(text.split())
+    # Roman numeral conversion
+    roman_map = {'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5',
+                 'vi': '6', 'vii': '7', 'viii': '8', 'ix': '9', 'x': '10'}
+    def replace_roman(match):
+        roman = match.group(1).lower()
+        return f"({roman_map.get(roman, roman)})"
+    text = re.sub(r'\(([ivx]+)\)', replace_roman, text)
+    # Clean enumerated lists
+    for roman in roman_map:
+        text = re.sub(f"\\b{roman}\\)", f"{roman_map[roman]})", text, flags=re.IGNORECASE)
+    # Standardize section headers
     section_patterns = {
+        r'\b(?:introduction|purpose|background|objectives?|context)\s*:?\s*': 'Background: ',
+        r'\b(?:materials?\s+and\s+methods?|methods?|approach|study\s+design)\s*:?\s*': 'Methods: ',
+        r'\b(?:results?|findings?|observations?)\s*:?\s*': 'Results: ',
+        r'\b(?:conclusions?|summary|final\s+remarks?)\s*:?\s*': 'Conclusions: ',
+        r'\b(?:results?\s+and\s+conclusions?)\s*:?\s*(?=.*?:)': '',  # Remove if followed by another section
+        r'\b(?:results?\s*:\s*and\s*conclusions?\s*:)': 'Results: '  # Fix malformed combination
     }
     for pattern, replacement in section_patterns.items():
         text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
+    # Ensure complete sentences in sections
+    text = re.sub(r'(?<=:)\s*([^.!?\n]*?)(?=\s*(?:[A-Z][^:]*:|$))',
+                  lambda m: f" {m.group(1)}." if m.group(1) and not m.group(1).strip().endswith('.') else m.group(0),
+                  text)
+    # Fix truncated sentences
+    text = re.sub(r'(?<=:)\s*([^.!?\n]*?)\s*(?=[A-Z][^:]*:)',
+                  lambda m: f" {m.group(1)}." if m.group(1) else "",
+                  text)
+    # Clean formatting
+    text = re.sub(r'[\r\n]+', ' ', text)
+    text = re.sub(r'\s*:\s*', ': ', text)
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'(?<=[.!?])\s*(?=[A-Z])', ' ', text)
+    text = re.sub(r'•|\*|■|□|→|✓', '', text)
+    text = re.sub(r'\\n|\\r', ' ', text)
+    text = re.sub(r'\s*\(\s*', ' (', text)
+    text = re.sub(r'\s*\)\s*', ') ', text)
+    # Fix statistical notations
+    text = re.sub(r'p\s*[<=>]\s*0\.\d+', lambda m: m.group().replace(' ', ''), text)
+    text = re.sub(r'(?<=\d)\s*%', '%', text)
+    # Fix abbreviations spacing
+    text = re.sub(r'(?<=\w)vs\.(?=\w)', 'vs. ', text)
+    text = re.sub(r'(?<=\w)et\s+al\.(?=\w)', 'et al. ', text)
+    # Remove repeated punctuation
+    text = re.sub(r'([.!?])\1+', r'\1', text)
+    # Final cleanup
+    text = re.sub(r'(?<=[.!?])\s*(?=[A-Z])', ' ', text)
+    text = text.strip()
+    if not text.endswith('.'):
+        text += '.'
+    return text
+#     """Enhanced text preprocessing with better section handling and prompt removal."""
+#     if not isinstance(text, str) or not text.strip():
+#         return text
+#     # Remove prompt leakage
+#     prompt_patterns = [
+#         r'Generate a structured summary addressing this question:.*?(?=\w+:)',
+#         r'Focus on key findings and methods\.',
+#         r'is a structured summary addressing this question:'
+#     ]
+#     for pattern in prompt_patterns:
+#         text = re.sub(pattern, '', text, flags=re.IGNORECASE)
+#     # Clean section headers more aggressively
+#     section_patterns = {
+#         r'\b(?:introduction|purpose|background|objectives?|context)\s*:?\s*': 'Background: ',
+#         r'\b(?:materials?\s+and\s+methods?|methods?|approach|study\s+design)\s*:?\s*': 'Methods: ',
+#         r'\b(?:results?|findings?|observations?)\s*:?\s*': 'Results: ',
+#         r'\b(?:conclusions?|summary|final\s+remarks?)\s*:?\s*': 'Conclusions: '
+#     }
+#     # Apply section normalization
+#     for pattern, replacement in section_patterns.items():
+#         text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
+#     # Remove combined section headers
+#     combined_headers = [
+#         r'\bmethods?\s+and\s+conclusions?\b',
+#         r'\bresults?\s+and\s+conclusions?\b',
+#         r'\bmaterials?\s+and\s+methods?\b'
+#     ]
+#     for pattern in combined_headers:
+#         text = re.sub(pattern, 'Methods:', text, flags=re.IGNORECASE)
+#     # Clean up sentences
+#     sentences = text.split('.')
+#     cleaned_sentences = []
+#     for sentence in sentences:
+#         # Remove redundant section references
+#         sentence = re.sub(r'\b(?:first|second|third|fourth|fifth)\s+sections?\b', '', sentence, flags=re.IGNORECASE)
+#         # Remove comparative phrases about section details
+#         sentence = re.sub(r'\b(?:more|less)\s+detailed\s+than.*', '', sentence, flags=re.IGNORECASE)
+#         if sentence.strip():
+#             cleaned_sentences.append(sentence.strip())
+#     # Rejoin and format
+#     text = '. '.join(cleaned_sentences)
+#     text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
+#     text = re.sub(r'\s*:\s*', ': ', text)  # Fix spacing around colons
+#     return text.strip()
 def generate_focused_summary(question, abstracts, model, tokenizer):
     formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts if abstract.strip()]
     abstracts_content = " [SEP] ".join(formatted_abstracts)
     prompt = f"""
+    Provide a factual summary structured as:
+    - Background: Context and origin only if present
+    - Methods: Key procedures and approaches
+    - Results: Specific findings with numbers
+    - Conclusions: Main implications
+    Requirements:
+    - Present sections sequentially
+    - Merge related points within sections
+    - Complete all sentences
+    - Avoid repeating section headers
+    - Use original terminology
+    Content: {abstracts_content}
+    """
+    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     with torch.no_grad():
             **{
                 "input_ids": inputs["input_ids"],
                 "attention_mask": inputs["attention_mask"],
+                "max_length": 512,
+                "min_length": 200,
                 "num_beams": 4,
                 "length_penalty": 2.0,
+                "no_repeat_ngram_size": 3,
                 "temperature": 0.7,
                 "do_sample": False
             }
         )
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
     return post_process_summary(summary)
+def post_process_summary(summary):
+    """Post-process summary with improved section handling and formatting."""
+    if not summary:
+        return summary
+    valid_sections = ['Background', 'Methods', 'Results', 'Conclusions']
+    sections = {}
+    current_section = None
+    current_content = []
+    # Pre-clean section headers
+    summary = re.sub(r'\b(?:results?\s*:\s*and\s*conclusions?\s*:)', 'Results:', summary, flags=re.IGNORECASE)
+    summary = re.sub(r'\bresults?\s*and\s*conclusions?\s*:', 'Results:', summary, flags=re.IGNORECASE)
+    # Process line by line
+    lines = [line.strip() for line in summary.split('.') if line.strip()]
+    for i, line in enumerate(lines):
+        section_match = None
+        for section in valid_sections:
+            if re.match(fr'\b{section}:', line, re.IGNORECASE):
+                section_match = section
+                break
+        if section_match:
+            if current_section:
+                content = ' '.join(current_content)
+                if content:
+                    sections[current_section] = content
+            current_section = section_match
+            content = re.sub(fr'\b{section_match}:\s*', '', line, flags=re.IGNORECASE)
+            current_content = [content] if content else []
+        elif current_section:
+            # Prevent section header splitting
+            if not any(sect.lower() in line.lower() for sect in valid_sections):
+                current_content.append(line)
+    if current_section and current_content:
+        sections[current_section] = ' '.join(current_content)
+    # Format sections
+    formatted_sections = []
+    for section in valid_sections:
+        if section in sections:
+            content = sections[section].strip()
+            if content:
+                # Complete truncated sentences
+                if not re.search(r'[.!?]$', content):
+                    if len(content.split()) >= 3:  # Only complete if substantial
+                        content += '.'
+                # Ensure capitalization
+                content = content[0].upper() + content[1:]
+                # Fix double periods
+                content = re.sub(r'\.+', '.', content)
+                formatted_sections.append(f"{section}: {content}")
+    return ' '.join(formatted_sections)
 def process_papers_in_batches(df, model, tokenizer, batch_size=2):
     """Process papers in batches for better efficiency"""
                         if not st.session_state.get('focused_summary_generated', False):
                             try:
                                 with st.spinner("Analyzing relevant papers..."):
                                     if st.session_state.text_processor is None:
                                         st.session_state.text_processor = TextProcessor()
+                                    model, tokenizer = get_model("question_focused")
+                                    if model is None or tokenizer is None:
+                                        raise Exception("Failed to load question-focused model")
                                     results = st.session_state.text_processor.find_most_relevant_abstracts(
                                         question,
                                         df['Abstract'].tolist(),
                                         top_k=5
                                     )
                                     if not results['top_indices']:
+                                        st.warning("No papers found relevant to your question")
                                         return
+                                    # Store relevant papers and scores
+                                    st.session_state.relevant_papers = df.iloc[results['top_indices']]
+                                    st.session_state.relevance_scores = results['scores']
+                                    relevant_abstracts = df['Abstract'].iloc[results['top_indices']].tolist()
+                                    st.session_state.focused_summary = generate_focused_summary(
+                                        question,
+                                        relevant_abstracts,
+                                        model,
+                                        tokenizer
+                                    )
+                                    st.session_state.focused_summary_generated = True
                             except Exception as e:
                                 st.error(f"Error generating focused summary: {str(e)}")
                                 reset_processing_state()
+                            finally:
+                                cleanup_model(model, tokenizer)
                     # Display focused summary results
                     if st.session_state.get('focused_summary_generated', False):