Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 12

Commit

d0820e9

verified ·

1 Parent(s): 805eb33

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -196

app.py CHANGED Viewed

@@ -36,6 +36,7 @@ def load_model(model_type):
         device = "cpu"  # Force CPU usage
         if model_type == "summarize":
             model = AutoModelForSeq2SeqLM.from_pretrained(
                 "pendar02/bart-large-pubmedd",
                 cache_dir="./models",
@@ -150,219 +151,65 @@ def post_process_summary(summary):
     return cleaned_summary
 def improve_summary_generation(text, model, tokenizer):
-    """Enhanced version of summary generation optimized for biomedical papers"""
-    if not isinstance(text, str) or not text.strip():
-        return "No abstract available to summarize."
-    # Don't summarize if text is too short
-    word_count = len(text.split())
-    if word_count < 100:  # Increased minimum length for medical texts
-        return text
-    # Preprocess text
-    formatted_text = preprocess_text(text)
-    # Prepare inputs
-    inputs = tokenizer(
-        formatted_text,
-        return_tensors="pt",
-        max_length=1024,
-        truncation=True,
-        padding=True
     )
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    # Generate summary with parameters tuned for biomedical text
     with torch.no_grad():
         summary_ids = model.generate(
             **{
                 "input_ids": inputs["input_ids"],
                 "attention_mask": inputs["attention_mask"],
-                "max_length": 300,  # Increased for medical summaries
-                "min_length": 100,  # Increased to ensure comprehensive coverage
-                "num_beams": 4,
-                "length_penalty": 2.0,  # Encourage slightly longer summaries
                 "no_repeat_ngram_size": 3,
-                "early_stopping": True,
-                "do_sample": True,  # Enable sampling
-                "top_p": 0.95,  # Nucleus sampling
-                "temperature": 0.85,  # Slightly higher temperature for medical terms
-                "repetition_penalty": 1.5  # Increased to avoid repeated stats/numbers
             }
         )
     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-    # Enhanced post-processing for medical text
-    summary = post_process_medical_summary(summary)
-    return summary
-def post_process_medical_summary(summary):
-    """Special post-processing for medical/scientific summaries"""
-    if not summary:
-        return summary
-    # Fix common medical text issues
-    summary = (summary
-        .replace(" p =", " p=")  # Fix p-value spacing
-        .replace(" n =", " n=")  # Fix sample size spacing
-        .replace("( ", "(")      # Fix parentheses spacing
-        .replace(" )", ")")
-        .replace("vs.", "versus")  # Expand abbreviations
-        .replace("..", ".")      # Fix double periods
-    )
-    # Ensure statistical significance symbols are correct
-    summary = (summary
-        .replace("p < ", "p<")
-        .replace("p > ", "p>")
-        .replace("P < ", "p<")
-        .replace("P > ", "p>")
-    )
-    # Fix number formatting
-    summary = (summary
-        .replace(" +/- ", "±")
-        .replace(" ± ", "±")
-    )
-    # Split into sentences and process each
-    sentences = [s.strip() for s in summary.split('.')]
-    processed_sentences = []
-    for sentence in sentences:
-        if sentence:
-            # Capitalize first letter
-            sentence = sentence[0].upper() + sentence[1:] if sentence else sentence
-            # Fix common medical abbreviations spacing
-            sentence = (sentence
-                .replace(" et al ", " et al. ")
-                .replace("et al.", "et al.")  # Fix double period
-            )
-            processed_sentences.append(sentence)
-    # Join sentences
-    summary = '. '.join(processed_sentences)
-    # Ensure proper ending
-    if summary and not summary.endswith('.'):
-        summary += '.'
-    return summary
-def post_process_medical_summary(summary):
-    """Special post-processing for medical/scientific summaries"""
-    if not summary:
-        return summary
-    # Fix common medical text issues
-    summary = (summary
-        .replace(" p =", " p=")  # Fix p-value spacing
-        .replace(" n =", " n=")  # Fix sample size spacing
-        .replace("( ", "(")      # Fix parentheses spacing
-        .replace(" )", ")")
-        .replace("vs.", "versus")  # Expand abbreviations
-        .replace("..", ".")      # Fix double periods
-    )
-    # Ensure statistical significance symbols are correct
-    summary = (summary
-        .replace("p < ", "p<")
-        .replace("p > ", "p>")
-        .replace("P < ", "p<")
-        .replace("P > ", "p>")
-    )
-    # Fix number formatting
-    summary = (summary
-        .replace(" +/- ", "±")
-        .replace(" ± ", "±")
-    )
-    # Split into sentences and process each
-    sentences = [s.strip() for s in summary.split('.')]
-    processed_sentences = []
-    for sentence in sentences:
-        if sentence:
-            # Capitalize first letter
-            sentence = sentence[0].upper() + sentence[1:] if sentence else sentence
-            # Fix common medical abbreviations spacing
-            sentence = (sentence
-                .replace(" et al ", " et al. ")
-                .replace("et al.", "et al.")  # Fix double period
-            )
-            processed_sentences.append(sentence)
-    # Join sentences
-    summary = '. '.join(processed_sentences)
-    # Ensure proper ending
-    if summary and not summary.endswith('.'):
-        summary += '.'
-    return summary
-def post_process_medical_summary(summary):
-    """Special post-processing for medical/scientific summaries"""
     if not summary:
         return summary
-    # Fix common medical text issues
-    summary = (summary
-        .replace(" p =", " p=")  # Fix p-value spacing
-        .replace(" n =", " n=")  # Fix sample size spacing
-        .replace("( ", "(")      # Fix parentheses spacing
-        .replace(" )", ")")
-        .replace("vs.", "versus")  # Expand abbreviations
-        .replace("..", ".")      # Fix double periods
-    )
-    # Ensure statistical significance symbols are correct
-    summary = (summary
-        .replace("p < ", "p<")
-        .replace("p > ", "p>")
-        .replace("P < ", "p<")
-        .replace("P > ", "p>")
-    )
-    # Fix number formatting
-    summary = (summary
-        .replace(" +/- ", "±")
-        .replace(" ± ", "±")
-    )
-    # Split into sentences and process each
-    sentences = [s.strip() for s in summary.split('.')]
-    processed_sentences = []
-    for sentence in sentences:
-        if sentence:
-            # Capitalize first letter
-            sentence = sentence[0].upper() + sentence[1:] if sentence else sentence
-            # Fix common medical abbreviations spacing
-            sentence = (sentence
-                .replace(" et al ", " et al. ")
-                .replace("et al.", "et al.")  # Fix double period
-            )
-            processed_sentences.append(sentence)
-    # Join sentences
-    summary = '. '.join(processed_sentences)
-    # Ensure proper ending
-    if summary and not summary.endswith('.'):
-        summary += '.'
-    return summary
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
@@ -388,6 +235,23 @@ def generate_focused_summary(question, abstracts, model, tokenizer):
     return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 def main():
     st.title("🔬 Biomedical Papers Analysis")

         device = "cpu"  # Force CPU usage
         if model_type == "summarize":
+            # Load the new fine-tuned model directly
             model = AutoModelForSeq2SeqLM.from_pretrained(
                 "pendar02/bart-large-pubmedd",
                 cache_dir="./models",
     return cleaned_summary
 def improve_summary_generation(text, model, tokenizer):
+    # Add a more specific prompt
+    formatted_text = (
+        "Summarize the following medical research paper, focusing on: "
+        "1) Study objectives 2) Methods 3) Key findings 4) Main conclusions. "
+        "Text: " + preprocess_text(text)
     )
+    # Adjust generation parameters
+    inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     with torch.no_grad():
         summary_ids = model.generate(
             **{
                 "input_ids": inputs["input_ids"],
                 "attention_mask": inputs["attention_mask"],
+                "max_length": 200,
+                "min_length": 50,
+                "num_beams": 5,
+                "length_penalty": 1.5,
                 "no_repeat_ngram_size": 3,
+                "temperature": 0.7,
+                "repetition_penalty": 1.5  # Increased to reduce repetition
             }
         )
     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+def post_process_summary(summary):
+    """Enhanced post-processing to catch common errors"""
     if not summary:
         return summary
+    # Remove contradictory age statements
+    age_statements = []
+    lines = summary.split('.')
+    cleaned_lines = []
+    for line in lines:
+        if "age" not in line.lower():
+            cleaned_lines.append(line)
+        elif not age_statements:  # Only keep first age statement
+            age_statements.append(line)
+            cleaned_lines.append(line)
+    # Remove redundant statements
+    seen_content = set()
+    unique_lines = []
+    for line in cleaned_lines:
+        line_core = ' '.join(sorted(line.lower().split()))  # Normalize for comparison
+        if line_core not in seen_content:
+            seen_content.add(line_core)
+            unique_lines.append(line)
+    # Join sentences with proper spacing and punctuation
+    cleaned_summary = '. '.join(s.strip() for s in unique_lines if s.strip())
+    if cleaned_summary and not cleaned_summary.endswith('.'):
+        cleaned_summary += '.'
+    return cleaned_summary
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
     return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+def validate_summary(summary, original_text):
+    """Validate summary content against original text"""
+    # Check for age inconsistencies
+    age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
+    if len(age_mentions) > 1:  # Multiple age mentions
+        return False
+    # Check for repetitive sentences
+    sentences = summary.split('.')
+    unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
+    if len(sentences) - len(unique_sentences) > 1:  # More than one duplicate
+        return False
+    return True
 def main():
     st.title("🔬 Biomedical Papers Analysis")