Spaces:

salvinjose
/

HNTAI

Sleeping

App Files Files Community

sachinchandrankallar commited on 11 days ago

Commit

0073f9b

1 Parent(s): bb194c0

chunking

Browse files

Files changed (5) hide show

__pycache__/test_chunking.cpython-311.pyc +0 -0
ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc +0 -0
ai_med_extract/agents/summarizer.py +51 -29
simple_test.py +35 -0
test_chunking.py +56 -0

__pycache__/test_chunking.cpython-311.pyc ADDED Viewed

Binary file (4.61 kB). View file

ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc CHANGED Viewed

Binary files a/ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc and b/ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc differ

ai_med_extract/agents/summarizer.py CHANGED Viewed

@@ -43,6 +43,24 @@ class SummarizerAgent:
         return text
     def generate_summary(self, text):
         """Generate summary with improved state management and parameter optimization"""
         try:
@@ -57,33 +75,40 @@ class SummarizerAgent:
             # Load model (this ensures fresh model state for each request)
             model = self.summarization_model_loader.load()
-            # Generate summary with optimized parameters
-            summary_result = model(
-                clean_text,
-                max_length=max_length,
-                min_length=min_length,
-                do_sample=False,
-                num_beams=4,  # Use beam search for more consistent results
-                early_stopping=True
-            )
-            # Extract and clean summary
-            if isinstance(summary_result, list) and summary_result:
-                summary = summary_result[0].get('summary_text', '').strip()
-            else:
-                summary = str(summary_result).strip()
-            # Remove any prompt artifacts that might be included
-            summary = re.sub(r'^.*?(?=##|Clinical|Assessment|Summary)', '', summary, flags=re.IGNORECASE)
-            summary = summary.strip()
             # Track summary length for future optimization
-            self.last_summary_length = len(summary.split())
             self.request_count += 1
             logging.info(f"Generated summary: {self.last_summary_length} words, request count: {self.request_count}")
-            return summary
         except Exception as e:
             logging.error(f"Summary generation failed: {e}", exc_info=True)
@@ -96,17 +121,14 @@ class SummarizerAgent:
         if word_count < 20:
             return "Insufficient text for detailed summary."
-        # Simple template-based fallback
         sections = [
-            "## Clinical Assessment\nBased on the provided medical information.",
-            "## Key Findings\nReview of the clinical data indicates relevant medical content.",
-            "## Summary\nMedical documentation requires professional review for comprehensive assessment."
         ]
-        # Adjust length based on input
-        if word_count > 100:
-            sections.append("## Additional Notes\nFurther analysis recommended by healthcare provider.")
         return "\n\n".join(sections)
     def reset_state(self):

         return text
+    def _chunk_text(self, text, max_length):
+        """Chunk the text into smaller segments based on max_length, preserving context."""
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        for word in words:
+            current_chunk.append(word)
+            if len(current_chunk) >= max_length:
+                # Include a few words from the next chunk to maintain context
+                chunks.append(' '.join(current_chunk))
+                current_chunk = current_chunk[-10:]  # Keep the last 10 words for context
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks
     def generate_summary(self, text):
         """Generate summary with improved state management and parameter optimization"""
         try:
             # Load model (this ensures fresh model state for each request)
             model = self.summarization_model_loader.load()
+            # Chunk the text if it exceeds the max_length
+            chunks = self._chunk_text(clean_text, max_length)
+            full_summary = ""
+            for chunk in chunks:
+                # Generate summary for each chunk
+                summary_result = model(
+                    chunk,
+                    max_length=max_length,
+                    min_length=min_length,
+                    do_sample=False,
+                    num_beams=4,  # Use beam search for more consistent results
+                    early_stopping=True
+                )
+                # Extract and clean summary
+                if isinstance(summary_result, list) and summary_result:
+                    summary = summary_result[0].get('summary_text', '').strip()
+                else:
+                    summary = str(summary_result).strip()
+                # Remove any prompt artifacts that might be included
+                summary = re.sub(r'^.*?(?=##|Clinical|Assessment|Summary)', '', summary, flags=re.IGNORECASE)
+                summary = summary.strip()
+                full_summary += summary + "\n\n"  # Concatenate summaries with spacing
             # Track summary length for future optimization
+            self.last_summary_length = len(full_summary.split())
             self.request_count += 1
             logging.info(f"Generated summary: {self.last_summary_length} words, request count: {self.request_count}")
+            return full_summary.strip()
         except Exception as e:
             logging.error(f"Summary generation failed: {e}", exc_info=True)
         if word_count < 20:
             return "Insufficient text for detailed summary."
+        # Template-based fallback with the 4 required fields
         sections = [
+            "## Clinical Assessment\nOverall patient condition and status based on available data.",
+            "## Key Trends & Changes\nAnalysis of trends over time based on date-wise chart data.",
+            "## Plan & Suggested Actions\nRecommended next steps and interventions for patient care.",
+            "## Direct Guidance for Physician\nSpecific recommendations for the treating physician."
         ]
         return "\n\n".join(sections)
     def reset_state(self):

simple_test.py ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/usr/bin/env python3
+print("Testing chunking implementation...")
+import sys
+sys.path.insert(0, '.')
+from ai_med_extract.agents.summarizer import SummarizerAgent
+class MockModelLoader:
+    def load(self):
+        return self
+    def __call__(self, *args, **kwargs):
+        return [{'summary_text': 'Test summary content for verification.'}]
+# Test the chunking functionality
+summarizer = SummarizerAgent(MockModelLoader())
+# Test with a long text that requires chunking
+long_text = " ".join([f"Medical record entry {i}: Patient presents with various symptoms including headache, fatigue, and elevated blood pressure. The condition has been persistent for several days and requires medical attention." for i in range(60)])
+print(f"Original text length: {len(long_text.split())} words")
+# Test chunking
+chunks = summarizer._chunk_text(long_text, 100)
+print(f"Number of chunks created: {len(chunks)}")
+for i, chunk in enumerate(chunks):
+    print(f"Chunk {i+1}: {len(chunk.split())} words")
+# Test summary generation
+summary = summarizer.generate_summary(long_text)
+print(f"Generated summary length: {len(summary)} characters")
+print("Summary preview:", summary[:200] + "..." if len(summary) > 200 else summary)
+print("Test completed successfully!")

test_chunking.py ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/usr/bin/env python3
+"""
+Test script to verify the chunking implementation for GGUF model summarization
+"""
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from ai_med_extract.agents.summarizer import SummarizerAgent
+class MockModelLoader:
+    """Mock model loader for testing"""
+    def load(self):
+        return self
+    def __call__(self, text, max_length=512, min_length=30, do_sample=False, num_beams=4, early_stopping=True):
+        # Mock model that returns a simple summary
+        return [{'summary_text': f"Summary of: {text[:50]}..."}]
+def test_chunking():
+    """Test the chunking functionality with context preservation"""
+    print("Testing chunking implementation with context preservation...")
+    # Create summarizer agent with mock model loader
+    summarizer = SummarizerAgent(MockModelLoader())
+    # Test with short text (should not be chunked)
+    short_text = "Patient has fever and cough. Blood pressure is normal."
+    summary_short = summarizer.generate_summary(short_text)
+    print(f"Short text summary: {summary_short}")
+    # Test with long text (should be chunked with context preservation)
+    long_text = " ".join([f"Symptom {i}: Patient reports ongoing issues with various medical conditions including persistent headaches, fatigue, and joint pain. The symptoms have been present for several weeks and are affecting daily activities." for i in range(50)])
+    summary_long = summarizer.generate_summary(long_text)
+    print(f"Long text summary length: {len(summary_long)} characters")
+    print(f"Long text summary preview: {summary_long[:200]}...")
+    # Test chunking method directly to verify context preservation
+    chunks = summarizer._chunk_text(long_text, 100)  # Small chunk size for testing
+    print(f"Number of chunks created: {len(chunks)}")
+    print(f"Chunk sizes: {[len(chunk.split()) for chunk in chunks]}")
+    # Verify that chunks have overlapping context
+    if len(chunks) > 1:
+        for i in range(len(chunks) - 1):
+            current_chunk = chunks[i]
+            next_chunk = chunks[i + 1]
+            # Check if there's some overlap in content
+            overlap_found = any(word in next_chunk for word in current_chunk.split()[-10:])
+            print(f"Chunk {i+1} to {i+2} context preservation: {'✓' if overlap_found else '✗'}")
+    print("Chunking test with context preservation completed successfully!")
+if __name__ == "__main__":
+    test_chunking()