sachinchandrankallar commited on
Commit
0073f9b
·
1 Parent(s): bb194c0
__pycache__/test_chunking.cpython-311.pyc ADDED
Binary file (4.61 kB). View file
 
ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc CHANGED
Binary files a/ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc and b/ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc differ
 
ai_med_extract/agents/summarizer.py CHANGED
@@ -43,6 +43,24 @@ class SummarizerAgent:
43
 
44
  return text
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def generate_summary(self, text):
47
  """Generate summary with improved state management and parameter optimization"""
48
  try:
@@ -57,33 +75,40 @@ class SummarizerAgent:
57
  # Load model (this ensures fresh model state for each request)
58
  model = self.summarization_model_loader.load()
59
 
60
- # Generate summary with optimized parameters
61
- summary_result = model(
62
- clean_text,
63
- max_length=max_length,
64
- min_length=min_length,
65
- do_sample=False,
66
- num_beams=4, # Use beam search for more consistent results
67
- early_stopping=True
68
- )
69
-
70
- # Extract and clean summary
71
- if isinstance(summary_result, list) and summary_result:
72
- summary = summary_result[0].get('summary_text', '').strip()
73
- else:
74
- summary = str(summary_result).strip()
75
-
76
- # Remove any prompt artifacts that might be included
77
- summary = re.sub(r'^.*?(?=##|Clinical|Assessment|Summary)', '', summary, flags=re.IGNORECASE)
78
- summary = summary.strip()
 
 
 
 
 
 
 
79
 
80
  # Track summary length for future optimization
81
- self.last_summary_length = len(summary.split())
82
  self.request_count += 1
83
 
84
  logging.info(f"Generated summary: {self.last_summary_length} words, request count: {self.request_count}")
85
 
86
- return summary
87
 
88
  except Exception as e:
89
  logging.error(f"Summary generation failed: {e}", exc_info=True)
@@ -96,17 +121,14 @@ class SummarizerAgent:
96
  if word_count < 20:
97
  return "Insufficient text for detailed summary."
98
 
99
- # Simple template-based fallback
100
  sections = [
101
- "## Clinical Assessment\nBased on the provided medical information.",
102
- "## Key Findings\nReview of the clinical data indicates relevant medical content.",
103
- "## Summary\nMedical documentation requires professional review for comprehensive assessment."
 
104
  ]
105
 
106
- # Adjust length based on input
107
- if word_count > 100:
108
- sections.append("## Additional Notes\nFurther analysis recommended by healthcare provider.")
109
-
110
  return "\n\n".join(sections)
111
 
112
  def reset_state(self):
 
43
 
44
  return text
45
 
46
+ def _chunk_text(self, text, max_length):
47
+ """Chunk the text into smaller segments based on max_length, preserving context."""
48
+ words = text.split()
49
+ chunks = []
50
+ current_chunk = []
51
+
52
+ for word in words:
53
+ current_chunk.append(word)
54
+ if len(current_chunk) >= max_length:
55
+ # Include a few words from the next chunk to maintain context
56
+ chunks.append(' '.join(current_chunk))
57
+ current_chunk = current_chunk[-10:] # Keep the last 10 words for context
58
+
59
+ if current_chunk:
60
+ chunks.append(' '.join(current_chunk))
61
+
62
+ return chunks
63
+
64
  def generate_summary(self, text):
65
  """Generate summary with improved state management and parameter optimization"""
66
  try:
 
75
  # Load model (this ensures fresh model state for each request)
76
  model = self.summarization_model_loader.load()
77
 
78
+ # Chunk the text if it exceeds the max_length
79
+ chunks = self._chunk_text(clean_text, max_length)
80
+ full_summary = ""
81
+
82
+ for chunk in chunks:
83
+ # Generate summary for each chunk
84
+ summary_result = model(
85
+ chunk,
86
+ max_length=max_length,
87
+ min_length=min_length,
88
+ do_sample=False,
89
+ num_beams=4, # Use beam search for more consistent results
90
+ early_stopping=True
91
+ )
92
+
93
+ # Extract and clean summary
94
+ if isinstance(summary_result, list) and summary_result:
95
+ summary = summary_result[0].get('summary_text', '').strip()
96
+ else:
97
+ summary = str(summary_result).strip()
98
+
99
+ # Remove any prompt artifacts that might be included
100
+ summary = re.sub(r'^.*?(?=##|Clinical|Assessment|Summary)', '', summary, flags=re.IGNORECASE)
101
+ summary = summary.strip()
102
+
103
+ full_summary += summary + "\n\n" # Concatenate summaries with spacing
104
 
105
  # Track summary length for future optimization
106
+ self.last_summary_length = len(full_summary.split())
107
  self.request_count += 1
108
 
109
  logging.info(f"Generated summary: {self.last_summary_length} words, request count: {self.request_count}")
110
 
111
+ return full_summary.strip()
112
 
113
  except Exception as e:
114
  logging.error(f"Summary generation failed: {e}", exc_info=True)
 
121
  if word_count < 20:
122
  return "Insufficient text for detailed summary."
123
 
124
+ # Template-based fallback with the 4 required fields
125
  sections = [
126
+ "## Clinical Assessment\nOverall patient condition and status based on available data.",
127
+ "## Key Trends & Changes\nAnalysis of trends over time based on date-wise chart data.",
128
+ "## Plan & Suggested Actions\nRecommended next steps and interventions for patient care.",
129
+ "## Direct Guidance for Physician\nSpecific recommendations for the treating physician."
130
  ]
131
 
 
 
 
 
132
  return "\n\n".join(sections)
133
 
134
  def reset_state(self):
simple_test.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ print("Testing chunking implementation...")
3
+
4
+ import sys
5
+ sys.path.insert(0, '.')
6
+
7
+ from ai_med_extract.agents.summarizer import SummarizerAgent
8
+
9
+ class MockModelLoader:
10
+ def load(self):
11
+ return self
12
+ def __call__(self, *args, **kwargs):
13
+ return [{'summary_text': 'Test summary content for verification.'}]
14
+
15
+ # Test the chunking functionality
16
+ summarizer = SummarizerAgent(MockModelLoader())
17
+
18
+ # Test with a long text that requires chunking
19
+ long_text = " ".join([f"Medical record entry {i}: Patient presents with various symptoms including headache, fatigue, and elevated blood pressure. The condition has been persistent for several days and requires medical attention." for i in range(60)])
20
+
21
+ print(f"Original text length: {len(long_text.split())} words")
22
+
23
+ # Test chunking
24
+ chunks = summarizer._chunk_text(long_text, 100)
25
+ print(f"Number of chunks created: {len(chunks)}")
26
+
27
+ for i, chunk in enumerate(chunks):
28
+ print(f"Chunk {i+1}: {len(chunk.split())} words")
29
+
30
+ # Test summary generation
31
+ summary = summarizer.generate_summary(long_text)
32
+ print(f"Generated summary length: {len(summary)} characters")
33
+ print("Summary preview:", summary[:200] + "..." if len(summary) > 200 else summary)
34
+
35
+ print("Test completed successfully!")
test_chunking.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to verify the chunking implementation for GGUF model summarization
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
9
+
10
+ from ai_med_extract.agents.summarizer import SummarizerAgent
11
+
12
+ class MockModelLoader:
13
+ """Mock model loader for testing"""
14
+ def load(self):
15
+ return self
16
+
17
+ def __call__(self, text, max_length=512, min_length=30, do_sample=False, num_beams=4, early_stopping=True):
18
+ # Mock model that returns a simple summary
19
+ return [{'summary_text': f"Summary of: {text[:50]}..."}]
20
+
21
+ def test_chunking():
22
+ """Test the chunking functionality with context preservation"""
23
+ print("Testing chunking implementation with context preservation...")
24
+
25
+ # Create summarizer agent with mock model loader
26
+ summarizer = SummarizerAgent(MockModelLoader())
27
+
28
+ # Test with short text (should not be chunked)
29
+ short_text = "Patient has fever and cough. Blood pressure is normal."
30
+ summary_short = summarizer.generate_summary(short_text)
31
+ print(f"Short text summary: {summary_short}")
32
+
33
+ # Test with long text (should be chunked with context preservation)
34
+ long_text = " ".join([f"Symptom {i}: Patient reports ongoing issues with various medical conditions including persistent headaches, fatigue, and joint pain. The symptoms have been present for several weeks and are affecting daily activities." for i in range(50)])
35
+ summary_long = summarizer.generate_summary(long_text)
36
+ print(f"Long text summary length: {len(summary_long)} characters")
37
+ print(f"Long text summary preview: {summary_long[:200]}...")
38
+
39
+ # Test chunking method directly to verify context preservation
40
+ chunks = summarizer._chunk_text(long_text, 100) # Small chunk size for testing
41
+ print(f"Number of chunks created: {len(chunks)}")
42
+ print(f"Chunk sizes: {[len(chunk.split()) for chunk in chunks]}")
43
+
44
+ # Verify that chunks have overlapping context
45
+ if len(chunks) > 1:
46
+ for i in range(len(chunks) - 1):
47
+ current_chunk = chunks[i]
48
+ next_chunk = chunks[i + 1]
49
+ # Check if there's some overlap in content
50
+ overlap_found = any(word in next_chunk for word in current_chunk.split()[-10:])
51
+ print(f"Chunk {i+1} to {i+2} context preservation: {'✓' if overlap_found else '✗'}")
52
+
53
+ print("Chunking test with context preservation completed successfully!")
54
+
55
+ if __name__ == "__main__":
56
+ test_chunking()