Spaces:
Sleeping
Sleeping
Commit
·
0073f9b
1
Parent(s):
bb194c0
chunking
Browse files
__pycache__/test_chunking.cpython-311.pyc
ADDED
Binary file (4.61 kB). View file
|
|
ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc
CHANGED
Binary files a/ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc and b/ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc differ
|
|
ai_med_extract/agents/summarizer.py
CHANGED
@@ -43,6 +43,24 @@ class SummarizerAgent:
|
|
43 |
|
44 |
return text
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
def generate_summary(self, text):
|
47 |
"""Generate summary with improved state management and parameter optimization"""
|
48 |
try:
|
@@ -57,33 +75,40 @@ class SummarizerAgent:
|
|
57 |
# Load model (this ensures fresh model state for each request)
|
58 |
model = self.summarization_model_loader.load()
|
59 |
|
60 |
-
#
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
# Track summary length for future optimization
|
81 |
-
self.last_summary_length = len(
|
82 |
self.request_count += 1
|
83 |
|
84 |
logging.info(f"Generated summary: {self.last_summary_length} words, request count: {self.request_count}")
|
85 |
|
86 |
-
return
|
87 |
|
88 |
except Exception as e:
|
89 |
logging.error(f"Summary generation failed: {e}", exc_info=True)
|
@@ -96,17 +121,14 @@ class SummarizerAgent:
|
|
96 |
if word_count < 20:
|
97 |
return "Insufficient text for detailed summary."
|
98 |
|
99 |
-
#
|
100 |
sections = [
|
101 |
-
"## Clinical Assessment\
|
102 |
-
"## Key
|
103 |
-
"##
|
|
|
104 |
]
|
105 |
|
106 |
-
# Adjust length based on input
|
107 |
-
if word_count > 100:
|
108 |
-
sections.append("## Additional Notes\nFurther analysis recommended by healthcare provider.")
|
109 |
-
|
110 |
return "\n\n".join(sections)
|
111 |
|
112 |
def reset_state(self):
|
|
|
43 |
|
44 |
return text
|
45 |
|
46 |
+
def _chunk_text(self, text, max_length):
|
47 |
+
"""Chunk the text into smaller segments based on max_length, preserving context."""
|
48 |
+
words = text.split()
|
49 |
+
chunks = []
|
50 |
+
current_chunk = []
|
51 |
+
|
52 |
+
for word in words:
|
53 |
+
current_chunk.append(word)
|
54 |
+
if len(current_chunk) >= max_length:
|
55 |
+
# Include a few words from the next chunk to maintain context
|
56 |
+
chunks.append(' '.join(current_chunk))
|
57 |
+
current_chunk = current_chunk[-10:] # Keep the last 10 words for context
|
58 |
+
|
59 |
+
if current_chunk:
|
60 |
+
chunks.append(' '.join(current_chunk))
|
61 |
+
|
62 |
+
return chunks
|
63 |
+
|
64 |
def generate_summary(self, text):
|
65 |
"""Generate summary with improved state management and parameter optimization"""
|
66 |
try:
|
|
|
75 |
# Load model (this ensures fresh model state for each request)
|
76 |
model = self.summarization_model_loader.load()
|
77 |
|
78 |
+
# Chunk the text if it exceeds the max_length
|
79 |
+
chunks = self._chunk_text(clean_text, max_length)
|
80 |
+
full_summary = ""
|
81 |
+
|
82 |
+
for chunk in chunks:
|
83 |
+
# Generate summary for each chunk
|
84 |
+
summary_result = model(
|
85 |
+
chunk,
|
86 |
+
max_length=max_length,
|
87 |
+
min_length=min_length,
|
88 |
+
do_sample=False,
|
89 |
+
num_beams=4, # Use beam search for more consistent results
|
90 |
+
early_stopping=True
|
91 |
+
)
|
92 |
+
|
93 |
+
# Extract and clean summary
|
94 |
+
if isinstance(summary_result, list) and summary_result:
|
95 |
+
summary = summary_result[0].get('summary_text', '').strip()
|
96 |
+
else:
|
97 |
+
summary = str(summary_result).strip()
|
98 |
+
|
99 |
+
# Remove any prompt artifacts that might be included
|
100 |
+
summary = re.sub(r'^.*?(?=##|Clinical|Assessment|Summary)', '', summary, flags=re.IGNORECASE)
|
101 |
+
summary = summary.strip()
|
102 |
+
|
103 |
+
full_summary += summary + "\n\n" # Concatenate summaries with spacing
|
104 |
|
105 |
# Track summary length for future optimization
|
106 |
+
self.last_summary_length = len(full_summary.split())
|
107 |
self.request_count += 1
|
108 |
|
109 |
logging.info(f"Generated summary: {self.last_summary_length} words, request count: {self.request_count}")
|
110 |
|
111 |
+
return full_summary.strip()
|
112 |
|
113 |
except Exception as e:
|
114 |
logging.error(f"Summary generation failed: {e}", exc_info=True)
|
|
|
121 |
if word_count < 20:
|
122 |
return "Insufficient text for detailed summary."
|
123 |
|
124 |
+
# Template-based fallback with the 4 required fields
|
125 |
sections = [
|
126 |
+
"## Clinical Assessment\nOverall patient condition and status based on available data.",
|
127 |
+
"## Key Trends & Changes\nAnalysis of trends over time based on date-wise chart data.",
|
128 |
+
"## Plan & Suggested Actions\nRecommended next steps and interventions for patient care.",
|
129 |
+
"## Direct Guidance for Physician\nSpecific recommendations for the treating physician."
|
130 |
]
|
131 |
|
|
|
|
|
|
|
|
|
132 |
return "\n\n".join(sections)
|
133 |
|
134 |
def reset_state(self):
|
simple_test.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
print("Testing chunking implementation...")
|
3 |
+
|
4 |
+
import sys
|
5 |
+
sys.path.insert(0, '.')
|
6 |
+
|
7 |
+
from ai_med_extract.agents.summarizer import SummarizerAgent
|
8 |
+
|
9 |
+
class MockModelLoader:
|
10 |
+
def load(self):
|
11 |
+
return self
|
12 |
+
def __call__(self, *args, **kwargs):
|
13 |
+
return [{'summary_text': 'Test summary content for verification.'}]
|
14 |
+
|
15 |
+
# Test the chunking functionality
|
16 |
+
summarizer = SummarizerAgent(MockModelLoader())
|
17 |
+
|
18 |
+
# Test with a long text that requires chunking
|
19 |
+
long_text = " ".join([f"Medical record entry {i}: Patient presents with various symptoms including headache, fatigue, and elevated blood pressure. The condition has been persistent for several days and requires medical attention." for i in range(60)])
|
20 |
+
|
21 |
+
print(f"Original text length: {len(long_text.split())} words")
|
22 |
+
|
23 |
+
# Test chunking
|
24 |
+
chunks = summarizer._chunk_text(long_text, 100)
|
25 |
+
print(f"Number of chunks created: {len(chunks)}")
|
26 |
+
|
27 |
+
for i, chunk in enumerate(chunks):
|
28 |
+
print(f"Chunk {i+1}: {len(chunk.split())} words")
|
29 |
+
|
30 |
+
# Test summary generation
|
31 |
+
summary = summarizer.generate_summary(long_text)
|
32 |
+
print(f"Generated summary length: {len(summary)} characters")
|
33 |
+
print("Summary preview:", summary[:200] + "..." if len(summary) > 200 else summary)
|
34 |
+
|
35 |
+
print("Test completed successfully!")
|
test_chunking.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script to verify the chunking implementation for GGUF model summarization
|
4 |
+
"""
|
5 |
+
|
6 |
+
import sys
|
7 |
+
import os
|
8 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
9 |
+
|
10 |
+
from ai_med_extract.agents.summarizer import SummarizerAgent
|
11 |
+
|
12 |
+
class MockModelLoader:
|
13 |
+
"""Mock model loader for testing"""
|
14 |
+
def load(self):
|
15 |
+
return self
|
16 |
+
|
17 |
+
def __call__(self, text, max_length=512, min_length=30, do_sample=False, num_beams=4, early_stopping=True):
|
18 |
+
# Mock model that returns a simple summary
|
19 |
+
return [{'summary_text': f"Summary of: {text[:50]}..."}]
|
20 |
+
|
21 |
+
def test_chunking():
|
22 |
+
"""Test the chunking functionality with context preservation"""
|
23 |
+
print("Testing chunking implementation with context preservation...")
|
24 |
+
|
25 |
+
# Create summarizer agent with mock model loader
|
26 |
+
summarizer = SummarizerAgent(MockModelLoader())
|
27 |
+
|
28 |
+
# Test with short text (should not be chunked)
|
29 |
+
short_text = "Patient has fever and cough. Blood pressure is normal."
|
30 |
+
summary_short = summarizer.generate_summary(short_text)
|
31 |
+
print(f"Short text summary: {summary_short}")
|
32 |
+
|
33 |
+
# Test with long text (should be chunked with context preservation)
|
34 |
+
long_text = " ".join([f"Symptom {i}: Patient reports ongoing issues with various medical conditions including persistent headaches, fatigue, and joint pain. The symptoms have been present for several weeks and are affecting daily activities." for i in range(50)])
|
35 |
+
summary_long = summarizer.generate_summary(long_text)
|
36 |
+
print(f"Long text summary length: {len(summary_long)} characters")
|
37 |
+
print(f"Long text summary preview: {summary_long[:200]}...")
|
38 |
+
|
39 |
+
# Test chunking method directly to verify context preservation
|
40 |
+
chunks = summarizer._chunk_text(long_text, 100) # Small chunk size for testing
|
41 |
+
print(f"Number of chunks created: {len(chunks)}")
|
42 |
+
print(f"Chunk sizes: {[len(chunk.split()) for chunk in chunks]}")
|
43 |
+
|
44 |
+
# Verify that chunks have overlapping context
|
45 |
+
if len(chunks) > 1:
|
46 |
+
for i in range(len(chunks) - 1):
|
47 |
+
current_chunk = chunks[i]
|
48 |
+
next_chunk = chunks[i + 1]
|
49 |
+
# Check if there's some overlap in content
|
50 |
+
overlap_found = any(word in next_chunk for word in current_chunk.split()[-10:])
|
51 |
+
print(f"Chunk {i+1} to {i+2} context preservation: {'✓' if overlap_found else '✗'}")
|
52 |
+
|
53 |
+
print("Chunking test with context preservation completed successfully!")
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
test_chunking()
|