vedaMD

Sleeping

App Files Files Community

vedaMD / src /test_pdf_processor.py

sniro23

Initial commit without binary files

19aaa42 5 months ago

raw

history blame

6.61 kB

	#!/usr/bin/env python3
	"""
	Test script for Enhanced PDF Processing Pipeline
	Tests with sample maternal health documents
	"""

	import sys
	from pathlib import Path
	import json

	# Add src to path for imports
	sys.path.append(str(Path(__file__).parent))

	from enhanced_pdf_processor import EnhancedMedicalPDFProcessor

	def test_single_document():
	"""Test processing a single document"""
	print("🧪 Testing Enhanced PDF Processor with sample document...")

	# Initialize processor
	processor = EnhancedMedicalPDFProcessor(output_dir="test_output")

	# Test with a smaller document first
	test_files = [
	"../Obs/RhESUS.pdf",
	"../Obs/puerperal-sepsis.pdf",
	"../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf"
	]

	for test_file in test_files:
	test_path = Path(test_file)
	if test_path.exists():
	print(f"\n📄 Testing with: {test_path.name}")

	try:
	# Process single document
	result = processor.process_single_pdf(test_path)

	# Display results
	print(f"✅ Processing successful!")
	print(f" 📊 Pages: {result.summary['total_pages']}")
	print(f" 📋 Tables: {result.summary['total_tables']}")
	print(f" 📝 Words: {result.summary['total_words']}")
	print(f" 💾 Size: {result.file_info['size_mb']:.2f} MB")

	# Show content classification for first page
	if result.content:
	first_page = result.content[0]
	classification = first_page.metadata['content_classification']
	print(f" 🏷️ Content Classification:")
	for category, score in classification.items():
	if score > 0:
	print(f" - {category}: {score:.4f}")

	# Test successful, break after first working file
	return True

	except Exception as e:
	print(f"❌ Processing failed: {e}")
	continue

	print("❌ No test files could be processed successfully")
	return False

	def test_table_extraction():
	"""Test table extraction specifically"""
	print("\n🔍 Testing table extraction capabilities...")

	processor = EnhancedMedicalPDFProcessor()

	# Test with documents likely to have tables
	table_test_files = [
	"../Obs/Management-of-Normal-Labourchart.pdf",
	"../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf"
	]

	for test_file in table_test_files:
	test_path = Path(test_file)
	if test_path.exists():
	print(f"\n📊 Testing table extraction with: {test_path.name}")

	try:
	result = processor.process_single_pdf(test_path)
	total_tables = result.summary['total_tables']

	if total_tables > 0:
	print(f"✅ Found {total_tables} tables!")

	# Show table details
	for content in result.content:
	if content.tables:
	print(f" Page {content.page_number}: {len(content.tables)} table(s)")
	for i, table in enumerate(content.tables):
	print(f" Table {i+1}: {table.shape[0]} rows × {table.shape[1]} columns")
	if not table.empty:
	print(f" Columns: {list(table.columns)}")
	return True
	else:
	print(f"⚠️ No tables found in {test_path.name}")

	except Exception as e:
	print(f"❌ Table extraction test failed: {e}")

	return False

	def validate_output_structure():
	"""Validate the output structure is correct"""
	print("\n📁 Validating output structure...")

	test_output_dir = Path("test_output")
	if not test_output_dir.exists():
	print("❌ Test output directory not found")
	return False

	# Check for expected files
	expected_files = []
	for item in test_output_dir.iterdir():
	if item.is_dir():
	expected_files.extend([
	item / "summary.json",
	item / "extracted_text.txt"
	])

	all_valid = True
	for expected_file in expected_files:
	if expected_file.exists():
	print(f"✅ Found: {expected_file}")

	# Validate JSON structure
	if expected_file.name == "summary.json":
	try:
	with open(expected_file) as f:
	data = json.load(f)
	if 'summary' in data and 'file_info' in data:
	print(f" 📋 Valid JSON structure")
	else:
	print(f" ⚠️ Missing keys in JSON")
	all_valid = False
	except json.JSONDecodeError:
	print(f" ❌ Invalid JSON format")
	all_valid = False
	else:
	print(f"❌ Missing: {expected_file}")
	all_valid = False

	return all_valid

	def main():
	"""Run all tests"""
	print("🚀 Starting Enhanced PDF Processor Tests")
	print("=" * 60)

	# Test 1: Single document processing
	test1_result = test_single_document()

	# Test 2: Table extraction
	test2_result = test_table_extraction()

	# Test 3: Output validation
	test3_result = validate_output_structure()

	# Summary
	print("\n" + "=" * 60)
	print("📊 TEST SUMMARY")
	print(f"Single Document Processing: {'✅ PASS' if test1_result else '❌ FAIL'}")
	print(f"Table Extraction: {'✅ PASS' if test2_result else '❌ FAIL'}")
	print(f"Output Structure: {'✅ PASS' if test3_result else '❌ FAIL'}")

	overall_success = all([test1_result, test2_result, test3_result])
	print(f"\n🎯 OVERALL: {'✅ ALL TESTS PASSED' if overall_success else '❌ SOME TESTS FAILED'}")

	if overall_success:
	print("\n🚀 Ready to process all maternal health documents!")
	else:
	print("\n⚠️ Please fix issues before processing all documents")

	return overall_success

	if __name__ == "__main__":
	success = main()
	sys.exit(0 if success else 1)