Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Test script for Enhanced PDF Processing Pipeline | |
| Tests with sample maternal health documents | |
| """ | |
| import sys | |
| from pathlib import Path | |
| import json | |
| # Add src to path for imports | |
| sys.path.append(str(Path(__file__).parent)) | |
| from enhanced_pdf_processor import EnhancedMedicalPDFProcessor | |
| def test_single_document(): | |
| """Test processing a single document""" | |
| print("π§ͺ Testing Enhanced PDF Processor with sample document...") | |
| # Initialize processor | |
| processor = EnhancedMedicalPDFProcessor(output_dir="test_output") | |
| # Test with a smaller document first | |
| test_files = [ | |
| "../Obs/RhESUS.pdf", | |
| "../Obs/puerperal-sepsis.pdf", | |
| "../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf" | |
| ] | |
| for test_file in test_files: | |
| test_path = Path(test_file) | |
| if test_path.exists(): | |
| print(f"\nπ Testing with: {test_path.name}") | |
| try: | |
| # Process single document | |
| result = processor.process_single_pdf(test_path) | |
| # Display results | |
| print(f"β Processing successful!") | |
| print(f" π Pages: {result.summary['total_pages']}") | |
| print(f" π Tables: {result.summary['total_tables']}") | |
| print(f" π Words: {result.summary['total_words']}") | |
| print(f" πΎ Size: {result.file_info['size_mb']:.2f} MB") | |
| # Show content classification for first page | |
| if result.content: | |
| first_page = result.content[0] | |
| classification = first_page.metadata['content_classification'] | |
| print(f" π·οΈ Content Classification:") | |
| for category, score in classification.items(): | |
| if score > 0: | |
| print(f" - {category}: {score:.4f}") | |
| # Test successful, break after first working file | |
| return True | |
| except Exception as e: | |
| print(f"β Processing failed: {e}") | |
| continue | |
| print("β No test files could be processed successfully") | |
| return False | |
| def test_table_extraction(): | |
| """Test table extraction specifically""" | |
| print("\nπ Testing table extraction capabilities...") | |
| processor = EnhancedMedicalPDFProcessor() | |
| # Test with documents likely to have tables | |
| table_test_files = [ | |
| "../Obs/Management-of-Normal-Labourchart.pdf", | |
| "../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf" | |
| ] | |
| for test_file in table_test_files: | |
| test_path = Path(test_file) | |
| if test_path.exists(): | |
| print(f"\nπ Testing table extraction with: {test_path.name}") | |
| try: | |
| result = processor.process_single_pdf(test_path) | |
| total_tables = result.summary['total_tables'] | |
| if total_tables > 0: | |
| print(f"β Found {total_tables} tables!") | |
| # Show table details | |
| for content in result.content: | |
| if content.tables: | |
| print(f" Page {content.page_number}: {len(content.tables)} table(s)") | |
| for i, table in enumerate(content.tables): | |
| print(f" Table {i+1}: {table.shape[0]} rows Γ {table.shape[1]} columns") | |
| if not table.empty: | |
| print(f" Columns: {list(table.columns)}") | |
| return True | |
| else: | |
| print(f"β οΈ No tables found in {test_path.name}") | |
| except Exception as e: | |
| print(f"β Table extraction test failed: {e}") | |
| return False | |
| def validate_output_structure(): | |
| """Validate the output structure is correct""" | |
| print("\nπ Validating output structure...") | |
| test_output_dir = Path("test_output") | |
| if not test_output_dir.exists(): | |
| print("β Test output directory not found") | |
| return False | |
| # Check for expected files | |
| expected_files = [] | |
| for item in test_output_dir.iterdir(): | |
| if item.is_dir(): | |
| expected_files.extend([ | |
| item / "summary.json", | |
| item / "extracted_text.txt" | |
| ]) | |
| all_valid = True | |
| for expected_file in expected_files: | |
| if expected_file.exists(): | |
| print(f"β Found: {expected_file}") | |
| # Validate JSON structure | |
| if expected_file.name == "summary.json": | |
| try: | |
| with open(expected_file) as f: | |
| data = json.load(f) | |
| if 'summary' in data and 'file_info' in data: | |
| print(f" π Valid JSON structure") | |
| else: | |
| print(f" β οΈ Missing keys in JSON") | |
| all_valid = False | |
| except json.JSONDecodeError: | |
| print(f" β Invalid JSON format") | |
| all_valid = False | |
| else: | |
| print(f"β Missing: {expected_file}") | |
| all_valid = False | |
| return all_valid | |
| def main(): | |
| """Run all tests""" | |
| print("π Starting Enhanced PDF Processor Tests") | |
| print("=" * 60) | |
| # Test 1: Single document processing | |
| test1_result = test_single_document() | |
| # Test 2: Table extraction | |
| test2_result = test_table_extraction() | |
| # Test 3: Output validation | |
| test3_result = validate_output_structure() | |
| # Summary | |
| print("\n" + "=" * 60) | |
| print("π TEST SUMMARY") | |
| print(f"Single Document Processing: {'β PASS' if test1_result else 'β FAIL'}") | |
| print(f"Table Extraction: {'β PASS' if test2_result else 'β FAIL'}") | |
| print(f"Output Structure: {'β PASS' if test3_result else 'β FAIL'}") | |
| overall_success = all([test1_result, test2_result, test3_result]) | |
| print(f"\nπ― OVERALL: {'β ALL TESTS PASSED' if overall_success else 'β SOME TESTS FAILED'}") | |
| if overall_success: | |
| print("\nπ Ready to process all maternal health documents!") | |
| else: | |
| print("\nβ οΈ Please fix issues before processing all documents") | |
| return overall_success | |
| if __name__ == "__main__": | |
| success = main() | |
| sys.exit(0 if success else 1) |