Spaces:
Sleeping
Sleeping
Update processing_utility.py
Browse files- processing_utility.py +47 -0
processing_utility.py
CHANGED
|
@@ -45,6 +45,10 @@ from llama_index.readers.file import PyMuPDFReader
|
|
| 45 |
|
| 46 |
import PyPDF2
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
class Insurance(BaseModel):
|
| 49 |
"""
|
| 50 |
A Pydantic model to define the data schema for extraction.
|
|
@@ -56,6 +60,49 @@ class Insurance(BaseModel):
|
|
| 56 |
class Insurance(BaseModel):
|
| 57 |
headings: str = Field(description="An array of headings")
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
def process_pdf_chunk(chunk_path: str) -> str:
|
| 60 |
"""
|
| 61 |
Worker function for the ProcessPoolExecutor.
|
|
|
|
| 45 |
|
| 46 |
import PyPDF2
|
| 47 |
|
| 48 |
+
# Global variable for the extractor agent
|
| 49 |
+
llama_extract_agent = None
|
| 50 |
+
|
| 51 |
+
|
| 52 |
class Insurance(BaseModel):
|
| 53 |
"""
|
| 54 |
A Pydantic model to define the data schema for extraction.
|
|
|
|
| 60 |
class Insurance(BaseModel):
|
| 61 |
headings: str = Field(description="An array of headings")
|
| 62 |
|
| 63 |
+
def initialize_llama_extract_agent():
|
| 64 |
+
global llama_extract_agent
|
| 65 |
+
if llama_extract_agent is None:
|
| 66 |
+
print("Initializing LlamaExtract client and getting agent...")
|
| 67 |
+
try:
|
| 68 |
+
extractor = LlamaExtract()
|
| 69 |
+
llama_extract_agent = extractor.get_agent(name="insurance-parser")
|
| 70 |
+
print("LlamaExtract agent initialized.")
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(f"Error initializing LlamaExtract agent: {e}")
|
| 73 |
+
llama_extract_agent = None # Ensure it's None if there was an error
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def extract_schema_from_file(file_path: str) -> Optional[Insurance]:
|
| 77 |
+
if not os.path.exists(file_path):
|
| 78 |
+
print(f"❌ Error: The file '{file_path}' was not found.")
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
if llama_extract_agent is None:
|
| 82 |
+
print("LlamaExtract agent not initialized. Attempting to initialize now.")
|
| 83 |
+
initialize_llama_extract_agent()
|
| 84 |
+
if llama_extract_agent is None:
|
| 85 |
+
print("LlamaExtract agent failed to initialize. Cannot proceed with extraction.")
|
| 86 |
+
return None
|
| 87 |
+
|
| 88 |
+
print(f"🚀 Sending '{file_path}' to LlamaCloud for schema extraction...")
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
result = llama_extract_agent.extract(file_path)
|
| 92 |
+
|
| 93 |
+
if result and result.data:
|
| 94 |
+
print("✅ Extraction successful!")
|
| 95 |
+
return result.data
|
| 96 |
+
else:
|
| 97 |
+
print("⚠️ Extraction did not return any data.")
|
| 98 |
+
return None
|
| 99 |
+
|
| 100 |
+
except Exception as e:
|
| 101 |
+
print(f"\n❌ An error occurred during the API call: {e}")
|
| 102 |
+
print("Please check your API key, network connection, and file format.")
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
|
| 106 |
def process_pdf_chunk(chunk_path: str) -> str:
|
| 107 |
"""
|
| 108 |
Worker function for the ProcessPoolExecutor.
|