Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -435,11 +435,11 @@ class EnhancedGazaKnowledgeBase:
|
|
435 |
logger.info(f"Cached {len(documents)} documents")
|
436 |
|
437 |
return documents
|
438 |
-
|
439 |
def _extract_pdf_text(self, pdf_path: Path) -> str:
|
440 |
-
"""Use unstructured to extract and chunk PDF text by title"""
|
441 |
try:
|
442 |
-
elements = partition_pdf(filename=str(pdf_path), strategy="
|
443 |
if not elements:
|
444 |
logger.warning(f"No elements extracted from {pdf_path}")
|
445 |
return ""
|
@@ -464,11 +464,18 @@ class EnhancedGazaKnowledgeBase:
|
|
464 |
logger.warning(f"Extracted text too short from {pdf_path}")
|
465 |
return ""
|
466 |
|
|
|
|
|
|
|
|
|
|
|
|
|
467 |
return full_text
|
468 |
except Exception as e:
|
469 |
logger.error(f"Unstructured PDF parse failed for {pdf_path}: {e}")
|
470 |
return ""
|
471 |
|
|
|
472 |
|
473 |
def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
|
474 |
"""Enhanced search with better error handling and result processing"""
|
|
|
435 |
logger.info(f"Cached {len(documents)} documents")
|
436 |
|
437 |
return documents
|
438 |
+
|
439 |
def _extract_pdf_text(self, pdf_path: Path) -> str:
|
440 |
+
"""Use unstructured to extract and chunk PDF text by title, and save as .txt"""
|
441 |
try:
|
442 |
+
elements = partition_pdf(filename=str(pdf_path), strategy="auto")
|
443 |
if not elements:
|
444 |
logger.warning(f"No elements extracted from {pdf_path}")
|
445 |
return ""
|
|
|
464 |
logger.warning(f"Extracted text too short from {pdf_path}")
|
465 |
return ""
|
466 |
|
467 |
+
# Save extracted output to .txt next to original PDF
|
468 |
+
txt_output = pdf_path.with_suffix(".extracted.txt")
|
469 |
+
with open(txt_output, "w", encoding="utf-8") as f:
|
470 |
+
f.write(full_text)
|
471 |
+
logger.info(f"Saved extracted text to {txt_output.name}")
|
472 |
+
|
473 |
return full_text
|
474 |
except Exception as e:
|
475 |
logger.error(f"Unstructured PDF parse failed for {pdf_path}: {e}")
|
476 |
return ""
|
477 |
|
478 |
+
|
479 |
|
480 |
def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
|
481 |
"""Enhanced search with better error handling and result processing"""
|