rivapereira123 commited on
Commit
84c93e1
·
verified ·
1 Parent(s): 370caed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -3
app.py CHANGED
@@ -435,11 +435,11 @@ class EnhancedGazaKnowledgeBase:
435
  logger.info(f"Cached {len(documents)} documents")
436
 
437
  return documents
438
-
439
  def _extract_pdf_text(self, pdf_path: Path) -> str:
440
- """Use unstructured to extract and chunk PDF text by title"""
441
  try:
442
- elements = partition_pdf(filename=str(pdf_path), strategy="hi_res")
443
  if not elements:
444
  logger.warning(f"No elements extracted from {pdf_path}")
445
  return ""
@@ -464,11 +464,18 @@ class EnhancedGazaKnowledgeBase:
464
  logger.warning(f"Extracted text too short from {pdf_path}")
465
  return ""
466
 
 
 
 
 
 
 
467
  return full_text
468
  except Exception as e:
469
  logger.error(f"Unstructured PDF parse failed for {pdf_path}: {e}")
470
  return ""
471
 
 
472
 
473
  def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
474
  """Enhanced search with better error handling and result processing"""
 
435
  logger.info(f"Cached {len(documents)} documents")
436
 
437
  return documents
438
+
439
  def _extract_pdf_text(self, pdf_path: Path) -> str:
440
+ """Use unstructured to extract and chunk PDF text by title, and save as .txt"""
441
  try:
442
+ elements = partition_pdf(filename=str(pdf_path), strategy="auto")
443
  if not elements:
444
  logger.warning(f"No elements extracted from {pdf_path}")
445
  return ""
 
464
  logger.warning(f"Extracted text too short from {pdf_path}")
465
  return ""
466
 
467
+ # Save extracted output to .txt next to original PDF
468
+ txt_output = pdf_path.with_suffix(".extracted.txt")
469
+ with open(txt_output, "w", encoding="utf-8") as f:
470
+ f.write(full_text)
471
+ logger.info(f"Saved extracted text to {txt_output.name}")
472
+
473
  return full_text
474
  except Exception as e:
475
  logger.error(f"Unstructured PDF parse failed for {pdf_path}: {e}")
476
  return ""
477
 
478
+
479
 
480
  def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
481
  """Enhanced search with better error handling and result processing"""