talhashoaib commited on
Commit
6c75a17
·
verified ·
1 Parent(s): 69ea95d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -6
app.py CHANGED
@@ -3,7 +3,6 @@ import pdfplumber
3
  from pdf2image import convert_from_path
4
  import pytesseract
5
  import shutil
6
- import os
7
 
8
  def extract_text_debug(file_path):
9
  logs = []
@@ -13,7 +12,7 @@ def extract_text_debug(file_path):
13
  logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}")
14
  logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}")
15
 
16
- # 1️⃣ Try pdfplumber
17
  try:
18
  with pdfplumber.open(file_path) as pdf:
19
  for page in pdf.pages:
@@ -27,12 +26,10 @@ def extract_text_debug(file_path):
27
  except Exception as e:
28
  logs.append(f"❌ pdfplumber failed: {e}")
29
 
30
- # 2️⃣ OCR fallback
31
  try:
32
  images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin")
33
- ocr_text = []
34
- for img in images[:2]:
35
- ocr_text.append(pytesseract.image_to_string(img))
36
  text = "\n".join(ocr_text)
37
  if text.strip():
38
  logs.append("✅ OCR worked via pdf2image + Tesseract")
 
3
  from pdf2image import convert_from_path
4
  import pytesseract
5
  import shutil
 
6
 
7
  def extract_text_debug(file_path):
8
  logs = []
 
12
  logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}")
13
  logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}")
14
 
15
+ # Try pdfplumber
16
  try:
17
  with pdfplumber.open(file_path) as pdf:
18
  for page in pdf.pages:
 
26
  except Exception as e:
27
  logs.append(f"❌ pdfplumber failed: {e}")
28
 
29
+ # OCR fallback
30
  try:
31
  images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin")
32
+ ocr_text = [pytesseract.image_to_string(img) for img in images[:2]]
 
 
33
  text = "\n".join(ocr_text)
34
  if text.strip():
35
  logs.append("✅ OCR worked via pdf2image + Tesseract")