Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,6 @@ import pdfplumber
|
|
3 |
from pdf2image import convert_from_path
|
4 |
import pytesseract
|
5 |
import shutil
|
6 |
-
import os
|
7 |
|
8 |
def extract_text_debug(file_path):
|
9 |
logs = []
|
@@ -13,7 +12,7 @@ def extract_text_debug(file_path):
|
|
13 |
logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}")
|
14 |
logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}")
|
15 |
|
16 |
-
#
|
17 |
try:
|
18 |
with pdfplumber.open(file_path) as pdf:
|
19 |
for page in pdf.pages:
|
@@ -27,12 +26,10 @@ def extract_text_debug(file_path):
|
|
27 |
except Exception as e:
|
28 |
logs.append(f"❌ pdfplumber failed: {e}")
|
29 |
|
30 |
-
#
|
31 |
try:
|
32 |
images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin")
|
33 |
-
ocr_text = []
|
34 |
-
for img in images[:2]:
|
35 |
-
ocr_text.append(pytesseract.image_to_string(img))
|
36 |
text = "\n".join(ocr_text)
|
37 |
if text.strip():
|
38 |
logs.append("✅ OCR worked via pdf2image + Tesseract")
|
|
|
3 |
from pdf2image import convert_from_path
|
4 |
import pytesseract
|
5 |
import shutil
|
|
|
6 |
|
7 |
def extract_text_debug(file_path):
|
8 |
logs = []
|
|
|
12 |
logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}")
|
13 |
logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}")
|
14 |
|
15 |
+
# Try pdfplumber
|
16 |
try:
|
17 |
with pdfplumber.open(file_path) as pdf:
|
18 |
for page in pdf.pages:
|
|
|
26 |
except Exception as e:
|
27 |
logs.append(f"❌ pdfplumber failed: {e}")
|
28 |
|
29 |
+
# OCR fallback
|
30 |
try:
|
31 |
images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin")
|
32 |
+
ocr_text = [pytesseract.image_to_string(img) for img in images[:2]]
|
|
|
|
|
33 |
text = "\n".join(ocr_text)
|
34 |
if text.strip():
|
35 |
logs.append("✅ OCR worked via pdf2image + Tesseract")
|