Spaces:
Sleeping
Sleeping
File size: 1,751 Bytes
499048f a8a9a5f 7fe28ff 72f8b31 0ed62c0 152c89b 7fe28ff 0ed62c0 fed2de5 0ed62c0 7fe28ff cd403d2 7fe28ff 0ed62c0 7fe28ff 0ed62c0 c1a7429 0ed62c0 c1a7429 7fe28ff 4a9700b 0ed62c0 7fe28ff 0ed62c0 7fe28ff 2fd6e92 0ed62c0 2fd6e92 79cef95 0ed62c0 79cef95 499048f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import gradio as gr
import pdfplumber
from pdf2image import convert_from_path
import pytesseract
import shutil
import os
def extract_text_debug(file_path):
logs = []
text = ""
# Debug paths
logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}")
logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}")
# 1️⃣ Try pdfplumber
try:
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
t = page.extract_text() or ""
text += t
if text.strip():
logs.append("✅ Extracted text using pdfplumber")
return text[:800] + "\n\n---\n" + "\n".join(logs)
else:
logs.append("⚠️ pdfplumber gave empty text, trying OCR…")
except Exception as e:
logs.append(f"❌ pdfplumber failed: {e}")
# 2️⃣ OCR fallback
try:
images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin")
ocr_text = []
for img in images[:2]:
ocr_text.append(pytesseract.image_to_string(img))
text = "\n".join(ocr_text)
if text.strip():
logs.append("✅ OCR worked via pdf2image + Tesseract")
else:
logs.append("⚠️ OCR returned empty text")
except Exception as e:
logs.append(f"❌ OCR fallback failed: {e}")
return (text[:800] if text.strip() else "❌ No text extracted") + "\n\n---\n" + "\n".join(logs)
with gr.Blocks() as demo:
gr.Markdown("# 📄 PDF Extractor Debug")
inp = gr.File(file_types=[".pdf"], type="filepath")
out = gr.Textbox(lines=20, label="Text + Debug Logs")
inp.change(extract_text_debug, inputs=inp, outputs=out)
if __name__ == "__main__":
demo.launch()
|