Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pdfplumber | |
from pdf2image import convert_from_path | |
import pytesseract | |
import shutil | |
import os | |
def extract_text_debug(file_path): | |
logs = [] | |
text = "" | |
# Debug paths | |
logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}") | |
logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}") | |
# 1️⃣ Try pdfplumber | |
try: | |
with pdfplumber.open(file_path) as pdf: | |
for page in pdf.pages: | |
t = page.extract_text() or "" | |
text += t | |
if text.strip(): | |
logs.append("✅ Extracted text using pdfplumber") | |
return text[:800] + "\n\n---\n" + "\n".join(logs) | |
else: | |
logs.append("⚠️ pdfplumber gave empty text, trying OCR…") | |
except Exception as e: | |
logs.append(f"❌ pdfplumber failed: {e}") | |
# 2️⃣ OCR fallback | |
try: | |
images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin") | |
ocr_text = [] | |
for img in images[:2]: | |
ocr_text.append(pytesseract.image_to_string(img)) | |
text = "\n".join(ocr_text) | |
if text.strip(): | |
logs.append("✅ OCR worked via pdf2image + Tesseract") | |
else: | |
logs.append("⚠️ OCR returned empty text") | |
except Exception as e: | |
logs.append(f"❌ OCR fallback failed: {e}") | |
return (text[:800] if text.strip() else "❌ No text extracted") + "\n\n---\n" + "\n".join(logs) | |
with gr.Blocks() as demo: | |
gr.Markdown("# 📄 PDF Extractor Debug") | |
inp = gr.File(file_types=[".pdf"], type="filepath") | |
out = gr.Textbox(lines=20, label="Text + Debug Logs") | |
inp.change(extract_text_debug, inputs=inp, outputs=out) | |
if __name__ == "__main__": | |
demo.launch() | |