import gradio as gr import pdfplumber from pdf2image import convert_from_path import pytesseract import shutil import os def extract_text_debug(file_path): logs = [] text = "" # Debug paths logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}") logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}") # 1️⃣ Try pdfplumber try: with pdfplumber.open(file_path) as pdf: for page in pdf.pages: t = page.extract_text() or "" text += t if text.strip(): logs.append("✅ Extracted text using pdfplumber") return text[:800] + "\n\n---\n" + "\n".join(logs) else: logs.append("⚠️ pdfplumber gave empty text, trying OCR…") except Exception as e: logs.append(f"❌ pdfplumber failed: {e}") # 2️⃣ OCR fallback try: images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin") ocr_text = [] for img in images[:2]: ocr_text.append(pytesseract.image_to_string(img)) text = "\n".join(ocr_text) if text.strip(): logs.append("✅ OCR worked via pdf2image + Tesseract") else: logs.append("⚠️ OCR returned empty text") except Exception as e: logs.append(f"❌ OCR fallback failed: {e}") return (text[:800] if text.strip() else "❌ No text extracted") + "\n\n---\n" + "\n".join(logs) with gr.Blocks() as demo: gr.Markdown("# 📄 PDF Extractor Debug") inp = gr.File(file_types=[".pdf"], type="filepath") out = gr.Textbox(lines=20, label="Text + Debug Logs") inp.change(extract_text_debug, inputs=inp, outputs=out) if __name__ == "__main__": demo.launch()