|
import gradio as gr |
|
import pdfplumber |
|
from pdf2image import convert_from_path |
|
import pytesseract |
|
import shutil |
|
|
|
def extract_text_debug(file_path): |
|
logs = [] |
|
text = "" |
|
|
|
|
|
logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}") |
|
logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}") |
|
|
|
|
|
try: |
|
with pdfplumber.open(file_path) as pdf: |
|
for page in pdf.pages: |
|
t = page.extract_text() or "" |
|
text += t |
|
if text.strip(): |
|
logs.append("β
Extracted text using pdfplumber") |
|
return text[:800] + "\n\n---\n" + "\n".join(logs) |
|
else: |
|
logs.append("β οΈ pdfplumber gave empty text, trying OCRβ¦") |
|
except Exception as e: |
|
logs.append(f"β pdfplumber failed: {e}") |
|
|
|
|
|
try: |
|
images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin") |
|
ocr_text = [pytesseract.image_to_string(img) for img in images[:2]] |
|
text = "\n".join(ocr_text) |
|
if text.strip(): |
|
logs.append("β
OCR worked via pdf2image + Tesseract") |
|
else: |
|
logs.append("β οΈ OCR returned empty text") |
|
except Exception as e: |
|
logs.append(f"β OCR fallback failed: {e}") |
|
|
|
return (text[:800] if text.strip() else "β No text extracted") + "\n\n---\n" + "\n".join(logs) |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# π PDF Extractor Debug") |
|
inp = gr.File(file_types=[".pdf"], type="filepath") |
|
out = gr.Textbox(lines=20, label="Text + Debug Logs") |
|
inp.change(extract_text_debug, inputs=inp, outputs=out) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|