File size: 1,751 Bytes
499048f
 
a8a9a5f
7fe28ff
72f8b31
0ed62c0
152c89b
7fe28ff
 
0ed62c0
fed2de5
0ed62c0
7fe28ff
 
cd403d2
7fe28ff
 
 
 
0ed62c0
 
7fe28ff
 
0ed62c0
c1a7429
0ed62c0
c1a7429
7fe28ff
4a9700b
0ed62c0
7fe28ff
 
0ed62c0
 
 
 
7fe28ff
 
 
 
 
 
2fd6e92
0ed62c0
2fd6e92
79cef95
0ed62c0
 
 
 
79cef95
499048f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import gradio as gr
import pdfplumber
from pdf2image import convert_from_path
import pytesseract
import shutil
import os

def extract_text_debug(file_path):
    logs = []
    text = ""

    # Debug paths
    logs.append(f"pdftoppm path: {shutil.which('pdftoppm')}")
    logs.append(f"pdftocairo path: {shutil.which('pdftocairo')}")

    # 1️⃣ Try pdfplumber
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                t = page.extract_text() or ""
                text += t
        if text.strip():
            logs.append("✅ Extracted text using pdfplumber")
            return text[:800] + "\n\n---\n" + "\n".join(logs)
        else:
            logs.append("⚠️ pdfplumber gave empty text, trying OCR…")
    except Exception as e:
        logs.append(f"❌ pdfplumber failed: {e}")

    # 2️⃣ OCR fallback
    try:
        images = convert_from_path(file_path, dpi=200, poppler_path="/usr/bin")
        ocr_text = []
        for img in images[:2]:
            ocr_text.append(pytesseract.image_to_string(img))
        text = "\n".join(ocr_text)
        if text.strip():
            logs.append("✅ OCR worked via pdf2image + Tesseract")
        else:
            logs.append("⚠️ OCR returned empty text")
    except Exception as e:
        logs.append(f"❌ OCR fallback failed: {e}")

    return (text[:800] if text.strip() else "❌ No text extracted") + "\n\n---\n" + "\n".join(logs)

with gr.Blocks() as demo:
    gr.Markdown("# 📄 PDF Extractor Debug")
    inp = gr.File(file_types=[".pdf"], type="filepath")
    out = gr.Textbox(lines=20, label="Text + Debug Logs")
    inp.change(extract_text_debug, inputs=inp, outputs=out)

if __name__ == "__main__":
    demo.launch()