File size: 1,704 Bytes

import gradio as gr
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

# لیست زبان‌هایی که می‌خوای همزمان پشتیبانی بشن
# (حتماً باید پکیج زبان‌ها روی تسرکت نصب باشن)
AUTO_LANGS = "eng+fas+ara+rus+spa+fra"

def ocr_auto(input_file):
    extracted_text = ""

    if isinstance(input_file, str) and input_file.endswith('.pdf'):
        images = convert_from_path(input_file)
        for page_number, image in enumerate(images, start=1):
            text = pytesseract.image_to_string(image, lang=AUTO_LANGS)
            extracted_text += f"\n--- Page {page_number} ---\n{text}"
    elif isinstance(input_file, Image.Image):
        extracted_text = pytesseract.image_to_string(input_file, lang=AUTO_LANGS)

    return extracted_text.strip()

def gradio_interface():
    input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")
    file_input = gr.File(label="Upload PDF/Image", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
    output_text = gr.Textbox(label="Extracted Text", interactive=False)

    def process(input_type, file):
        if not file:
            return "⚠️ Please upload a file first."
        if input_type == "PDF":
            return ocr_auto(file.name)
        else:
            image = Image.open(file.name)
            return ocr_auto(image)

    gr.Interface(
        fn=process,
        inputs=[input_type, file_input],
        outputs=[output_text],
        title="Auto OCR (PDF/Image)",
        description="Upload a PDF or Image. OCR will automatically detect and extract text in multiple languages."
    ).launch()

# Run
gradio_interface()