Spaces:
Running
Running
File size: 1,704 Bytes
717b6b1 ebe3e23 717b6b1 1f78813 521a314 717b6b1 521a314 1f78813 521a314 1f78813 521a314 1f78813 717b6b1 75183d4 521a314 717b6b1 521a314 ebe3e23 521a314 75183d4 521a314 717b6b1 75183d4 521a314 75183d4 521a314 1f78813 75183d4 521a314 717b6b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import gradio as gr
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
# لیست زبانهایی که میخوای همزمان پشتیبانی بشن
# (حتماً باید پکیج زبانها روی تسرکت نصب باشن)
AUTO_LANGS = "eng+fas+ara+rus+spa+fra"
def ocr_auto(input_file):
extracted_text = ""
if isinstance(input_file, str) and input_file.endswith('.pdf'):
images = convert_from_path(input_file)
for page_number, image in enumerate(images, start=1):
text = pytesseract.image_to_string(image, lang=AUTO_LANGS)
extracted_text += f"\n--- Page {page_number} ---\n{text}"
elif isinstance(input_file, Image.Image):
extracted_text = pytesseract.image_to_string(input_file, lang=AUTO_LANGS)
return extracted_text.strip()
def gradio_interface():
input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")
file_input = gr.File(label="Upload PDF/Image", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
output_text = gr.Textbox(label="Extracted Text", interactive=False)
def process(input_type, file):
if not file:
return "⚠️ Please upload a file first."
if input_type == "PDF":
return ocr_auto(file.name)
else:
image = Image.open(file.name)
return ocr_auto(image)
gr.Interface(
fn=process,
inputs=[input_type, file_input],
outputs=[output_text],
title="Auto OCR (PDF/Image)",
description="Upload a PDF or Image. OCR will automatically detect and extract text in multiple languages."
).launch()
# Run
gradio_interface()
|