Spaces:

suprimedev
/

pdf2text5

Runtime error

App Files Files Community

suprimedev commited on 23 days ago

Commit

4193c04

verified ·

1 Parent(s): 975a37a

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -8

app.py CHANGED Viewed

@@ -6,11 +6,24 @@ import pytesseract
 from PIL import Image
 import io
 import numpy as np
 def extract_text_from_pdf(pdf_file):
     try:
-        # باز کردن فایل PDF
-        doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
         full_text = ""
         has_ocr_processed = False
@@ -26,11 +39,18 @@ def extract_text_from_pdf(pdf_file):
                 # تبدیل صفحه به تصویر با وضوح بالا
                 mat = fitz.Matrix(300/72, 300/72)  # وضوح 300 DPI
                 pix = page.get_pixmap(matrix=mat)
-                img_data = pix.tobytes("ppm")
                 # استفاده از OCR برای استخراج متن از تصویر
                 image = Image.open(io.BytesIO(img_data))
-                ocr_text = pytesseract.image_to_string(image, lang='fas+ara+eng')
                 text = ocr_text
                 has_ocr_processed = True
@@ -39,6 +59,9 @@ def extract_text_from_pdf(pdf_file):
         doc.close()
         # پردازش متن برای زبان‌های راست‌به‌چپ
         try:
             reshaped_text = arabic_reshaper.reshape(full_text)
@@ -48,10 +71,17 @@ def extract_text_from_pdf(pdf_file):
                 bidi_text = "[⚠️ برخی صفحات با OCR پردازش شدند]\n\n" + bidi_text
             return bidi_text
-        except:
             return full_text
     except Exception as e:
         return f"خطا در پردازش فایل: {str(e)}"
 # ایجاد رابط Gradio
@@ -60,12 +90,21 @@ with gr.Blocks(title="PDF Text Extractor with OCR") as demo:
     gr.Markdown("با قابلیت پردازش OCR برای PDFهای تصویری")
     with gr.Row():
-        pdf_input = gr.File(label="فایل PDF را انتخاب کنید", file_types=[".pdf"])
     extract_btn = gr.Button("🔄 استخراج متن")
     with gr.Row():
-        text_output = gr.Textbox(label="متن استخراج شده", lines=20, interactive=False)
     gr.Markdown("""
     **⚠️ توجه:**
@@ -82,4 +121,4 @@ with gr.Blocks(title="PDF Text Extractor with OCR") as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

 from PIL import Image
 import io
 import numpy as np
+import tempfile
+import os
 def extract_text_from_pdf(pdf_file):
     try:
+        # ایجاد فایل موقت برای PDF
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
+            if hasattr(pdf_file, 'read'):
+                # اگر فایل قابل خواندن است
+                tmp_file.write(pdf_file.read())
+            else:
+                # اگر از Gradio File component است
+                tmp_file.write(pdf_file)
+            tmp_path = tmp_file.name
+        # باز کردن فایل PDF از مسیر
+        doc = fitz.open(tmp_path)
         full_text = ""
         has_ocr_processed = False
                 # تبدیل صفحه به تصویر با وضوح بالا
                 mat = fitz.Matrix(300/72, 300/72)  # وضوح 300 DPI
                 pix = page.get_pixmap(matrix=mat)
+                img_data = pix.tobytes("png")
                 # استفاده از OCR برای استخراج متن از تصویر
                 image = Image.open(io.BytesIO(img_data))
+                # تنظیمات OCR برای زبان‌های مختلف
+                custom_config = r'--oem 3 --psm 6'
+                ocr_text = pytesseract.image_to_string(
+                    image,
+                    lang='fas+ara+eng',
+                    config=custom_config
+                )
                 text = ocr_text
                 has_ocr_processed = True
         doc.close()
+        # حذف فایل موقت
+        os.unlink(tmp_path)
         # پردازش متن برای زبان‌های راست‌به‌چپ
         try:
             reshaped_text = arabic_reshaper.reshape(full_text)
                 bidi_text = "[⚠️ برخی صفحات با OCR پردازش شدند]\n\n" + bidi_text
             return bidi_text
+        except Exception as proc_error:
+            print(f"خطا در پردازش متن: {proc_error}")
             return full_text
     except Exception as e:
+        # حذف فایل موقت در صورت خطا
+        try:
+            if 'tmp_path' in locals():
+                os.unlink(tmp_path)
+        except:
+            pass
         return f"خطا در پردازش فایل: {str(e)}"
 # ایجاد رابط Gradio
     gr.Markdown("با قابلیت پردازش OCR برای PDFهای تصویری")
     with gr.Row():
+        pdf_input = gr.File(
+            label="فایل PDF را انتخاب کنید",
+            file_types=[".pdf"],
+            type="bytes"  # استفاده از bytes به جای file path
+        )
     extract_btn = gr.Button("🔄 استخراج متن")
     with gr.Row():
+        text_output = gr.Textbox(
+            label="متن استخراج شده",
+            lines=20,
+            interactive=False,
+            show_copy_button=True
+        )
     gr.Markdown("""
     **⚠️ توجه:**
     )
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)