PDF-Editor

Sleeping

App Files Files Community

Tassawar commited on Feb 11

Commit

880593d

verified ·

1 Parent(s): b695c33

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -43

app.py CHANGED Viewed

@@ -4,51 +4,74 @@ from docx import Document
 from io import BytesIO
 from pdf2image import convert_from_bytes
 import pytesseract
 def pdf_to_word(pdf_file, password=None):
     """Convert a PDF file to a Word file with optional decryption and OCR."""
-    # Ensure the file is a valid file-like object
-    reader = PdfReader(pdf_file)
-    # Decrypt the PDF if it's encrypted
-    if reader.is_encrypted:
-        if password:
-            try:
-                reader.decrypt(password)
-            except Exception as e:
-                raise ValueError("Failed to decrypt the PDF. Check the password.") from e
-        else:
-            raise ValueError("The PDF is encrypted. Please provide a password.")
-    document = Document()
-    # Extract text from each page
-    pdf_bytes = pdf_file.read()
-    for page in reader.pages:
-        if page.extract_text():  # Use PyPDF2 for text extraction
             text = page.extract_text()
-            document.add_paragraph(text)
-        else:
-            # Use OCR for non-extractable pages
-            images = convert_from_bytes(pdf_bytes)
-            for image in images:
-                ocr_text = pytesseract.image_to_string(image)
-                if ocr_text.strip():
-                    document.add_paragraph(ocr_text)
-                else:
-                    document.add_paragraph("[This page contains non-extractable content or images]")
-    word_file = BytesIO()
-    document.save(word_file)
-    word_file.seek(0)
-    return word_file
 # Streamlit app configuration
 st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered")
 # App header
-st.title("PDF to Word Converter")
-st.write("Upload a PDF file")
 # Upload PDF file widget
 uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
@@ -57,11 +80,19 @@ if uploaded_file is not None:
     # Optionally ask for a password if the PDF is encrypted
     password = st.text_input("Enter PDF password (if encrypted)", type="password")
-    try:
-        # Convert the PDF to Word
-        word_file = pdf_to_word(uploaded_file, password)
-        # Provide a download link for the Word file
-        st.download_button("Download Word file", word_file, file_name="converted.docx")
-    except Exception as e:
-        st.error(f"Error: {e}")

 from io import BytesIO
 from pdf2image import convert_from_bytes
 import pytesseract
+import time
+# Configure Tesseract path (if needed)
+# pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract'
 def pdf_to_word(pdf_file, password=None):
     """Convert a PDF file to a Word file with optional decryption and OCR."""
+    try:
+        # Ensure the file is a valid PDF
+        if pdf_file.type != "application/pdf":
+            raise ValueError("Invalid file type. Please upload a PDF file.")
+        # Initialize PDF reader
+        reader = PdfReader(pdf_file)
+        # Decrypt the PDF if it's encrypted
+        if reader.is_encrypted:
+            if password:
+                try:
+                    reader.decrypt(password)
+                except Exception as e:
+                    raise ValueError("Failed to decrypt the PDF. Check the password.") from e
+            else:
+                raise ValueError("The PDF is encrypted. Please provide a password.")
+        # Create a Word document
+        document = Document()
+        # Extract text from each page
+        pdf_bytes = pdf_file.read()
+        total_pages = len(reader.pages)
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        for i, page in enumerate(reader.pages):
+            status_text.text(f"Processing page {i + 1} of {total_pages}...")
+            progress_bar.progress((i + 1) / total_pages)
+            # Try extracting text directly
             text = page.extract_text()
+            if text:
+                document.add_paragraph(text)
+            else:
+                # Use OCR for non-extractable pages
+                images = convert_from_bytes(pdf_bytes, first_page=i + 1, last_page=i + 1)
+                for image in images:
+                    ocr_text = pytesseract.image_to_string(image)
+                    if ocr_text.strip():
+                        document.add_paragraph(ocr_text)
+                    else:
+                        document.add_paragraph("[This page contains non-extractable content or images]")
+        # Save the Word document to a BytesIO object
+        word_file = BytesIO()
+        document.save(word_file)
+        word_file.seek(0)
+        return word_file
+    except Exception as e:
+        raise ValueError(f"An error occurred: {e}")
 # Streamlit app configuration
 st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered")
 # App header
+st.title("📄 PDF to Word Converter")
+st.write("Upload a PDF file to convert it into an editable Word document.")
 # Upload PDF file widget
 uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
     # Optionally ask for a password if the PDF is encrypted
     password = st.text_input("Enter PDF password (if encrypted)", type="password")
+    if st.button("Convert to Word"):
+        try:
+            # Convert the PDF to Word
+            with st.spinner("Converting PDF to Word..."):
+                word_file = pdf_to_word(uploaded_file, password)
+            # Provide a download link for the Word file
+            st.success("Conversion successful!")
+            st.download_button(
+                label="Download Word file",
+                data=word_file,
+                file_name="converted.docx",
+                mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+            )
+        except Exception as e:
+            st.error(f"Error: {e}")