PDF-Editor

Sleeping

File size: 3,532 Bytes

import streamlit as st
from PyPDF2 import PdfReader
from docx import Document
from io import BytesIO
from pdf2image import convert_from_bytes
import pytesseract
import time

# Configure Tesseract path (if needed)
# pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract'

def pdf_to_word(pdf_file, password=None):
    """Convert a PDF file to a Word file with optional decryption and OCR."""
    try:
        # Ensure the file is a valid PDF
        if pdf_file.type != "application/pdf":
            raise ValueError("Invalid file type. Please upload a PDF file.")

        # Initialize PDF reader
        reader = PdfReader(pdf_file)

        # Decrypt the PDF if it's encrypted
        if reader.is_encrypted:
            if password:
                try:
                    reader.decrypt(password)
                except Exception as e:
                    raise ValueError("Failed to decrypt the PDF. Check the password.") from e
            else:
                raise ValueError("The PDF is encrypted. Please provide a password.")

        # Create a Word document
        document = Document()

        # Extract text from each page
        pdf_bytes = pdf_file.read()
        total_pages = len(reader.pages)
        progress_bar = st.progress(0)
        status_text = st.empty()

        for i, page in enumerate(reader.pages):
            status_text.text(f"Processing page {i + 1} of {total_pages}...")
            progress_bar.progress((i + 1) / total_pages)

            # Try extracting text directly
            text = page.extract_text()
            if text:
                document.add_paragraph(text)
            else:
                # Use OCR for non-extractable pages
                images = convert_from_bytes(pdf_bytes, first_page=i + 1, last_page=i + 1)
                for image in images:
                    ocr_text = pytesseract.image_to_string(image)
                    if ocr_text.strip():
                        document.add_paragraph(ocr_text)
                    else:
                        document.add_paragraph("[This page contains non-extractable content or images]")

        # Save the Word document to a BytesIO object
        word_file = BytesIO()
        document.save(word_file)
        word_file.seek(0)

        return word_file

    except Exception as e:
        raise ValueError(f"An error occurred: {e}")

# Streamlit app configuration
st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered")

# App header
st.title("📄 PDF to Word Converter")
st.write("Upload a PDF file to convert it into an editable Word document.")

# Upload PDF file widget
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

if uploaded_file is not None:
    # Optionally ask for a password if the PDF is encrypted
    password = st.text_input("Enter PDF password (if encrypted)", type="password")

    if st.button("Convert to Word"):
        try:
            # Convert the PDF to Word
            with st.spinner("Converting PDF to Word..."):
                word_file = pdf_to_word(uploaded_file, password)

            # Provide a download link for the Word file
            st.success("Conversion successful!")
            st.download_button(
                label="Download Word file",
                data=word_file,
                file_name="converted.docx",
                mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
            )
        except Exception as e:
            st.error(f"Error: {e}")