PDF-Editor

Sleeping

App Files Files Community

PDF-Editor / app.py

Tassawar

Update app.py

880593d verified about 1 month ago

raw

history blame contribute delete

3.53 kB

	import streamlit as st
	from PyPDF2 import PdfReader
	from docx import Document
	from io import BytesIO
	from pdf2image import convert_from_bytes
	import pytesseract
	import time

	# Configure Tesseract path (if needed)
	# pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract'

	def pdf_to_word(pdf_file, password=None):
	"""Convert a PDF file to a Word file with optional decryption and OCR."""
	try:
	# Ensure the file is a valid PDF
	if pdf_file.type != "application/pdf":
	raise ValueError("Invalid file type. Please upload a PDF file.")

	# Initialize PDF reader
	reader = PdfReader(pdf_file)

	# Decrypt the PDF if it's encrypted
	if reader.is_encrypted:
	if password:
	try:
	reader.decrypt(password)
	except Exception as e:
	raise ValueError("Failed to decrypt the PDF. Check the password.") from e
	else:
	raise ValueError("The PDF is encrypted. Please provide a password.")

	# Create a Word document
	document = Document()

	# Extract text from each page
	pdf_bytes = pdf_file.read()
	total_pages = len(reader.pages)
	progress_bar = st.progress(0)
	status_text = st.empty()

	for i, page in enumerate(reader.pages):
	status_text.text(f"Processing page {i + 1} of {total_pages}...")
	progress_bar.progress((i + 1) / total_pages)

	# Try extracting text directly
	text = page.extract_text()
	if text:
	document.add_paragraph(text)
	else:
	# Use OCR for non-extractable pages
	images = convert_from_bytes(pdf_bytes, first_page=i + 1, last_page=i + 1)
	for image in images:
	ocr_text = pytesseract.image_to_string(image)
	if ocr_text.strip():
	document.add_paragraph(ocr_text)
	else:
	document.add_paragraph("[This page contains non-extractable content or images]")

	# Save the Word document to a BytesIO object
	word_file = BytesIO()
	document.save(word_file)
	word_file.seek(0)

	return word_file

	except Exception as e:
	raise ValueError(f"An error occurred: {e}")

	# Streamlit app configuration
	st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered")

	# App header
	st.title("📄 PDF to Word Converter")
	st.write("Upload a PDF file to convert it into an editable Word document.")

	# Upload PDF file widget
	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file is not None:
	# Optionally ask for a password if the PDF is encrypted
	password = st.text_input("Enter PDF password (if encrypted)", type="password")

	if st.button("Convert to Word"):
	try:
	# Convert the PDF to Word
	with st.spinner("Converting PDF to Word..."):
	word_file = pdf_to_word(uploaded_file, password)

	# Provide a download link for the Word file
	st.success("Conversion successful!")
	st.download_button(
	label="Download Word file",
	data=word_file,
	file_name="converted.docx",
	mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	)
	except Exception as e:
	st.error(f"Error: {e}")