Tassawar commited on
Commit
880593d
·
verified ·
1 Parent(s): b695c33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -43
app.py CHANGED
@@ -4,51 +4,74 @@ from docx import Document
4
  from io import BytesIO
5
  from pdf2image import convert_from_bytes
6
  import pytesseract
 
 
 
 
7
 
8
  def pdf_to_word(pdf_file, password=None):
9
  """Convert a PDF file to a Word file with optional decryption and OCR."""
10
- # Ensure the file is a valid file-like object
11
- reader = PdfReader(pdf_file)
12
-
13
- # Decrypt the PDF if it's encrypted
14
- if reader.is_encrypted:
15
- if password:
16
- try:
17
- reader.decrypt(password)
18
- except Exception as e:
19
- raise ValueError("Failed to decrypt the PDF. Check the password.") from e
20
- else:
21
- raise ValueError("The PDF is encrypted. Please provide a password.")
22
-
23
- document = Document()
24
-
25
- # Extract text from each page
26
- pdf_bytes = pdf_file.read()
27
- for page in reader.pages:
28
- if page.extract_text(): # Use PyPDF2 for text extraction
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  text = page.extract_text()
30
- document.add_paragraph(text)
31
- else:
32
- # Use OCR for non-extractable pages
33
- images = convert_from_bytes(pdf_bytes)
34
- for image in images:
35
- ocr_text = pytesseract.image_to_string(image)
36
- if ocr_text.strip():
37
- document.add_paragraph(ocr_text)
38
- else:
39
- document.add_paragraph("[This page contains non-extractable content or images]")
40
-
41
- word_file = BytesIO()
42
- document.save(word_file)
43
- word_file.seek(0)
44
- return word_file
 
 
 
 
 
 
45
 
46
  # Streamlit app configuration
47
  st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered")
48
 
49
  # App header
50
- st.title("PDF to Word Converter")
51
- st.write("Upload a PDF file")
52
 
53
  # Upload PDF file widget
54
  uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
@@ -57,11 +80,19 @@ if uploaded_file is not None:
57
  # Optionally ask for a password if the PDF is encrypted
58
  password = st.text_input("Enter PDF password (if encrypted)", type="password")
59
 
60
- try:
61
- # Convert the PDF to Word
62
- word_file = pdf_to_word(uploaded_file, password)
 
 
63
 
64
- # Provide a download link for the Word file
65
- st.download_button("Download Word file", word_file, file_name="converted.docx")
66
- except Exception as e:
67
- st.error(f"Error: {e}")
 
 
 
 
 
 
 
4
  from io import BytesIO
5
  from pdf2image import convert_from_bytes
6
  import pytesseract
7
+ import time
8
+
9
+ # Configure Tesseract path (if needed)
10
+ # pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract'
11
 
12
  def pdf_to_word(pdf_file, password=None):
13
  """Convert a PDF file to a Word file with optional decryption and OCR."""
14
+ try:
15
+ # Ensure the file is a valid PDF
16
+ if pdf_file.type != "application/pdf":
17
+ raise ValueError("Invalid file type. Please upload a PDF file.")
18
+
19
+ # Initialize PDF reader
20
+ reader = PdfReader(pdf_file)
21
+
22
+ # Decrypt the PDF if it's encrypted
23
+ if reader.is_encrypted:
24
+ if password:
25
+ try:
26
+ reader.decrypt(password)
27
+ except Exception as e:
28
+ raise ValueError("Failed to decrypt the PDF. Check the password.") from e
29
+ else:
30
+ raise ValueError("The PDF is encrypted. Please provide a password.")
31
+
32
+ # Create a Word document
33
+ document = Document()
34
+
35
+ # Extract text from each page
36
+ pdf_bytes = pdf_file.read()
37
+ total_pages = len(reader.pages)
38
+ progress_bar = st.progress(0)
39
+ status_text = st.empty()
40
+
41
+ for i, page in enumerate(reader.pages):
42
+ status_text.text(f"Processing page {i + 1} of {total_pages}...")
43
+ progress_bar.progress((i + 1) / total_pages)
44
+
45
+ # Try extracting text directly
46
  text = page.extract_text()
47
+ if text:
48
+ document.add_paragraph(text)
49
+ else:
50
+ # Use OCR for non-extractable pages
51
+ images = convert_from_bytes(pdf_bytes, first_page=i + 1, last_page=i + 1)
52
+ for image in images:
53
+ ocr_text = pytesseract.image_to_string(image)
54
+ if ocr_text.strip():
55
+ document.add_paragraph(ocr_text)
56
+ else:
57
+ document.add_paragraph("[This page contains non-extractable content or images]")
58
+
59
+ # Save the Word document to a BytesIO object
60
+ word_file = BytesIO()
61
+ document.save(word_file)
62
+ word_file.seek(0)
63
+
64
+ return word_file
65
+
66
+ except Exception as e:
67
+ raise ValueError(f"An error occurred: {e}")
68
 
69
  # Streamlit app configuration
70
  st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered")
71
 
72
  # App header
73
+ st.title("📄 PDF to Word Converter")
74
+ st.write("Upload a PDF file to convert it into an editable Word document.")
75
 
76
  # Upload PDF file widget
77
  uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
 
80
  # Optionally ask for a password if the PDF is encrypted
81
  password = st.text_input("Enter PDF password (if encrypted)", type="password")
82
 
83
+ if st.button("Convert to Word"):
84
+ try:
85
+ # Convert the PDF to Word
86
+ with st.spinner("Converting PDF to Word..."):
87
+ word_file = pdf_to_word(uploaded_file, password)
88
 
89
+ # Provide a download link for the Word file
90
+ st.success("Conversion successful!")
91
+ st.download_button(
92
+ label="Download Word file",
93
+ data=word_file,
94
+ file_name="converted.docx",
95
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
96
+ )
97
+ except Exception as e:
98
+ st.error(f"Error: {e}")