dminhk's picture
Create app.py
ac446e5
raw
history blame
2.79 kB
import json
import base64
import pathlib
import pdfplumber
import streamlit as st
import fillpdf
from fillpdf import fillpdfs
##########################################################
# Display PDF function
def displayPDF(file):
# Opening file from file path
with open(file, "rb") as f:
base64_pdf = base64.b64encode(f.read()).decode('utf-8')
# Embedding PDF in HTML
pdf_display = F'<embed src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf">'
# Displaying File
st.markdown(pdf_display, unsafe_allow_html=True)
##########################################################
st.set_page_config(page_title="AcroForms Data Extractor")
st.title("AcroForms Data Extractor")
st.markdown("""
This app allows you to extract AcroForms data from PDF files. Simply upload a PDF file and the app will generate a downloadable text file containing the extracted data.
""")
# Upload PDF
pdf_file = st.file_uploader("Upload PDF File", type=["pdf"])
if pdf_file is not None:
# Save file to a directory
uploads_dir = pathlib.Path("uploads")
if not uploads_dir.exists():
uploads_dir.mkdir()
with open(f"{uploads_dir}/{pdf_file.name}", "wb") as f:
f.write(pdf_file.getbuffer())
# Get file path
pdf_path = f"{uploads_dir}/{pdf_file.name}"
# # Print path
# st.markdown("**PDF Path:**")
# st.write(pdf_path)
# Display PDF
st.divider()
st.markdown("**PDF Display:**")
displayPDF(pdf_path)
# Print Form Data
st.divider()
form_data = fillpdfs.get_form_fields(pdf_path)
st.markdown("\n\n**PDF AcroForm:**")
st.write(form_data)
# convert dictionary into string
form_txt = json.dumps(form_data)
# download button
st.download_button(
label='Download AcroForm JSON',
data=form_txt,
file_name='form.json',
mime='application/json',
)
# Print Number of Pages and Extract Texxt
st.divider()
st.markdown("**PDF to Text:**")
with pdfplumber.open(pdf_file) as pdf:
pages = pdf.pages
# Number of Pages
st.markdown("**Number of Pages**")
st.write(f"Number of Pages: {len(pages)}")
# Extract Metadata
st.markdown("**Metadata**")
metadata = pdf.metadata
st.code(metadata)
# Extract Text
text = ""
for page in pages:
text += page.extract_text(layout=True) + "\n\n"
st.markdown("**Text**")
st.text(text)
# Allow text to be downloaded
btn = st.download_button(
label="Download PDF Text",
data=text,
file_name=f"{pdf_file.name.replace('.pdf', '')}_text.txt",
mime="text/plain"
)