Spaces:
Sleeping
Sleeping
import json | |
import base64 | |
import pathlib | |
import pdfplumber | |
import streamlit as st | |
import fillpdf | |
from fillpdf import fillpdfs | |
########################################################## | |
# Display PDF function | |
def displayPDF(file): | |
# Opening file from file path | |
with open(file, "rb") as f: | |
base64_pdf = base64.b64encode(f.read()).decode('utf-8') | |
# Embedding PDF in HTML | |
pdf_display = F'<embed src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf">' | |
# Displaying File | |
st.markdown(pdf_display, unsafe_allow_html=True) | |
########################################################## | |
st.set_page_config(page_title="AcroForms Data Extractor") | |
st.title("AcroForms Data Extractor") | |
st.markdown(""" | |
This app allows you to extract AcroForms data from PDF files. Simply upload a PDF file and the app will generate a downloadable text file containing the extracted data. | |
""") | |
# Upload PDF | |
pdf_file = st.file_uploader("Upload PDF File", type=["pdf"]) | |
if pdf_file is not None: | |
# Save file to a directory | |
uploads_dir = pathlib.Path("uploads") | |
if not uploads_dir.exists(): | |
uploads_dir.mkdir() | |
with open(f"{uploads_dir}/{pdf_file.name}", "wb") as f: | |
f.write(pdf_file.getbuffer()) | |
# Get file path | |
pdf_path = f"{uploads_dir}/{pdf_file.name}" | |
# # Print path | |
# st.markdown("**PDF Path:**") | |
# st.write(pdf_path) | |
# Display PDF | |
st.divider() | |
st.markdown("**PDF Display:**") | |
displayPDF(pdf_path) | |
# Print Form Data | |
st.divider() | |
form_data = fillpdfs.get_form_fields(pdf_path) | |
st.markdown("\n\n**PDF AcroForm:**") | |
st.write(form_data) | |
# convert dictionary into string | |
form_txt = json.dumps(form_data) | |
# download button | |
st.download_button( | |
label='Download AcroForm JSON', | |
data=form_txt, | |
file_name='form.json', | |
mime='application/json', | |
) | |
# Print Number of Pages and Extract Texxt | |
st.divider() | |
st.markdown("**PDF to Text:**") | |
with pdfplumber.open(pdf_file) as pdf: | |
pages = pdf.pages | |
# Number of Pages | |
st.markdown("**Number of Pages**") | |
st.write(f"Number of Pages: {len(pages)}") | |
# Extract Metadata | |
st.markdown("**Metadata**") | |
metadata = pdf.metadata | |
st.code(metadata) | |
# Extract Text | |
text = "" | |
for page in pages: | |
text += page.extract_text(layout=True) + "\n\n" | |
st.markdown("**Text**") | |
st.text(text) | |
# Allow text to be downloaded | |
btn = st.download_button( | |
label="Download PDF Text", | |
data=text, | |
file_name=f"{pdf_file.name.replace('.pdf', '')}_text.txt", | |
mime="text/plain" | |
) | |