Spaces:

dminhk
/

AcroForms-Data-Extractor

Sleeping

File size: 2,789 Bytes

ac446e5

import json
import base64
import pathlib
import pdfplumber
import streamlit as st
import fillpdf
from fillpdf import fillpdfs

##########################################################
# Display PDF function
def displayPDF(file):
    # Opening file from file path
    with open(file, "rb") as f:
        base64_pdf = base64.b64encode(f.read()).decode('utf-8')
    # Embedding PDF in HTML
    pdf_display = F'<embed src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf">'
    # Displaying File
    st.markdown(pdf_display, unsafe_allow_html=True)
##########################################################

st.set_page_config(page_title="AcroForms Data Extractor")

st.title("AcroForms Data Extractor")

st.markdown("""
This app allows you to extract AcroForms data from PDF files. Simply upload a PDF file and the app will generate a downloadable text file containing the extracted data.
""")

# Upload PDF
pdf_file = st.file_uploader("Upload PDF File", type=["pdf"])

if pdf_file is not None:

    # Save file to a directory
    uploads_dir = pathlib.Path("uploads") 
    if not uploads_dir.exists():
        uploads_dir.mkdir()

    with open(f"{uploads_dir}/{pdf_file.name}", "wb") as f: 
        f.write(pdf_file.getbuffer())

    # Get file path
    pdf_path = f"{uploads_dir}/{pdf_file.name}"
    
    # # Print path
    # st.markdown("**PDF Path:**")
    # st.write(pdf_path)
    
    # Display PDF
    st.divider()
    st.markdown("**PDF Display:**")
    displayPDF(pdf_path)
    # Print Form Data
    st.divider()
    form_data = fillpdfs.get_form_fields(pdf_path)
    st.markdown("\n\n**PDF AcroForm:**")
    st.write(form_data)
    # convert dictionary into string
    form_txt = json.dumps(form_data)
    # download button
    st.download_button(
        label='Download AcroForm JSON',
        data=form_txt,
        file_name='form.json',
        mime='application/json',
    )
    # Print Number of Pages and Extract Texxt
    st.divider()
    st.markdown("**PDF to Text:**")
    with pdfplumber.open(pdf_file) as pdf:
        pages = pdf.pages
        # Number of Pages
        st.markdown("**Number of Pages**")
        st.write(f"Number of Pages: {len(pages)}")
        # Extract Metadata
        st.markdown("**Metadata**")
        metadata = pdf.metadata
        st.code(metadata)
        # Extract Text 
        text = ""
        for page in pages:
            text += page.extract_text(layout=True) + "\n\n"
        
        st.markdown("**Text**")
        st.text(text)
        
        # Allow text to be downloaded
        btn = st.download_button(
            label="Download PDF Text",
            data=text,
            file_name=f"{pdf_file.name.replace('.pdf', '')}_text.txt",
            mime="text/plain"
        )