Spaces:

dminhk
/

AcroForms-Data-Extractor

Sleeping

App Files Files Community

dminhk commited on Dec 15, 2023

Commit

ac446e5

1 Parent(s): bed9972

Create app.py

Browse files

Files changed (1) hide show

app.py +93 -0

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import json
+import base64
+import pathlib
+import pdfplumber
+import streamlit as st
+import fillpdf
+from fillpdf import fillpdfs
+##########################################################
+# Display PDF function
+def displayPDF(file):
+    # Opening file from file path
+    with open(file, "rb") as f:
+        base64_pdf = base64.b64encode(f.read()).decode('utf-8')
+    # Embedding PDF in HTML
+    pdf_display = F'<embed src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf">'
+    # Displaying File
+    st.markdown(pdf_display, unsafe_allow_html=True)
+##########################################################
+st.set_page_config(page_title="AcroForms Data Extractor")
+st.title("AcroForms Data Extractor")
+st.markdown("""
+This app allows you to extract AcroForms data from PDF files. Simply upload a PDF file and the app will generate a downloadable text file containing the extracted data.
+""")
+# Upload PDF
+pdf_file = st.file_uploader("Upload PDF File", type=["pdf"])
+if pdf_file is not None:
+    # Save file to a directory
+    uploads_dir = pathlib.Path("uploads")
+    if not uploads_dir.exists():
+        uploads_dir.mkdir()
+    with open(f"{uploads_dir}/{pdf_file.name}", "wb") as f:
+        f.write(pdf_file.getbuffer())
+    # Get file path
+    pdf_path = f"{uploads_dir}/{pdf_file.name}"
+    # # Print path
+    # st.markdown("**PDF Path:**")
+    # st.write(pdf_path)
+    # Display PDF
+    st.divider()
+    st.markdown("**PDF Display:**")
+    displayPDF(pdf_path)
+    # Print Form Data
+    st.divider()
+    form_data = fillpdfs.get_form_fields(pdf_path)
+    st.markdown("\n\n**PDF AcroForm:**")
+    st.write(form_data)
+    # convert dictionary into string
+    form_txt = json.dumps(form_data)
+    # download button
+    st.download_button(
+        label='Download AcroForm JSON',
+        data=form_txt,
+        file_name='form.json',
+        mime='application/json',
+    )
+    # Print Number of Pages and Extract Texxt
+    st.divider()
+    st.markdown("**PDF to Text:**")
+    with pdfplumber.open(pdf_file) as pdf:
+        pages = pdf.pages
+        # Number of Pages
+        st.markdown("**Number of Pages**")
+        st.write(f"Number of Pages: {len(pages)}")
+        # Extract Metadata
+        st.markdown("**Metadata**")
+        metadata = pdf.metadata
+        st.code(metadata)
+        # Extract Text
+        text = ""
+        for page in pages:
+            text += page.extract_text(layout=True) + "\n\n"
+        st.markdown("**Text**")
+        st.text(text)
+        # Allow text to be downloaded
+        btn = st.download_button(
+            label="Download PDF Text",
+            data=text,
+            file_name=f"{pdf_file.name.replace('.pdf', '')}_text.txt",
+            mime="text/plain"
+        )