Spaces:

asony999
/

documents

Sleeping

App Files Files Community

asony999 commited on Feb 21

Commit

cbdb7d7

verified ·

1 Parent(s): fdc0581

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -20

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gradio as gr
 import os
-import io
 import fitz  # PyMuPDF for handling PDFs
 import pytesseract
 from pdf2image import convert_from_path
@@ -19,15 +18,13 @@ analyzer = AnalyzerEngine()
 anonymizer = AnonymizerEngine()
 def extract_text_from_pdf(pdf_path):
-    """
-    Extracts text from PDF files using Google Cloud Document AI.
-    """
     with open(pdf_path, "rb") as f:
         pdf_bytes = f.read()
     # Set up the request for Document AI
     document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
-    name = f"projects/presidio-450223/locations/us/processors/5cbc64853974c755"
     request = documentai.ProcessRequest(name=name, raw_document=document)
@@ -37,25 +34,21 @@ def extract_text_from_pdf(pdf_path):
     return result.document.text if result.document.text else "No text detected."
 def extract_text_from_image(image_path):
-    """
-    Extracts text from images using Tesseract OCR.
-    """
     return pytesseract.image_to_string(image_path)
 def redact_document(uploaded_file):
-    """
-    1. Extracts text from PDFs, Word, or Image files.
-    2. Uses Presidio to redact sensitive PII.
-    """
-    file_ext = uploaded_file.name.split(".")[-1].lower()
     if file_ext == "pdf":
-        extracted_text = extract_text_from_pdf(uploaded_file.name)
     elif file_ext in ["png", "jpg", "jpeg"]:
-        extracted_text = extract_text_from_image(uploaded_file.name)
     else:
-        extracted_text = uploaded_file.read().decode("utf-8")  # Plain text or Word files
     # Analyze and redact PII
     results = analyzer.analyze(
         text=extracted_text,
@@ -66,14 +59,14 @@ def redact_document(uploaded_file):
     return extracted_text, anonymized_text.text
-# ✅ Gradio UI for Document Upload & Redaction
 iface = gr.Interface(
     fn=redact_document,
-    inputs=gr.File(type="filepath"),
     outputs=["text", "text"],
     title="Legal & Business Document Redaction",
     description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
-    examples=["sample_contract.pdf", "business_report.docx"]
 )
 # ✅ Launch Gradio App

 import gradio as gr
 import os
 import fitz  # PyMuPDF for handling PDFs
 import pytesseract
 from pdf2image import convert_from_path
 anonymizer = AnonymizerEngine()
 def extract_text_from_pdf(pdf_path):
+    """ Extracts text from PDFs using Google Cloud Document AI. """
     with open(pdf_path, "rb") as f:
         pdf_bytes = f.read()
     # Set up the request for Document AI
     document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
+    name = f"projects/YOUR_PROJECT_ID/locations/us/processors/YOUR_PROCESSOR_ID"
     request = documentai.ProcessRequest(name=name, raw_document=document)
     return result.document.text if result.document.text else "No text detected."
 def extract_text_from_image(image_path):
+    """ Extracts text from images using Tesseract OCR. """
     return pytesseract.image_to_string(image_path)
 def redact_document(uploaded_file):
+    """ 1. Extracts text from PDFs, Word, or Image files. 2. Uses Presidio to redact sensitive PII. """
+    file_ext = uploaded_file.split(".")[-1].lower()
     if file_ext == "pdf":
+        extracted_text = extract_text_from_pdf(uploaded_file)
     elif file_ext in ["png", "jpg", "jpeg"]:
+        extracted_text = extract_text_from_image(uploaded_file)
     else:
+        with open(uploaded_file, "r", encoding="utf-8") as f:
+            extracted_text = f.read()
     # Analyze and redact PII
     results = analyzer.analyze(
         text=extracted_text,
     return extracted_text, anonymized_text.text
+# ✅ Fix: Remove `examples` to avoid missing file errors
 iface = gr.Interface(
     fn=redact_document,
+    inputs=gr.File(type="filepath"),  # ✅ FIXED INPUT TYPE
     outputs=["text", "text"],
     title="Legal & Business Document Redaction",
     description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
+    examples=[]  # ✅ FIXED EXAMPLES
 )
 # ✅ Launch Gradio App