Spaces:

asony999
/

documents

Sleeping

App Files Files Community

asony999 commited on Feb 21

Commit

3d1b068

verified ·

1 Parent(s): e145845

Create app.py

Browse files

Files changed (1) hide show

app.py +78 -0

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import gradio as gr
+import os
+import io
+import json
+import fitz  # PyMuPDF for handling PDFs
+import pytesseract
+from pdf2image import convert_from_path
+from google.cloud import documentai_v1 as documentai
+from presidio_analyzer import AnalyzerEngine
+from presidio_anonymizer import AnonymizerEngine
+# ✅ Set Google Cloud Credentials (File Uploaded in Hugging Face)
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service-key.json"
+# ✅ Initialize Google Cloud Document AI Client
+docai_client = documentai.DocumentUnderstandingServiceClient()
+# ✅ Initialize Presidio Analyzer & Anonymizer
+analyzer = AnalyzerEngine()
+anonymizer = AnonymizerEngine()
+def extract_text_from_pdf(pdf_path):
+    """
+    Extracts text from PDF files using Google Document AI.
+    """
+    with open(pdf_path, "rb") as f:
+        pdf_bytes = f.read()
+    # Configure Document AI request
+    document = {"content": pdf_bytes, "mime_type": "application/pdf"}
+    request = {"name": f"projects/YOUR_PROJECT_ID/locations/us/processors/YOUR_PROCESSOR_ID", "raw_document": document}
+    result = docai_client.process_document(request=request)
+    return result.document.text if result.document.text else "No text detected."
+def extract_text_from_image(image_path):
+    """
+    Extracts text from images using Tesseract OCR.
+    """
+    return pytesseract.image_to_string(image_path)
+def redact_document(uploaded_file):
+    """
+    1. Extracts text from PDF, Word, or Image files.
+    2. Uses Presidio to redact sensitive information.
+    """
+    file_ext = uploaded_file.name.split(".")[-1].lower()
+    if file_ext == "pdf":
+        extracted_text = extract_text_from_pdf(uploaded_file.name)
+    elif file_ext in ["png", "jpg", "jpeg"]:
+        extracted_text = extract_text_from_image(uploaded_file.name)
+    else:
+        extracted_text = uploaded_file.read().decode("utf-8")  # Plain text or Word files
+    # Analyze and redact PII
+    results = analyzer.analyze(
+        text=extracted_text,
+        entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "ID_NUMBER", "LOCATION"],
+        language="en"
+    )
+    anonymized_text = anonymizer.anonymize(text=extracted_text, analyzer_results=results)
+    return extracted_text, anonymized_text.text
+# ✅ Gradio UI for Document Upload & Redaction
+iface = gr.Interface(
+    fn=redact_document,
+    inputs=gr.File(type="file"),
+    outputs=["text", "text"],
+    title="Legal & Business Document Redaction",
+    description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
+    examples=["sample_contract.pdf", "business_report.docx"]
+)
+# ✅ Launch Gradio App
+if __name__ == "__main__":
+    iface.launch()