File size: 2,742 Bytes
3d1b068 3bd2289 7e63791 3d1b068 3bd2289 3f8f017 3d1b068 3bd2289 3d1b068 cbdb7d7 3d1b068 93ae8be ed0df26 93ae8be 3bd2289 93ae8be 3d1b068 cbdb7d7 3d1b068 cbdb7d7 3d1b068 cbdb7d7 3d1b068 cbdb7d7 3d1b068 cbdb7d7 3d1b068 cbdb7d7 3d1b068 cbdb7d7 3d1b068 cbdb7d7 3d1b068 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import gradio as gr
import os
import fitz # PyMuPDF for handling PDFs
import pytesseract
from pdf2image import convert_from_path
from google.cloud import documentai_v1 as documentai
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
# β
Step 1: Set Google Cloud Credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"
# β
Step 2: Initialize Google Cloud Document AI Client
client = documentai.DocumentProcessorServiceClient()
# β
Step 3: Initialize Presidio Analyzer & Anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
def extract_text_from_pdf(pdf_path):
""" Extracts text from PDFs using Google Cloud Document AI. """
with open(pdf_path, "rb") as f:
pdf_bytes = f.read()
# Set up the request for Document AI
document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
name = f"projects/presidio-450223/locations/us/processors/5cbc64853974c755"
request = documentai.ProcessRequest(name=name, raw_document=document)
# Call the Document AI API
result = client.process_document(request=request)
return result.document.text if result.document.text else "No text detected."
def extract_text_from_image(image_path):
""" Extracts text from images using Tesseract OCR. """
return pytesseract.image_to_string(image_path)
def redact_document(uploaded_file):
""" 1. Extracts text from PDFs, Word, or Image files. 2. Uses Presidio to redact sensitive PII. """
file_ext = uploaded_file.split(".")[-1].lower()
if file_ext == "pdf":
extracted_text = extract_text_from_pdf(uploaded_file)
elif file_ext in ["png", "jpg", "jpeg"]:
extracted_text = extract_text_from_image(uploaded_file)
else:
with open(uploaded_file, "r", encoding="utf-8") as f:
extracted_text = f.read()
# Analyze and redact PII
results = analyzer.analyze(
text=extracted_text,
entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "ID_NUMBER", "LOCATION"],
language="en"
)
anonymized_text = anonymizer.anonymize(text=extracted_text, analyzer_results=results)
return extracted_text, anonymized_text.text
# β
Fix: Remove `examples` to avoid missing file errors
iface = gr.Interface(
fn=redact_document,
inputs=gr.File(type="filepath"), # β
FIXED INPUT TYPE
outputs=["text", "text"],
title="Legal & Business Document Redaction",
description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
examples=[] # β
FIXED EXAMPLES
)
# β
Launch Gradio App
if __name__ == "__main__":
iface.launch()
|