|
import gradio as gr |
|
import os |
|
import fitz |
|
import pytesseract |
|
from pdf2image import convert_from_path |
|
from google.cloud import documentai_v1 as documentai |
|
from presidio_analyzer import AnalyzerEngine |
|
from presidio_anonymizer import AnonymizerEngine |
|
|
|
|
|
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json" |
|
|
|
|
|
client = documentai.DocumentProcessorServiceClient() |
|
|
|
|
|
analyzer = AnalyzerEngine() |
|
anonymizer = AnonymizerEngine() |
|
|
|
def extract_text_from_pdf(pdf_path): |
|
""" Extracts text from PDFs using Google Cloud Document AI. """ |
|
with open(pdf_path, "rb") as f: |
|
pdf_bytes = f.read() |
|
|
|
|
|
document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf") |
|
name = f"projects/presidio-450223/locations/us/processors/5cbc64853974c755" |
|
|
|
request = documentai.ProcessRequest(name=name, raw_document=document) |
|
|
|
|
|
result = client.process_document(request=request) |
|
|
|
return result.document.text if result.document.text else "No text detected." |
|
|
|
def extract_text_from_image(image_path): |
|
""" Extracts text from images using Tesseract OCR. """ |
|
return pytesseract.image_to_string(image_path) |
|
|
|
def redact_document(uploaded_file): |
|
""" 1. Extracts text from PDFs, Word, or Image files. 2. Uses Presidio to redact sensitive PII. """ |
|
file_ext = uploaded_file.split(".")[-1].lower() |
|
|
|
if file_ext == "pdf": |
|
extracted_text = extract_text_from_pdf(uploaded_file) |
|
elif file_ext in ["png", "jpg", "jpeg"]: |
|
extracted_text = extract_text_from_image(uploaded_file) |
|
else: |
|
with open(uploaded_file, "r", encoding="utf-8") as f: |
|
extracted_text = f.read() |
|
|
|
|
|
results = analyzer.analyze( |
|
text=extracted_text, |
|
entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "ID_NUMBER", "LOCATION"], |
|
language="en" |
|
) |
|
anonymized_text = anonymizer.anonymize(text=extracted_text, analyzer_results=results) |
|
|
|
return extracted_text, anonymized_text.text |
|
|
|
|
|
iface = gr.Interface( |
|
fn=redact_document, |
|
inputs=gr.File(type="filepath"), |
|
outputs=["text", "text"], |
|
title="Legal & Business Document Redaction", |
|
description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.", |
|
examples=[] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|