documents / app.py
asony999's picture
Update app.py
ed0df26 verified
import gradio as gr
import os
import fitz # PyMuPDF for handling PDFs
import pytesseract
from pdf2image import convert_from_path
from google.cloud import documentai_v1 as documentai
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
# βœ… Step 1: Set Google Cloud Credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"
# βœ… Step 2: Initialize Google Cloud Document AI Client
client = documentai.DocumentProcessorServiceClient()
# βœ… Step 3: Initialize Presidio Analyzer & Anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
def extract_text_from_pdf(pdf_path):
""" Extracts text from PDFs using Google Cloud Document AI. """
with open(pdf_path, "rb") as f:
pdf_bytes = f.read()
# Set up the request for Document AI
document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
name = f"projects/presidio-450223/locations/us/processors/5cbc64853974c755"
request = documentai.ProcessRequest(name=name, raw_document=document)
# Call the Document AI API
result = client.process_document(request=request)
return result.document.text if result.document.text else "No text detected."
def extract_text_from_image(image_path):
""" Extracts text from images using Tesseract OCR. """
return pytesseract.image_to_string(image_path)
def redact_document(uploaded_file):
""" 1. Extracts text from PDFs, Word, or Image files. 2. Uses Presidio to redact sensitive PII. """
file_ext = uploaded_file.split(".")[-1].lower()
if file_ext == "pdf":
extracted_text = extract_text_from_pdf(uploaded_file)
elif file_ext in ["png", "jpg", "jpeg"]:
extracted_text = extract_text_from_image(uploaded_file)
else:
with open(uploaded_file, "r", encoding="utf-8") as f:
extracted_text = f.read()
# Analyze and redact PII
results = analyzer.analyze(
text=extracted_text,
entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "ID_NUMBER", "LOCATION"],
language="en"
)
anonymized_text = anonymizer.anonymize(text=extracted_text, analyzer_results=results)
return extracted_text, anonymized_text.text
# βœ… Fix: Remove `examples` to avoid missing file errors
iface = gr.Interface(
fn=redact_document,
inputs=gr.File(type="filepath"), # βœ… FIXED INPUT TYPE
outputs=["text", "text"],
title="Legal & Business Document Redaction",
description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
examples=[] # βœ… FIXED EXAMPLES
)
# βœ… Launch Gradio App
if __name__ == "__main__":
iface.launch()