import gradio as gr import os import fitz # PyMuPDF for handling PDFs import pytesseract from pdf2image import convert_from_path from google.cloud import documentai_v1 as documentai from presidio_analyzer import AnalyzerEngine from presidio_anonymizer import AnonymizerEngine # ✅ Step 1: Set Google Cloud Credentials os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json" # ✅ Step 2: Initialize Google Cloud Document AI Client client = documentai.DocumentProcessorServiceClient() # ✅ Step 3: Initialize Presidio Analyzer & Anonymizer analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine() def extract_text_from_pdf(pdf_path): """ Extracts text from PDFs using Google Cloud Document AI. """ with open(pdf_path, "rb") as f: pdf_bytes = f.read() # Set up the request for Document AI document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf") name = f"projects/presidio-450223/locations/us/processors/5cbc64853974c755" request = documentai.ProcessRequest(name=name, raw_document=document) # Call the Document AI API result = client.process_document(request=request) return result.document.text if result.document.text else "No text detected." def extract_text_from_image(image_path): """ Extracts text from images using Tesseract OCR. """ return pytesseract.image_to_string(image_path) def redact_document(uploaded_file): """ 1. Extracts text from PDFs, Word, or Image files. 2. Uses Presidio to redact sensitive PII. """ file_ext = uploaded_file.split(".")[-1].lower() if file_ext == "pdf": extracted_text = extract_text_from_pdf(uploaded_file) elif file_ext in ["png", "jpg", "jpeg"]: extracted_text = extract_text_from_image(uploaded_file) else: with open(uploaded_file, "r", encoding="utf-8") as f: extracted_text = f.read() # Analyze and redact PII results = analyzer.analyze( text=extracted_text, entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "ID_NUMBER", "LOCATION"], language="en" ) anonymized_text = anonymizer.anonymize(text=extracted_text, analyzer_results=results) return extracted_text, anonymized_text.text # ✅ Fix: Remove `examples` to avoid missing file errors iface = gr.Interface( fn=redact_document, inputs=gr.File(type="filepath"), # ✅ FIXED INPUT TYPE outputs=["text", "text"], title="Legal & Business Document Redaction", description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.", examples=[] # ✅ FIXED EXAMPLES ) # ✅ Launch Gradio App if __name__ == "__main__": iface.launch()