Spaces:

asony999
/

documents

Running

File size: 2,742 Bytes

3d1b068
 
 
 
 
 
 
 
 
3bd2289
7e63791
3d1b068
3bd2289
3f8f017
3d1b068
3bd2289
3d1b068
 
 
 
cbdb7d7
3d1b068
 
 
93ae8be
 
ed0df26
93ae8be
 
 
 
3bd2289
93ae8be
3d1b068
 
 
cbdb7d7
3d1b068
 
 
cbdb7d7
 
3d1b068
 
cbdb7d7
3d1b068
cbdb7d7
3d1b068
cbdb7d7
 
 
3d1b068
 
 
 
 
 
 
 
 
 
cbdb7d7
3d1b068
 
cbdb7d7
3d1b068
 
 
cbdb7d7
3d1b068

import gradio as gr
import os
import fitz  # PyMuPDF for handling PDFs
import pytesseract
from pdf2image import convert_from_path
from google.cloud import documentai_v1 as documentai
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

# ✅ Step 1: Set Google Cloud Credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"

# ✅ Step 2: Initialize Google Cloud Document AI Client
client = documentai.DocumentProcessorServiceClient()

# ✅ Step 3: Initialize Presidio Analyzer & Anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

def extract_text_from_pdf(pdf_path):
    """ Extracts text from PDFs using Google Cloud Document AI. """
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()

    # Set up the request for Document AI
    document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
    name = f"projects/presidio-450223/locations/us/processors/5cbc64853974c755"

    request = documentai.ProcessRequest(name=name, raw_document=document)

    # Call the Document AI API
    result = client.process_document(request=request)

    return result.document.text if result.document.text else "No text detected."

def extract_text_from_image(image_path):
    """ Extracts text from images using Tesseract OCR. """
    return pytesseract.image_to_string(image_path)

def redact_document(uploaded_file):
    """ 1. Extracts text from PDFs, Word, or Image files. 2. Uses Presidio to redact sensitive PII. """
    file_ext = uploaded_file.split(".")[-1].lower()
    
    if file_ext == "pdf":
        extracted_text = extract_text_from_pdf(uploaded_file)
    elif file_ext in ["png", "jpg", "jpeg"]:
        extracted_text = extract_text_from_image(uploaded_file)
    else:
        with open(uploaded_file, "r", encoding="utf-8") as f:
            extracted_text = f.read()

    # Analyze and redact PII
    results = analyzer.analyze(
        text=extracted_text,
        entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "ID_NUMBER", "LOCATION"],
        language="en"
    )
    anonymized_text = anonymizer.anonymize(text=extracted_text, analyzer_results=results)

    return extracted_text, anonymized_text.text

# ✅ Fix: Remove `examples` to avoid missing file errors
iface = gr.Interface(
    fn=redact_document,
    inputs=gr.File(type="filepath"),  # ✅ FIXED INPUT TYPE
    outputs=["text", "text"],
    title="Legal & Business Document Redaction",
    description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
    examples=[]  # ✅ FIXED EXAMPLES
)

# ✅ Launch Gradio App
if __name__ == "__main__":
    iface.launch()