import gradio as gr
import os
import fitz  # PyMuPDF for handling PDFs
import pytesseract
from pdf2image import convert_from_path
from google.cloud import documentai_v1 as documentai
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

# ✅ Step 1: Set Google Cloud Credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"

# ✅ Step 2: Initialize Google Cloud Document AI Client
client = documentai.DocumentProcessorServiceClient()

# ✅ Step 3: Initialize Presidio Analyzer & Anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

def extract_text_from_pdf(pdf_path):
    """ Extracts text from PDFs using Google Cloud Document AI. """
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()

    # Set up the request for Document AI
    document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
    name = f"projects/presidio-450223/locations/us/processors/5cbc64853974c755"

    request = documentai.ProcessRequest(name=name, raw_document=document)

    # Call the Document AI API
    result = client.process_document(request=request)

    return result.document.text if result.document.text else "No text detected."

def extract_text_from_image(image_path):
    """ Extracts text from images using Tesseract OCR. """
    return pytesseract.image_to_string(image_path)

def redact_document(uploaded_file):
    """ 1. Extracts text from PDFs, Word, or Image files. 2. Uses Presidio to redact sensitive PII. """
    file_ext = uploaded_file.split(".")[-1].lower()
    
    if file_ext == "pdf":
        extracted_text = extract_text_from_pdf(uploaded_file)
    elif file_ext in ["png", "jpg", "jpeg"]:
        extracted_text = extract_text_from_image(uploaded_file)
    else:
        with open(uploaded_file, "r", encoding="utf-8") as f:
            extracted_text = f.read()

    # Analyze and redact PII
    results = analyzer.analyze(
        text=extracted_text,
        entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "ID_NUMBER", "LOCATION"],
        language="en"
    )
    anonymized_text = anonymizer.anonymize(text=extracted_text, analyzer_results=results)

    return extracted_text, anonymized_text.text

# ✅ Fix: Remove `examples` to avoid missing file errors
iface = gr.Interface(
    fn=redact_document,
    inputs=gr.File(type="filepath"),  # ✅ FIXED INPUT TYPE
    outputs=["text", "text"],
    title="Legal & Business Document Redaction",
    description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
    examples=[]  # ✅ FIXED EXAMPLES
)

# ✅ Launch Gradio App
if __name__ == "__main__":
    iface.launch()