Spaces:

asony999
/

documents

Running

App Files Files Community

documents / app.py

asony999

Update app.py

ed0df26 verified 21 days ago

raw

history blame contribute delete

2.74 kB

	import gradio as gr
	import os
	import fitz # PyMuPDF for handling PDFs
	import pytesseract
	from pdf2image import convert_from_path
	from google.cloud import documentai_v1 as documentai
	from presidio_analyzer import AnalyzerEngine
	from presidio_anonymizer import AnonymizerEngine

	# ✅ Step 1: Set Google Cloud Credentials
	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"

	# ✅ Step 2: Initialize Google Cloud Document AI Client
	client = documentai.DocumentProcessorServiceClient()

	# ✅ Step 3: Initialize Presidio Analyzer & Anonymizer
	analyzer = AnalyzerEngine()
	anonymizer = AnonymizerEngine()

	def extract_text_from_pdf(pdf_path):
	""" Extracts text from PDFs using Google Cloud Document AI. """
	with open(pdf_path, "rb") as f:
	pdf_bytes = f.read()

	# Set up the request for Document AI
	document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
	name = f"projects/presidio-450223/locations/us/processors/5cbc64853974c755"

	request = documentai.ProcessRequest(name=name, raw_document=document)

	# Call the Document AI API
	result = client.process_document(request=request)

	return result.document.text if result.document.text else "No text detected."

	def extract_text_from_image(image_path):
	""" Extracts text from images using Tesseract OCR. """
	return pytesseract.image_to_string(image_path)

	def redact_document(uploaded_file):
	""" 1. Extracts text from PDFs, Word, or Image files. 2. Uses Presidio to redact sensitive PII. """
	file_ext = uploaded_file.split(".")[-1].lower()

	if file_ext == "pdf":
	extracted_text = extract_text_from_pdf(uploaded_file)
	elif file_ext in ["png", "jpg", "jpeg"]:
	extracted_text = extract_text_from_image(uploaded_file)
	else:
	with open(uploaded_file, "r", encoding="utf-8") as f:
	extracted_text = f.read()

	# Analyze and redact PII
	results = analyzer.analyze(
	text=extracted_text,
	entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "ID_NUMBER", "LOCATION"],
	language="en"
	)
	anonymized_text = anonymizer.anonymize(text=extracted_text, analyzer_results=results)

	return extracted_text, anonymized_text.text

	# ✅ Fix: Remove `examples` to avoid missing file errors
	iface = gr.Interface(
	fn=redact_document,
	inputs=gr.File(type="filepath"), # ✅ FIXED INPUT TYPE
	outputs=["text", "text"],
	title="Legal & Business Document Redaction",
	description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
	examples=[] # ✅ FIXED EXAMPLES
	)

	# ✅ Launch Gradio App
	if __name__ == "__main__":
	iface.launch()