Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import io
|
4 |
+
import json
|
5 |
+
import fitz # PyMuPDF for handling PDFs
|
6 |
+
import pytesseract
|
7 |
+
from pdf2image import convert_from_path
|
8 |
+
from google.cloud import documentai_v1 as documentai
|
9 |
+
from presidio_analyzer import AnalyzerEngine
|
10 |
+
from presidio_anonymizer import AnonymizerEngine
|
11 |
+
|
12 |
+
# β
Set Google Cloud Credentials (File Uploaded in Hugging Face)
|
13 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service-key.json"
|
14 |
+
|
15 |
+
# β
Initialize Google Cloud Document AI Client
|
16 |
+
docai_client = documentai.DocumentUnderstandingServiceClient()
|
17 |
+
|
18 |
+
# β
Initialize Presidio Analyzer & Anonymizer
|
19 |
+
analyzer = AnalyzerEngine()
|
20 |
+
anonymizer = AnonymizerEngine()
|
21 |
+
|
22 |
+
def extract_text_from_pdf(pdf_path):
|
23 |
+
"""
|
24 |
+
Extracts text from PDF files using Google Document AI.
|
25 |
+
"""
|
26 |
+
with open(pdf_path, "rb") as f:
|
27 |
+
pdf_bytes = f.read()
|
28 |
+
|
29 |
+
# Configure Document AI request
|
30 |
+
document = {"content": pdf_bytes, "mime_type": "application/pdf"}
|
31 |
+
request = {"name": f"projects/YOUR_PROJECT_ID/locations/us/processors/YOUR_PROCESSOR_ID", "raw_document": document}
|
32 |
+
|
33 |
+
result = docai_client.process_document(request=request)
|
34 |
+
return result.document.text if result.document.text else "No text detected."
|
35 |
+
|
36 |
+
def extract_text_from_image(image_path):
|
37 |
+
"""
|
38 |
+
Extracts text from images using Tesseract OCR.
|
39 |
+
"""
|
40 |
+
return pytesseract.image_to_string(image_path)
|
41 |
+
|
42 |
+
def redact_document(uploaded_file):
|
43 |
+
"""
|
44 |
+
1. Extracts text from PDF, Word, or Image files.
|
45 |
+
2. Uses Presidio to redact sensitive information.
|
46 |
+
"""
|
47 |
+
file_ext = uploaded_file.name.split(".")[-1].lower()
|
48 |
+
|
49 |
+
if file_ext == "pdf":
|
50 |
+
extracted_text = extract_text_from_pdf(uploaded_file.name)
|
51 |
+
elif file_ext in ["png", "jpg", "jpeg"]:
|
52 |
+
extracted_text = extract_text_from_image(uploaded_file.name)
|
53 |
+
else:
|
54 |
+
extracted_text = uploaded_file.read().decode("utf-8") # Plain text or Word files
|
55 |
+
|
56 |
+
# Analyze and redact PII
|
57 |
+
results = analyzer.analyze(
|
58 |
+
text=extracted_text,
|
59 |
+
entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "ID_NUMBER", "LOCATION"],
|
60 |
+
language="en"
|
61 |
+
)
|
62 |
+
anonymized_text = anonymizer.anonymize(text=extracted_text, analyzer_results=results)
|
63 |
+
|
64 |
+
return extracted_text, anonymized_text.text
|
65 |
+
|
66 |
+
# β
Gradio UI for Document Upload & Redaction
|
67 |
+
iface = gr.Interface(
|
68 |
+
fn=redact_document,
|
69 |
+
inputs=gr.File(type="file"),
|
70 |
+
outputs=["text", "text"],
|
71 |
+
title="Legal & Business Document Redaction",
|
72 |
+
description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
|
73 |
+
examples=["sample_contract.pdf", "business_report.docx"]
|
74 |
+
)
|
75 |
+
|
76 |
+
# β
Launch Gradio App
|
77 |
+
if __name__ == "__main__":
|
78 |
+
iface.launch()
|