asony999 commited on
Commit
3d1b068
Β·
verified Β·
1 Parent(s): e145845

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import io
4
+ import json
5
+ import fitz # PyMuPDF for handling PDFs
6
+ import pytesseract
7
+ from pdf2image import convert_from_path
8
+ from google.cloud import documentai_v1 as documentai
9
+ from presidio_analyzer import AnalyzerEngine
10
+ from presidio_anonymizer import AnonymizerEngine
11
+
12
+ # βœ… Set Google Cloud Credentials (File Uploaded in Hugging Face)
13
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service-key.json"
14
+
15
+ # βœ… Initialize Google Cloud Document AI Client
16
+ docai_client = documentai.DocumentUnderstandingServiceClient()
17
+
18
+ # βœ… Initialize Presidio Analyzer & Anonymizer
19
+ analyzer = AnalyzerEngine()
20
+ anonymizer = AnonymizerEngine()
21
+
22
+ def extract_text_from_pdf(pdf_path):
23
+ """
24
+ Extracts text from PDF files using Google Document AI.
25
+ """
26
+ with open(pdf_path, "rb") as f:
27
+ pdf_bytes = f.read()
28
+
29
+ # Configure Document AI request
30
+ document = {"content": pdf_bytes, "mime_type": "application/pdf"}
31
+ request = {"name": f"projects/YOUR_PROJECT_ID/locations/us/processors/YOUR_PROCESSOR_ID", "raw_document": document}
32
+
33
+ result = docai_client.process_document(request=request)
34
+ return result.document.text if result.document.text else "No text detected."
35
+
36
+ def extract_text_from_image(image_path):
37
+ """
38
+ Extracts text from images using Tesseract OCR.
39
+ """
40
+ return pytesseract.image_to_string(image_path)
41
+
42
+ def redact_document(uploaded_file):
43
+ """
44
+ 1. Extracts text from PDF, Word, or Image files.
45
+ 2. Uses Presidio to redact sensitive information.
46
+ """
47
+ file_ext = uploaded_file.name.split(".")[-1].lower()
48
+
49
+ if file_ext == "pdf":
50
+ extracted_text = extract_text_from_pdf(uploaded_file.name)
51
+ elif file_ext in ["png", "jpg", "jpeg"]:
52
+ extracted_text = extract_text_from_image(uploaded_file.name)
53
+ else:
54
+ extracted_text = uploaded_file.read().decode("utf-8") # Plain text or Word files
55
+
56
+ # Analyze and redact PII
57
+ results = analyzer.analyze(
58
+ text=extracted_text,
59
+ entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "ID_NUMBER", "LOCATION"],
60
+ language="en"
61
+ )
62
+ anonymized_text = anonymizer.anonymize(text=extracted_text, analyzer_results=results)
63
+
64
+ return extracted_text, anonymized_text.text
65
+
66
+ # βœ… Gradio UI for Document Upload & Redaction
67
+ iface = gr.Interface(
68
+ fn=redact_document,
69
+ inputs=gr.File(type="file"),
70
+ outputs=["text", "text"],
71
+ title="Legal & Business Document Redaction",
72
+ description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
73
+ examples=["sample_contract.pdf", "business_report.docx"]
74
+ )
75
+
76
+ # βœ… Launch Gradio App
77
+ if __name__ == "__main__":
78
+ iface.launch()