Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import io
|
4 |
-
import json
|
5 |
import fitz # PyMuPDF for handling PDFs
|
6 |
import pytesseract
|
7 |
from pdf2image import convert_from_path
|
@@ -9,13 +8,13 @@ from google.cloud import documentai_v1 as documentai
|
|
9 |
from presidio_analyzer import AnalyzerEngine
|
10 |
from presidio_anonymizer import AnonymizerEngine
|
11 |
|
12 |
-
# β
Set Google Cloud Credentials
|
13 |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"
|
14 |
|
15 |
-
# β
Initialize Google Cloud Document AI Client
|
16 |
-
|
17 |
|
18 |
-
# β
Initialize Presidio Analyzer & Anonymizer
|
19 |
analyzer = AnalyzerEngine()
|
20 |
anonymizer = AnonymizerEngine()
|
21 |
|
@@ -28,9 +27,8 @@ def extract_text_from_pdf(pdf_path):
|
|
28 |
|
29 |
# Configure Document AI request
|
30 |
document = {"content": pdf_bytes, "mime_type": "application/pdf"}
|
31 |
-
request = {"name": f"projects/
|
32 |
-
|
33 |
-
result = docai_client.process_document(request=request)
|
34 |
return result.document.text if result.document.text else "No text detected."
|
35 |
|
36 |
def extract_text_from_image(image_path):
|
@@ -41,8 +39,8 @@ def extract_text_from_image(image_path):
|
|
41 |
|
42 |
def redact_document(uploaded_file):
|
43 |
"""
|
44 |
-
1. Extracts text from
|
45 |
-
2. Uses Presidio to redact sensitive
|
46 |
"""
|
47 |
file_ext = uploaded_file.name.split(".")[-1].lower()
|
48 |
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import io
|
|
|
4 |
import fitz # PyMuPDF for handling PDFs
|
5 |
import pytesseract
|
6 |
from pdf2image import convert_from_path
|
|
|
8 |
from presidio_analyzer import AnalyzerEngine
|
9 |
from presidio_anonymizer import AnonymizerEngine
|
10 |
|
11 |
+
# β
Step 1: Set Google Cloud Credentials
|
12 |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"
|
13 |
|
14 |
+
# β
Step 2: Initialize Google Cloud Document AI Client
|
15 |
+
client = documentai.DocumentUnderstandingServiceClient()
|
16 |
|
17 |
+
# β
Step 3: Initialize Presidio Analyzer & Anonymizer
|
18 |
analyzer = AnalyzerEngine()
|
19 |
anonymizer = AnonymizerEngine()
|
20 |
|
|
|
27 |
|
28 |
# Configure Document AI request
|
29 |
document = {"content": pdf_bytes, "mime_type": "application/pdf"}
|
30 |
+
request = {"name": f"projects/presidio-450223/locations/us/processors/5cbc64853974c755", "raw_document": document}
|
31 |
+
result = client.process_document(request=request)
|
|
|
32 |
return result.document.text if result.document.text else "No text detected."
|
33 |
|
34 |
def extract_text_from_image(image_path):
|
|
|
39 |
|
40 |
def redact_document(uploaded_file):
|
41 |
"""
|
42 |
+
1. Extracts text from PDFs or images.
|
43 |
+
2. Uses Presidio to redact sensitive PII.
|
44 |
"""
|
45 |
file_ext = uploaded_file.name.split(".")[-1].lower()
|
46 |
|