Update app.py
Browse files
app.py
CHANGED
@@ -7,8 +7,6 @@ from pdf2image import convert_from_path
|
|
7 |
from google.cloud import documentai_v1 as documentai
|
8 |
from presidio_analyzer import AnalyzerEngine
|
9 |
from presidio_anonymizer import AnonymizerEngine
|
10 |
-
from google.cloud import documentai_v1 as documentai
|
11 |
-
|
12 |
|
13 |
# ✅ Step 1: Set Google Cloud Credentials
|
14 |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"
|
@@ -22,15 +20,20 @@ anonymizer = AnonymizerEngine()
|
|
22 |
|
23 |
def extract_text_from_pdf(pdf_path):
|
24 |
"""
|
25 |
-
Extracts text from PDF files using Google Document AI.
|
26 |
"""
|
27 |
with open(pdf_path, "rb") as f:
|
28 |
pdf_bytes = f.read()
|
29 |
|
30 |
-
#
|
31 |
-
document =
|
32 |
-
|
|
|
|
|
|
|
|
|
33 |
result = client.process_document(request=request)
|
|
|
34 |
return result.document.text if result.document.text else "No text detected."
|
35 |
|
36 |
def extract_text_from_image(image_path):
|
@@ -41,7 +44,7 @@ def extract_text_from_image(image_path):
|
|
41 |
|
42 |
def redact_document(uploaded_file):
|
43 |
"""
|
44 |
-
1. Extracts text from PDFs or
|
45 |
2. Uses Presidio to redact sensitive PII.
|
46 |
"""
|
47 |
file_ext = uploaded_file.name.split(".")[-1].lower()
|
|
|
7 |
from google.cloud import documentai_v1 as documentai
|
8 |
from presidio_analyzer import AnalyzerEngine
|
9 |
from presidio_anonymizer import AnonymizerEngine
|
|
|
|
|
10 |
|
11 |
# ✅ Step 1: Set Google Cloud Credentials
|
12 |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"
|
|
|
20 |
|
21 |
def extract_text_from_pdf(pdf_path):
|
22 |
"""
|
23 |
+
Extracts text from PDF files using Google Cloud Document AI.
|
24 |
"""
|
25 |
with open(pdf_path, "rb") as f:
|
26 |
pdf_bytes = f.read()
|
27 |
|
28 |
+
# Set up the request for Document AI
|
29 |
+
document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
|
30 |
+
name = f"projects/presidio-450223/locations/us/processors/5cbc64853974c755"
|
31 |
+
|
32 |
+
request = documentai.ProcessRequest(name=name, raw_document=document)
|
33 |
+
|
34 |
+
# Call the Document AI API
|
35 |
result = client.process_document(request=request)
|
36 |
+
|
37 |
return result.document.text if result.document.text else "No text detected."
|
38 |
|
39 |
def extract_text_from_image(image_path):
|
|
|
44 |
|
45 |
def redact_document(uploaded_file):
|
46 |
"""
|
47 |
+
1. Extracts text from PDFs, Word, or Image files.
|
48 |
2. Uses Presidio to redact sensitive PII.
|
49 |
"""
|
50 |
file_ext = uploaded_file.name.split(".")[-1].lower()
|