Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
-
import io
|
4 |
import fitz # PyMuPDF for handling PDFs
|
5 |
import pytesseract
|
6 |
from pdf2image import convert_from_path
|
@@ -19,15 +18,13 @@ analyzer = AnalyzerEngine()
|
|
19 |
anonymizer = AnonymizerEngine()
|
20 |
|
21 |
def extract_text_from_pdf(pdf_path):
|
22 |
-
"""
|
23 |
-
Extracts text from PDF files using Google Cloud Document AI.
|
24 |
-
"""
|
25 |
with open(pdf_path, "rb") as f:
|
26 |
pdf_bytes = f.read()
|
27 |
|
28 |
# Set up the request for Document AI
|
29 |
document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
|
30 |
-
name = f"projects/
|
31 |
|
32 |
request = documentai.ProcessRequest(name=name, raw_document=document)
|
33 |
|
@@ -37,25 +34,21 @@ def extract_text_from_pdf(pdf_path):
|
|
37 |
return result.document.text if result.document.text else "No text detected."
|
38 |
|
39 |
def extract_text_from_image(image_path):
|
40 |
-
"""
|
41 |
-
Extracts text from images using Tesseract OCR.
|
42 |
-
"""
|
43 |
return pytesseract.image_to_string(image_path)
|
44 |
|
45 |
def redact_document(uploaded_file):
|
46 |
-
"""
|
47 |
-
|
48 |
-
2. Uses Presidio to redact sensitive PII.
|
49 |
-
"""
|
50 |
-
file_ext = uploaded_file.name.split(".")[-1].lower()
|
51 |
|
52 |
if file_ext == "pdf":
|
53 |
-
extracted_text = extract_text_from_pdf(uploaded_file
|
54 |
elif file_ext in ["png", "jpg", "jpeg"]:
|
55 |
-
extracted_text = extract_text_from_image(uploaded_file
|
56 |
else:
|
57 |
-
|
58 |
-
|
|
|
59 |
# Analyze and redact PII
|
60 |
results = analyzer.analyze(
|
61 |
text=extracted_text,
|
@@ -66,14 +59,14 @@ def redact_document(uploaded_file):
|
|
66 |
|
67 |
return extracted_text, anonymized_text.text
|
68 |
|
69 |
-
# β
|
70 |
iface = gr.Interface(
|
71 |
fn=redact_document,
|
72 |
-
inputs=gr.File(type="filepath"),
|
73 |
outputs=["text", "text"],
|
74 |
title="Legal & Business Document Redaction",
|
75 |
description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
|
76 |
-
examples=[
|
77 |
)
|
78 |
|
79 |
# β
Launch Gradio App
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
|
|
3 |
import fitz # PyMuPDF for handling PDFs
|
4 |
import pytesseract
|
5 |
from pdf2image import convert_from_path
|
|
|
18 |
anonymizer = AnonymizerEngine()
|
19 |
|
20 |
def extract_text_from_pdf(pdf_path):
|
21 |
+
""" Extracts text from PDFs using Google Cloud Document AI. """
|
|
|
|
|
22 |
with open(pdf_path, "rb") as f:
|
23 |
pdf_bytes = f.read()
|
24 |
|
25 |
# Set up the request for Document AI
|
26 |
document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
|
27 |
+
name = f"projects/YOUR_PROJECT_ID/locations/us/processors/YOUR_PROCESSOR_ID"
|
28 |
|
29 |
request = documentai.ProcessRequest(name=name, raw_document=document)
|
30 |
|
|
|
34 |
return result.document.text if result.document.text else "No text detected."
|
35 |
|
36 |
def extract_text_from_image(image_path):
|
37 |
+
""" Extracts text from images using Tesseract OCR. """
|
|
|
|
|
38 |
return pytesseract.image_to_string(image_path)
|
39 |
|
40 |
def redact_document(uploaded_file):
|
41 |
+
""" 1. Extracts text from PDFs, Word, or Image files. 2. Uses Presidio to redact sensitive PII. """
|
42 |
+
file_ext = uploaded_file.split(".")[-1].lower()
|
|
|
|
|
|
|
43 |
|
44 |
if file_ext == "pdf":
|
45 |
+
extracted_text = extract_text_from_pdf(uploaded_file)
|
46 |
elif file_ext in ["png", "jpg", "jpeg"]:
|
47 |
+
extracted_text = extract_text_from_image(uploaded_file)
|
48 |
else:
|
49 |
+
with open(uploaded_file, "r", encoding="utf-8") as f:
|
50 |
+
extracted_text = f.read()
|
51 |
+
|
52 |
# Analyze and redact PII
|
53 |
results = analyzer.analyze(
|
54 |
text=extracted_text,
|
|
|
59 |
|
60 |
return extracted_text, anonymized_text.text
|
61 |
|
62 |
+
# β
Fix: Remove `examples` to avoid missing file errors
|
63 |
iface = gr.Interface(
|
64 |
fn=redact_document,
|
65 |
+
inputs=gr.File(type="filepath"), # β
FIXED INPUT TYPE
|
66 |
outputs=["text", "text"],
|
67 |
title="Legal & Business Document Redaction",
|
68 |
description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
|
69 |
+
examples=[] # β
FIXED EXAMPLES
|
70 |
)
|
71 |
|
72 |
# β
Launch Gradio App
|