Spaces:

asony999
/

documents

Sleeping

asony999 commited on Feb 21

Commit

3bd2289

verified ·

1 Parent(s): 7e63791

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import os
 import io
-import json
 import fitz  # PyMuPDF for handling PDFs
 import pytesseract
 from pdf2image import convert_from_path
@@ -9,13 +8,13 @@ from google.cloud import documentai_v1 as documentai
 from presidio_analyzer import AnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine
-# ✅ Set Google Cloud Credentials (File Uploaded in Hugging Face)
 os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"
-# ✅ Initialize Google Cloud Document AI Client
-docai_client = documentai.DocumentUnderstandingServiceClient()
-# ✅ Initialize Presidio Analyzer & Anonymizer
 analyzer = AnalyzerEngine()
 anonymizer = AnonymizerEngine()
@@ -28,9 +27,8 @@ def extract_text_from_pdf(pdf_path):
     # Configure Document AI request
     document = {"content": pdf_bytes, "mime_type": "application/pdf"}
-    request = {"name": f"projects/YOUR_PROJECT_ID/locations/us/processors/YOUR_PROCESSOR_ID", "raw_document": document}
-    result = docai_client.process_document(request=request)
     return result.document.text if result.document.text else "No text detected."
 def extract_text_from_image(image_path):
@@ -41,8 +39,8 @@ def extract_text_from_image(image_path):
 def redact_document(uploaded_file):
     """
-    1. Extracts text from PDF, Word, or Image files.
-    2. Uses Presidio to redact sensitive information.
     """
     file_ext = uploaded_file.name.split(".")[-1].lower()

 import gradio as gr
 import os
 import io
 import fitz  # PyMuPDF for handling PDFs
 import pytesseract
 from pdf2image import convert_from_path
 from presidio_analyzer import AnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine
+# ✅ Step 1: Set Google Cloud Credentials
 os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"
+# ✅ Step 2: Initialize Google Cloud Document AI Client
+client = documentai.DocumentUnderstandingServiceClient()
+# ✅ Step 3: Initialize Presidio Analyzer & Anonymizer
 analyzer = AnalyzerEngine()
 anonymizer = AnonymizerEngine()
     # Configure Document AI request
     document = {"content": pdf_bytes, "mime_type": "application/pdf"}
+    request = {"name": f"projects/presidio-450223/locations/us/processors/5cbc64853974c755", "raw_document": document}
+    result = client.process_document(request=request)
     return result.document.text if result.document.text else "No text detected."
 def extract_text_from_image(image_path):
 def redact_document(uploaded_file):
     """
+    1. Extracts text from PDFs or images.
+    2. Uses Presidio to redact sensitive PII.
     """
     file_ext = uploaded_file.name.split(".")[-1].lower()