asony999 commited on
Commit
3bd2289
Β·
verified Β·
1 Parent(s): 7e63791

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -10
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
  import os
3
  import io
4
- import json
5
  import fitz # PyMuPDF for handling PDFs
6
  import pytesseract
7
  from pdf2image import convert_from_path
@@ -9,13 +8,13 @@ from google.cloud import documentai_v1 as documentai
9
  from presidio_analyzer import AnalyzerEngine
10
  from presidio_anonymizer import AnonymizerEngine
11
 
12
- # βœ… Set Google Cloud Credentials (File Uploaded in Hugging Face)
13
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"
14
 
15
- # βœ… Initialize Google Cloud Document AI Client
16
- docai_client = documentai.DocumentUnderstandingServiceClient()
17
 
18
- # βœ… Initialize Presidio Analyzer & Anonymizer
19
  analyzer = AnalyzerEngine()
20
  anonymizer = AnonymizerEngine()
21
 
@@ -28,9 +27,8 @@ def extract_text_from_pdf(pdf_path):
28
 
29
  # Configure Document AI request
30
  document = {"content": pdf_bytes, "mime_type": "application/pdf"}
31
- request = {"name": f"projects/YOUR_PROJECT_ID/locations/us/processors/YOUR_PROCESSOR_ID", "raw_document": document}
32
-
33
- result = docai_client.process_document(request=request)
34
  return result.document.text if result.document.text else "No text detected."
35
 
36
  def extract_text_from_image(image_path):
@@ -41,8 +39,8 @@ def extract_text_from_image(image_path):
41
 
42
  def redact_document(uploaded_file):
43
  """
44
- 1. Extracts text from PDF, Word, or Image files.
45
- 2. Uses Presidio to redact sensitive information.
46
  """
47
  file_ext = uploaded_file.name.split(".")[-1].lower()
48
 
 
1
  import gradio as gr
2
  import os
3
  import io
 
4
  import fitz # PyMuPDF for handling PDFs
5
  import pytesseract
6
  from pdf2image import convert_from_path
 
8
  from presidio_analyzer import AnalyzerEngine
9
  from presidio_anonymizer import AnonymizerEngine
10
 
11
+ # βœ… Step 1: Set Google Cloud Credentials
12
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"
13
 
14
+ # βœ… Step 2: Initialize Google Cloud Document AI Client
15
+ client = documentai.DocumentUnderstandingServiceClient()
16
 
17
+ # βœ… Step 3: Initialize Presidio Analyzer & Anonymizer
18
  analyzer = AnalyzerEngine()
19
  anonymizer = AnonymizerEngine()
20
 
 
27
 
28
  # Configure Document AI request
29
  document = {"content": pdf_bytes, "mime_type": "application/pdf"}
30
+ request = {"name": f"projects/presidio-450223/locations/us/processors/5cbc64853974c755", "raw_document": document}
31
+ result = client.process_document(request=request)
 
32
  return result.document.text if result.document.text else "No text detected."
33
 
34
  def extract_text_from_image(image_path):
 
39
 
40
  def redact_document(uploaded_file):
41
  """
42
+ 1. Extracts text from PDFs or images.
43
+ 2. Uses Presidio to redact sensitive PII.
44
  """
45
  file_ext = uploaded_file.name.split(".")[-1].lower()
46