asony999 commited on
Commit
93ae8be
·
verified ·
1 Parent(s): a850594

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -7
app.py CHANGED
@@ -7,8 +7,6 @@ from pdf2image import convert_from_path
7
  from google.cloud import documentai_v1 as documentai
8
  from presidio_analyzer import AnalyzerEngine
9
  from presidio_anonymizer import AnonymizerEngine
10
- from google.cloud import documentai_v1 as documentai
11
-
12
 
13
  # ✅ Step 1: Set Google Cloud Credentials
14
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"
@@ -22,15 +20,20 @@ anonymizer = AnonymizerEngine()
22
 
23
  def extract_text_from_pdf(pdf_path):
24
  """
25
- Extracts text from PDF files using Google Document AI.
26
  """
27
  with open(pdf_path, "rb") as f:
28
  pdf_bytes = f.read()
29
 
30
- # Configure Document AI request
31
- document = {"content": pdf_bytes, "mime_type": "application/pdf"}
32
- request = {"name": f"projects/presidio-450223/locations/us/processors/5cbc64853974c755", "raw_document": document}
 
 
 
 
33
  result = client.process_document(request=request)
 
34
  return result.document.text if result.document.text else "No text detected."
35
 
36
  def extract_text_from_image(image_path):
@@ -41,7 +44,7 @@ def extract_text_from_image(image_path):
41
 
42
  def redact_document(uploaded_file):
43
  """
44
- 1. Extracts text from PDFs or images.
45
  2. Uses Presidio to redact sensitive PII.
46
  """
47
  file_ext = uploaded_file.name.split(".")[-1].lower()
 
7
  from google.cloud import documentai_v1 as documentai
8
  from presidio_analyzer import AnalyzerEngine
9
  from presidio_anonymizer import AnonymizerEngine
 
 
10
 
11
  # ✅ Step 1: Set Google Cloud Credentials
12
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json"
 
20
 
21
  def extract_text_from_pdf(pdf_path):
22
  """
23
+ Extracts text from PDF files using Google Cloud Document AI.
24
  """
25
  with open(pdf_path, "rb") as f:
26
  pdf_bytes = f.read()
27
 
28
+ # Set up the request for Document AI
29
+ document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
30
+ name = f"projects/presidio-450223/locations/us/processors/5cbc64853974c755"
31
+
32
+ request = documentai.ProcessRequest(name=name, raw_document=document)
33
+
34
+ # Call the Document AI API
35
  result = client.process_document(request=request)
36
+
37
  return result.document.text if result.document.text else "No text detected."
38
 
39
  def extract_text_from_image(image_path):
 
44
 
45
  def redact_document(uploaded_file):
46
  """
47
+ 1. Extracts text from PDFs, Word, or Image files.
48
  2. Uses Presidio to redact sensitive PII.
49
  """
50
  file_ext = uploaded_file.name.split(".")[-1].lower()