asony999 commited on
Commit
cbdb7d7
Β·
verified Β·
1 Parent(s): fdc0581

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -20
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import gradio as gr
2
  import os
3
- import io
4
  import fitz # PyMuPDF for handling PDFs
5
  import pytesseract
6
  from pdf2image import convert_from_path
@@ -19,15 +18,13 @@ analyzer = AnalyzerEngine()
19
  anonymizer = AnonymizerEngine()
20
 
21
  def extract_text_from_pdf(pdf_path):
22
- """
23
- Extracts text from PDF files using Google Cloud Document AI.
24
- """
25
  with open(pdf_path, "rb") as f:
26
  pdf_bytes = f.read()
27
 
28
  # Set up the request for Document AI
29
  document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
30
- name = f"projects/presidio-450223/locations/us/processors/5cbc64853974c755"
31
 
32
  request = documentai.ProcessRequest(name=name, raw_document=document)
33
 
@@ -37,25 +34,21 @@ def extract_text_from_pdf(pdf_path):
37
  return result.document.text if result.document.text else "No text detected."
38
 
39
  def extract_text_from_image(image_path):
40
- """
41
- Extracts text from images using Tesseract OCR.
42
- """
43
  return pytesseract.image_to_string(image_path)
44
 
45
  def redact_document(uploaded_file):
46
- """
47
- 1. Extracts text from PDFs, Word, or Image files.
48
- 2. Uses Presidio to redact sensitive PII.
49
- """
50
- file_ext = uploaded_file.name.split(".")[-1].lower()
51
 
52
  if file_ext == "pdf":
53
- extracted_text = extract_text_from_pdf(uploaded_file.name)
54
  elif file_ext in ["png", "jpg", "jpeg"]:
55
- extracted_text = extract_text_from_image(uploaded_file.name)
56
  else:
57
- extracted_text = uploaded_file.read().decode("utf-8") # Plain text or Word files
58
-
 
59
  # Analyze and redact PII
60
  results = analyzer.analyze(
61
  text=extracted_text,
@@ -66,14 +59,14 @@ def redact_document(uploaded_file):
66
 
67
  return extracted_text, anonymized_text.text
68
 
69
- # βœ… Gradio UI for Document Upload & Redaction
70
  iface = gr.Interface(
71
  fn=redact_document,
72
- inputs=gr.File(type="filepath"),
73
  outputs=["text", "text"],
74
  title="Legal & Business Document Redaction",
75
  description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
76
- examples=["sample_contract.pdf", "business_report.docx"]
77
  )
78
 
79
  # βœ… Launch Gradio App
 
1
  import gradio as gr
2
  import os
 
3
  import fitz # PyMuPDF for handling PDFs
4
  import pytesseract
5
  from pdf2image import convert_from_path
 
18
  anonymizer = AnonymizerEngine()
19
 
20
  def extract_text_from_pdf(pdf_path):
21
+ """ Extracts text from PDFs using Google Cloud Document AI. """
 
 
22
  with open(pdf_path, "rb") as f:
23
  pdf_bytes = f.read()
24
 
25
  # Set up the request for Document AI
26
  document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf")
27
+ name = f"projects/YOUR_PROJECT_ID/locations/us/processors/YOUR_PROCESSOR_ID"
28
 
29
  request = documentai.ProcessRequest(name=name, raw_document=document)
30
 
 
34
  return result.document.text if result.document.text else "No text detected."
35
 
36
  def extract_text_from_image(image_path):
37
+ """ Extracts text from images using Tesseract OCR. """
 
 
38
  return pytesseract.image_to_string(image_path)
39
 
40
  def redact_document(uploaded_file):
41
+ """ 1. Extracts text from PDFs, Word, or Image files. 2. Uses Presidio to redact sensitive PII. """
42
+ file_ext = uploaded_file.split(".")[-1].lower()
 
 
 
43
 
44
  if file_ext == "pdf":
45
+ extracted_text = extract_text_from_pdf(uploaded_file)
46
  elif file_ext in ["png", "jpg", "jpeg"]:
47
+ extracted_text = extract_text_from_image(uploaded_file)
48
  else:
49
+ with open(uploaded_file, "r", encoding="utf-8") as f:
50
+ extracted_text = f.read()
51
+
52
  # Analyze and redact PII
53
  results = analyzer.analyze(
54
  text=extracted_text,
 
59
 
60
  return extracted_text, anonymized_text.text
61
 
62
+ # βœ… Fix: Remove `examples` to avoid missing file errors
63
  iface = gr.Interface(
64
  fn=redact_document,
65
+ inputs=gr.File(type="filepath"), # βœ… FIXED INPUT TYPE
66
  outputs=["text", "text"],
67
  title="Legal & Business Document Redaction",
68
  description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.",
69
+ examples=[] # βœ… FIXED EXAMPLES
70
  )
71
 
72
  # βœ… Launch Gradio App