Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import io
|
4 |
+
import json
|
5 |
+
from pydub import AudioSegment
|
6 |
+
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern
|
7 |
+
from presidio_anonymizer import AnonymizerEngine
|
8 |
+
from google.cloud import speech
|
9 |
+
|
10 |
+
# ✅ Step 1: Set Google Cloud Credentials (File Uploaded in Hugging Face)
|
11 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service-key.json"
|
12 |
+
|
13 |
+
# ✅ Step 2: Initialize Google Speech-to-Text API Client
|
14 |
+
client = speech.SpeechClient()
|
15 |
+
|
16 |
+
# ✅ Step 3: Initialize Presidio Analyzer & Anonymizer
|
17 |
+
analyzer = AnalyzerEngine()
|
18 |
+
anonymizer = AnonymizerEngine()
|
19 |
+
|
20 |
+
# ✅ Step 4: Define Custom Recognizers for Financial Data
|
21 |
+
|
22 |
+
# Credit Card Number Pattern (Visa, MasterCard, Amex, Discover)
|
23 |
+
credit_card_pattern = Pattern(name="credit_card_pattern", regex=r"\b(?:\d[ -]*?){13,16}\b", score=0.90)
|
24 |
+
credit_card_recognizer = PatternRecognizer(supported_entity="CREDIT_CARD", patterns=[credit_card_pattern])
|
25 |
+
|
26 |
+
# Bank Account Number Pattern (Generic 6-12 digit accounts)
|
27 |
+
account_number_pattern = Pattern(name="account_number_pattern", regex=r"\b\d{6,12}\b", score=0.85)
|
28 |
+
account_number_recognizer = PatternRecognizer(supported_entity="ACCOUNT_NUMBER", patterns=[account_number_pattern])
|
29 |
+
|
30 |
+
# Social Security Number (SSN - US Format)
|
31 |
+
ssn_pattern = Pattern(name="ssn_pattern", regex=r"\b\d{3}-\d{2}-\d{4}\b", score=0.90)
|
32 |
+
ssn_recognizer = PatternRecognizer(supported_entity="SSN", patterns=[ssn_pattern])
|
33 |
+
|
34 |
+
# Bank Names (Simple regex match for common banks)
|
35 |
+
bank_pattern = Pattern(name="bank_pattern", regex=r"\b(Barclays|HSBC|Lloyds|NatWest|Santander|Metro Bank|Revolut)\b", score=0.85)
|
36 |
+
bank_recognizer = PatternRecognizer(supported_entity="BANK", patterns=[bank_pattern])
|
37 |
+
|
38 |
+
# Amounts (Currency format like £5000, $100, €2000)
|
39 |
+
amount_pattern = Pattern(name="amount_pattern", regex=r"\b[$£€]?\d{1,3}(?:,?\d{3})*(?:\.\d{2})?\b", score=0.85)
|
40 |
+
amount_recognizer = PatternRecognizer(supported_entity="AMOUNT", patterns=[amount_pattern])
|
41 |
+
|
42 |
+
# Add all custom recognizers to Presidio
|
43 |
+
analyzer.registry.add_recognizer(credit_card_recognizer)
|
44 |
+
analyzer.registry.add_recognizer(account_number_recognizer)
|
45 |
+
analyzer.registry.add_recognizer(ssn_recognizer)
|
46 |
+
analyzer.registry.add_recognizer(bank_recognizer)
|
47 |
+
analyzer.registry.add_recognizer(amount_recognizer)
|
48 |
+
|
49 |
+
def transcribe_audio(audio_file):
|
50 |
+
"""
|
51 |
+
Converts uploaded audio to text using Google Cloud Speech-to-Text.
|
52 |
+
"""
|
53 |
+
# Convert audio to FLAC format for Google API compatibility
|
54 |
+
audio = AudioSegment.from_file(audio_file)
|
55 |
+
audio = audio.set_channels(1).set_frame_rate(16000) # Ensure proper format
|
56 |
+
|
57 |
+
# Save as FLAC in memory
|
58 |
+
buffer = io.BytesIO()
|
59 |
+
audio.export(buffer, format="flac")
|
60 |
+
buffer.seek(0)
|
61 |
+
|
62 |
+
# Send audio for transcription
|
63 |
+
audio_recognition = speech.RecognitionAudio(content=buffer.read())
|
64 |
+
config = speech.RecognitionConfig(language_code="en-GB")
|
65 |
+
|
66 |
+
response = client.recognize(config=config, audio=audio_recognition)
|
67 |
+
|
68 |
+
# Get transcribed text
|
69 |
+
transcribed_text = " ".join([result.alternatives[0].transcript for result in response.results])
|
70 |
+
|
71 |
+
return transcribed_text if transcribed_text else "No speech detected."
|
72 |
+
|
73 |
+
def redact_pii(audio_file):
|
74 |
+
"""
|
75 |
+
1. Transcribes the audio using Google Speech-to-Text.
|
76 |
+
2. Uses Presidio to redact financial PII from the transcript.
|
77 |
+
"""
|
78 |
+
transcribed_text = transcribe_audio(audio_file)
|
79 |
+
|
80 |
+
# Analyze and redact PII
|
81 |
+
results = analyzer.analyze(
|
82 |
+
text=transcribed_text,
|
83 |
+
entities=["PERSON", "PHONE_NUMBER", "DATE_TIME", "LOCATION", "CREDIT_CARD", "SSN", "ACCOUNT_NUMBER", "BANK", "AMOUNT"],
|
84 |
+
language="en"
|
85 |
+
)
|
86 |
+
anonymized_text = anonymizer.anonymize(text=transcribed_text, analyzer_results=results)
|
87 |
+
|
88 |
+
return transcribed_text, anonymized_text.text
|
89 |
+
|
90 |
+
# ✅ Step 5: Gradio UI for Audio Upload & Redaction
|
91 |
+
iface = gr.Interface(
|
92 |
+
fn=redact_pii,
|
93 |
+
inputs=gr.Audio(type="filepath"),
|
94 |
+
outputs=["text", "text"],
|
95 |
+
title="Financial Call Redaction Demo",
|
96 |
+
description="Upload a financial call recording, and Presidio will transcribe and anonymize sensitive financial data.",
|
97 |
+
examples=["sample_financial_call.wav"]
|
98 |
+
)
|
99 |
+
|
100 |
+
# ✅ Step 6: Launch the Gradio Web App
|
101 |
+
if __name__ == "__main__":
|
102 |
+
iface.launch()
|