File size: 4,505 Bytes
466d846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0a718e
 
 
 
466d846
 
 
 
 
d0a718e
 
466d846
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import gradio as gr
import os
import io
import json
from pydub import AudioSegment
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern
from presidio_anonymizer import AnonymizerEngine
from google.cloud import speech

# βœ… Step 1: Set Google Cloud Credentials (File Uploaded in Hugging Face)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service-key.json"

# βœ… Step 2: Initialize Google Speech-to-Text API Client
client = speech.SpeechClient()

# βœ… Step 3: Initialize Presidio Analyzer & Anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# βœ… Step 4: Define Custom Recognizers for Financial Data

# Credit Card Number Pattern (Visa, MasterCard, Amex, Discover)
credit_card_pattern = Pattern(name="credit_card_pattern", regex=r"\b(?:\d[ -]*?){13,16}\b", score=0.90)
credit_card_recognizer = PatternRecognizer(supported_entity="CREDIT_CARD", patterns=[credit_card_pattern])

# Bank Account Number Pattern (Generic 6-12 digit accounts)
account_number_pattern = Pattern(name="account_number_pattern", regex=r"\b\d{6,12}\b", score=0.85)
account_number_recognizer = PatternRecognizer(supported_entity="ACCOUNT_NUMBER", patterns=[account_number_pattern])

# Social Security Number (SSN - US Format)
ssn_pattern = Pattern(name="ssn_pattern", regex=r"\b\d{3}-\d{2}-\d{4}\b", score=0.90)
ssn_recognizer = PatternRecognizer(supported_entity="SSN", patterns=[ssn_pattern])

# Bank Names (Simple regex match for common banks)
bank_pattern = Pattern(name="bank_pattern", regex=r"\b(Barclays|HSBC|Lloyds|NatWest|Santander|Metro Bank|Revolut)\b", score=0.85)
bank_recognizer = PatternRecognizer(supported_entity="BANK", patterns=[bank_pattern])

# Amounts (Currency format like Β£5000, $100, €2000)
amount_pattern = Pattern(name="amount_pattern", regex=r"\b[$£€]?\d{1,3}(?:,?\d{3})*(?:\.\d{2})?\b", score=0.85)
amount_recognizer = PatternRecognizer(supported_entity="AMOUNT", patterns=[amount_pattern])

# Add all custom recognizers to Presidio
analyzer.registry.add_recognizer(credit_card_recognizer)
analyzer.registry.add_recognizer(account_number_recognizer)
analyzer.registry.add_recognizer(ssn_recognizer)
analyzer.registry.add_recognizer(bank_recognizer)
analyzer.registry.add_recognizer(amount_recognizer)

def transcribe_audio(audio_file):
    """
    Converts uploaded audio to text using Google Cloud Speech-to-Text.
    """
    # Convert audio to FLAC format for Google API compatibility
    audio = AudioSegment.from_file(audio_file)
    audio = audio.set_channels(1).set_frame_rate(16000)  # Ensure proper format
    
    # Save as FLAC in memory
    buffer = io.BytesIO()
    audio.export(buffer, format="flac")
    buffer.seek(0)

    # Send audio for transcription
    audio_recognition = speech.RecognitionAudio(content=buffer.read())
    config = speech.RecognitionConfig(language_code="en-GB")

    response = client.recognize(config=config, audio=audio_recognition)

    # Get transcribed text
    transcribed_text = " ".join([result.alternatives[0].transcript for result in response.results])
    
    return transcribed_text if transcribed_text else "No speech detected."

def redact_pii(audio_file):
    """
    1. Transcribes the audio using Google Speech-to-Text.
    2. Uses Presidio to redact financial PII from the transcript.
    """
    transcribed_text = transcribe_audio(audio_file)

    # Analyze and redact PII
    results = analyzer.analyze(
        text=transcribed_text,
        entities=["PERSON", "PHONE_NUMBER", "DATE_TIME", "LOCATION", "CREDIT_CARD", "SSN", "ACCOUNT_NUMBER", "BANK", "AMOUNT"],
        language="en"
    )
    anonymized_text = anonymizer.anonymize(text=transcribed_text, analyzer_results=results)

    return transcribed_text, anonymized_text.text

# βœ… Step 5: Gradio UI for Audio Upload & Redaction
sample_transcript = """Example script for the demo:
'Hello, my name is John Doe. My credit card number is 4111-1111-1111-1111, and I bank with HSBC. 
I’d like to transfer $5,000 to my account ending in 123456.'"""

iface = gr.Interface(
    fn=redact_pii,
    inputs=gr.Audio(type="filepath"),
    outputs=["text", "text"],
    title="Financial Call Redaction Demo",
    description=f"Upload a financial call recording, and Presidio will transcribe and anonymize sensitive financial data.\n\n"
                f"πŸ”Š **Sample Script for Testing:**\n{sample_transcript}",
    examples=["sample_financial_call.wav"]
)

# βœ… Step 6: Launch the Gradio Web App
if __name__ == "__main__":
    iface.launch()