asony999 commited on
Commit
466d846
·
verified ·
1 Parent(s): 0eec31c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -0
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import io
4
+ import json
5
+ from pydub import AudioSegment
6
+ from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern
7
+ from presidio_anonymizer import AnonymizerEngine
8
+ from google.cloud import speech
9
+
10
+ # ✅ Step 1: Set Google Cloud Credentials (File Uploaded in Hugging Face)
11
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service-key.json"
12
+
13
+ # ✅ Step 2: Initialize Google Speech-to-Text API Client
14
+ client = speech.SpeechClient()
15
+
16
+ # ✅ Step 3: Initialize Presidio Analyzer & Anonymizer
17
+ analyzer = AnalyzerEngine()
18
+ anonymizer = AnonymizerEngine()
19
+
20
+ # ✅ Step 4: Define Custom Recognizers for Financial Data
21
+
22
+ # Credit Card Number Pattern (Visa, MasterCard, Amex, Discover)
23
+ credit_card_pattern = Pattern(name="credit_card_pattern", regex=r"\b(?:\d[ -]*?){13,16}\b", score=0.90)
24
+ credit_card_recognizer = PatternRecognizer(supported_entity="CREDIT_CARD", patterns=[credit_card_pattern])
25
+
26
+ # Bank Account Number Pattern (Generic 6-12 digit accounts)
27
+ account_number_pattern = Pattern(name="account_number_pattern", regex=r"\b\d{6,12}\b", score=0.85)
28
+ account_number_recognizer = PatternRecognizer(supported_entity="ACCOUNT_NUMBER", patterns=[account_number_pattern])
29
+
30
+ # Social Security Number (SSN - US Format)
31
+ ssn_pattern = Pattern(name="ssn_pattern", regex=r"\b\d{3}-\d{2}-\d{4}\b", score=0.90)
32
+ ssn_recognizer = PatternRecognizer(supported_entity="SSN", patterns=[ssn_pattern])
33
+
34
+ # Bank Names (Simple regex match for common banks)
35
+ bank_pattern = Pattern(name="bank_pattern", regex=r"\b(Barclays|HSBC|Lloyds|NatWest|Santander|Metro Bank|Revolut)\b", score=0.85)
36
+ bank_recognizer = PatternRecognizer(supported_entity="BANK", patterns=[bank_pattern])
37
+
38
+ # Amounts (Currency format like £5000, $100, €2000)
39
+ amount_pattern = Pattern(name="amount_pattern", regex=r"\b[$£€]?\d{1,3}(?:,?\d{3})*(?:\.\d{2})?\b", score=0.85)
40
+ amount_recognizer = PatternRecognizer(supported_entity="AMOUNT", patterns=[amount_pattern])
41
+
42
+ # Add all custom recognizers to Presidio
43
+ analyzer.registry.add_recognizer(credit_card_recognizer)
44
+ analyzer.registry.add_recognizer(account_number_recognizer)
45
+ analyzer.registry.add_recognizer(ssn_recognizer)
46
+ analyzer.registry.add_recognizer(bank_recognizer)
47
+ analyzer.registry.add_recognizer(amount_recognizer)
48
+
49
+ def transcribe_audio(audio_file):
50
+ """
51
+ Converts uploaded audio to text using Google Cloud Speech-to-Text.
52
+ """
53
+ # Convert audio to FLAC format for Google API compatibility
54
+ audio = AudioSegment.from_file(audio_file)
55
+ audio = audio.set_channels(1).set_frame_rate(16000) # Ensure proper format
56
+
57
+ # Save as FLAC in memory
58
+ buffer = io.BytesIO()
59
+ audio.export(buffer, format="flac")
60
+ buffer.seek(0)
61
+
62
+ # Send audio for transcription
63
+ audio_recognition = speech.RecognitionAudio(content=buffer.read())
64
+ config = speech.RecognitionConfig(language_code="en-GB")
65
+
66
+ response = client.recognize(config=config, audio=audio_recognition)
67
+
68
+ # Get transcribed text
69
+ transcribed_text = " ".join([result.alternatives[0].transcript for result in response.results])
70
+
71
+ return transcribed_text if transcribed_text else "No speech detected."
72
+
73
+ def redact_pii(audio_file):
74
+ """
75
+ 1. Transcribes the audio using Google Speech-to-Text.
76
+ 2. Uses Presidio to redact financial PII from the transcript.
77
+ """
78
+ transcribed_text = transcribe_audio(audio_file)
79
+
80
+ # Analyze and redact PII
81
+ results = analyzer.analyze(
82
+ text=transcribed_text,
83
+ entities=["PERSON", "PHONE_NUMBER", "DATE_TIME", "LOCATION", "CREDIT_CARD", "SSN", "ACCOUNT_NUMBER", "BANK", "AMOUNT"],
84
+ language="en"
85
+ )
86
+ anonymized_text = anonymizer.anonymize(text=transcribed_text, analyzer_results=results)
87
+
88
+ return transcribed_text, anonymized_text.text
89
+
90
+ # ✅ Step 5: Gradio UI for Audio Upload & Redaction
91
+ iface = gr.Interface(
92
+ fn=redact_pii,
93
+ inputs=gr.Audio(type="filepath"),
94
+ outputs=["text", "text"],
95
+ title="Financial Call Redaction Demo",
96
+ description="Upload a financial call recording, and Presidio will transcribe and anonymize sensitive financial data.",
97
+ examples=["sample_financial_call.wav"]
98
+ )
99
+
100
+ # ✅ Step 6: Launch the Gradio Web App
101
+ if __name__ == "__main__":
102
+ iface.launch()