Spaces:

asony999
/

financial

Sleeping

App Files Files Community

asony999 commited on Feb 21

Commit

466d846

verified ·

1 Parent(s): 0eec31c

Create app.py

Browse files

Files changed (1) hide show

app.py +102 -0

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import gradio as gr
+import os
+import io
+import json
+from pydub import AudioSegment
+from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern
+from presidio_anonymizer import AnonymizerEngine
+from google.cloud import speech
+# ✅ Step 1: Set Google Cloud Credentials (File Uploaded in Hugging Face)
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service-key.json"
+# ✅ Step 2: Initialize Google Speech-to-Text API Client
+client = speech.SpeechClient()
+# ✅ Step 3: Initialize Presidio Analyzer & Anonymizer
+analyzer = AnalyzerEngine()
+anonymizer = AnonymizerEngine()
+# ✅ Step 4: Define Custom Recognizers for Financial Data
+# Credit Card Number Pattern (Visa, MasterCard, Amex, Discover)
+credit_card_pattern = Pattern(name="credit_card_pattern", regex=r"\b(?:\d[ -]*?){13,16}\b", score=0.90)
+credit_card_recognizer = PatternRecognizer(supported_entity="CREDIT_CARD", patterns=[credit_card_pattern])
+# Bank Account Number Pattern (Generic 6-12 digit accounts)
+account_number_pattern = Pattern(name="account_number_pattern", regex=r"\b\d{6,12}\b", score=0.85)
+account_number_recognizer = PatternRecognizer(supported_entity="ACCOUNT_NUMBER", patterns=[account_number_pattern])
+# Social Security Number (SSN - US Format)
+ssn_pattern = Pattern(name="ssn_pattern", regex=r"\b\d{3}-\d{2}-\d{4}\b", score=0.90)
+ssn_recognizer = PatternRecognizer(supported_entity="SSN", patterns=[ssn_pattern])
+# Bank Names (Simple regex match for common banks)
+bank_pattern = Pattern(name="bank_pattern", regex=r"\b(Barclays|HSBC|Lloyds|NatWest|Santander|Metro Bank|Revolut)\b", score=0.85)
+bank_recognizer = PatternRecognizer(supported_entity="BANK", patterns=[bank_pattern])
+# Amounts (Currency format like £5000, $100, €2000)
+amount_pattern = Pattern(name="amount_pattern", regex=r"\b[$£€]?\d{1,3}(?:,?\d{3})*(?:\.\d{2})?\b", score=0.85)
+amount_recognizer = PatternRecognizer(supported_entity="AMOUNT", patterns=[amount_pattern])
+# Add all custom recognizers to Presidio
+analyzer.registry.add_recognizer(credit_card_recognizer)
+analyzer.registry.add_recognizer(account_number_recognizer)
+analyzer.registry.add_recognizer(ssn_recognizer)
+analyzer.registry.add_recognizer(bank_recognizer)
+analyzer.registry.add_recognizer(amount_recognizer)
+def transcribe_audio(audio_file):
+    """
+    Converts uploaded audio to text using Google Cloud Speech-to-Text.
+    """
+    # Convert audio to FLAC format for Google API compatibility
+    audio = AudioSegment.from_file(audio_file)
+    audio = audio.set_channels(1).set_frame_rate(16000)  # Ensure proper format
+    # Save as FLAC in memory
+    buffer = io.BytesIO()
+    audio.export(buffer, format="flac")
+    buffer.seek(0)
+    # Send audio for transcription
+    audio_recognition = speech.RecognitionAudio(content=buffer.read())
+    config = speech.RecognitionConfig(language_code="en-GB")
+    response = client.recognize(config=config, audio=audio_recognition)
+    # Get transcribed text
+    transcribed_text = " ".join([result.alternatives[0].transcript for result in response.results])
+    return transcribed_text if transcribed_text else "No speech detected."
+def redact_pii(audio_file):
+    """
+    1. Transcribes the audio using Google Speech-to-Text.
+    2. Uses Presidio to redact financial PII from the transcript.
+    """
+    transcribed_text = transcribe_audio(audio_file)
+    # Analyze and redact PII
+    results = analyzer.analyze(
+        text=transcribed_text,
+        entities=["PERSON", "PHONE_NUMBER", "DATE_TIME", "LOCATION", "CREDIT_CARD", "SSN", "ACCOUNT_NUMBER", "BANK", "AMOUNT"],
+        language="en"
+    )
+    anonymized_text = anonymizer.anonymize(text=transcribed_text, analyzer_results=results)
+    return transcribed_text, anonymized_text.text
+# ✅ Step 5: Gradio UI for Audio Upload & Redaction
+iface = gr.Interface(
+    fn=redact_pii,
+    inputs=gr.Audio(type="filepath"),
+    outputs=["text", "text"],
+    title="Financial Call Redaction Demo",
+    description="Upload a financial call recording, and Presidio will transcribe and anonymize sensitive financial data.",
+    examples=["sample_financial_call.wav"]
+)
+# ✅ Step 6: Launch the Gradio Web App
+if __name__ == "__main__":
+    iface.launch()