Spaces:

asony999
/

financial

Sleeping

App Files Files Community

financial / app.py

asony999

Update app.py

d0a718e verified 21 days ago

raw

history blame contribute delete

4.51 kB

	import gradio as gr
	import os
	import io
	import json
	from pydub import AudioSegment
	from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern
	from presidio_anonymizer import AnonymizerEngine
	from google.cloud import speech

	# ✅ Step 1: Set Google Cloud Credentials (File Uploaded in Hugging Face)
	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "service-key.json"

	# ✅ Step 2: Initialize Google Speech-to-Text API Client
	client = speech.SpeechClient()

	# ✅ Step 3: Initialize Presidio Analyzer & Anonymizer
	analyzer = AnalyzerEngine()
	anonymizer = AnonymizerEngine()

	# ✅ Step 4: Define Custom Recognizers for Financial Data

	# Credit Card Number Pattern (Visa, MasterCard, Amex, Discover)
	credit_card_pattern = Pattern(name="credit_card_pattern", regex=r"\b(?:\d[ -]*?){13,16}\b", score=0.90)
	credit_card_recognizer = PatternRecognizer(supported_entity="CREDIT_CARD", patterns=[credit_card_pattern])

	# Bank Account Number Pattern (Generic 6-12 digit accounts)
	account_number_pattern = Pattern(name="account_number_pattern", regex=r"\b\d{6,12}\b", score=0.85)
	account_number_recognizer = PatternRecognizer(supported_entity="ACCOUNT_NUMBER", patterns=[account_number_pattern])

	# Social Security Number (SSN - US Format)
	ssn_pattern = Pattern(name="ssn_pattern", regex=r"\b\d{3}-\d{2}-\d{4}\b", score=0.90)
	ssn_recognizer = PatternRecognizer(supported_entity="SSN", patterns=[ssn_pattern])

	# Bank Names (Simple regex match for common banks)
	bank_pattern = Pattern(name="bank_pattern", regex=r"\b(Barclays\|HSBC\|Lloyds\|NatWest\|Santander\|Metro Bank\|Revolut)\b", score=0.85)
	bank_recognizer = PatternRecognizer(supported_entity="BANK", patterns=[bank_pattern])

	# Amounts (Currency format like £5000, $100, €2000)
	amount_pattern = Pattern(name="amount_pattern", regex=r"\b[$£€]?\d{1,3}(?:,?\d{3})*(?:\.\d{2})?\b", score=0.85)
	amount_recognizer = PatternRecognizer(supported_entity="AMOUNT", patterns=[amount_pattern])

	# Add all custom recognizers to Presidio
	analyzer.registry.add_recognizer(credit_card_recognizer)
	analyzer.registry.add_recognizer(account_number_recognizer)
	analyzer.registry.add_recognizer(ssn_recognizer)
	analyzer.registry.add_recognizer(bank_recognizer)
	analyzer.registry.add_recognizer(amount_recognizer)

	def transcribe_audio(audio_file):
	"""
	Converts uploaded audio to text using Google Cloud Speech-to-Text.
	"""
	# Convert audio to FLAC format for Google API compatibility
	audio = AudioSegment.from_file(audio_file)
	audio = audio.set_channels(1).set_frame_rate(16000) # Ensure proper format

	# Save as FLAC in memory
	buffer = io.BytesIO()
	audio.export(buffer, format="flac")
	buffer.seek(0)

	# Send audio for transcription
	audio_recognition = speech.RecognitionAudio(content=buffer.read())
	config = speech.RecognitionConfig(language_code="en-GB")

	response = client.recognize(config=config, audio=audio_recognition)

	# Get transcribed text
	transcribed_text = " ".join([result.alternatives[0].transcript for result in response.results])

	return transcribed_text if transcribed_text else "No speech detected."

	def redact_pii(audio_file):
	"""
	1. Transcribes the audio using Google Speech-to-Text.
	2. Uses Presidio to redact financial PII from the transcript.
	"""
	transcribed_text = transcribe_audio(audio_file)

	# Analyze and redact PII
	results = analyzer.analyze(
	text=transcribed_text,
	entities=["PERSON", "PHONE_NUMBER", "DATE_TIME", "LOCATION", "CREDIT_CARD", "SSN", "ACCOUNT_NUMBER", "BANK", "AMOUNT"],
	language="en"
	)
	anonymized_text = anonymizer.anonymize(text=transcribed_text, analyzer_results=results)

	return transcribed_text, anonymized_text.text

	# ✅ Step 5: Gradio UI for Audio Upload & Redaction
	sample_transcript = """Example script for the demo:
	'Hello, my name is John Doe. My credit card number is 4111-1111-1111-1111, and I bank with HSBC.
	I’d like to transfer $5,000 to my account ending in 123456.'"""

	iface = gr.Interface(
	fn=redact_pii,
	inputs=gr.Audio(type="filepath"),
	outputs=["text", "text"],
	title="Financial Call Redaction Demo",
	description=f"Upload a financial call recording, and Presidio will transcribe and anonymize sensitive financial data.\n\n"
	f"🔊 Sample Script for Testing:\n{sample_transcript}",
	examples=["sample_financial_call.wav"]
	)

	# ✅ Step 6: Launch the Gradio Web App
	if __name__ == "__main__":
	iface.launch()