Spaces:
Sleeping
Sleeping
import gradio as gr | |
import azure.cognitiveservices.speech as speechsdk | |
import time | |
# Azure credentials | |
SPEECH_KEY = "your_speech_key" | |
SERVICE_REGION = "your_service_region" | |
# Define the language and dialect mapping | |
language_dialects = { | |
"Arabic": { | |
"Egypt": "ar-EG", | |
"Saudi Arabia": "ar-SA", | |
"United Arab Emirates": "ar-AE", | |
"Bahrain": "ar-BH", | |
"Algeria": "ar-DZ", | |
"Iraq": "ar-IQ", | |
"Jordan": "ar-JO", | |
"Kuwait": "ar-KW", | |
"Lebanon": "ar-LB", | |
"Libya": "ar-LY", | |
"Morocco": "ar-MA", | |
"Oman": "ar-OM", | |
"Palestinian Authority": "ar-PS", | |
"Qatar": "ar-QA", | |
"Syria": "ar-SY", | |
"Tunisia": "ar-TN", | |
"Yemen": "ar-YE" | |
}, | |
"English": { | |
"United States": "en-US", | |
"United Kingdom": "en-GB", | |
"Australia": "en-AU", | |
"Canada": "en-CA", | |
"India": "en-IN", | |
"Ireland": "en-IE", | |
"New Zealand": "en-NZ", | |
"South Africa": "en-ZA", | |
"Singapore": "en-SG", | |
"Philippines": "en-PH" | |
}, | |
"French": { | |
"France": "fr-FR", | |
"Canada": "fr-CA", | |
"Switzerland": "fr-CH" | |
}, | |
"Spanish": { | |
"Spain": "es-ES", | |
"Mexico": "es-MX", | |
"Argentina": "es-AR", | |
"Colombia": "es-CO", | |
"Chile": "es-CL", | |
"Peru": "es-PE", | |
"Venezuela": "es-VE" | |
}, | |
"German": { | |
"Germany": "de-DE", | |
"Austria": "de-AT", | |
"Switzerland": "de-CH" | |
}, | |
"Portuguese": { | |
"Portugal": "pt-PT", | |
"Brazil": "pt-BR" | |
}, | |
"Chinese": { | |
"Mainland China": "zh-CN", | |
"Hong Kong": "zh-HK", | |
"Taiwan": "zh-TW" | |
}, | |
"Italian": { | |
"Italy": "it-IT" | |
}, | |
"Japanese": { | |
"Japan": "ja-JP" | |
}, | |
"Korean": { | |
"Korea": "ko-KR" | |
} | |
# Add more languages and dialects as needed | |
} | |
# Function to get dialects based on selected language | |
def get_dialects(language): | |
dialects = list(language_dialects.get(language, {}).keys()) | |
return gr.update(choices=dialects, value=dialects[0] if dialects else None) | |
# Function to transcribe audio | |
def transcribe_audio(audio_file, duration, language, dialect): | |
# Simulate recording duration | |
print(f"Recording for {duration} seconds...") | |
time.sleep(duration) | |
# Get the locale code | |
locale_code = language_dialects.get(language, {}).get(dialect, "en-US") | |
print(f"Selected Locale Code: {locale_code}") | |
# Set up speech recognition | |
speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SERVICE_REGION) | |
speech_config.speech_recognition_language = locale_code | |
print(locale_code) | |
audio_input = speechsdk.audio.AudioConfig(filename=audio_file) | |
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input) | |
result = speech_recognizer.recognize_once() | |
if result.reason == speechsdk.ResultReason.RecognizedSpeech: | |
return result.text | |
elif result.reason == speechsdk.ResultReason.NoMatch: | |
return "No speech could be recognized" | |
elif result.reason == speechsdk.ResultReason.Canceled: | |
cancellation_details = result.cancellation_details | |
return f"Speech recognition canceled: {cancellation_details.error_details}" | |
else: | |
return "Unknown error occurred during speech recognition" | |
# Create the Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("## Azure Speech to Text with Language and Dialect Selection") | |
with gr.Row(): | |
audio_input = gr.Audio(type="filepath", label="Upload Audio") | |
duration_input = gr.Dropdown(choices=[5, 10], label="Recording Duration", value=5) | |
with gr.Row(): | |
language_input = gr.Dropdown(choices=list(language_dialects.keys()), label="Select Language") | |
dialect_input = gr.Dropdown(choices=[], label="Select Dialect") | |
transcribe_button = gr.Button("Transcribe") | |
output_text = gr.Textbox(label="Transcription Result") | |
# Update dialect options based on selected language | |
language_input.change(fn=get_dialects, inputs=language_input, outputs=dialect_input) | |
# Transcribe audio on button click | |
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, duration_input, language_input, dialect_input], outputs=output_text) | |
# Launch the app | |
demo.launch() | |