Spaces:
Sleeping
Sleeping
File size: 4,374 Bytes
6cbe6e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import gradio as gr
import azure.cognitiveservices.speech as speechsdk
import time
# Azure credentials
SPEECH_KEY = "your_speech_key"
SERVICE_REGION = "your_service_region"
# Define the language and dialect mapping
language_dialects = {
"Arabic": {
"Egypt": "ar-EG",
"Saudi Arabia": "ar-SA",
"United Arab Emirates": "ar-AE",
"Bahrain": "ar-BH",
"Algeria": "ar-DZ",
"Iraq": "ar-IQ",
"Jordan": "ar-JO",
"Kuwait": "ar-KW",
"Lebanon": "ar-LB",
"Libya": "ar-LY",
"Morocco": "ar-MA",
"Oman": "ar-OM",
"Palestinian Authority": "ar-PS",
"Qatar": "ar-QA",
"Syria": "ar-SY",
"Tunisia": "ar-TN",
"Yemen": "ar-YE"
},
"English": {
"United States": "en-US",
"United Kingdom": "en-GB",
"Australia": "en-AU",
"Canada": "en-CA",
"India": "en-IN",
"Ireland": "en-IE",
"New Zealand": "en-NZ",
"South Africa": "en-ZA",
"Singapore": "en-SG",
"Philippines": "en-PH"
},
"French": {
"France": "fr-FR",
"Canada": "fr-CA",
"Switzerland": "fr-CH"
},
"Spanish": {
"Spain": "es-ES",
"Mexico": "es-MX",
"Argentina": "es-AR",
"Colombia": "es-CO",
"Chile": "es-CL",
"Peru": "es-PE",
"Venezuela": "es-VE"
},
"German": {
"Germany": "de-DE",
"Austria": "de-AT",
"Switzerland": "de-CH"
},
"Portuguese": {
"Portugal": "pt-PT",
"Brazil": "pt-BR"
},
"Chinese": {
"Mainland China": "zh-CN",
"Hong Kong": "zh-HK",
"Taiwan": "zh-TW"
},
"Italian": {
"Italy": "it-IT"
},
"Japanese": {
"Japan": "ja-JP"
},
"Korean": {
"Korea": "ko-KR"
}
# Add more languages and dialects as needed
}
# Function to get dialects based on selected language
def get_dialects(language):
dialects = list(language_dialects.get(language, {}).keys())
return gr.update(choices=dialects, value=dialects[0] if dialects else None)
# Function to transcribe audio
def transcribe_audio(audio_file, duration, language, dialect):
# Simulate recording duration
print(f"Recording for {duration} seconds...")
time.sleep(duration)
# Get the locale code
locale_code = language_dialects.get(language, {}).get(dialect, "en-US")
print(f"Selected Locale Code: {locale_code}")
# Set up speech recognition
speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SERVICE_REGION)
speech_config.speech_recognition_language = locale_code
print(locale_code)
audio_input = speechsdk.audio.AudioConfig(filename=audio_file)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)
result = speech_recognizer.recognize_once()
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
return result.text
elif result.reason == speechsdk.ResultReason.NoMatch:
return "No speech could be recognized"
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
return f"Speech recognition canceled: {cancellation_details.error_details}"
else:
return "Unknown error occurred during speech recognition"
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Azure Speech to Text with Language and Dialect Selection")
with gr.Row():
audio_input = gr.Audio(type="filepath", label="Upload Audio")
duration_input = gr.Dropdown(choices=[5, 10], label="Recording Duration", value=5)
with gr.Row():
language_input = gr.Dropdown(choices=list(language_dialects.keys()), label="Select Language")
dialect_input = gr.Dropdown(choices=[], label="Select Dialect")
transcribe_button = gr.Button("Transcribe")
output_text = gr.Textbox(label="Transcription Result")
# Update dialect options based on selected language
language_input.change(fn=get_dialects, inputs=language_input, outputs=dialect_input)
# Transcribe audio on button click
transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, duration_input, language_input, dialect_input], outputs=output_text)
# Launch the app
demo.launch()
|