File size: 4,374 Bytes
6cbe6e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import gradio as gr
import azure.cognitiveservices.speech as speechsdk
import time

# Azure credentials
SPEECH_KEY = "your_speech_key"
SERVICE_REGION = "your_service_region"

# Define the language and dialect mapping
language_dialects = {
    "Arabic": {
        "Egypt": "ar-EG",
        "Saudi Arabia": "ar-SA",
        "United Arab Emirates": "ar-AE",
        "Bahrain": "ar-BH",
        "Algeria": "ar-DZ",
        "Iraq": "ar-IQ",
        "Jordan": "ar-JO",
        "Kuwait": "ar-KW",
        "Lebanon": "ar-LB",
        "Libya": "ar-LY",
        "Morocco": "ar-MA",
        "Oman": "ar-OM",
        "Palestinian Authority": "ar-PS",
        "Qatar": "ar-QA",
        "Syria": "ar-SY",
        "Tunisia": "ar-TN",
        "Yemen": "ar-YE"
    },
    "English": {
        "United States": "en-US",
        "United Kingdom": "en-GB",
        "Australia": "en-AU",
        "Canada": "en-CA",
        "India": "en-IN",
        "Ireland": "en-IE",
        "New Zealand": "en-NZ",
        "South Africa": "en-ZA",
        "Singapore": "en-SG",
        "Philippines": "en-PH"
    },
    "French": {
        "France": "fr-FR",
        "Canada": "fr-CA",
        "Switzerland": "fr-CH"
    },
    "Spanish": {
        "Spain": "es-ES",
        "Mexico": "es-MX",
        "Argentina": "es-AR",
        "Colombia": "es-CO",
        "Chile": "es-CL",
        "Peru": "es-PE",
        "Venezuela": "es-VE"
    },
    "German": {
        "Germany": "de-DE",
        "Austria": "de-AT",
        "Switzerland": "de-CH"
    },
    "Portuguese": {
        "Portugal": "pt-PT",
        "Brazil": "pt-BR"
    },
    "Chinese": {
        "Mainland China": "zh-CN",
        "Hong Kong": "zh-HK",
        "Taiwan": "zh-TW"
    },
    "Italian": {
        "Italy": "it-IT"
    },
    "Japanese": {
        "Japan": "ja-JP"
    },
    "Korean": {
        "Korea": "ko-KR"
    }
    # Add more languages and dialects as needed
}


# Function to get dialects based on selected language
def get_dialects(language):
    dialects = list(language_dialects.get(language, {}).keys())
    return gr.update(choices=dialects, value=dialects[0] if dialects else None)

# Function to transcribe audio
def transcribe_audio(audio_file, duration, language, dialect):
    # Simulate recording duration
    print(f"Recording for {duration} seconds...")
    time.sleep(duration)

    # Get the locale code
    locale_code = language_dialects.get(language, {}).get(dialect, "en-US")
    print(f"Selected Locale Code: {locale_code}")

    # Set up speech recognition
    speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SERVICE_REGION)
    speech_config.speech_recognition_language = locale_code
    print(locale_code)
    audio_input = speechsdk.audio.AudioConfig(filename=audio_file)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

    result = speech_recognizer.recognize_once()

    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        return result.text
    elif result.reason == speechsdk.ResultReason.NoMatch:
        return "No speech could be recognized"
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        return f"Speech recognition canceled: {cancellation_details.error_details}"
    else:
        return "Unknown error occurred during speech recognition"

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Azure Speech to Text with Language and Dialect Selection")

    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Upload Audio")
        duration_input = gr.Dropdown(choices=[5, 10], label="Recording Duration", value=5)

    with gr.Row():
        language_input = gr.Dropdown(choices=list(language_dialects.keys()), label="Select Language")
        dialect_input = gr.Dropdown(choices=[], label="Select Dialect")

    transcribe_button = gr.Button("Transcribe")
    output_text = gr.Textbox(label="Transcription Result")

    # Update dialect options based on selected language
    language_input.change(fn=get_dialects, inputs=language_input, outputs=dialect_input)

    # Transcribe audio on button click
    transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, duration_input, language_input, dialect_input], outputs=output_text)

# Launch the app
demo.launch()