Loren commited on
Commit
8559f42
·
verified ·
1 Parent(s): c4e20d8

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +241 -100
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import torch
3
  from transformers import AutoProcessor, VoxtralForConditionalGeneration
4
  from pydub import AudioSegment
5
- from pydub.silence import split_on_silence, detect_silence
6
  import yt_dlp
7
  import requests
8
  import validators
@@ -13,8 +13,103 @@ import re
13
  import glob
14
  import spaces
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  #### Functions
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  @spaces.GPU
19
  def process_transcript(language: str, audio_path: str) -> str:
20
  """Process the audio file to return its transcription.
@@ -25,66 +120,105 @@ def process_transcript(language: str, audio_path: str) -> str:
25
 
26
  Returns:
27
  The transcribed text of the audio.
 
28
  """
 
 
29
 
30
  if audio_path is None:
31
- return "Please provide some input audio: either upload an audio file or use the microphone."
32
  else:
33
  id_language = dict_languages[language]
34
- inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name)
35
- inputs = inputs.to(device, dtype=torch.bfloat16)
36
- outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
37
- decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
38
 
39
- return decoded_outputs[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ###
41
 
42
  @spaces.GPU
43
  def process_translate(language: str, audio_path: str) -> str:
44
- conversation = [
45
- {
46
- "role": "user",
47
- "content": [
48
- {
49
- "type": "audio",
50
- "path": audio_path,
51
- },
52
- {"type": "text", "text": "Translate this in "+language},
53
- ],
54
- }
55
- ]
56
-
57
- inputs = processor.apply_chat_template(conversation)
58
- inputs = inputs.to(device, dtype=torch.bfloat16)
59
 
60
- outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
61
- decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
62
-
63
- return decoded_outputs[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  ###
65
 
66
  @spaces.GPU
67
  def process_chat(question: str, audio_path: str) -> str:
68
- conversation = [
69
- {
70
- "role": "user",
71
- "content": [
72
- {
73
- "type": "audio",
74
- "path": audio_path,
75
- },
76
- {"type": "text", "text": question},
77
- ],
78
- }
79
- ]
80
-
81
- inputs = processor.apply_chat_template(conversation)
82
- inputs = inputs.to(device, dtype=torch.bfloat16)
83
-
84
- outputs = model.generate(**inputs, max_new_tokens=500)
85
- decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
86
 
87
- return decoded_outputs[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  ###
89
 
90
  def disable_buttons():
@@ -94,6 +228,30 @@ def enable_buttons():
94
  return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
95
  ###
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  def secure_download_from_url(url: str):
98
  """
99
  Validates a URL and downloads the file if it is an authorized media.
@@ -269,57 +427,15 @@ def clear_audio():
269
  return None, None, None, None
270
  ###
271
 
272
- ### Initializations
273
-
274
- MAX_TOKENS = 32000
275
-
276
- device = "cuda" if torch.cuda.is_available() else "cpu"
277
- print(f"*** Device: {device}")
278
- model_name = 'mistralai/Voxtral-Mini-3B-2507'
279
-
280
- processor = AutoProcessor.from_pretrained(model_name)
281
- model = VoxtralForConditionalGeneration.from_pretrained(model_name,
282
- torch_dtype=torch.bfloat16,
283
- device_map=device)
284
- # Supported languages
285
- dict_languages = {"English": "en",
286
- "French": "fr",
287
- "German": "de",
288
- "Spanish": "es",
289
- "Italian": "it",
290
- "Portuguese": "pt",
291
- "Dutch": "nl",
292
- "Hindi": "hi"}
293
-
294
- # Whitelist of allowed MIME types for audio and video
295
- ALLOWED_MIME_TYPES = {
296
- # Audio
297
- 'audio/mpeg', 'audio/wav', 'audio/wave', 'audio/x-wav', 'audio/x-pn-wav',
298
- 'audio/ogg', 'audio/vorbis', 'audio/aac', 'audio/mp4', 'audio/flac',
299
- 'audio/x-flac', 'audio/opus', 'audio/webm',
300
- # Vidéo
301
- 'video/mp4', 'video/mpeg', 'video/ogg', 'video/webm', 'video/quicktime',
302
- 'video/x-msvideo', 'video/x-matroska'
303
- }
304
-
305
- # Maximum allowed file size (in bytes). Ex: 1 GB
306
- MAX_FILE_SIZE = 1 * 1024 * 1024 * 1024 # 1 GB
307
-
308
- # Directory where the files will be saved
309
- DOWNLOAD_DIR = "downloaded_files"
310
- if not os.path.exists(DOWNLOAD_DIR):
311
- os.makedirs(DOWNLOAD_DIR)
312
 
313
 
314
  #### Gradio interface
315
  with gr.Blocks(title="Voxtral") as voxtral:
316
- gr.Markdown("# **Voxtral Mini Evaluation**")
317
- gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
318
- capabilities while retaining best-in-class text performance.
319
- #### It excels at speech transcription, translation and audio understanding.""")
320
 
321
- with gr.Accordion("🔎 More on Voxtral", open=False):
322
- gr.Markdown("""## **Key Features:**
323
 
324
  #### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
325
  ##### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
@@ -329,6 +445,9 @@ with gr.Blocks(title="Voxtral") as voxtral:
329
  ##### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
330
  ##### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
331
 
 
 
 
332
 
333
  gr.Markdown("### **1.Choose the audio:**")
334
  sel_audio = gr.State()
@@ -336,12 +455,12 @@ with gr.Blocks(title="Voxtral") as voxtral:
336
  with gr.Tabs():
337
  with gr.Tab("From record or file upload"):
338
  gr.Markdown("### **Upload an audio file, record via microphone, or select a demo file:**")
339
- gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*")
340
  sel_audio1 = gr.Audio(sources=["upload", "microphone"], type="filepath",
341
  label="Set an audio file to process it:")
342
- example = [["mapo_tofu.mp3"]]
343
  gr.Examples(
344
- examples=example,
345
  inputs=sel_audio1,
346
  outputs=None,
347
  fn=None,
@@ -363,6 +482,15 @@ with gr.Blocks(title="Voxtral") as voxtral:
363
  gr.Markdown("### **Enter the url of the file (mp3, wav, mp4, ...):**")
364
  url_input2 = gr.Textbox(label="URL (MP3 or MP4 file)",
365
  placeholder="https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4")
 
 
 
 
 
 
 
 
 
366
  download_button2 = gr.Button("Check and upload", variant="primary")
367
  input_audio2 = gr.Audio()
368
  status_output2 = gr.Markdown()
@@ -416,23 +544,36 @@ with gr.Blocks(title="Voxtral") as voxtral:
416
  )
417
  submit_transcript = gr.Button("Extract transcription", variant="primary")
418
  text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
 
419
 
420
  with gr.Column():
421
  with gr.Accordion("🔁 Translation", open=True):
 
 
422
  sel_translate_language = gr.Dropdown(
423
  choices=list(dict_languages.keys()),
424
  value="English",
425
  label="Select the language for translation:"
426
  )
427
-
428
  submit_translate = gr.Button("Translate audio file", variant="primary")
429
  text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
 
430
 
431
  with gr.Column():
432
  with gr.Accordion("🤖 Ask audio file", open=True):
433
  question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file")
434
- submit_chat = gr.Button("Ask audio file:", variant="primary")
 
 
 
 
 
 
 
 
 
435
  text_chat = gr.Textbox(label="💬 Model answer", lines=10)
 
436
 
437
  ### Processing
438
 
@@ -444,7 +585,7 @@ with gr.Blocks(title="Voxtral") as voxtral:
444
  ).then(
445
  fn=process_transcript,
446
  inputs=[sel_language, sel_audio],
447
- outputs=text_transcript
448
  ).then(
449
  enable_buttons,
450
  outputs=[submit_transcript, submit_translate, submit_chat],
@@ -458,7 +599,7 @@ with gr.Blocks(title="Voxtral") as voxtral:
458
  ).then(
459
  fn=process_translate,
460
  inputs=[sel_translate_language, sel_audio],
461
- outputs=text_translate
462
  ).then(
463
  enable_buttons,
464
  outputs=[submit_transcript, submit_translate, submit_chat],
@@ -472,7 +613,7 @@ with gr.Blocks(title="Voxtral") as voxtral:
472
  ).then(
473
  fn=process_chat,
474
  inputs=[question_chat, sel_audio],
475
- outputs=text_chat
476
  ).then(
477
  enable_buttons,
478
  outputs=[submit_transcript, submit_translate, submit_chat],
@@ -481,4 +622,4 @@ with gr.Blocks(title="Voxtral") as voxtral:
481
  ### Launch the app
482
 
483
  if __name__ == "__main__":
484
- voxtral.queue().launch()
 
2
  import torch
3
  from transformers import AutoProcessor, VoxtralForConditionalGeneration
4
  from pydub import AudioSegment
5
+ from pydub.silence import detect_silence
6
  import yt_dlp
7
  import requests
8
  import validators
 
13
  import glob
14
  import spaces
15
 
16
+ ### Initializations
17
+
18
+ MAX_TOKENS = 32000
19
+
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
+ print(f"*** Device: {device}")
22
+ model_name = 'mistralai/Voxtral-Mini-3B-2507'
23
+
24
+ processor = AutoProcessor.from_pretrained(model_name)
25
+ model = VoxtralForConditionalGeneration.from_pretrained(model_name,
26
+ torch_dtype=torch.bfloat16,
27
+ device_map=device)
28
+ # Supported languages
29
+ dict_languages = {"English": "en",
30
+ "French": "fr",
31
+ "German": "de",
32
+ "Spanish": "es",
33
+ "Italian": "it",
34
+ "Portuguese": "pt",
35
+ "Dutch": "nl",
36
+ "Hindi": "hi"}
37
+
38
+ # Whitelist of allowed MIME types for audio and video
39
+ ALLOWED_MIME_TYPES = {
40
+ # Audio
41
+ 'audio/mpeg', 'audio/wav', 'audio/wave', 'audio/x-wav', 'audio/x-pn-wav',
42
+ 'audio/ogg', 'audio/vorbis', 'audio/aac', 'audio/mp4', 'audio/flac',
43
+ 'audio/x-flac', 'audio/opus', 'audio/webm',
44
+ # Video
45
+ 'video/mp4', 'video/mpeg', 'video/ogg', 'video/webm', 'video/quicktime',
46
+ 'video/x-msvideo', 'video/x-matroska'
47
+ }
48
+
49
+ # Maximum allowed file size (in bytes). Ex: 1 GB
50
+ MAX_FILE_SIZE = 1 * 1024 * 1024 * 1024 # 1 GB
51
+
52
+ # Directory where the files will be saved
53
+ DOWNLOAD_DIR = "downloaded_files"
54
+ if not os.path.exists(DOWNLOAD_DIR):
55
+ os.makedirs(DOWNLOAD_DIR)
56
+
57
+ MAX_LEN = 1800000 # 30 mn
58
+ one_second_silence = AudioSegment.silent(duration=1000)
59
+
60
  #### Functions
61
 
62
+ @spaces.GPU
63
+ def chunks_creation(audio_path):
64
+ list_audio_path = [audio_path]
65
+ audio = AudioSegment.from_file(audio_path)
66
+ status = gr.Markdown("👍 Audio duration less than max")
67
+ # Input too large ?
68
+ if len(audio) > MAX_LEN:
69
+ list_audio_path = []
70
+ try:
71
+ # Create list of chunks
72
+ list_silent = detect_silence(audio,min_silence_len=300,
73
+ # silent if quieter than -14 dBFS threshold
74
+ silence_thresh=audio.dBFS-14, seek_step=100)
75
+ list_interval = [(start, stop) for start, stop in list_silent]
76
+
77
+ # Calculate speech intervals
78
+ list_speech = []
79
+ current_start = 0
80
+ for start, stop in list_interval:
81
+ if current_start < start:
82
+ list_interval.append((current_start, start))
83
+ current_start = stop
84
+ # Add last interval if needed
85
+ if current_start < len(audio):
86
+ list_speech.append((current_start, len(audio)))
87
+
88
+ # Determination of chunks, to fit within the maximum duration
89
+ list_chunks = []
90
+ deb_chunk, fin_chunk = 0, list_speech[0][1]
91
+
92
+ for start, end in list_speech[1:]:
93
+ if end - deb_chunk + one_second_silence <= MAX_LEN:
94
+ fin_chunk = end + one_second_silence
95
+ else:
96
+ list_chunks.append([deb_chunk, fin_chunk])
97
+ deb_chunk, fin_chunk = start, end
98
+ list_chunks.append([deb_chunk, fin_chunk+one_second_silence])
99
+
100
+ # Save chunks
101
+ for i, (start, stop) in enumerate(list_chunks):
102
+ segment = audio[start:stop]
103
+ segment.export(f"chunk_{i}.wav", format="wav")
104
+ list_audio_path.append(f"chunk_{i}.wav")
105
+
106
+ status = f"✅ **Success!** {len(list_audio_path)} chunks saved."
107
+ except Exception as e:
108
+ status = gr.Markdown(f"❌ **Unexpected error during chuncks creation:** {e}")
109
+
110
+ return list_audio_path, status
111
+ ###
112
+
113
  @spaces.GPU
114
  def process_transcript(language: str, audio_path: str) -> str:
115
  """Process the audio file to return its transcription.
 
120
 
121
  Returns:
122
  The transcribed text of the audio.
123
+ The status of transcription : with or without chunking.
124
  """
125
+ result = ""
126
+ status = gr.Markdown()
127
 
128
  if audio_path is None:
129
+ status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
130
  else:
131
  id_language = dict_languages[language]
 
 
 
 
132
 
133
+ # Verification of the duration, for possible division into chunks
134
+ list_audio_path, status = chunks_creation(audio_path)
135
+
136
+ # Transcription process
137
+ try:
138
+ for path in list_audio_path:
139
+ inputs = processor.apply_transcrition_request(language=id_language,
140
+ audio=path, model_id=model_name)
141
+ inputs = inputs.to(device, dtype=torch.bfloat16)
142
+ outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
143
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:],
144
+ skip_special_tokens=True)
145
+ result += decoded_outputs[0]
146
+ status = "✅ **Success!** Transcription done."
147
+ except Exception as e:
148
+ status = gr.Markdown(f"❌ **Unexpected error during transcription:** {e}")
149
+
150
+ return result, status
151
  ###
152
 
153
  @spaces.GPU
154
  def process_translate(language: str, audio_path: str) -> str:
155
+ result = ""
156
+ status = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ if audio_path is None:
159
+ status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
160
+ else:
161
+ try:
162
+ conversation = [
163
+ {
164
+ "role": "user",
165
+ "content": [
166
+ {
167
+ "type": "audio",
168
+ "path": audio_path,
169
+ },
170
+ {"type": "text", "text": "Translate this in "+language},
171
+ ],
172
+ }
173
+ ]
174
+
175
+ inputs = processor.apply_chat_template(conversation)
176
+ inputs = inputs.to(device, dtype=torch.bfloat16)
177
+
178
+ outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
179
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
180
+ result = decoded_outputs[0]
181
+ status = "✅ **Success!** Translation done."
182
+ except Exception as e:
183
+ status = gr.Markdown(f"❌ **Unexpected error during translation:** {e}")
184
+
185
+ return result, status
186
  ###
187
 
188
  @spaces.GPU
189
  def process_chat(question: str, audio_path: str) -> str:
190
+ result = ""
191
+ status = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ if audio_path is None:
194
+ status = gr.Markdown("Please provide some input audio: either upload an audio file or use the microphone.")
195
+ else:
196
+ try:
197
+ conversation = [
198
+ {
199
+ "role": "user",
200
+ "content": [
201
+ {
202
+ "type": "audio",
203
+ "path": audio_path,
204
+ },
205
+ {"type": "text", "text": question},
206
+ ],
207
+ }
208
+ ]
209
+
210
+ inputs = processor.apply_chat_template(conversation)
211
+ inputs = inputs.to(device, dtype=torch.bfloat16)
212
+
213
+ outputs = model.generate(**inputs, max_new_tokens=500)
214
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
215
+
216
+ result = decoded_outputs[0]
217
+ status = "✅ **Success!** Translation done."
218
+ except Exception as e:
219
+ status = gr.Markdown(f"❌ **Unexpected error during translation:** {e}")
220
+
221
+ return result, status
222
  ###
223
 
224
  def disable_buttons():
 
228
  return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
229
  ###
230
 
231
+ def clear_audio():
232
+ return None, None, None, None
233
+ ###
234
+
235
+ @spaces.GPU
236
+ def voice_extract_demucs():
237
+ """
238
+ Returns the path of the voice extracted file.
239
+ """
240
+ try:
241
+ cmd = [
242
+ "demucs",
243
+ "--two-stems=vocals",
244
+ "--out", "demucs",
245
+ "audio_file.wav"
246
+ ]
247
+ subprocess.run(cmd, check=True)
248
+ voice_path = os.path.join("demucs", "htdemucs", "audio_file", "vocals.wav")
249
+ success_message = "✅ **Success!** Voice extracted."
250
+ return voice_path, voice_path, gr.Markdown(success_message)
251
+ except Exception as e:
252
+ return None, None, gr.Markdown(f"❌ **Error:** An unexpected ERROR occurred: {e}")
253
+ ###
254
+
255
  def secure_download_from_url(url: str):
256
  """
257
  Validates a URL and downloads the file if it is an authorized media.
 
427
  return None, None, None, None
428
  ###
429
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
 
431
 
432
  #### Gradio interface
433
  with gr.Blocks(title="Voxtral") as voxtral:
434
+ with gr.Row():
435
+ gr.Markdown("# **Voxtral Mini Evaluation**")
 
 
436
 
437
+ with gr.Accordion("🔎 More on Voxtral", open=False):
438
+ gr.Markdown("""## **Key Features:**
439
 
440
  #### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
441
  ##### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
 
445
  ##### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
446
  ##### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
447
 
448
+ gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
449
+ capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and \
450
+ audio understanding. Available languages: English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian.""")
451
 
452
  gr.Markdown("### **1.Choose the audio:**")
453
  sel_audio = gr.State()
 
455
  with gr.Tabs():
456
  with gr.Tab("From record or file upload"):
457
  gr.Markdown("### **Upload an audio file, record via microphone, or select a demo file:**")
458
+ gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription; if longer, it will be cut into chunks)*")
459
  sel_audio1 = gr.Audio(sources=["upload", "microphone"], type="filepath",
460
  label="Set an audio file to process it:")
461
+ example1 = [["mapo_tofu.mp3"]]
462
  gr.Examples(
463
+ examples=example1,
464
  inputs=sel_audio1,
465
  outputs=None,
466
  fn=None,
 
482
  gr.Markdown("### **Enter the url of the file (mp3, wav, mp4, ...):**")
483
  url_input2 = gr.Textbox(label="URL (MP3 or MP4 file)",
484
  placeholder="https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4")
485
+ example2 = [["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/mapo_tofu.mp4"]]
486
+ gr.Examples(
487
+ examples=example2,
488
+ inputs=url_input2,
489
+ outputs=None,
490
+ fn=None,
491
+ cache_examples=False,
492
+ run_on_click=False
493
+ )
494
  download_button2 = gr.Button("Check and upload", variant="primary")
495
  input_audio2 = gr.Audio()
496
  status_output2 = gr.Markdown()
 
544
  )
545
  submit_transcript = gr.Button("Extract transcription", variant="primary")
546
  text_transcript = gr.Textbox(label="💬 Generated transcription", lines=10)
547
+ status_transcript = gr.Markdown()
548
 
549
  with gr.Column():
550
  with gr.Accordion("🔁 Translation", open=True):
551
+ list_language = list(dict_languages.keys())
552
+ list_language.pop(list_language.index(sel_language.value)) # Fix: Access the value of the dropdown
553
  sel_translate_language = gr.Dropdown(
554
  choices=list(dict_languages.keys()),
555
  value="English",
556
  label="Select the language for translation:"
557
  )
 
558
  submit_translate = gr.Button("Translate audio file", variant="primary")
559
  text_translate = gr.Textbox(label="💬 Generated translation", lines=10)
560
+ status_translate = gr.Markdown()
561
 
562
  with gr.Column():
563
  with gr.Accordion("🤖 Ask audio file", open=True):
564
  question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file")
565
+ submit_chat = gr.Button("Ask audio file", variant="primary")
566
+ example_chat = [["What is the subject of this audio file?"], ["Quels sont les ingrédients ?"]]
567
+ gr.Examples(
568
+ examples=example_chat,
569
+ inputs=question_chat,
570
+ outputs=None,
571
+ fn=None,
572
+ cache_examples=False,
573
+ run_on_click=False
574
+ )
575
  text_chat = gr.Textbox(label="💬 Model answer", lines=10)
576
+ status_chat = gr.Markdown()
577
 
578
  ### Processing
579
 
 
585
  ).then(
586
  fn=process_transcript,
587
  inputs=[sel_language, sel_audio],
588
+ outputs=[text_transcript, status_transcript]
589
  ).then(
590
  enable_buttons,
591
  outputs=[submit_transcript, submit_translate, submit_chat],
 
599
  ).then(
600
  fn=process_translate,
601
  inputs=[sel_translate_language, sel_audio],
602
+ outputs=[text_translate, status_translate]
603
  ).then(
604
  enable_buttons,
605
  outputs=[submit_transcript, submit_translate, submit_chat],
 
613
  ).then(
614
  fn=process_chat,
615
  inputs=[question_chat, sel_audio],
616
+ outputs=[text_chat, status_chat]
617
  ).then(
618
  enable_buttons,
619
  outputs=[submit_transcript, submit_translate, submit_chat],
 
622
  ### Launch the app
623
 
624
  if __name__ == "__main__":
625
+ voxtral.queue().launch(debug=True)