MohamedRashad commited on
Commit
ba6a1e9
·
1 Parent(s): 19b28ef

Add language support and update audio processing function in Voxtral app

Browse files
Files changed (2) hide show
  1. app.py +20 -3
  2. requirements.txt +2 -1
app.py CHANGED
@@ -13,8 +13,22 @@ voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRas
13
  voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
14
  voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  @spaces.GPU()
17
- def process_audio(audio_path, model_name, language="en", max_tokens=500):
18
  """Process audio with selected Voxtral model and return the generated response"""
19
  if not audio_path:
20
  return "Please upload an audio file."
@@ -30,6 +44,7 @@ def process_audio(audio_path, model_name, language="en", max_tokens=500):
30
  else:
31
  return "Invalid model selected."
32
 
 
33
  inputs = processor.apply_transcription_request(language=language, audio=audio_path, model_id=repo_id)
34
  inputs = inputs.to(device, dtype=torch.bfloat16)
35
 
@@ -38,6 +53,8 @@ def process_audio(audio_path, model_name, language="en", max_tokens=500):
38
 
39
  return decoded_outputs[0]
40
 
 
 
41
  # Define Gradio interface
42
  with gr.Blocks(title="Voxtral Demo") as demo:
43
  gr.Markdown("# Voxtral Audio Processing Demo")
@@ -54,13 +71,13 @@ with gr.Blocks(title="Voxtral Demo") as demo:
54
  )
55
 
56
  language = gr.Dropdown(
57
- choices=["en", "fr", "de", "es", "it", "pt", "nl", "ru", "zh", "ja", "ar"],
58
  value="en",
59
  label="Language"
60
  )
61
 
62
  max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
63
- submit_btn = gr.Button("Process Audio")
64
 
65
  with gr.Column():
66
  output_text = gr.Textbox(label="Generated Response", lines=10)
 
13
  voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
14
  voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)
15
 
16
+ LANGUAGES = {
17
+ "English": "en",
18
+ "French": "fr",
19
+ "German": "de",
20
+ "Spanish": "es",
21
+ "Italian": "it",
22
+ "Portuguese": "pt",
23
+ "Dutch": "nl",
24
+ "Russian": "ru",
25
+ "Chinese": "zh",
26
+ "Japanese": "ja",
27
+ "Arabic": "ar",
28
+ }
29
+
30
  @spaces.GPU()
31
+ def process_audio(audio_path, model_name, lang_name, max_tokens=500):
32
  """Process audio with selected Voxtral model and return the generated response"""
33
  if not audio_path:
34
  return "Please upload an audio file."
 
44
  else:
45
  return "Invalid model selected."
46
 
47
+ language = LANGUAGES[lang_name]
48
  inputs = processor.apply_transcription_request(language=language, audio=audio_path, model_id=repo_id)
49
  inputs = inputs.to(device, dtype=torch.bfloat16)
50
 
 
53
 
54
  return decoded_outputs[0]
55
 
56
+
57
+
58
  # Define Gradio interface
59
  with gr.Blocks(title="Voxtral Demo") as demo:
60
  gr.Markdown("# Voxtral Audio Processing Demo")
 
71
  )
72
 
73
  language = gr.Dropdown(
74
+ choices=list(LANGUAGES.keys()),
75
  value="en",
76
  label="Language"
77
  )
78
 
79
  max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
80
+ submit_btn = gr.Button("Extract Transcription", variant="primary")
81
 
82
  with gr.Column():
83
  output_text = gr.Textbox(label="Generated Response", lines=10)
requirements.txt CHANGED
@@ -2,4 +2,5 @@ mistral-common
2
  git+https://github.com/huggingface/transformers
3
  gradio
4
  torch
5
- spaces
 
 
2
  git+https://github.com/huggingface/transformers
3
  gradio
4
  torch
5
+ spaces
6
+ accelerate