Spaces:

MohamedRashad
/

Voxtral

Running on Zero

App Files Files Community

MohamedRashad commited on Jul 18

Commit

ba6a1e9

1 Parent(s): 19b28ef

Add language support and update audio processing function in Voxtral app

Browse files

Files changed (2) hide show

app.py +20 -3
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -13,8 +13,22 @@ voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRas
 voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
 voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)
 @spaces.GPU()
-def process_audio(audio_path, model_name, language="en", max_tokens=500):
     """Process audio with selected Voxtral model and return the generated response"""
     if not audio_path:
         return "Please upload an audio file."
@@ -30,6 +44,7 @@ def process_audio(audio_path, model_name, language="en", max_tokens=500):
     else:
         return "Invalid model selected."
     inputs = processor.apply_transcription_request(language=language, audio=audio_path, model_id=repo_id)
     inputs = inputs.to(device, dtype=torch.bfloat16)
@@ -38,6 +53,8 @@ def process_audio(audio_path, model_name, language="en", max_tokens=500):
     return decoded_outputs[0]
 # Define Gradio interface
 with gr.Blocks(title="Voxtral Demo") as demo:
     gr.Markdown("# Voxtral Audio Processing Demo")
@@ -54,13 +71,13 @@ with gr.Blocks(title="Voxtral Demo") as demo:
             )
             language = gr.Dropdown(
-                choices=["en", "fr", "de", "es", "it", "pt", "nl", "ru", "zh", "ja", "ar"],
                 value="en",
                 label="Language"
             )
             max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
-            submit_btn = gr.Button("Process Audio")
         with gr.Column():
             output_text = gr.Textbox(label="Generated Response", lines=10)

 voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
 voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)
+LANGUAGES = {
+    "English": "en",
+    "French": "fr",
+    "German": "de",
+    "Spanish": "es",
+    "Italian": "it",
+    "Portuguese": "pt",
+    "Dutch": "nl",
+    "Russian": "ru",
+    "Chinese": "zh",
+    "Japanese": "ja",
+    "Arabic": "ar",
+}
 @spaces.GPU()
+def process_audio(audio_path, model_name, lang_name, max_tokens=500):
     """Process audio with selected Voxtral model and return the generated response"""
     if not audio_path:
         return "Please upload an audio file."
     else:
         return "Invalid model selected."
+    language = LANGUAGES[lang_name]
     inputs = processor.apply_transcription_request(language=language, audio=audio_path, model_id=repo_id)
     inputs = inputs.to(device, dtype=torch.bfloat16)
     return decoded_outputs[0]
 # Define Gradio interface
 with gr.Blocks(title="Voxtral Demo") as demo:
     gr.Markdown("# Voxtral Audio Processing Demo")
             )
             language = gr.Dropdown(
+                choices=list(LANGUAGES.keys()),
                 value="en",
                 label="Language"
             )
             max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
+            submit_btn = gr.Button("Extract Transcription", variant="primary")
         with gr.Column():
             output_text = gr.Textbox(label="Generated Response", lines=10)

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ mistral-common
 git+https://github.com/huggingface/transformers
 gradio
 torch
-spaces

 git+https://github.com/huggingface/transformers
 gradio
 torch
+spaces
+accelerate