Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
ba6a1e9
1
Parent(s):
19b28ef
Add language support and update audio processing function in Voxtral app
Browse files- app.py +20 -3
- requirements.txt +2 -1
app.py
CHANGED
@@ -13,8 +13,22 @@ voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRas
|
|
13 |
voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
|
14 |
voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
@spaces.GPU()
|
17 |
-
def process_audio(audio_path, model_name,
|
18 |
"""Process audio with selected Voxtral model and return the generated response"""
|
19 |
if not audio_path:
|
20 |
return "Please upload an audio file."
|
@@ -30,6 +44,7 @@ def process_audio(audio_path, model_name, language="en", max_tokens=500):
|
|
30 |
else:
|
31 |
return "Invalid model selected."
|
32 |
|
|
|
33 |
inputs = processor.apply_transcription_request(language=language, audio=audio_path, model_id=repo_id)
|
34 |
inputs = inputs.to(device, dtype=torch.bfloat16)
|
35 |
|
@@ -38,6 +53,8 @@ def process_audio(audio_path, model_name, language="en", max_tokens=500):
|
|
38 |
|
39 |
return decoded_outputs[0]
|
40 |
|
|
|
|
|
41 |
# Define Gradio interface
|
42 |
with gr.Blocks(title="Voxtral Demo") as demo:
|
43 |
gr.Markdown("# Voxtral Audio Processing Demo")
|
@@ -54,13 +71,13 @@ with gr.Blocks(title="Voxtral Demo") as demo:
|
|
54 |
)
|
55 |
|
56 |
language = gr.Dropdown(
|
57 |
-
choices=
|
58 |
value="en",
|
59 |
label="Language"
|
60 |
)
|
61 |
|
62 |
max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
|
63 |
-
submit_btn = gr.Button("
|
64 |
|
65 |
with gr.Column():
|
66 |
output_text = gr.Textbox(label="Generated Response", lines=10)
|
|
|
13 |
voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers")
|
14 |
voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device)
|
15 |
|
16 |
+
LANGUAGES = {
|
17 |
+
"English": "en",
|
18 |
+
"French": "fr",
|
19 |
+
"German": "de",
|
20 |
+
"Spanish": "es",
|
21 |
+
"Italian": "it",
|
22 |
+
"Portuguese": "pt",
|
23 |
+
"Dutch": "nl",
|
24 |
+
"Russian": "ru",
|
25 |
+
"Chinese": "zh",
|
26 |
+
"Japanese": "ja",
|
27 |
+
"Arabic": "ar",
|
28 |
+
}
|
29 |
+
|
30 |
@spaces.GPU()
|
31 |
+
def process_audio(audio_path, model_name, lang_name, max_tokens=500):
|
32 |
"""Process audio with selected Voxtral model and return the generated response"""
|
33 |
if not audio_path:
|
34 |
return "Please upload an audio file."
|
|
|
44 |
else:
|
45 |
return "Invalid model selected."
|
46 |
|
47 |
+
language = LANGUAGES[lang_name]
|
48 |
inputs = processor.apply_transcription_request(language=language, audio=audio_path, model_id=repo_id)
|
49 |
inputs = inputs.to(device, dtype=torch.bfloat16)
|
50 |
|
|
|
53 |
|
54 |
return decoded_outputs[0]
|
55 |
|
56 |
+
|
57 |
+
|
58 |
# Define Gradio interface
|
59 |
with gr.Blocks(title="Voxtral Demo") as demo:
|
60 |
gr.Markdown("# Voxtral Audio Processing Demo")
|
|
|
71 |
)
|
72 |
|
73 |
language = gr.Dropdown(
|
74 |
+
choices=list(LANGUAGES.keys()),
|
75 |
value="en",
|
76 |
label="Language"
|
77 |
)
|
78 |
|
79 |
max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
|
80 |
+
submit_btn = gr.Button("Extract Transcription", variant="primary")
|
81 |
|
82 |
with gr.Column():
|
83 |
output_text = gr.Textbox(label="Generated Response", lines=10)
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ mistral-common
|
|
2 |
git+https://github.com/huggingface/transformers
|
3 |
gradio
|
4 |
torch
|
5 |
-
spaces
|
|
|
|
2 |
git+https://github.com/huggingface/transformers
|
3 |
gradio
|
4 |
torch
|
5 |
+
spaces
|
6 |
+
accelerate
|