Loren commited on
Commit
5327d9d
·
verified ·
1 Parent(s): 740245f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -25
app.py CHANGED
@@ -3,11 +3,68 @@ import torch
3
  from transformers import AutoProcessor, VoxtralForConditionalGeneration
4
  import spaces
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  MAX_TOKENS = 32000
7
 
8
  device = "cuda" if torch.cuda.is_available() else "cpu"
9
  print(f"*** Device: {device}")
10
-
11
  model_name = 'mistralai/Voxtral-Mini-3B-2507'
12
 
13
  processor = AutoProcessor.from_pretrained(model_name)
@@ -24,28 +81,13 @@ dict_languages = {"English": "en",
24
  "Dutch": "nl",
25
  "Hindi": "hi"}
26
 
27
- @spaces.GPU
28
- def process_transcript(language, audio_path):
29
- """Process audio with selected Voxtral model and return the generated response"""
30
-
31
- if audio_path is None:
32
- return "Please provide some input audio: either upload an audio file or use the microphone."
33
- else:
34
- id_language = dict_languages[language]
35
- inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name)
36
- inputs = inputs.to(device, dtype=torch.bfloat16)
37
- outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
38
- decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
39
-
40
- return decoded_outputs[0]
41
-
42
 
 
43
 
44
- # Define Gradio interface
45
  with gr.Blocks(title="Transcription") as transcript:
46
- gr.Markdown("# Audio Transcription")
47
- gr.Markdown("#### Choose the language of the audio and the model, then set an audio file to get its transcription.")
48
- gr.Markdown("#### **(Voxtral handles audios up to 30 minutes for transcription)**")
49
 
50
  with gr.Row():
51
  with gr.Column():
@@ -56,7 +98,8 @@ with gr.Blocks(title="Transcription") as transcript:
56
  )
57
 
58
  with gr.Column():
59
- sel_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload an audio file or record via microphone:")
 
60
 
61
  example = [["mapo_tofu.mp3"]]
62
  gr.Examples(
@@ -70,16 +113,55 @@ with gr.Blocks(title="Transcription") as transcript:
70
 
71
  with gr.Row():
72
  with gr.Column():
73
- submit_transcript = gr.Button("Extract Transcription", variant="primary")
74
- text_transcript = gr.Textbox(label="Generated Response", lines=10)
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
 
 
 
77
  submit_transcript.click(
 
 
 
 
78
  fn=process_transcript,
79
  inputs=[sel_language, sel_audio],
80
  outputs=text_transcript
 
 
 
81
  )
82
 
83
- # Launch the app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  if __name__ == "__main__":
85
- transcript.launch(share=True)
 
3
  from transformers import AutoProcessor, VoxtralForConditionalGeneration
4
  import spaces
5
 
6
+ #### Functions
7
+
8
+ @spaces.GPU
9
+ def process_transcript(language: str, audio_path: str) -> str:
10
+ """Process the audio file to return its transcription.
11
+
12
+ Args:
13
+ language: The language of the audio.
14
+ audio_path: The path to the audio file.
15
+
16
+ Returns:
17
+ The transcribed text of the audio.
18
+ """
19
+
20
+ if audio_path is None:
21
+ return "Please provide some input audio: either upload an audio file or use the microphone."
22
+ else:
23
+ id_language = dict_languages[language]
24
+ inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name)
25
+ inputs = inputs.to(device, dtype=torch.bfloat16)
26
+ outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
27
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
28
+
29
+ return decoded_outputs[0]
30
+ ###
31
+
32
+ def process_translate(language: str, audio_path: str) -> str:
33
+ conversation = [
34
+ {
35
+ "role": "user",
36
+ "content": [
37
+ {
38
+ "type": "audio",
39
+ "path": "+audio_path,
40
+ },
41
+ {"type": "text", "text": "Translate this in "+language},
42
+ ],
43
+ }
44
+ ]
45
+
46
+ inputs = processor.apply_chat_template(conversation)
47
+ inputs = inputs.to(device, dtype=torch.bfloat16)
48
+
49
+ outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
50
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
51
+
52
+ return decoded_outputs
53
+
54
+
55
+ def disable_buttons():
56
+ return gr.update(interactive=False), gr.update(interactive=False)
57
+
58
+ def enable_buttons():
59
+ return gr.update(interactive=True), gr.update(interactive=True)
60
+ ###
61
+
62
+ ### Initializations
63
+
64
  MAX_TOKENS = 32000
65
 
66
  device = "cuda" if torch.cuda.is_available() else "cpu"
67
  print(f"*** Device: {device}")
 
68
  model_name = 'mistralai/Voxtral-Mini-3B-2507'
69
 
70
  processor = AutoProcessor.from_pretrained(model_name)
 
81
  "Dutch": "nl",
82
  "Hindi": "hi"}
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ #### Gradio interface
86
 
 
87
  with gr.Blocks(title="Transcription") as transcript:
88
+ gr.Markdown("# Voxtral Mini Evaluation")
89
+ gr.Markdown("#### Choose the language of the audio and set an audio file to process it.")
90
+ gr.Markdown("##### *(Voxtral handles audios up to 30 minutes for transcription)*")
91
 
92
  with gr.Row():
93
  with gr.Column():
 
98
  )
99
 
100
  with gr.Column():
101
+ sel_audio = gr.Audio(sources=["upload", "microphone"], type="filepath",
102
+ label="Upload an audio file, record via microphone, or select a demo file:")
103
 
104
  example = [["mapo_tofu.mp3"]]
105
  gr.Examples(
 
113
 
114
  with gr.Row():
115
  with gr.Column():
116
+ submit_transcript = gr.Button("Extract transcription", variant="primary")
117
+ text_transcript = gr.Textbox(label="Generated transcription", lines=10)
118
 
119
+ with gr.Column():
120
+ sel_translate_language = gr.Dropdown(
121
+ choices=list(dict_languages.keys()),
122
+ value="English",
123
+ label="Select the language for translation:"
124
+ )
125
+
126
+ submit_translate = gr.Button("Translate audio file", variant="primary")
127
+ text_translate = gr.Textbox(label="Generated translation", lines=10)
128
+
129
+ with gr.Column():
130
+ submit_chat = gr.Button("Ask audio file", variant="primary")
131
+ text_chat = gr.Textbox(label="Model answer", lines=10)
132
 
133
+ ### Processing
134
+
135
+ # Transcription
136
  submit_transcript.click(
137
+ disable_buttons,
138
+ outputs=[submit_transcript, submit_translate, submit_chat],
139
+ trigger_mode="once",
140
+ ).then(
141
  fn=process_transcript,
142
  inputs=[sel_language, sel_audio],
143
  outputs=text_transcript
144
+ ).then(
145
+ enable_buttons,
146
+ outputs=[submit_transcript, submit_translate, submit_chat],
147
  )
148
 
149
+ # Translation
150
+ submit_transcript.click(
151
+ disable_buttons,
152
+ outputs=[submit_transcript, submit_translate, submit_chat],
153
+ trigger_mode="once",
154
+ ).then(
155
+ fn=process_transcript,
156
+ inputs=[sel_language, sel_audio],
157
+ outputs=text_transcript
158
+ ).then(
159
+ enable_buttons,
160
+ outputs=[submit_transcript, submit_translate, submit_chat],
161
+ )
162
+
163
+
164
+ ### Launch the app
165
+
166
  if __name__ == "__main__":
167
+ audio.launch()