Loren commited on
Commit
740245f
·
verified ·
1 Parent(s): a71ad7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -11
app.py CHANGED
@@ -25,22 +25,15 @@ dict_languages = {"English": "en",
25
  "Hindi": "hi"}
26
 
27
  @spaces.GPU
28
- def process_transcript(model, processor, language, audio_path):
29
  """Process audio with selected Voxtral model and return the generated response"""
30
 
31
  if audio_path is None:
32
  return "Please provide some input audio: either upload an audio file or use the microphone."
33
  else:
34
  id_language = dict_languages[language]
35
- raw_inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name)
36
- # inputs = inputs.to(device, dtype=torch.bfloat16)
37
- # inputs = {k: v.to(device, dtype=torch.bfloat16) for k, v in inputs.items()}
38
- inputs = {}
39
- for k, v in raw_inputs.items():
40
- if k == "input_ids":
41
- inputs[k] = v.to(device=device, dtype=torch.long)
42
- else:
43
- inputs[k] = v.to(device=device, dtype=torch.bfloat16)
44
  outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
45
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
46
 
@@ -82,7 +75,7 @@ with gr.Blocks(title="Transcription") as transcript:
82
 
83
 
84
  submit_transcript.click(
85
- fn=lambda v1, v2: process_transcript(model, processor, v1, v2),
86
  inputs=[sel_language, sel_audio],
87
  outputs=text_transcript
88
  )
 
25
  "Hindi": "hi"}
26
 
27
  @spaces.GPU
28
+ def process_transcript(language, audio_path):
29
  """Process audio with selected Voxtral model and return the generated response"""
30
 
31
  if audio_path is None:
32
  return "Please provide some input audio: either upload an audio file or use the microphone."
33
  else:
34
  id_language = dict_languages[language]
35
+ inputs = processor.apply_transcrition_request(language=id_language, audio=audio_path, model_id=model_name)
36
+ inputs = inputs.to(device, dtype=torch.bfloat16)
 
 
 
 
 
 
 
37
  outputs = model.generate(**inputs, max_new_tokens=MAX_TOKENS)
38
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
39
 
 
75
 
76
 
77
  submit_transcript.click(
78
+ fn=process_transcript,
79
  inputs=[sel_language, sel_audio],
80
  outputs=text_transcript
81
  )