helvekami commited on
Commit
a0b460e
·
1 Parent(s): 1895fc7

Updated Gradio App

Browse files
Files changed (1) hide show
  1. app.py +11 -18
app.py CHANGED
@@ -6,7 +6,7 @@ import spaces
6
  import numpy as np
7
 
8
  @spaces.GPU(duration=60)
9
- def transcribe_audio(audio_file):
10
  try:
11
  pipe = transformers.pipeline(
12
  model='sarvamai/shuka_v1',
@@ -15,30 +15,23 @@ def transcribe_audio(audio_file):
15
  torch_dtype=torch.bfloat16
16
  )
17
 
18
- # Load the audio file at 16kHz
19
  audio, sr = librosa.load(audio_file, sr=16000)
20
-
21
- # Ensure audio is a floating-point numpy array
22
- audio = np.array(audio, dtype=np.float32)
23
- # Convert multi-channel audio to mono if needed
24
- if audio.ndim > 1:
25
- audio = np.mean(audio, axis=-1)
26
-
27
- # Debug: Print audio properties for troubleshooting
28
  print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
29
 
30
- # Change the conversation turns to instruct transcription
31
  turns = [
32
- {'role': 'system', 'content': 'Please transcribe the following audio exactly.'},
33
  {'role': 'user', 'content': '<|audio|>'}
34
  ]
35
-
36
  # Debug: Print the initial turns
37
  print(f"Initial turns: {turns}")
38
 
39
- # Call the model with the audio and transcription prompt
40
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
41
-
42
  # Debug: Print the final output from the model
43
  print(f"Model output: {output}")
44
 
@@ -48,11 +41,11 @@ def transcribe_audio(audio_file):
48
  return f"Error: {str(e)}"
49
 
50
  iface = gr.Interface(
51
- fn=transcribe_audio,
52
  inputs=gr.Audio(sources="microphone", type="filepath"),
53
  outputs="text",
54
- title="Shuka ASR Demo",
55
- description="Speak into your microphone, and the model will transcribe your speech.",
56
  live=True
57
  )
58
 
 
6
  import numpy as np
7
 
8
  @spaces.GPU(duration=60)
9
+ def transcribe_and_respond(audio_file):
10
  try:
11
  pipe = transformers.pipeline(
12
  model='sarvamai/shuka_v1',
 
15
  torch_dtype=torch.bfloat16
16
  )
17
 
18
+ # Load the audio file
19
  audio, sr = librosa.load(audio_file, sr=16000)
20
+
21
+ # Print audio properties for debugging
 
 
 
 
 
 
22
  print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
23
 
 
24
  turns = [
25
+ {'role': 'system', 'content': 'Print text.'},
26
  {'role': 'user', 'content': '<|audio|>'}
27
  ]
28
+
29
  # Debug: Print the initial turns
30
  print(f"Initial turns: {turns}")
31
 
32
+ # Call the model with the audio and prompt
33
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
34
+
35
  # Debug: Print the final output from the model
36
  print(f"Model output: {output}")
37
 
 
41
  return f"Error: {str(e)}"
42
 
43
  iface = gr.Interface(
44
+ fn=transcribe_and_respond,
45
  inputs=gr.Audio(sources="microphone", type="filepath"),
46
  outputs="text",
47
+ title="Live Transcription",
48
+ description="Speak into your microphone, and the model will Transcribe.",
49
  live=True
50
  )
51