helvekami commited on
Commit
1895fc7
·
1 Parent(s): 8c679c2

Updated Gradio App

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -6,7 +6,7 @@ import spaces
6
  import numpy as np
7
 
8
  @spaces.GPU(duration=60)
9
- def transcribe_and_respond(audio_file):
10
  try:
11
  pipe = transformers.pipeline(
12
  model='sarvamai/shuka_v1',
@@ -20,22 +20,23 @@ def transcribe_and_respond(audio_file):
20
 
21
  # Ensure audio is a floating-point numpy array
22
  audio = np.array(audio, dtype=np.float32)
23
- # If audio has more than one channel, convert to mono by averaging
24
  if audio.ndim > 1:
25
  audio = np.mean(audio, axis=-1)
26
 
27
- # Debug: Print audio properties
28
  print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
29
 
 
30
  turns = [
31
- {'role': 'system', 'content': 'Respond naturally and informatively.'},
32
  {'role': 'user', 'content': '<|audio|>'}
33
  ]
34
 
35
- # Debug: Print initial turns
36
  print(f"Initial turns: {turns}")
37
 
38
- # Call the model with the audio and prompt
39
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
40
 
41
  # Debug: Print the final output from the model
@@ -47,11 +48,11 @@ def transcribe_and_respond(audio_file):
47
  return f"Error: {str(e)}"
48
 
49
  iface = gr.Interface(
50
- fn=transcribe_and_respond,
51
  inputs=gr.Audio(sources="microphone", type="filepath"),
52
  outputs="text",
53
- title="Live Transcription and Response",
54
- description="Speak into your microphone, and the model will respond naturally and informatively.",
55
  live=True
56
  )
57
 
 
6
  import numpy as np
7
 
8
  @spaces.GPU(duration=60)
9
+ def transcribe_audio(audio_file):
10
  try:
11
  pipe = transformers.pipeline(
12
  model='sarvamai/shuka_v1',
 
20
 
21
  # Ensure audio is a floating-point numpy array
22
  audio = np.array(audio, dtype=np.float32)
23
+ # Convert multi-channel audio to mono if needed
24
  if audio.ndim > 1:
25
  audio = np.mean(audio, axis=-1)
26
 
27
+ # Debug: Print audio properties for troubleshooting
28
  print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
29
 
30
+ # Change the conversation turns to instruct transcription
31
  turns = [
32
+ {'role': 'system', 'content': 'Please transcribe the following audio exactly.'},
33
  {'role': 'user', 'content': '<|audio|>'}
34
  ]
35
 
36
+ # Debug: Print the initial turns
37
  print(f"Initial turns: {turns}")
38
 
39
+ # Call the model with the audio and transcription prompt
40
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
41
 
42
  # Debug: Print the final output from the model
 
48
  return f"Error: {str(e)}"
49
 
50
  iface = gr.Interface(
51
+ fn=transcribe_audio,
52
  inputs=gr.Audio(sources="microphone", type="filepath"),
53
  outputs="text",
54
+ title="Shuka ASR Demo",
55
+ description="Speak into your microphone, and the model will transcribe your speech.",
56
  live=True
57
  )
58