Spaces:

helvekami
/

ShukaNote

Running on Zero

App Files Files Community

helvekami commited on 15 days ago

Commit

1895fc7

1 Parent(s): 8c679c2

Updated Gradio App

Browse files

Files changed (1) hide show

app.py +10 -9

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import spaces
 import numpy as np
 @spaces.GPU(duration=60)
-def transcribe_and_respond(audio_file):
     try:
         pipe = transformers.pipeline(
             model='sarvamai/shuka_v1',
@@ -20,22 +20,23 @@ def transcribe_and_respond(audio_file):
         # Ensure audio is a floating-point numpy array
         audio = np.array(audio, dtype=np.float32)
-        # If audio has more than one channel, convert to mono by averaging
         if audio.ndim > 1:
             audio = np.mean(audio, axis=-1)
-        # Debug: Print audio properties
         print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
         turns = [
-            {'role': 'system', 'content': 'Respond naturally and informatively.'},
             {'role': 'user', 'content': '<|audio|>'}
         ]
-        # Debug: Print initial turns
         print(f"Initial turns: {turns}")
-        # Call the model with the audio and prompt
         output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
         # Debug: Print the final output from the model
@@ -47,11 +48,11 @@ def transcribe_and_respond(audio_file):
         return f"Error: {str(e)}"
 iface = gr.Interface(
-    fn=transcribe_and_respond,
     inputs=gr.Audio(sources="microphone", type="filepath"),
     outputs="text",
-    title="Live Transcription and Response",
-    description="Speak into your microphone, and the model will respond naturally and informatively.",
     live=True
 )

 import numpy as np
 @spaces.GPU(duration=60)
+def transcribe_audio(audio_file):
     try:
         pipe = transformers.pipeline(
             model='sarvamai/shuka_v1',
         # Ensure audio is a floating-point numpy array
         audio = np.array(audio, dtype=np.float32)
+        # Convert multi-channel audio to mono if needed
         if audio.ndim > 1:
             audio = np.mean(audio, axis=-1)
+        # Debug: Print audio properties for troubleshooting
         print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
+        # Change the conversation turns to instruct transcription
         turns = [
+            {'role': 'system', 'content': 'Please transcribe the following audio exactly.'},
             {'role': 'user', 'content': '<|audio|>'}
         ]
+        # Debug: Print the initial turns
         print(f"Initial turns: {turns}")
+        # Call the model with the audio and transcription prompt
         output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
         # Debug: Print the final output from the model
         return f"Error: {str(e)}"
 iface = gr.Interface(
+    fn=transcribe_audio,
     inputs=gr.Audio(sources="microphone", type="filepath"),
     outputs="text",
+    title="Shuka ASR Demo",
+    description="Speak into your microphone, and the model will transcribe your speech.",
     live=True
 )