helvekami commited on
Commit
8c679c2
·
1 Parent(s): 5f13772

Updated Gradio App

Browse files
Files changed (1) hide show
  1. app.py +46 -63
app.py CHANGED
@@ -1,76 +1,59 @@
1
- import gradio as gr
2
  import transformers
 
3
  import librosa
4
  import torch
 
5
  import numpy as np
6
 
7
- # Load the Shuka model pipeline.
8
- pipe = transformers.pipeline(
9
- model="sarvamai/shuka_v1",
10
- trust_remote_code=True,
11
- device=0 if torch.cuda.is_available() else -1,
12
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None
13
- )
14
-
15
- def process_audio(audio):
16
- """
17
- Processes the input audio and returns a text response generated by the Shuka model.
18
- """
19
- if audio is None:
20
- return "No audio provided. Please upload or record an audio file."
21
-
22
- try:
23
- # Gradio returns a tuple: (sample_rate, audio_data)
24
- sample_rate, audio_data = audio
25
- except Exception as e:
26
- return f"Error processing audio input: {e}"
27
-
28
- if audio_data is None or len(audio_data) == 0:
29
- return "Audio data is empty. Please try again with a valid audio file."
30
-
31
- # Force conversion of audio data to a floating-point numpy array.
32
- audio_data = np.array(audio_data, dtype=np.float32)
33
-
34
- # If the audio data is multi-dimensional, squeeze it to 1D.
35
- if audio_data.ndim > 1:
36
- audio_data = np.squeeze(audio_data)
37
-
38
- # Resample to 16000 Hz if necessary.
39
- if sample_rate != 16000:
40
- try:
41
- audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
42
- sample_rate = 16000
43
- except Exception as e:
44
- return f"Error during resampling: {e}"
45
-
46
- # Define conversation turns for the model.
47
- turns = [
48
- {'role': 'system', 'content': 'Respond naturally and informatively.'},
49
- {'role': 'user', 'content': '<|audio|>'}
50
- ]
51
-
52
  try:
53
- result = pipe({'audio': audio_data, 'turns': turns, 'sampling_rate': sample_rate}, max_new_tokens=512)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  except Exception as e:
55
- return f"Error during model processing: {e}"
56
-
57
- # Extract the generated text response.
58
- if isinstance(result, list) and len(result) > 0:
59
- response = result[0].get('generated_text', '')
60
- else:
61
- response = str(result)
62
-
63
- return response
64
 
65
- # Create the Gradio interface.
66
  iface = gr.Interface(
67
- fn=process_audio,
68
- inputs=gr.Audio(type="numpy"), # Use file upload for audio input.
69
  outputs="text",
70
- title="Sarvam AI Shuka Voice Demo",
71
- description="Upload an audio file and get a response using Sarvam AI's Shuka model."
 
72
  )
73
 
74
  if __name__ == "__main__":
75
- # Launch the app with share=True to create a public link.
76
- iface.launch(share=True)
 
 
1
  import transformers
2
+ import gradio as gr
3
  import librosa
4
  import torch
5
+ import spaces
6
  import numpy as np
7
 
8
+ @spaces.GPU(duration=60)
9
+ def transcribe_and_respond(audio_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  try:
11
+ pipe = transformers.pipeline(
12
+ model='sarvamai/shuka_v1',
13
+ trust_remote_code=True,
14
+ device=0,
15
+ torch_dtype=torch.bfloat16
16
+ )
17
+
18
+ # Load the audio file at 16kHz
19
+ audio, sr = librosa.load(audio_file, sr=16000)
20
+
21
+ # Ensure audio is a floating-point numpy array
22
+ audio = np.array(audio, dtype=np.float32)
23
+ # If audio has more than one channel, convert to mono by averaging
24
+ if audio.ndim > 1:
25
+ audio = np.mean(audio, axis=-1)
26
+
27
+ # Debug: Print audio properties
28
+ print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
29
+
30
+ turns = [
31
+ {'role': 'system', 'content': 'Respond naturally and informatively.'},
32
+ {'role': 'user', 'content': '<|audio|>'}
33
+ ]
34
+
35
+ # Debug: Print initial turns
36
+ print(f"Initial turns: {turns}")
37
+
38
+ # Call the model with the audio and prompt
39
+ output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
40
+
41
+ # Debug: Print the final output from the model
42
+ print(f"Model output: {output}")
43
+
44
+ return output
45
+
46
  except Exception as e:
47
+ return f"Error: {str(e)}"
 
 
 
 
 
 
 
 
48
 
 
49
  iface = gr.Interface(
50
+ fn=transcribe_and_respond,
51
+ inputs=gr.Audio(sources="microphone", type="filepath"),
52
  outputs="text",
53
+ title="Live Transcription and Response",
54
+ description="Speak into your microphone, and the model will respond naturally and informatively.",
55
+ live=True
56
  )
57
 
58
  if __name__ == "__main__":
59
+ iface.launch()