helvekami commited on
Commit
fbc6758
·
1 Parent(s): e2f65f6

Updated Gradio App

Browse files
Files changed (1) hide show
  1. app.py +30 -16
app.py CHANGED
@@ -16,40 +16,54 @@ def process_audio(audio):
16
  Processes the input audio and returns a text response generated by the Shuka model.
17
  """
18
  if audio is None:
19
- return "No audio provided."
20
-
21
- # Gradio returns a tuple (sample_rate, numpy_array)
22
- sample_rate, audio_data = audio
23
 
 
 
 
 
 
 
 
 
 
24
  # Resample to 16000 Hz if necessary
25
  if sample_rate != 16000:
26
- audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
27
- sample_rate = 16000
28
-
29
- # Define conversation turns with a system prompt and a user prompt that signals audio input
 
 
 
30
  turns = [
31
  {'role': 'system', 'content': 'Respond naturally and informatively.'},
32
  {'role': 'user', 'content': '<|audio|>'}
33
  ]
34
-
35
- # Run the pipeline with the audio input and conversation context
36
- result = pipe({'audio': audio_data, 'turns': turns, 'sampling_rate': sample_rate}, max_new_tokens=512)
37
 
38
- # Extract the generated text response
 
 
 
 
 
39
  if isinstance(result, list) and len(result) > 0:
40
  response = result[0].get('generated_text', '')
41
  else:
42
  response = str(result)
 
43
  return response
44
 
45
- # Create the Gradio interface without the 'source' parameter.
 
46
  iface = gr.Interface(
47
  fn=process_audio,
48
- inputs=gr.Audio(type="numpy"),
49
  outputs="text",
50
  title="Sarvam AI Shuka Voice Demo",
51
- description="Upload a voice note and get a response using Sarvam AI's Shuka model."
52
  )
53
 
54
  if __name__ == "__main__":
55
- iface.launch()
 
 
16
  Processes the input audio and returns a text response generated by the Shuka model.
17
  """
18
  if audio is None:
19
+ return "No audio provided. Please upload or record an audio file."
 
 
 
20
 
21
+ try:
22
+ # Gradio returns a tuple: (sample_rate, numpy_array)
23
+ sample_rate, audio_data = audio
24
+ except Exception as e:
25
+ return f"Error processing audio input: {e}"
26
+
27
+ if audio_data is None or len(audio_data) == 0:
28
+ return "Audio data is empty. Please try again with a valid audio file."
29
+
30
  # Resample to 16000 Hz if necessary
31
  if sample_rate != 16000:
32
+ try:
33
+ audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
34
+ sample_rate = 16000
35
+ except Exception as e:
36
+ return f"Error during resampling: {e}"
37
+
38
+ # Define conversation turns for the model
39
  turns = [
40
  {'role': 'system', 'content': 'Respond naturally and informatively.'},
41
  {'role': 'user', 'content': '<|audio|>'}
42
  ]
 
 
 
43
 
44
+ try:
45
+ result = pipe({'audio': audio_data, 'turns': turns, 'sampling_rate': sample_rate}, max_new_tokens=512)
46
+ except Exception as e:
47
+ return f"Error during model processing: {e}"
48
+
49
+ # Extract generated text
50
  if isinstance(result, list) and len(result) > 0:
51
  response = result[0].get('generated_text', '')
52
  else:
53
  response = str(result)
54
+
55
  return response
56
 
57
+ # Create the Gradio interface.
58
+ # If you wish to record audio directly, you may need to upgrade Gradio to a version that supports "source" for the Audio component.
59
  iface = gr.Interface(
60
  fn=process_audio,
61
+ inputs=gr.Audio(type="numpy"), # using file upload input for audio
62
  outputs="text",
63
  title="Sarvam AI Shuka Voice Demo",
64
+ description="Upload an audio file and get a response using Sarvam AI's Shuka model."
65
  )
66
 
67
  if __name__ == "__main__":
68
+ # If port 7860 is in use, you can specify another port (here we use 7861)
69
+ iface.launch(server_port=7861)