helvekami commited on
Commit
0a3e7f6
·
1 Parent(s): a0b460e

Updated Gradio App

Browse files
Files changed (1) hide show
  1. app.py +18 -11
app.py CHANGED
@@ -15,26 +15,33 @@ def transcribe_and_respond(audio_file):
15
  torch_dtype=torch.bfloat16
16
  )
17
 
18
- # Load the audio file
19
  audio, sr = librosa.load(audio_file, sr=16000)
20
-
21
- # Print audio properties for debugging
 
 
 
 
 
 
 
22
  print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
23
-
24
  turns = [
25
- {'role': 'system', 'content': 'Print text.'},
26
  {'role': 'user', 'content': '<|audio|>'}
27
  ]
28
-
29
  # Debug: Print the initial turns
30
  print(f"Initial turns: {turns}")
31
-
32
  # Call the model with the audio and prompt
33
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
34
-
35
  # Debug: Print the final output from the model
36
  print(f"Model output: {output}")
37
-
38
  return output
39
 
40
  except Exception as e:
@@ -44,8 +51,8 @@ iface = gr.Interface(
44
  fn=transcribe_and_respond,
45
  inputs=gr.Audio(sources="microphone", type="filepath"),
46
  outputs="text",
47
- title="Live Transcription",
48
- description="Speak into your microphone, and the model will Transcribe.",
49
  live=True
50
  )
51
 
 
15
  torch_dtype=torch.bfloat16
16
  )
17
 
18
+ # Load the audio file at 16kHz
19
  audio, sr = librosa.load(audio_file, sr=16000)
20
+
21
+ # Ensure audio is a contiguous floating-point numpy array
22
+ audio = np.ascontiguousarray(audio, dtype=np.float32)
23
+
24
+ # If audio has more than one channel, convert to mono by averaging the channels
25
+ if audio.ndim > 1:
26
+ audio = np.mean(audio, axis=-1)
27
+
28
+ # Debug: Print audio properties
29
  print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
30
+
31
  turns = [
32
+ {'role': 'system', 'content': 'Please transcribe the following audio exactly.'},
33
  {'role': 'user', 'content': '<|audio|>'}
34
  ]
35
+
36
  # Debug: Print the initial turns
37
  print(f"Initial turns: {turns}")
38
+
39
  # Call the model with the audio and prompt
40
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
41
+
42
  # Debug: Print the final output from the model
43
  print(f"Model output: {output}")
44
+
45
  return output
46
 
47
  except Exception as e:
 
51
  fn=transcribe_and_respond,
52
  inputs=gr.Audio(sources="microphone", type="filepath"),
53
  outputs="text",
54
+ title="Live Transcription and Response",
55
+ description="Speak into your microphone, and the model will transcribe your speech.",
56
  live=True
57
  )
58