NightPrince commited on
Commit
0bbfec6
·
verified ·
1 Parent(s): 2726627

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -7
app.py CHANGED
@@ -11,15 +11,19 @@ processor = Wav2Vec2Processor.from_pretrained(model_name)
11
 
12
  # Function to transcribe audio using the model
13
  def transcribe(audio):
 
 
 
14
  # Resample the audio to 16kHz if necessary
15
- if audio.ndim > 1: # If audio is stereo
16
- audio = audio.mean(axis=1) # Convert to mono
17
 
18
- # Resample audio to 16kHz if it's not already
19
- audio = librosa.resample(audio, orig_sr=audio.shape[0] / len(audio), target_sr=16000)
 
20
 
21
  # Process the audio to match the model's input format
22
- inputs = processor(audio, return_tensors="pt", sampling_rate=16000)
23
 
24
  # Get the model's predictions
25
  with torch.no_grad():
@@ -35,8 +39,7 @@ def transcribe(audio):
35
  interface = gr.Interface(
36
  fn=transcribe,
37
  inputs=gr.Audio(type="numpy"), # Take the audio input as numpy array
38
- outputs="text", # Output transcribed text
39
- live=True # Optional: live transcribing as you speak
40
  )
41
 
42
  # Launch the interface
 
11
 
12
  # Function to transcribe audio using the model
13
  def transcribe(audio):
14
+ # Extract audio data from the tuple (audio, sample_rate)
15
+ audio_data, sample_rate = audio
16
+
17
  # Resample the audio to 16kHz if necessary
18
+ if audio_data.ndim > 1: # If audio is stereo
19
+ audio_data = audio_data.mean(axis=1) # Convert to mono
20
 
21
+ # Ensure the audio is resampled to 16kHz if it's not already
22
+ if sample_rate != 16000:
23
+ audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
24
 
25
  # Process the audio to match the model's input format
26
+ inputs = processor(audio_data, return_tensors="pt", sampling_rate=16000)
27
 
28
  # Get the model's predictions
29
  with torch.no_grad():
 
39
  interface = gr.Interface(
40
  fn=transcribe,
41
  inputs=gr.Audio(type="numpy"), # Take the audio input as numpy array
42
+ outputs="text" # Optional: live transcribing as you speak
 
43
  )
44
 
45
  # Launch the interface