chinmaydan commited on
Commit
666f810
·
1 Parent(s): a5f2574

tried using app.py similar to https://huggingface.co/spaces/openai/whisper/blob/main/app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -40
app.py CHANGED
@@ -1,48 +1,26 @@
 
 
1
  import gradio as gr
2
- import librosa
3
- import torch
4
 
5
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
6
 
7
- processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
8
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
9
 
10
- model.config.forced_decoder_ids = None
11
 
12
- def process_audio(sampling_rate, waveform):
13
- # convert from int16 to floating point
14
- waveform = waveform / 32678.0
15
-
16
- # convert to mono if stereo
17
- if len(waveform.shape) > 1:
18
- waveform = librosa.to_mono(waveform.T)
19
-
20
- # resample to 16 kHz if necessary
21
- if sampling_rate != 16000:
22
- waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=16000)
23
-
24
- # limit to 30 seconds
25
- waveform = waveform[:16000*30]
26
-
27
- # make PyTorch tensor
28
- waveform = torch.tensor(waveform)
29
- return waveform
30
-
31
- def predict(audio, mic_audio=None):
32
- # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
33
- if mic_audio is not None:
34
- sampling_rate, waveform = mic_audio
35
- elif audio is not None:
36
- sampling_rate, waveform = audio
37
- else:
38
- return "(please provide audio)"
39
-
40
- waveform = process_audio(sampling_rate, waveform)
41
- input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
42
- predicted_ids = model.generate(input_features, max_length=400)
43
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
44
- print(transcription)
45
- return transcription[0]
46
 
47
 
48
  title = "Demo for Whisper -> Something -> XLS-R"
@@ -53,7 +31,7 @@ being passed into the model. The output is the text transcription of the audio.
53
  """
54
 
55
  gr.Interface(
56
- fn=predict,
57
  inputs=[
58
  gr.Audio(label="Upload Speech", source="upload", type="numpy"),
59
  gr.Audio(label="Record Speech", source="microphone", type="numpy"),
 
1
+ import os
2
+ os.system("pip install git+https://github.com/openai/whisper.git")
3
  import gradio as gr
4
+ import whisper
 
5
 
6
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
7
 
8
+ model = whisper.load_model("small")
 
9
 
 
10
 
11
+ def inference(audio):
12
+ audio = whisper.load_audio(audio)
13
+ audio = whisper.pad_or_trim(audio)
14
+
15
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
16
+
17
+ _, probs = model.detect_language(mel)
18
+
19
+ options = whisper.DecodingOptions(fp16 = False)
20
+ result = whisper.decode(model, mel, options)
21
+
22
+ print(result.text)
23
+ return result.text, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  title = "Demo for Whisper -> Something -> XLS-R"
 
31
  """
32
 
33
  gr.Interface(
34
+ fn=inference,
35
  inputs=[
36
  gr.Audio(label="Upload Speech", source="upload", type="numpy"),
37
  gr.Audio(label="Record Speech", source="microphone", type="numpy"),