Vaibhav Srivastav commited on
Commit
0b28f04
·
1 Parent(s): e134a02
Files changed (1) hide show
  1. app.py +4 -8
app.py CHANGED
@@ -35,7 +35,7 @@ speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys()
35
 
36
  SAMPLE_RATE = 24_000
37
 
38
- vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2").to(device)
39
 
40
  # import model
41
  if device == "cpu":
@@ -45,7 +45,7 @@ else:
45
  bark = bark.to_bettertransformer()
46
 
47
 
48
- # streaming inference
49
  def generate_audio(text, voice_preset = None, lag = 0):
50
  if voice_preset not in speaker_embeddings:
51
  voice_preset = None
@@ -63,13 +63,10 @@ def generate_audio(text, voice_preset = None, lag = 0):
63
  print("Fine tokens generated")
64
 
65
  with torch.no_grad():
66
-
67
- encodec_waveform = bark.codec_decode(fine_output)
68
-
69
  features = vocos.codes_to_features(fine_output.transpose(0,1))
70
  vocos_waveform = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
71
 
72
- return (SAMPLE_RATE, encodec_waveform.cpu().squeeze().numpy()), (SAMPLE_RATE, vocos_waveform.cpu().squeeze().numpy())
73
 
74
 
75
  # Gradio blocks demo
@@ -90,9 +87,8 @@ with gr.Blocks() as demo_blocks:
90
  btn = gr.Button("Bark with Vocos TTS")
91
 
92
  with gr.Row():
93
- out_audio_encodec = gr.Audio(type="numpy", autoplay=False, label="original output", show_label=True)
94
  out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="vocos enhanced output", show_label=True)
95
 
96
- btn.click(generate_audio, [inp_text, dd], [out_audio_encodec, out_audio_vocos])
97
 
98
  demo_blocks.queue().launch(debug=True)
 
35
 
36
  SAMPLE_RATE = 24_000
37
 
38
+ vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz").to(device)
39
 
40
  # import model
41
  if device == "cpu":
 
45
  bark = bark.to_bettertransformer()
46
 
47
 
48
+ # Inference
49
  def generate_audio(text, voice_preset = None, lag = 0):
50
  if voice_preset not in speaker_embeddings:
51
  voice_preset = None
 
63
  print("Fine tokens generated")
64
 
65
  with torch.no_grad():
 
 
 
66
  features = vocos.codes_to_features(fine_output.transpose(0,1))
67
  vocos_waveform = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
68
 
69
+ return (SAMPLE_RATE, vocos_waveform.cpu().squeeze().numpy())
70
 
71
 
72
  # Gradio blocks demo
 
87
  btn = gr.Button("Bark with Vocos TTS")
88
 
89
  with gr.Row():
 
90
  out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="vocos enhanced output", show_label=True)
91
 
92
+ btn.click(generate_audio, [inp_text, dd], [out_audio_vocos])
93
 
94
  demo_blocks.queue().launch(debug=True)