Spaces:
Runtime error
Runtime error
Upload 4 files
Browse files- app (1).py +138 -0
- dark-banner.png +0 -0
- light-banner.png +0 -0
- requirements (1).txt +3 -0
app (1).py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spaces
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import io
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
import torch
|
| 7 |
+
import torchaudio
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from whisperspeech.pipeline import Pipeline
|
| 10 |
+
|
| 11 |
+
DEVEL=os.environ.get('DEVEL', False)
|
| 12 |
+
|
| 13 |
+
title = """
|
| 14 |
+
<picture>
|
| 15 |
+
<source srcset="https://huggingface.co/spaces/collabora/whisperspeech/resolve/main/dark-banner.png" media="(prefers-color-scheme: dark)" />
|
| 16 |
+
<img alt="WhisperSpeech banner with Collabora and LAION logos" src="https://huggingface.co/spaces/collabora/whisperspeech/resolve/main/light-banner.png" style="width: 60%; margin: 0 auto;" />
|
| 17 |
+
</picture>
|
| 18 |
+
|
| 19 |
+
# Welcome to Collabora's WhisperSpeech
|
| 20 |
+
|
| 21 |
+
WhisperSpeech is an Open Source text-to-speech system built by Collabora and LAION by inverting Whisper.
|
| 22 |
+
The model is fully open and you can run it on your local hardware. It's like **Stable Diffusion but for speech**
|
| 23 |
+
– both powerful and easily customizable.
|
| 24 |
+
|
| 25 |
+
[You can contribute to WhisperSpeech on Github.](https://github.com/collabora/WhisperSpeech)
|
| 26 |
+
You can also join the discussion on Discord [](https://discord.gg/FANw4rHD5E)
|
| 27 |
+
|
| 28 |
+
Huge thanks to [Tonic](https://huggingface.co/Tonic) who helped build this Space for WhisperSpeech.
|
| 29 |
+
|
| 30 |
+
### How to Use It
|
| 31 |
+
|
| 32 |
+
Write you text in the box, you can use language tags (`<en>` or `<pl>`) to create multilingual speech.
|
| 33 |
+
Optionally you can upload a speech sample or give it a file URL to clone an existing voice. Check out the
|
| 34 |
+
examples at the bottom of the page for inspiration.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
footer = """
|
| 38 |
+
|
| 39 |
+
### How to use it locally
|
| 40 |
+
|
| 41 |
+
```
|
| 42 |
+
pip install -U WhisperSpeech
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
Afterwards:
|
| 46 |
+
|
| 47 |
+
```
|
| 48 |
+
from whisperspeech.pipeline import Pipeline
|
| 49 |
+
|
| 50 |
+
pipe = Pipeline(torch_compile=True)
|
| 51 |
+
pipe.generate_to_file("output.wav", "Hello from WhisperSpeech.")
|
| 52 |
+
```
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
text_examples = [
|
| 57 |
+
["This is the first demo of Whisper Speech, a fully open source text-to-speech model trained by Collabora and Lion on the Juwels supercomputer.", None],
|
| 58 |
+
["World War II or the Second World War was a global conflict that lasted from 1939 to 1945. The vast majority of the world's countries, including all the great powers, fought as part of two opposing military alliances: the Allies and the Axis.", "https://upload.wikimedia.org/wikipedia/commons/7/75/Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg"],
|
| 59 |
+
["<pl>To jest pierwszy test wielojęzycznego <en>Whisper Speech <pl>, modelu zamieniającego tekst na mowę, który Collabora i Laion nauczyli na superkomputerze <en>Jewels.", None],
|
| 60 |
+
["<en> WhisperSpeech is an Open Source library that helps you convert text to speech. <pl>Teraz także po Polsku! <en>I think I just tried saying \"now also in Polish\", don't judge me...", None],
|
| 61 |
+
# ["<de> WhisperSpeech is multi-lingual <es> y puede cambiar de idioma <hi> मध्य वाक्य में"],
|
| 62 |
+
["<pl>To jest pierwszy test naszego modelu. Pozdrawiamy serdecznie.", None],
|
| 63 |
+
# ["<en> The big difference between Europe <fr> et les Etats Unis <pl> jest to, że mamy tak wiele języków <uk> тут, в Європі"]
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
def parse_multilingual_text(input_text):
|
| 67 |
+
pattern = r"(?:<(\w+)>)|([^<]+)"
|
| 68 |
+
cur_lang = 'en'
|
| 69 |
+
segments = []
|
| 70 |
+
for i, (lang, txt) in enumerate(re.findall(pattern, input_text)):
|
| 71 |
+
if lang: cur_lang = lang
|
| 72 |
+
else: segments.append((cur_lang, f" {txt} ")) # add spaces to give it some time to switch languages
|
| 73 |
+
if not segments: return [("en", "")]
|
| 74 |
+
return segments
|
| 75 |
+
|
| 76 |
+
@spaces.GPU(enable_queue=True)
|
| 77 |
+
def generate_audio(pipe, segments, speaker, speaker_url, cps=14):
|
| 78 |
+
if isinstance(speaker, (str, Path)): speaker = pipe.extract_spk_emb(speaker)
|
| 79 |
+
elif speaker_url: speaker = pipe.extract_spk_emb(speaker_url)
|
| 80 |
+
else: speaker = pipe.default_speaker
|
| 81 |
+
langs, texts = [list(x) for x in zip(*segments)]
|
| 82 |
+
print(texts, langs)
|
| 83 |
+
stoks = pipe.t2s.generate(texts, cps=cps, lang=langs)[0]
|
| 84 |
+
atoks = pipe.s2a.generate(stoks, speaker.unsqueeze(0))
|
| 85 |
+
audio = pipe.vocoder.decode(atoks)
|
| 86 |
+
return audio.cpu()
|
| 87 |
+
|
| 88 |
+
def whisper_speech_demo(multilingual_text, speaker_audio=None, speaker_url="", cps=14):
|
| 89 |
+
if len(multilingual_text) == 0:
|
| 90 |
+
raise gr.Error("Please enter some text for me to speak!")
|
| 91 |
+
|
| 92 |
+
segments = parse_multilingual_text(multilingual_text)
|
| 93 |
+
|
| 94 |
+
audio = generate_audio(pipe, segments, speaker_audio, speaker_url, cps)
|
| 95 |
+
|
| 96 |
+
return (24000, audio.T.numpy())
|
| 97 |
+
|
| 98 |
+
# Did not work for me in Safari:
|
| 99 |
+
# mp3 = io.BytesIO()
|
| 100 |
+
# torchaudio.save(mp3, audio, 24000, format='mp3')
|
| 101 |
+
# return mp3.getvalue()
|
| 102 |
+
|
| 103 |
+
pipe = Pipeline(torch_compile=not DEVEL)
|
| 104 |
+
# warmup will come from regenerating the examples
|
| 105 |
+
|
| 106 |
+
with gr.Blocks() as demo:
|
| 107 |
+
gr.Markdown(title)
|
| 108 |
+
with gr.Row(equal_height=True):
|
| 109 |
+
with gr.Column(scale=2):
|
| 110 |
+
text_input = gr.Textbox(label="Enter multilingual text💬📝",
|
| 111 |
+
value=text_examples[0][0],
|
| 112 |
+
info="You can use `<en>` for English and `<pl>` for Polish, see examples below.")
|
| 113 |
+
cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25,
|
| 114 |
+
label="Tempo (in characters per second)")
|
| 115 |
+
with gr.Row(equal_height=True):
|
| 116 |
+
speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬",
|
| 117 |
+
sources=["upload", "microphone"],
|
| 118 |
+
type='filepath')
|
| 119 |
+
url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:")
|
| 120 |
+
gr.Markdown(" \n ") # fixes the bottom overflow from Audio
|
| 121 |
+
generate_button = gr.Button("Try Collabora's WhisperSpeech🌟")
|
| 122 |
+
with gr.Column(scale=1):
|
| 123 |
+
output_audio = gr.Audio(label="WhisperSpeech says…")
|
| 124 |
+
|
| 125 |
+
with gr.Column():
|
| 126 |
+
gr.Markdown("### Try these examples to get started !🌟🌬️")
|
| 127 |
+
gr.Examples(
|
| 128 |
+
examples=text_examples,
|
| 129 |
+
inputs=[text_input, url_input],
|
| 130 |
+
outputs=[output_audio],
|
| 131 |
+
fn=whisper_speech_demo,
|
| 132 |
+
cache_examples=not DEVEL,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input, url_input, cps], outputs=output_audio)
|
| 136 |
+
gr.Markdown(footer)
|
| 137 |
+
|
| 138 |
+
demo.launch(server_port=3000 if DEVEL else None)
|
dark-banner.png
ADDED
|
light-banner.png
ADDED
|
requirements (1).txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
whisperspeech
|
| 2 |
+
gradio
|
| 3 |
+
spaces
|