Spaces:
Build error
Build error
File size: 6,837 Bytes
69a49c7 170b7fb 69a49c7 bb57065 69a49c7 c61143a 69a49c7 d807757 7298e06 69a49c7 706f12e c134bb3 69a49c7 19bfb12 69a49c7 170b7fb ddbbef2 170b7fb d807757 69a49c7 1470ecd 8454d6c edd432b 8454d6c 775e425 84b49fc 69a49c7 84b49fc 69a49c7 32a97af 3741399 69a49c7 98f6ed0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
from pathlib import Path
import argparse
import soundfile as sf
import torch
import io
import argparse
from matcha.hifigan.config import v1
from matcha.hifigan.denoiser import Denoiser
from matcha.hifigan.env import AttrDict
from matcha.hifigan.models import Generator as HiFiGAN
from matcha.models.matcha_tts import MatchaTTS
from matcha.text import sequence_to_text, text_to_sequence
from matcha.utils.utils import intersperse
import gradio as gr
import requests
from datetime import datetime
def download_file(url, save_path):
response = requests.get(url)
print(f'---Loading from URL: {url} ---')
with open(save_path, 'wb') as file:
file.write(response.content)
url_checkpoint = 'https://github.com/simonlobgromov/AkylAI_Matcha_Checkpoint/releases/download/LiveSpeech2025-v.1.0/checkpoint_epoch.399.ckpt' #'https://github.com/simonlobgromov/AkylAI_Matcha_Checkpoint/releases/download/Akyl-AI-TTS-v2/checkpoint_epoch.669.ckpt'
save_checkpoint_path = './checkpoints/checkpoint.ckpt'
url_generator = 'https://github.com/simonlobgromov/AkylAI_Matcha_HiFiGan/releases/download/Generator/generator_v1'
save_generator_path = './checkpoints/generator'
download_file(url_checkpoint, save_checkpoint_path)
download_file(url_generator, save_generator_path)
def load_matcha( checkpoint_path, device):
model = MatchaTTS.load_from_checkpoint(checkpoint_path, map_location=device)
_ = model.eval()
return model
def load_hifigan(checkpoint_path, device):
h = AttrDict(v1)
hifigan = HiFiGAN(h).to(device)
hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)["generator"])
_ = hifigan.eval()
hifigan.remove_weight_norm()
return hifigan
def load_vocoder(checkpoint_path, device):
vocoder = None
vocoder = load_hifigan(checkpoint_path, device)
denoiser = Denoiser(vocoder, mode="zeros")
return vocoder, denoiser
def process_text(i: int, text: str, device: torch.device):
print(f"[{i}] - Input text: {text}")
x = torch.tensor(
intersperse(text_to_sequence(text, ["kyrgyz_cleaners"]), 0),
dtype=torch.long,
device=device,
)[None]
x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device)
x_phones = sequence_to_text(x.squeeze(0).tolist())
print(f"[{i}] - Phonetised text: {x_phones}")
return {"x_orig": text, "x": x, "x_lengths": x_lengths, "x_phones": x_phones.replace('_q_ˌ_o_l_o_n_q_ˈ_ɑ_', '_q_ˌ_o_l_ˈ_o_n_q_ɑ_')}
def to_waveform(mel, vocoder, denoiser=None):
audio = vocoder(mel).clamp(-1, 1)
if denoiser is not None:
audio = denoiser(audio.squeeze(), strength=0.00025).cpu().squeeze()
return audio.cpu().squeeze()
@torch.inference_mode()
def process_text_gradio(text):
output = process_text(1, text, device)
return output["x_phones"][1::2], output["x"], output["x_lengths"]
@torch.inference_mode()
def synthesise_mel(text, text_length, n_timesteps, temperature, length_scale, spk=-1):
spk = torch.tensor([spk], device=device, dtype=torch.long) if spk >= 0 else None
output = model.synthesise(
text,
text_length,
n_timesteps=n_timesteps,
temperature=temperature,
spks=spk,
length_scale=length_scale,
)
output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
return output["waveform"].numpy()
def get_inference(text, n_timesteps=20, mel_temp = 0.667, length_scale=0.8, spk=-1):
phones, text, text_lengths = process_text_gradio(text)
print(type(synthesise_mel(text, text_lengths, n_timesteps, mel_temp, length_scale, spk)))
return synthesise_mel(text, text_lengths, n_timesteps, mel_temp, length_scale, spk)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = './checkpoints/checkpoint.ckpt'
vocoder_path = './checkpoints/generator'
model = load_matcha(model_path, device)
vocoder, denoiser = load_vocoder(vocoder_path, device)
def gen_tts(text, speaking_rate):
try:
output = 22050, get_inference(text = text, length_scale = speaking_rate)
return output
except Exception as e:
pass
default_text = "Баарыңарга салам, менин атым Акылай."
css = """
#share-btn-container {
display: flex;
padding-left: 0.5rem !important;
padding-right: 0.5rem !important;
background-color: #000000;
justify-content: center;
align-items: center;
border-radius: 9999px !important;
width: 13rem;
margin-top: 10px;
margin-left: auto;
flex: unset !important;
}
#share-btn {
all: initial;
color: #ffffff;
font-weight: 600;
cursor: pointer;
font-family: 'IBM Plex Sans', sans-serif;
margin-left: 0.5rem !important;
padding-top: 0.25rem !important;
padding-bottom: 0.25rem !important;
right:0;
}
#share-btn * {
all: unset !important;
}
#share-btn-container div:nth-child(-n+2){
width: auto !important;
min-height: 0px !important;
}
#share-btn-container .wrap {
display: none !important;
}
}
img {
display: block;
margin: 0 auto;
width: 132px !important;
height: 132px !important;
}
"""
with gr.Blocks(css=css) as block:
gr.HTML(
"""
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
Akyl-AI TTS
</h1>
</div>
</div>
"""
)
with gr.Row():
image_path = "./photo_2024-04-07_15-59-52.png"
gr.Image(image_path, label=None, width=132, height=132, show_label=False)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
speaking_rate = gr.Slider(label='Speaking rate', minimum=0.5, maximum=1, step=0.05, value=0.8, interactive=True, show_label=True, elem_id="speaking_rate")
run_button = gr.Button("Generate Audio", variant="primary")
with gr.Column():
audio_out = gr.Audio(label="AkylAi-TTS", type="numpy", elem_id="audio_out")
inputs = [input_text, speaking_rate]
outputs = [audio_out]
run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
block.queue()
block.launch() |