Spaces:
Running
Running
File size: 7,248 Bytes
9b15f17 e8c034c 9b15f17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
from onnx_modules.V230_OnnxInference import OnnxInferenceSession
import numpy as np
import torch
from scipy.io.wavfile import write
from text import cleaned_text_to_sequence, get_bert
from text.cleaner import clean_text
import utils
import commons
import uuid
from flask import Flask, request, jsonify, render_template_string
from flask_cors import CORS
import gradio as gr
import os
from threading import Thread
hps = utils.get_hparams_from_file('onnx/BangDreamApi.json')
device = 'cpu'
BandList = {
"PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
"Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
"HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"],
"PastelPalettes":["彩","日菜","千聖","イヴ","麻弥"],
"Roselia":["友希那","紗夜","リサ","燐子","あこ"],
"RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"],
"Morfonica":["ましろ","瑠唯","つくし","七深","透子"],
"MyGo":["燈","愛音","そよ","立希","楽奈"],
"AveMujica":["祥子","睦","海鈴","にゃむ","初華"],
"圣翔音乐学园":["華戀","光","香子","雙葉","真晝","純那","克洛迪娜","真矢","奈奈"],
"凛明馆女子学校":["珠緒","壘","文","悠悠子","一愛"],
"弗隆提亚艺术学校":["艾露","艾露露","菈樂菲","司","靜羽"],
"西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
}
Session = OnnxInferenceSession(
{
"enc" : "onnx/BangDreamApi/BangDreamApi_enc_p.onnx",
"emb_g" : "onnx/BangDreamApi/BangDreamApi_emb.onnx",
"dp" : "onnx/BangDreamApi/BangDreamApi_dp.onnx",
"sdp" : "onnx/BangDreamApi/BangDreamApi_sdp.onnx",
"flow" : "onnx/BangDreamApi/BangDreamApi_flow.onnx",
"dec" : "onnx/BangDreamApi/BangDreamApi_dec.onnx"
},
Providers = ["CPUExecutionProvider"]
)
def get_text(text, language_str, hps, device, style_text=None, style_weight=0.7):
style_text = None if style_text == "" else style_text
norm_text, phone, tone, word2ph = clean_text(text, language_str)
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
if True:
phone = commons.intersperse(phone, 0)
tone = commons.intersperse(tone, 0)
language = commons.intersperse(language, 0)
for i in range(len(word2ph)):
word2ph[i] = word2ph[i] * 2
word2ph[0] += 1
bert_ori = get_bert(
norm_text, word2ph, language_str, device, style_text, style_weight
)
del word2ph
assert bert_ori.shape[-1] == len(phone), phone
if language_str == "ZH":
bert = bert_ori
ja_bert = torch.randn(1024, len(phone))
en_bert = torch.randn(1024, len(phone))
elif language_str == "JP":
bert = torch.randn(1024, len(phone))
ja_bert = bert_ori
en_bert = torch.randn(1024, len(phone))
elif language_str == "EN":
bert = torch.randn(1024, len(phone))
ja_bert = torch.randn(1024, len(phone))
en_bert = bert_ori
else:
raise ValueError("language_str should be ZH, JP or EN")
assert bert.shape[-1] == len(
phone
), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
phone = torch.LongTensor(phone)
tone = torch.LongTensor(tone)
language = torch.LongTensor(language)
return bert, ja_bert, en_bert, phone, tone, language
def infer(
text,
sid,
style_text=None,
style_weight=0.7,
sdp_ratio=0.5,
noise_scale=0.6,
noise_scale_w=0.667,
length_scale=1,
unique_filename = 'temp.wav'
):
language= 'JP' if is_japanese(text) else 'ZH'
bert, ja_bert, en_bert, phones, tone, language = get_text(
text,
language,
hps,
device,
style_text=style_text,
style_weight=style_weight,
)
with torch.no_grad():
x_tst = phones.unsqueeze(0).to(device).numpy()
language = np.zeros_like(x_tst)
tone = np.zeros_like(x_tst)
bert = bert.to(device).transpose(0, 1).numpy()
ja_bert = ja_bert.to(device).transpose(0, 1).numpy()
en_bert = en_bert.to(device).transpose(0, 1).numpy()
del phones
sid = np.array([hps.spk2id[sid]])
audio = Session(
x_tst,
tone,
language,
bert,
ja_bert,
en_bert,
sid,
seed=114514,
seq_noise_scale=noise_scale_w,
sdp_noise_scale=noise_scale,
length_scale=length_scale,
sdp_ratio=sdp_ratio,
)
del x_tst, tone, language, bert, ja_bert, en_bert, sid
write(unique_filename, 44100, audio)
return (44100,gr.processing_utils.convert_to_16_bit_wav(audio))
def is_japanese(string):
for ch in string:
if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
return True
return False
Flaskapp = Flask(__name__)
CORS(Flaskapp)
@Flaskapp.route('/')
def tts():
global last_text, last_model
speaker = request.args.get('speaker')
sdp_ratio = float(request.args.get('sdp_ratio', 0.2))
noise_scale = float(request.args.get('noise_scale', 0.6))
noise_scale_w = float(request.args.get('noise_scale_w', 0.8))
length_scale = float(request.args.get('length_scale', 1))
style_weight = float(request.args.get('style_weight', 0.7))
style_text = request.args.get('style_text', 'happy')
text = request.args.get('text')
is_chat = request.args.get('is_chat', 'false').lower() == 'true'
#model = request.args.get('model',modelPaths[-1])
if not speaker or not text:
return render_template_string("""
<!DOCTYPE html>
<html>
<head>
<title>TTS API Documentation</title>
</head>
<body>
<iframe src="https://mahiruoshi-bangdream-bert-vits2.hf.space" style="width:100%; height:100vh; border:none;"></iframe>
</body>
</html>
""")
'''
if model != last_model:
unique_filename = loadmodel(model)
last_model = model
'''
if is_chat and text == last_text:
# Generate 1 second of silence and return
unique_filename = 'blank.wav'
silence = np.zeros(44100, dtype=np.int16)
write(unique_filename , 44100, silence)
else:
last_text = text
unique_filename = f"temp{uuid.uuid4()}.wav"
infer(text, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale,sid = speaker, style_text=style_text, style_weight=style_weight,unique_filename=unique_filename)
with open(unique_filename ,'rb') as bit:
wav_bytes = bit.read()
os.remove(unique_filename)
headers = {
'Content-Type': 'audio/wav',
'Text': unique_filename .encode('utf-8')}
return wav_bytes, 200, headers
if __name__ == "__main__":
speaker_ids = hps.spk2id
speakers = list(speaker_ids.keys())
last_text = ""
Flaskapp.run(host="0.0.0.0", port=5000,debug=True) |