File size: 3,578 Bytes
d59aeff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e42c164
 
d59aeff
 
 
 
 
 
 
 
 
 
 
 
 
e42c164
d59aeff
 
498fbde
 
e42c164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d59aeff
 
a0907be
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

import gradio as gr

import re
import random
import string
import librosa
import numpy as np

from pathlib import Path
from scipy.io.wavfile import write

from encoder import inference as encoder
from vocoder.hifigan import inference as gan_vocoder
from synthesizer.inference import Synthesizer

class Mandarin:
    def __init__(self):
        self.encoder_path = "encoder/saved_models/pretrained.pt"
        self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt"
        self.config_fpath = "vocoder/hifigan/config_16k_.json"
        self.accent = "synthesizer/saved_models/普通话.pt"

        synthesizers_cache = {}
        if synthesizers_cache.get(self.accent) is None:
            self.current_synt = Synthesizer(Path(self.accent))
            synthesizers_cache[self.accent] = self.current_synt
        else:
            self.current_synt = synthesizers_cache[self.accent]

        encoder.load_model(Path(self.encoder_path))
        gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath)

    def setVoice(self, timbre):
        self.timbre = timbre
        wav, sample_rate,  = librosa.load(self.timbre)

        encoder_wav = encoder.preprocess_wav(wav, sample_rate)
        self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

    def say(self, text):
        texts = filter(None, text.split("\n"))
        punctuation = "!,。、?!,.?::" # punctuate and split/clean text
        processed_texts = []
        for text in texts:
            for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
                if processed_text:
                    processed_texts.append(processed_text.strip())
        texts = processed_texts
        embeds = [self.embed] * len(texts)

        specs = self.current_synt.synthesize_spectrograms(texts, embeds)
        spec = np.concatenate(specs, axis=1)
        wav, sample_rate = gan_vocoder.infer_waveform(spec)

        return wav, sample_rate

def greet(audio, text, voice=None):
    print(f"Log print: audio name=[{audio.name}], text=[{text}]")
    
    if voice is None:
        voice = Mandarin()
        voice.setVoice(audio.name)
        voice.say("加载成功")
    wav, sample_rate = voice.say(text)

    output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"

    write(output_file, sample_rate, wav.astype(np.float32))

    return output_file, voice

def main():
    demo = gr.Interface(
        fn=greet,
        inputs=[gr.inputs.Audio(type="file"),"text", "state"],
        outputs=[gr.outputs.Audio(type="file"), "state"],
        title="Tacotron Zero-short Voice Clone (Chinese Version)"
    )
    
    demo.launch()

def new_main():
    with gr.Blocks() as demo:
        statas = gr.State()
        title = gr.Markdown("# Tacotron Zero-short Voice Clone (Chinese Version)")
        with gr.Row():
            with gr.Column(scale=1):
                input_audio = gr.Audio(type="file", label="Source Audio", value="exp/lihao_01.wav")
                input_text = gr.Textbox(value="大家好,我是正在搬砖的李昊,这是一段合成音频。")
                with gr.Row():
                    clear = gr.ClearButton()
                    submit = gr.Button(value="Submit", variant='primary')
            with gr.Column(scale=1):
                output_audio = gr.Audio(type="file", label="Output Audio")

    _ = submit.click(greet, inputs=[input_audio, input_text, statas], outputs=[output_audio, statas])
    
    demo.launch()

if __name__=="__main__":
    new_main()