synthesis / app.py
adowu's picture
Update app.py
29f6b1d verified
raw
history blame
1.9 kB
import spaces
import gradio as gr
import torch
from TTS.api import TTS
import os
import json
import scipy.io.wavfile as wavfile
import numpy as np
os.environ["COQUI_TOS_AGREED"] = "1"
device = "cuda"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
@spaces.GPU(enable_queue=True)
def clone(text, audio):
# Generowanie mowy
wav = tts.tts(text=text, speaker_wav=audio, language="pl")
# Konwersja do numpy array i zapisanie jako plik WAV
wav_np = np.array(wav)
wavfile.write("./output.wav", 24000, (wav_np * 32767).astype(np.int16))
# Uzyskanie informacji o fonemach
phonemes_info = tts.synthesizer.get_phonemes(text, language="pl")
# Przygotowanie informacji o fonemach
phonemes_data = []
cumulative_duration = 0
for phoneme, duration in phonemes_info:
start_time = cumulative_duration
end_time = start_time + duration
phonemes_data.append({
"phoneme": phoneme,
"start": float(start_time),
"end": float(end_time),
"duration": float(duration)
})
cumulative_duration = end_time
# Zapisywanie informacji o fonemach do pliku JSON
with open("./phonemes_info.json", "w", encoding="utf-8") as f:
json.dump(phonemes_data, f, ensure_ascii=False, indent=2)
return "./output.wav", "./phonemes_info.json"
# Interfejs Gradio
iface = gr.Interface(
fn=clone,
inputs=[
gr.Textbox(label='Tekst do syntezy'),
gr.Audio(type='filepath', label='Plik audio z głosem referencyjnym')
],
outputs=[
gr.Audio(type='filepath', label='Zsyntezowana mowa'),
gr.File(label='Informacje o fonemach (JSON)')
],
title='Klonowanie Głosu z Informacjami o Fonemach',
theme=gr.themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate")
)
iface.launch(share=True)