import spaces import gradio as gr import torch from TTS.api import TTS import os import json os.environ["COQUI_TOS_AGREED"] = "1" device = "cuda" tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) @spaces.GPU(enable_queue=True) def clone(text, audio): # Generowanie mowy wav, alignment, text_info, _ = tts.tts(text=text, speaker_wav=audio, language="pl", return_type="dict") # Zapisywanie pliku audio tts.save_wav(wav, file_path="./output.wav") # Przygotowanie informacji o fonemach phonemes_info = [] for phoneme, start, end in zip(text_info["phonemes"], alignment["align_durations"], alignment["align_durations_cumsum"]): phonemes_info.append({ "phoneme": phoneme, "start": float(start), "end": float(end) }) # Zapisywanie informacji o fonemach do pliku JSON with open("./phonemes_info.json", "w", encoding="utf-8") as f: json.dump(phonemes_info, f, ensure_ascii=False, indent=2) return "./output.wav", "./phonemes_info.json" # Interfejs Gradio iface = gr.Interface( fn=clone, inputs=[ gr.Textbox(label='Tekst do syntezy'), gr.Audio(type='filepath', label='Plik audio z głosem referencyjnym') ], outputs=[ gr.Audio(type='filepath', label='Zsyntezowana mowa'), gr.File(label='Informacje o fonemach (JSON)') ], title='Klonowanie Głosu z Informacjami o Fonemach', theme=gr.themes.Base(primary_hue="teal", secondary_hue="teal", neutral_hue="slate") ) iface.launch()