--- license: mit tags: - urdu-tts - text-to-speech - urdu-text-to-speech - urdu-voice-cloning --- # How to Use This Model # Installation 1) pip install coqui-tts 2) Locate TTS/tts/layers/xtts/tokenizers.py in your site-packages directory. 3) Replace the tokenizers.py file with the tokenizers.py in this repository. 4) And you should be good to go! Note: The model might not perform well on very long inputs. You can write your own text splitter to split longer inputs into shorter sentences based on your needs. # Example # Source Voice # Generated Voice # Inference Code ```python import torch import torchaudio from tqdm import tqdm from underthesea import sent_tokenize from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts device = "cuda:0" if torch.cuda.is_available() else "cpu" xtts_checkpoint = "model.pth" xtts_config = "config.json" xtts_vocab = "vocab.json" config = XttsConfig() config.load_json(xtts_config) XTTS_MODEL = Xtts.init_from_config(config) XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False) XTTS_MODEL.to(device) print("Model loaded successfully!") # In case you are cloning from WhatsApp voice notes: from pydub import AudioSegment audio = AudioSegment.from_file("input-4.ogg", format="ogg") audio.export("output.wav", format="wav") print("Conversion complete!") # Inference tts_text = f"""یہ ٹی ٹی ایس کیسا ہے؟ اس کے بارے میں کچھ بتائیں""" speaker_audio_file = "output.wav" lang = "ur" gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents( audio_path=["output.wav"], gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs, ) tts_texts = [tts_text] wav_chunks = [] for text in tqdm(tts_texts): wav_chunk = XTTS_MODEL.inference( text=text, language=lang, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, temperature=0.1, length_penalty=0.1, repetition_penalty=10.0, top_k=10, top_p=0.3, ) wav_chunks.append(torch.tensor(wav_chunk["wav"])) out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu() from IPython.display import Audio Audio(out_wav, rate=24000) ```