Tejas1206
app.py
049c446
raw
history blame
5.51 kB
import gradio as gr
import librosa
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("tejas1206/speecht5_tts_ta")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
speaker_embeddings = {
"BDL": "speaker/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
"CLB": "speaker/cmu_us_clb_arctic-wav-arctic_a0144.npy",
"KSP": "speaker/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
"RMS": "speaker/cmu_us_rms_arctic-wav-arctic_b0353.npy",
"SLT": "speaker/cmu_us_slt_arctic-wav-arctic_a0508.npy",
}
def convert_text(sentence):
replacements = [
(' ', ' '), # Space
('&', 'and'), # Ampersand
('_', '_'), # Underscore
('`', '`'), # Backtick
('·', '.'), # Middle dot
('á', 'a'), # Accent on 'a'
('ô', 'o'), # Accent on 'o'
('š', 's'), # 'S' with caron (soft s sound)
('ஃ', 'akh'), # Aytham (Tamil diacritic)
('அ', 'a'), # Tamil letter A
('ஆ', 'aa'), # Tamil letter AA
('இ', 'i'), # Tamil letter I
('ஈ', 'ii'), # Tamil letter II
('உ', 'u'), # Tamil letter U
('ஊ', 'uu'), # Tamil letter UU
('எ', 'e'), # Tamil letter E
('ஏ', 'ee'), # Tamil letter EE
('ஐ', 'ai'), # Tamil letter AI
('ஒ', 'o'), # Tamil letter O
('ஓ', 'oo'), # Tamil letter OO
('ஔ', 'au'), # Tamil letter AU
('க', 'ka'), # Tamil letter KA
('ங', 'nga'), # Tamil letter NGA
('ச', 'cha'), # Tamil letter CHA
('ஜ', 'ja'), # Tamil letter JA
('ஞ', 'nya'), # Tamil letter NYA
('ட', 'ta'), # Tamil letter TTA (retroflex T)
('ண', 'na'), # Tamil letter NNA (retroflex N)
('த', 'tha'), # Tamil letter THA
('ந', 'na'), # Tamil letter NA
('ன', 'na'), # Tamil letter NN (alveolar N)
('ப', 'pa'), # Tamil letter PA
('ம', 'ma'), # Tamil letter MA
('ய', 'ya'), # Tamil letter YA
('ர', 'ra'), # Tamil letter RA
('ற', 'rra'), # Tamil letter RRA (retroflex R)
('ல', 'la'), # Tamil letter LA
('ள', 'lla'), # Tamil letter LLA (retroflex L)
('ழ', 'zha'), # Tamil letter LLA (unique Tamil letter)
('வ', 'va'), # Tamil letter VA
('ஷ', 'sha'), # Tamil letter SHA
('ஸ', 'sa'), # Tamil letter SA
('ஹ', 'ha'), # Tamil letter HA
('ா', 'aa'), # Long A (Tamil vowel extension)
('ி', 'i'), # Short I (Tamil vowel extension)
('ீ', 'ii'), # Long I (Tamil vowel extension)
('ு', 'u'), # Short U (Tamil vowel extension)
('ூ', 'uu'), # Long U (Tamil vowel extension)
('ெ', 'e'), # Short E (Tamil vowel extension)
('ே', 'ee'), # Long E (Tamil vowel extension)
('ை', 'ai'), # Tamil diphthong AI
('ொ', 'o'), # Short O (Tamil vowel extension)
('ோ', 'oo'), # Long O (Tamil vowel extension)
('ௌ', 'au'), # Tamil diphthong AU
('்', ''), # Tamil virama (removes inherent vowel)
('ௗ', 'au'), # Rare Tamil vowel diacritic
('ഥ', 'tha'), # Malayalam letter THA
('–', '-'), # En dash
('‘', "'"), # Left single quotation mark
('’', "'"), # Right single quotation mark
('‚', ','), # Single low quotation mark
('“', '"'), # Left double quotation mark
('”', '"'), # Right double quotation mark
('•', '.'), # Bullet point
('…', '...'), # Ellipsis
('′', "'"), # Prime (minutes or feet symbol)
('″', '"'), # Double prime (seconds or inches symbol)
('●', '.'), # Filled bullet
('◯', 'o'), # Circle symbol
]
for src, dst in replacements:
sentence = sentence.replace(src, dst)
return sentence
def predict(text, speaker):
if len(text.strip()) == 0:
return (16000, np.zeros(0).astype(np.int16))
text = convert_text(text)
inputs = processor(text=text, return_tensors="pt")
# limit input length
input_ids = inputs["input_ids"]
input_ids = input_ids[..., :model.config.max_text_positions]
if speaker == "Surprise Me!":
# load one of the provided speaker embeddings at random
idx = np.random.randint(len(speaker_embeddings))
key = list(speaker_embeddings.keys())[idx]
speaker_embedding = np.load(speaker_embeddings[key])
# randomly shuffle the elements
np.random.shuffle(speaker_embedding)
# randomly flip half the values
x = (np.random.rand(512) >= 0.5) * 1.0
x[x == 0] = -1.0
speaker_embedding *= x
#speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
else:
speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
title = "Text-to-Speech App using SpeechT5"
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Input Text"),
gr.Radio(label="Speaker", choices=[
"BDL (male)",
"CLB (female)",
"KSP (male)",
"RMS (male)",
"SLT (female)",
"Surprise Me!"
],
value="BDL (male)"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
],
title=title,
).launch()