Spaces:
Running
Running
import gradio as gr | |
import torch | |
import soundfile as sf | |
import spaces | |
import os | |
import numpy as np | |
import re | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, AutoTokenizer | |
from speechbrain.pretrained import EncoderClassifier | |
from datasets import load_dataset | |
from huggingface_hub import hf_hub_download | |
import uuid | |
import wave | |
from piper import PiperVoice | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
auth_token = os.environ.get("hf_token") or True | |
model_path = hf_hub_download(repo_id="fahadqazi/piper-sindhi", filename="model.onnx", use_auth_token=auth_token) | |
config_path = hf_hub_download(repo_id="fahadqazi/piper-sindhi", filename="model.onnx.json", use_auth_token=auth_token) | |
voice = PiperVoice.load(model_path=model_path, config_path=config_path, use_cuda=device=="cuda") | |
synthesize_args = { | |
"speaker_id": 0, | |
"sentence_silence": 0.5 | |
} | |
def load_models_and_data(): | |
auth_token = os.environ.get("hf_token") or True | |
model_name = "microsoft/speecht5_tts" | |
processor = SpeechT5Processor.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained("fahadqazi/testts1234", use_auth_token=auth_token) | |
processor.tokenizer = tokenizer | |
model = SpeechT5ForTextToSpeech.from_pretrained("fahadqazi/testts1234", use_auth_token=auth_token).to(device) | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) | |
return model, processor, vocoder | |
# model, processor, vocoder = load_models_and_data() | |
# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
# speaker_embeddings = embeddings_dataset[7306]["xvector"] | |
# speaker_embeddings = torch.tensor(speaker_embeddings).to(device) | |
# default_embedding = speaker_embeddings | |
replacements = [ | |
# ("۾", "مين"), # | |
# ("۽", "ائين"), # | |
] | |
number_words = { | |
0: "ٻڙي", | |
1: "هڪ", | |
2: "ٻہ", | |
3: "ٽي", | |
4: "چار", | |
5: "پنج", | |
6: "ڇه", | |
7: "ست", | |
8: "اٺ", | |
9: "نوه", | |
10: "ڏهہ", | |
11: "يارنهن", 12: "ٻارنهن", 13: "تيرنهن", 14: "چوڏنهن", 15: "پنڌرنهن", 16: "سورنهن", 17: "سترنهن", | |
18: "ارڙنهن", 19: "اوڻينهن", 20: "ويهہ", 30: "ٽيهہ", 40: "چاليهہ", 50: "پنجها", 60: "سٺ", 70: "ستر", | |
80: "اسي", 90: "نوي", 100: "سوه", 1000: "هزار" | |
} | |
def number_to_words(number): | |
if number < 20: | |
return number_words[number] | |
elif number < 100: | |
tens, unit = divmod(number, 10) | |
return (number_words[unit] if unit else "") + (" " + number_words[tens * 10]) | |
elif number < 1000: | |
hundreds, remainder = divmod(number, 100) | |
return (number_words[hundreds] + " سوه" if hundreds > 1 else "سوه") + (" " + number_to_words(remainder) if remainder else "") | |
elif number < 1000000: | |
thousands, remainder = divmod(number, 1000) | |
return (number_to_words(thousands) + " هزار" if thousands > 1 else "هزار") + (" " + number_to_words(remainder) if remainder else "") | |
elif number < 1000000000: | |
millions, remainder = divmod(number, 1000000) | |
return number_to_words(millions) + " ملين" + (" " + number_to_words(remainder) if remainder else "") | |
elif number < 1000000000000: | |
billions, remainder = divmod(number, 1000000000) | |
return number_to_words(billions) + " بلين" + (" " + number_to_words(remainder) if remainder else "") | |
else: | |
return str(number) | |
def replace_numbers_with_words(text): | |
def replace(match): | |
number = int(match.group()) | |
return number_to_words(number) | |
# Find the numbers and change with words. | |
result = re.sub(r'\b\d+\b', replace, text) | |
return result | |
def normalize_text(text): | |
# Convert to lowercase | |
text = text.lower() | |
# Replace numbers followed by "ع" with "عيسوي" | |
text = re.sub(r'(\d+)\s*ع', r'\1 عيسوي', text) | |
# Replace numbers with words | |
text = replace_numbers_with_words(text) | |
# Apply character replacements | |
for old, new in replacements: | |
text = text.replace(old, new) | |
# # Remove punctuation | |
# text = re.sub(r'[^\w\s]', '', text) | |
return text | |
def text_to_speech(text, audio_file=None): | |
# Normalize the input text | |
normalized_text = normalize_text(text) | |
print("normalized text: ", normalized_text) | |
# # Prepare the input for the model | |
# inputs = processor(text=normalized_text, return_tensors="pt").to(device) | |
# # Use the default speaker embedding | |
# speaker_embeddings = default_embedding | |
# # Generate speech | |
# with torch.no_grad(): | |
# speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder) | |
# speech_np = speech.cpu().numpy() | |
# return (16000, speech_np) | |
# Generate speech: Write to file | |
output_file = f"{uuid.uuid4()}.wav" | |
with wave.open(output_file, "wb") as wav_file: | |
voice.synthesize(normalized_text, wav_file, **synthesize_args) | |
# Save the audio to a file | |
# with open("output.wav", "wb") as f: | |
# f.write(audio) | |
# return audio_file | |
return output_file | |
iface = gr.Interface( | |
fn=text_to_speech, | |
inputs=[ | |
gr.Textbox(label="Enter Sindhi text to convert to speech", value="هيلو ڪهڙا حال آهن") | |
], | |
outputs=[ | |
gr.Audio(label="Generated Speech", type="numpy") | |
], | |
title="Sindhi SpeechT5 Text-to-Speech Demo", | |
description="Enter Sindhi text, and listen to the generated speech." | |
) | |
iface.launch(share=True) |