Spaces:
Running
Running
# Based on example code of https://huggingface.co/facebook/m2m100_1.2B | |
# and https://github.com/wannaphong/ttsmms | |
# See also https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md | |
import gradio as gr | |
import os | |
import re | |
import soundfile as sf | |
import json | |
import nltk | |
from underthesea import sent_tokenize as vie_sent_tokenize # Vietnamese NLP toolkit | |
from underthesea import text_normalize as vie_text_normalize | |
from nltk import sent_tokenize as nltk_sent_tokenize | |
from ttsmms import download | |
from ttsmms import TTS | |
from collections import OrderedDict | |
import uuid | |
import datetime | |
import shutil | |
from num2words import num2words | |
this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper. | |
Please note that for some languages, it may not pronounce all words correctly (yet). | |
""" | |
nltk.download("punkt") | |
# Pre-download some languages | |
tts_models = {} | |
eng_path = download("eng", "./data") | |
tts_models["eng"] = eng_path | |
vie_path = download("vie", "./data") | |
tts_models["vie"] = vie_path | |
mya_path = download("mya", "./data") | |
tts_models["mya"] = mya_path | |
lang_codes = OrderedDict() | |
language_names = list(lang_codes.keys()) | |
with open("lang_code.txt", "r") as file: | |
for line in file: | |
line = line.strip() | |
if line.startswith("----"): | |
continue | |
iso, lang = line.split("\t", 1) | |
lang_codes[lang + " (" + iso + ")"] = iso | |
language_names = list(lang_codes.keys()) | |
# Load num2words_lang_map | |
with open("num2words_lang_map.json") as f: | |
num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict) | |
def convert_numbers_to_words_num2words(text, lang): | |
# Find all numbers in the text using regex | |
numbers = re.findall(r"\d+", text) | |
# Sort numbers in descending order of length | |
sorted_numbers = sorted(numbers, key=len, reverse=True) | |
print(sorted_numbers) | |
# Replace numbers with their word equivalents | |
for number in sorted_numbers: | |
number_word = num2words(int(number), lang=num2words_lang_map[lang][0]) | |
text = text.replace(number, number_word) | |
return text | |
def convert_mya_numbers_to_words(text): | |
from mm_num2word import mm_num2word, extract_num | |
numbers = extract_num(text) | |
sorted_numbers = sorted(numbers, key=len, reverse=True) | |
print(sorted_numbers) | |
for n in sorted_numbers: | |
text = text.replace(n, mm_num2word(n)) | |
return text | |
def prepare_sentences(text, lang="mya"): | |
sentences = [] | |
# pre-process the text for some languages | |
if lang.lower() == "mya": | |
text = convert_mya_numbers_to_words(text) | |
text = text.replace("\u104A", ",").replace("\u104B", ".") | |
if lang in num2words_lang_map: | |
print("num2words supports this lang", lang) | |
text = convert_numbers_to_words_num2words(text, lang) | |
print("Processed text", text) | |
# Not sure why this can fix unclear pronunciation for the first word of vie | |
text = text.lower() | |
paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()] | |
if lang.lower() == "vie": | |
for paragraph in paragraphs: | |
sentences_raw = vie_sent_tokenize(paragraph) | |
sentences.extend( | |
[ | |
vie_text_normalize(sentence) | |
for sentence in sentences_raw | |
if sentence.strip() | |
] | |
) | |
else: | |
sentences = [ | |
sentence | |
for paragraph in paragraphs | |
for sentence in nltk_sent_tokenize(paragraph) | |
if sentence.strip() | |
] | |
return sentences | |
def list_dir(lang): | |
# Get the current directory | |
current_dir = os.getcwd() | |
print(current_dir) | |
# List all files in the current directory | |
files = os.listdir(current_dir) | |
# Filter the list to include only WAV files | |
wav_files = [file for file in files if file.endswith(".wav")] | |
print("Total wav files:", len(wav_files)) | |
# Print the last WAV file | |
sorted_list = sorted(wav_files) | |
print(lang, sorted_list[-1]) | |
def combine_wav(source_dir, stamp, lang): | |
# Get a list of all WAV files in the folder | |
wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")] | |
# Sort the files alphabetically to ensure the correct order of combination | |
wav_files.sort() | |
# Combine the WAV files | |
combined_data = [] | |
for file in wav_files: | |
file_path = os.path.join(source_dir, file) | |
data, sr = sf.read(file_path) | |
combined_data.extend(data) | |
# Save the combined audio to a new WAV file | |
combined_file_path = f"{stamp}_{lang}.wav" | |
sf.write(combined_file_path, combined_data, sr) | |
shutil.rmtree(source_dir) | |
list_dir(lang) | |
# Display the combined audio in the Hugging Face Space app | |
return combined_file_path | |
def mms_tts(Input_Text, lang_name="Burmese (mya)"): | |
# lang_code = lang_codes[lang_name] | |
try: | |
lang_code = lang_codes[lang_name] | |
except KeyError: | |
lang_code = "mya" | |
user_model = download(lang_code, "./data") | |
tts = TTS(user_model) | |
sentences = prepare_sentences(Input_Text, lang_code) | |
# output_dir = f"out_{lang_code}" | |
current_datetime = datetime.datetime.now() | |
timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f") | |
user_dir = f"u_{timestamp}" | |
if os.path.exists(user_dir): | |
session_id = str(uuid.uuid4()) # Generate a random session ID | |
user_dir = f"u_{session_id}_{timestamp}" | |
os.makedirs(user_dir, exist_ok=True) | |
print("New user directory", user_dir) | |
for i, sentence in enumerate(sentences): | |
tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav") | |
combined_file_path = combine_wav(user_dir, timestamp, lang_code) | |
return combined_file_path | |
# common_languages = ["eng", "mya", "vie"] # List of common language codes | |
iface = gr.Interface( | |
fn=mms_tts, | |
title="Massively Multilingual Speech (MMS) - Text To Speech", | |
description=this_description, | |
inputs=[ | |
gr.Textbox(lines=5, placeholder="Enter text (unlimited sentences)", label="Input text (unlimited sentences)"), | |
gr.Dropdown( | |
choices=language_names, | |
label="Select language 1,000+", | |
value="Burmese (mya)", | |
), | |
], | |
outputs="audio", | |
) | |
# outputs=[ | |
# "audio", | |
# gr.File(label="Download", type="file", download_to="done.wav") | |
# ]) | |
iface.launch() | |