Spaces:

ReneeYe
/

ConST-speech2text-translator

Build error

App Files Files Community

ConST-speech2text-translator / app.py

ReneeYe

update

715d968 almost 3 years ago

raw

history blame

5.49 kB

	# -- coding: utf-8 --

	"""
	@Author : Rong Ye
	@Time : May 2022
	@Contact : yerong@bytedance
	@Description:
	"""

	import os
	import traceback
	import shutil
	import yaml
	from pydub import AudioSegment
	import gradio as gr
	from huggingface_hub import snapshot_download


	LANGUAGE_CODES = {
	"German": "de",
	"Spanish": "es",
	"French": "fr",
	"Italian": "it",
	"Netherlands": "nl",
	"Portuguese": "pt",
	"Romanian": "ro",
	"Russian": "ru",
	}

	LANG_GEN_SETUPS = {
	"de": {"beam": 10, "lenpen": 0.7},
	"es": {"beam": 10, "lenpen": 0.1},
	"fr": {"beam": 10, "lenpen": 1.0},
	"it": {"beam": 10, "lenpen": 0.5},
	"nl": {"beam": 10, "lenpen": 0.4},
	"pt": {"beam": 10, "lenpen": 0.9},
	"ro": {"beam": 10, "lenpen": 1.0},
	"ru": {"beam": 10, "lenpen": 0.3},
	}

	os.system("git clone https://github.com/ReneeYe/ConST")
	os.system("mv ConST ConST_git")
	os.system('mv -n ConST_git/* ./')
	os.system("rm -rf ConST_git")
	# os.system("python3 setup.py install")
	# os.system("python3 setup.py build_ext --inplace")
	os.system("pip3 install --editable ./")
	os.system("mkdir -p data checkpoint")


	huggingface_model_dir = snapshot_download(repo_id="ReneeYe/ConST_en2x_models")
	print(huggingface_model_dir)


	def convert_audio_to_16k_wav(audio_input):
	sound = AudioSegment.from_file(audio_input)
	sample_rate = sound.frame_rate
	num_channels = sound.channels
	num_frames = int(sound.frame_count())
	filename = audio_input.split("/")[-1]
	if (num_channels > 1) or (sample_rate != 16000): # convert to mono-channel 16k wav
	sound = sound.set_channels(1)
	sound = sound.set_frame_rate(16000)
	num_frames = int(sound.frame_count())
	filename = filename.replace(".wav", "") + "_16k.wav"
	sound.export(f"data/{filename}", format="wav")
	else:
	shutil.copy(audio_input, f'data/{filename}')
	return filename, num_frames


	def prepare_tsv(file_name, n_frame, language, task="ST"):
	tgt_lang = LANGUAGE_CODES[language]
	with open("data/test_case.tsv", "w") as f:
	f.write("id\taudio\tn_frames\ttgt_text\tspeaker\tsrc_lang\ttgt_lang\tsrc_text\n")
	f.write(f"sample\t{file_name}\t{n_frame}\tThis is in {tgt_lang}.\tspk.1\ten\t{tgt_lang}\tThis is English.\n")


	def get_vocab_and_yaml(language):
	tgt_lang = LANGUAGE_CODES[language]
	# get: spm_ende.model and spm_ende.txt, and save to data/xxx
	# if exist, no need to download
	shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.model"), "./data")
	shutil.copy(os.path.join(huggingface_model_dir, f"vocabulary/spm_en{tgt_lang}.txt"), "./data")

	# write yaml file
	abs_path = os.popen("pwd").read().strip()
	yaml_dict = LANG_GEN_SETUPS[tgt_lang]
	yaml_dict["input_channels"] = 1
	yaml_dict["use_audio_input"] = True
	yaml_dict["prepend_tgt_lang_tag"] = True
	yaml_dict["prepend_src_lang_tag"] = True
	yaml_dict["audio_root"] = os.path.join(abs_path, "data")
	yaml_dict["vocab_filename"] = f"spm_en{tgt_lang}.txt"
	yaml_dict["bpe_tokenizer"] = {"bpe": "sentencepiece",
	"sentencepiece_model": os.path.join(abs_path, f"data/spm_en{tgt_lang}.model")}
	with open("data/config.yaml", "w") as f:
	yaml.dump(yaml_dict, f)


	def get_model(language):
	# download models to checkpoint/xxx
	return os.path.join(huggingface_model_dir, f"models/const_en{LANGUAGE_CODES[language]}.pt")


	def generate(model_path):
	os.system(f"python3 fairseq_cli/generate.py data/ --gen-subset test_case --task speech_to_text --prefix-size 1 \
	--max-tokens 4000000 --max-source-positions 4000000 \
	--config-yaml config.yaml --path {model_path} \| tee temp.txt")
	output = os.popen("grep ^D temp.txt \| sort -n -k 2 -t '-' \| cut -f 3")
	return output.read().strip()


	def remove_temp_files(audio_file):
	os.remove("temp.txt")
	os.remove("data/test_case.tsv")
	os.remove(f"data/{audio_file}")


	def run(audio_file, language):
	try:
	converted_audio_file, n_frame = convert_audio_to_16k_wav(audio_file)
	prepare_tsv(converted_audio_file, n_frame, language)
	get_vocab_and_yaml(language)
	model_path = get_model(language)
	generated_output = generate(model_path)
	remove_temp_files(converted_audio_file)
	return generated_output
	except:
	traceback.print_exc()
	return error_output(language)


	def error_output(language):
	return f"Fail to translate the audio into {language}, you may use the examples I provide."


	inputs = [
	gr.inputs.Audio(source="microphone", type="filepath", label="Record something (in English)..."),
	gr.inputs.Dropdown(list(LANGUAGE_CODES.keys()), default="German", label="From English to Languages X..."),
	]

	iface = gr.Interface(
	fn=run,
	inputs=inputs,
	outputs=[gr.outputs.Textbox(label="The translation")],
	examples=[['case1.wav', "German"],['case2.wav', "German"], ['case3.wav', "German"]],
	title="ConST: an end-to-end speech translator",
	description="End-to-end Speech Translation Live Demo for English to eight European languages.",
	article="ConST is an end-to-end speech translation model (see paper at https://arxiv.org/abs/2205.02444 ). "
	"Its motivation is to use contrastive learning method to learn similar representations for semantically similar speech and text.",
	theme="seafoam",
	layout='vertical',
	)
	iface.launch()