Spaces:

ernestchu
/

tsm-net

Running

tsm-net / app.py

ernestchu

init

32bac05 over 1 year ago

4.63 kB

	import os
	from tsmnet import Stretcher
	import gradio as gr
	from gradio import processing_utils
	import torch
	import torchaudio

	model_root = './weights'
	available_models = ['general', 'pop-music', 'classical-music', 'speech']
	working_sr = 22050

	def prepare_models():
	return {
	weight: Stretcher(os.path.join(model_root, f'{weight}.pt'))
	for weight in available_models
	}

	def prepare_audio_file(rec, audio_file, yt_url):
	if rec is not None:
	return rec
	if audio_file is not None:
	return audio_file
	if yt_url != '':
	pass
	else:
	raise gr.Error('No audio found!')


	def run(rec, audio_file, yt_url, speed, model, start_time, end_time):
	audio_file = prepare_audio_file(rec, audio_file, yt_url)
	if speed == 1:
	return processing_utils.audio_from_file(audio_file)

	model = models[model]

	x, sr = torchaudio.load(audio_file)
	x = torchaudio.transforms.Resample(orig_freq=sr, new_freq=working_sr)(x)
	sr = working_sr

	x = model(x, speed).cpu()

	torchaudio.save(audio_file, x, sr)

	return processing_utils.audio_from_file(audio_file)


	# @@@@@@@ Start of the program @@@@@@@@

	models = prepare_models()

	with gr.Blocks() as demo:
	gr.Markdown('# TSM-Net')
	gr.Markdown('---')
	with gr.Row():
	with gr.Column():
	with gr.Tab('From microphone'):
	rec_box = gr.Audio(label='Recording', source='microphone', type='filepath')
	with gr.Tab('From file'):
	audio_file_box = gr.Audio(label='Audio sample', type='filepath')
	with gr.Tab('From YouTube'):
	yt_url_box = gr.Textbox(label='YouTube URL', placeholder='Under Construction', interactive=False)

	rec_box.change(lambda: [None] * 2, outputs=[audio_file_box, yt_url_box])
	audio_file_box.change(lambda: [None] * 2, outputs=[rec_box, yt_url_box])
	yt_url_box.input(lambda: [None] * 2, outputs=[rec_box, audio_file_box])

	speed_box = gr.Slider(label='Playback speed', minimum=0, maximum=2, value=1)
	with gr.Accordion('Fine-grained settings', open=False):
	with gr.Row():
	gr.Textbox(label='', value='Trim audio sample', interactive=False)
	start_time_box = gr.Number(label='Start', value=0)
	end_time_box = gr.Number(label='End', value=20)

	model_box = gr.Dropdown(label='Model weight', choices=available_models, value=available_models[0])

	submit_btn = gr.Button('Submit')

	with gr.Column():
	with gr.Accordion('Hint', open=False):
	gr.Markdown('You can find more settings under the Fine-grained settings')
	gr.Markdown('- Feeling slow? Try to adjust the start/end timestamp')
	gr.Markdown('- Low audio quality? Try to switch to a proper model weight')
	outputs=gr.Audio(label='Output')

	submit_btn.click(fn=run, inputs=[
	rec_box,
	audio_file_box,
	yt_url_box,
	speed_box,
	model_box,
	start_time_box,
	end_time_box,
	], outputs=outputs)

	with gr.Accordion('Read more ...', open=False):
	gr.Markdown('---')
	gr.Markdown(
	'We proposed a novel approach in the field of time-scale modification '
	'on audio signals. While traditional methods use the framing technique, '
	'spectral approach uses the short-time Fourier transform to preserve '
	'the frequency during temporal stretching. TSM-Net, our neural-network '
	'model encodes the raw audio into a high-level latent representation. '
	'We call it Neuralgram, in which one vector represents 1024 audio samples. '
	'It is inspired by the framing technique but addresses the clipping '
	'artifacts. The Neuralgram is a two-dimensional matrix with real values, '
	'we can apply some existing image resizing techniques on the Neuralgram '
	'and decode it using our neural decoder to obtain the time-scaled audio. '
	'Both the encoder and decoder are trained with GANs, which shows fair '
	'generalization ability on the scaled Neuralgrams. Our method yields '
	'little artifacts and opens a new possibility in the research of modern '
	'time-scale modification. Please find more detail in our '
	'<a href="https://arxiv.org/abs/2210.17152" target="_blank">paper</a>.'
	)

	demo.queue(4)
	demo.launch(server_name='0.0.0.0')