import os from tsmnet import Stretcher import gradio as gr from gradio import processing_utils import torch import torchaudio model_root = './weights' available_models = ['general', 'pop-music', 'classical-music', 'speech'] working_sr = 22050 def prepare_models(): return { weight: Stretcher(os.path.join(model_root, f'{weight}.pt')) for weight in available_models } def prepare_audio_file(rec, audio_file, yt_url): if rec is not None: return rec if audio_file is not None: return audio_file if yt_url != '': pass else: raise gr.Error('No audio found!') def run(rec, audio_file, yt_url, speed, model, start_time, end_time): audio_file = prepare_audio_file(rec, audio_file, yt_url) if speed == 1: return processing_utils.audio_from_file(audio_file) model = models[model] x, sr = torchaudio.load(audio_file) x = torchaudio.transforms.Resample(orig_freq=sr, new_freq=working_sr)(x) sr = working_sr x = model(x, speed).cpu() torchaudio.save(audio_file, x, sr) return processing_utils.audio_from_file(audio_file) # @@@@@@@ Start of the program @@@@@@@@ models = prepare_models() with gr.Blocks() as demo: gr.Markdown('# TSM-Net') gr.Markdown('---') with gr.Row(): with gr.Column(): with gr.Tab('From microphone'): rec_box = gr.Audio(label='Recording', source='microphone', type='filepath') with gr.Tab('From file'): audio_file_box = gr.Audio(label='Audio sample', type='filepath') with gr.Tab('From YouTube'): yt_url_box = gr.Textbox(label='YouTube URL', placeholder='Under Construction', interactive=False) rec_box.change(lambda: [None] * 2, outputs=[audio_file_box, yt_url_box]) audio_file_box.change(lambda: [None] * 2, outputs=[rec_box, yt_url_box]) yt_url_box.input(lambda: [None] * 2, outputs=[rec_box, audio_file_box]) speed_box = gr.Slider(label='Playback speed', minimum=0, maximum=2, value=1) with gr.Accordion('Fine-grained settings', open=False): with gr.Row(): gr.Textbox(label='', value='Trim audio sample', interactive=False) start_time_box = gr.Number(label='Start', value=0) end_time_box = gr.Number(label='End', value=20) model_box = gr.Dropdown(label='Model weight', choices=available_models, value=available_models[0]) submit_btn = gr.Button('Submit') with gr.Column(): with gr.Accordion('Hint', open=False): gr.Markdown('You can find more settings under the **Fine-grained settings**') gr.Markdown('- Feeling slow? Try to adjust the start/end timestamp') gr.Markdown('- Low audio quality? Try to switch to a proper model weight') outputs=gr.Audio(label='Output') submit_btn.click(fn=run, inputs=[ rec_box, audio_file_box, yt_url_box, speed_box, model_box, start_time_box, end_time_box, ], outputs=outputs) with gr.Accordion('Read more ...', open=False): gr.Markdown('---') gr.Markdown( 'We proposed a novel approach in the field of time-scale modification ' 'on audio signals. While traditional methods use the framing technique, ' 'spectral approach uses the short-time Fourier transform to preserve ' 'the frequency during temporal stretching. TSM-Net, our neural-network ' 'model encodes the raw audio into a high-level latent representation. ' 'We call it Neuralgram, in which one vector represents 1024 audio samples. ' 'It is inspired by the framing technique but addresses the clipping ' 'artifacts. The Neuralgram is a two-dimensional matrix with real values, ' 'we can apply some existing image resizing techniques on the Neuralgram ' 'and decode it using our neural decoder to obtain the time-scaled audio. ' 'Both the encoder and decoder are trained with GANs, which shows fair ' 'generalization ability on the scaled Neuralgrams. Our method yields ' 'little artifacts and opens a new possibility in the research of modern ' 'time-scale modification. Please find more detail in our ' 'paper.' ) demo.queue(4) demo.launch(server_name='0.0.0.0')