import os |
import time |
from tsmnet import Stretcher |
import gradio as gr |
from gradio import processing_utils |
import torch |
import numpy as np |
import torchaudio |
import yt_dlp |
import noisereduce as nr |
model_root = './weights' |
yt_dl_dir = 'yt-audio' |
available_models = ['speech', 'pop-music', 'classical-music'] |
working_sr = 22050 |
def prepare_models(): |
return { |
weight: Stretcher(os.path.join(model_root, f'{weight}.pt')) |
for weight in available_models |
} |
def download_yt_audio(url): |
os.system(f'find {yt_dl_dir} -audio -mtime +1 -delete') |
ydl_opts = { |
'format': 'm4a/bestaudio/best', |
'postprocessors': [{ |
'key': 'FFmpegExtractAudio', |
'preferredcodec': 'wav', |
}], |
'outtmpl': f"{yt_dl_dir}/%(id)s.%(ext)s" |
} |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
try: |
ydl.cache.remove() |
meta = ydl.extract_info(url, download=False) |
audio_file = os.path.join(yt_dl_dir, meta['id'] + '.wav') |
if not os.path.isfile(audio_file): |
ydl.download(url) |
except yt_dlp.DownloadError as error: |
raise gr.Error(f'Failed to download from YouTube: {error}') |
new_audio_file = os.path.join(os.path.dirname(audio_file), f'{time.time()}.wav') |
os.system(f'cp {audio_file} {new_audio_file}') |
return new_audio_file |
def prepare_audio_file(rec, audio_file, yt_url): |
if rec is not None: |
return rec |
if audio_file is not None: |
return audio_file |
if yt_url != '': |
return download_yt_audio(yt_url) |
else: |
raise gr.Error('No audio found!') |
def run(rec, audio_file, yt_url, denoise, speed, model, start_time, end_time): |
audio_file = prepare_audio_file(rec, audio_file, yt_url) |
x, sr = torchaudio.load(audio_file) |
x = torchaudio.transforms.Resample(orig_freq=sr, new_freq=working_sr)(x) |
sr = working_sr |
x = x[:, int(start_time * sr):int(end_time * sr)] |
if speed == 1: |
torchaudio.save(audio_file, x, sr) |
return processing_utils.audio_from_file(audio_file) |
x = models[model](x, speed).cpu() |
if denoise: |
if len(x.shape) == 1: |
x = x[None] |
x = x.numpy() |
x = torch.from_numpy(np.stack([nr.reduce_noise(y=y, sr=sr) for y in x])) |
torchaudio.save(audio_file, x, sr) |
return processing_utils.audio_from_file(audio_file) |
models = prepare_models() |
os.makedirs(yt_dl_dir, exist_ok=True) |
with gr.Blocks() as demo: |
gr.Markdown('# TSM-Net') |
gr.Markdown('---') |
with gr.Row(): |
with gr.Column(): |
with gr.Tab('From microphone'): |
rec_box = gr.Audio(label='Recording', source='microphone', type='filepath') |
with gr.Tab('From YouTube'): |
yt_url_box = gr.Textbox(label='YouTube URL', placeholder='https://youtu.be/q6EoRBvdVPQ') |
with gr.Tab('From file'): |
audio_file_box = gr.Audio(label='Audio sample', type='filepath') |
denoise_box = gr.Checkbox(label='Speech enhancement (should be off for music)', value=True) |
rec_box.change(lambda: [None] * 2, outputs=[audio_file_box, yt_url_box]) |
audio_file_box.change(lambda: [None] * 2, outputs=[rec_box, yt_url_box]) |
yt_url_box.input(lambda: [None] * 2, outputs=[rec_box, audio_file_box]) |
speed_box = gr.Slider(label='Playback speed', minimum=0.25, maximum=2, value=1) |
with gr.Accordion('Fine-grained settings', open=False): |
with gr.Tab('Trim audio sample (sec)'): |
with gr.Row(): |
start_time_box = gr.Number(label='Start', value=0) |
end_time_box = gr.Number(label='End', value=60) |
model_box = gr.Dropdown(label='Model weight', choices=available_models, value=available_models[0]) |
submit_btn = gr.Button('Submit') |
with gr.Column(): |
with gr.Accordion('Hint', open=False): |
gr.Markdown('You can find more settings under the **Fine-grained settings**') |
gr.Markdown('- Waiting too long? Try to adjust the start/end timestamp') |
gr.Markdown('- Low audio quality? Try to switch to a proper model weight') |
outputs=gr.Audio(label='Output') |
submit_btn.click(fn=run, inputs=[ |
rec_box, |
audio_file_box, |
yt_url_box, |
denoise_box, |
speed_box, |
model_box, |
start_time_box, |
end_time_box, |
], outputs=outputs) |
with gr.Accordion('Read more ...', open=False): |
gr.Markdown('---') |
gr.Markdown( |
'We proposed a novel approach in the field of time-scale modification ' |
'on audio signals. While traditional methods use the framing technique, ' |
'spectral approach uses the short-time Fourier transform to preserve ' |
'the frequency during temporal stretching. TSM-Net, our neural-network ' |
'model encodes the raw audio into a high-level latent representation. ' |
'We call it Neuralgram, in which one vector represents 1024 audio samples. ' |
'It is inspired by the framing technique but addresses the clipping ' |
'artifacts. The Neuralgram is a two-dimensional matrix with real values, ' |
'we can apply some existing image resizing techniques on the Neuralgram ' |
'and decode it using our neural decoder to obtain the time-scaled audio. ' |
'Both the encoder and decoder are trained with GANs, which shows fair ' |
'generalization ability on the scaled Neuralgrams. Our method yields ' |
'little artifacts and opens a new possibility in the research of modern ' |
'time-scale modification. Please find more detail in our ' |
'<a href="https://arxiv.org/abs/2210.17152" target="_blank">paper</a>.' |
) |
demo.queue(4) |
demo.launch(server_name='') |