Spaces:
Running
Running
File size: 4,172 Bytes
9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff b4dd7e7 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 9f252ff 4e64ba6 b4dd7e7 9f252ff 4e64ba6 b4dd7e7 4e64ba6 b4dd7e7 4e64ba6 9f252ff 4e64ba6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import gradio as gr
import numpy as np
import os
import torch
from timeit import default_timer as timer
from model.bart import BartCaptionModel
from utils.audio_utils import load_audio, STR_CH_FIRST
if os.path.isfile("transfer.pth") == False:
torch.hub.download_url_to_file(
"https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth",
"transfer.pth",
)
torch.hub.download_url_to_file(
"https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav",
"folk.wav",
)
torch.hub.download_url_to_file(
"https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3",
"electronic.mp3",
)
torch.hub.download_url_to_file(
"https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav",
"orchestra.wav",
)
device = "cuda" if torch.cuda.is_available() else "cpu"
example_list = ["folk.wav", "electronic.mp3", "orchestra.wav"]
model = BartCaptionModel(max_length=128)
pretrained_object = torch.load("./transfer.pth", map_location="cpu")
state_dict = pretrained_object["state_dict"]
model.load_state_dict(state_dict)
if torch.cuda.is_available():
torch.cuda.set_device(device)
model = model.cuda(device)
model.eval()
def get_audio(audio_path, duration=10, target_sr=16000):
n_samples = int(duration * target_sr)
audio, sr = load_audio(
path=audio_path,
ch_format=STR_CH_FIRST,
sample_rate=target_sr,
downmix_to_mono=True,
)
if len(audio.shape) == 2:
audio = audio.mean(0, False) # to mono
input_size = int(n_samples)
if audio.shape[-1] < input_size: # pad sequence
pad = np.zeros(input_size)
pad[: audio.shape[-1]] = audio
audio = pad
ceil = int(audio.shape[-1] // n_samples)
audio = torch.from_numpy(
np.stack(np.split(audio[: ceil * n_samples], ceil)).astype("float32")
)
return audio
def captioning(audio_path):
audio_tensor = get_audio(audio_path=audio_path)
if torch.cuda.is_available():
audio_tensor = audio_tensor.to(device)
with torch.no_grad():
output = model.generate(
samples=audio_tensor,
num_beams=5,
)
inference = ""
number_of_chunks = range(audio_tensor.shape[0])
for chunk, text in zip(number_of_chunks, output):
time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
inference += f"{time}\n{text} \n \n"
return inference
def load_css():
with open("static/css/musicapp.css", "r") as file:
css_content = file.read()
return css_content
title = "Capabara - Interactive demo: Music Captioning 🤖🎵"
description = """
<p style='text-align: center'> LP-MusicCaps: LLM-Based Pseudo Music Captioning</p>
<p style='text-align: center'> SeungHeon Doh, Keunwoo Choi, Jongpil Lee, Juhan Nam, ISMIR 2023</p>
<p style='text-align: center'> <a href='https://arxiv.org/abs/2307.16372' target='_blank'>ArXiv</a> | <a href='https://github.com/seungheondoh/lp-music-caps' target='_blank'>Codes</a> | <a href='https://huggingface.co/datasets/seungheondoh/LP-MusicCaps-MC' target='_blank'>Dataset</a> </p>
<p style='text-align: center'> To use it, simply upload your audio and click 'submit', or click one of the examples to load them. Read more at the links below. </p>
<p style='text-align: center'> If you have any error, plz check this code: <a href='https://github.com/seungheondoh/lp-music-caps/blob/main/demo/app.py' target='_blank'>Demo</a>. </p>
"""
article = "<p style='text-align: center'><a href='https://seungheondoh.github.io/' target='_blank'>Author Info</a> | <a href='https://github.com/seungheondoh' target='_blank'>Github</a></p>"
demo = gr.Interface(
fn=captioning,
inputs=gr.Audio(type="filepath"),
outputs=[
gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"),
],
examples=example_list,
title=title,
theme=gr.themes.Default(font=[gr.themes.GoogleFont("Work Sans"), "sans-serif"]),
description=description,
article=article,
cache_examples=False,
css=load_css(),
)
demo.launch()
|