File size: 4,172 Bytes
9f252ff
 
4e64ba6
 
 
 
 
9f252ff
 
 
 
4e64ba6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f252ff
b4dd7e7
9f252ff
4e64ba6
 
 
 
9f252ff
4e64ba6
9f252ff
 
 
4e64ba6
9f252ff
 
4e64ba6
9f252ff
 
4e64ba6
9f252ff
4e64ba6
 
 
 
9f252ff
4e64ba6
9f252ff
 
4e64ba6
9f252ff
4e64ba6
9f252ff
 
 
 
4e64ba6
9f252ff
4e64ba6
 
 
 
 
9f252ff
 
4e64ba6
9f252ff
4e64ba6
 
9f252ff
 
4e64ba6
9f252ff
 
 
 
 
4e64ba6
9f252ff
4e64ba6
9f252ff
4e64ba6
9f252ff
 
 
4e64ba6
9f252ff
 
4e64ba6
b4dd7e7
 
 
 
 
 
 
9f252ff
 
 
 
 
 
 
 
 
 
4e64ba6
 
 
 
 
 
 
 
b4dd7e7
4e64ba6
 
 
b4dd7e7
4e64ba6
9f252ff
4e64ba6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gradio as gr
import numpy as np
import os
import torch

from timeit import default_timer as timer

from model.bart import BartCaptionModel
from utils.audio_utils import load_audio, STR_CH_FIRST

if os.path.isfile("transfer.pth") == False:
    torch.hub.download_url_to_file(
        "https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth",
        "transfer.pth",
    )

    torch.hub.download_url_to_file(
        "https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav",
        "folk.wav",
    )

    torch.hub.download_url_to_file(
        "https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3",
        "electronic.mp3",
    )

    torch.hub.download_url_to_file(
        "https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav",
        "orchestra.wav",
    )

device = "cuda" if torch.cuda.is_available() else "cpu"

example_list = ["folk.wav", "electronic.mp3", "orchestra.wav"]
model = BartCaptionModel(max_length=128)
pretrained_object = torch.load("./transfer.pth", map_location="cpu")
state_dict = pretrained_object["state_dict"]
model.load_state_dict(state_dict)

if torch.cuda.is_available():
    torch.cuda.set_device(device)
    model = model.cuda(device)

model.eval()


def get_audio(audio_path, duration=10, target_sr=16000):
    n_samples = int(duration * target_sr)

    audio, sr = load_audio(
        path=audio_path,
        ch_format=STR_CH_FIRST,
        sample_rate=target_sr,
        downmix_to_mono=True,
    )

    if len(audio.shape) == 2:
        audio = audio.mean(0, False)  # to mono

    input_size = int(n_samples)

    if audio.shape[-1] < input_size:  # pad sequence
        pad = np.zeros(input_size)
        pad[: audio.shape[-1]] = audio
        audio = pad

    ceil = int(audio.shape[-1] // n_samples)

    audio = torch.from_numpy(
        np.stack(np.split(audio[: ceil * n_samples], ceil)).astype("float32")
    )

    return audio


def captioning(audio_path):
    audio_tensor = get_audio(audio_path=audio_path)

    if torch.cuda.is_available():
        audio_tensor = audio_tensor.to(device)

    with torch.no_grad():
        output = model.generate(
            samples=audio_tensor,
            num_beams=5,
        )

    inference = ""

    number_of_chunks = range(audio_tensor.shape[0])

    for chunk, text in zip(number_of_chunks, output):
        time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
        inference += f"{time}\n{text} \n \n"

    return inference


def load_css():
    with open("static/css/musicapp.css", "r") as file:
        css_content = file.read()
    return css_content


title = "Capabara - Interactive demo: Music Captioning 🤖🎵"
description = """
<p style='text-align: center'> LP-MusicCaps: LLM-Based Pseudo Music Captioning</p> 
<p style='text-align: center'> SeungHeon Doh, Keunwoo Choi, Jongpil Lee, Juhan Nam, ISMIR 2023</p> 
<p style='text-align: center'> <a href='https://arxiv.org/abs/2307.16372' target='_blank'>ArXiv</a> | <a href='https://github.com/seungheondoh/lp-music-caps' target='_blank'>Codes</a> | <a href='https://huggingface.co/datasets/seungheondoh/LP-MusicCaps-MC' target='_blank'>Dataset</a> </p>
<p style='text-align: center'> To use it, simply upload your audio and click 'submit', or click one of the examples to load them. Read more at the links below. </p>
<p style='text-align: center'> If you have any error, plz check this code: <a href='https://github.com/seungheondoh/lp-music-caps/blob/main/demo/app.py' target='_blank'>Demo</a>. </p>
"""

article = "<p style='text-align: center'><a href='https://seungheondoh.github.io/' target='_blank'>Author Info</a> | <a href='https://github.com/seungheondoh' target='_blank'>Github</a></p>"

demo = gr.Interface(
    fn=captioning,
    inputs=gr.Audio(type="filepath"),
    outputs=[
        gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"),
    ],
    examples=example_list,
    title=title,
    theme=gr.themes.Default(font=[gr.themes.GoogleFont("Work Sans"), "sans-serif"]),
    description=description,
    article=article,
    cache_examples=False,
    css=load_css(),
)

demo.launch()