Spaces:
Running
Running
import os | |
import torch | |
import shutil | |
import librosa | |
import warnings | |
import numpy as np | |
import gradio as gr | |
import librosa.display | |
import matplotlib.pyplot as plt | |
from utils import get_modelist, find_audio_files, embed_img | |
from model import EvalNet | |
CLASSES = ["Gong", "Shang", "Jue", "Zhi", "Yu"] | |
TEMP_DIR = "./__pycache__/tmp" | |
SAMPLE_RATE = 44100 | |
def zero_padding(y: np.ndarray, end: int): | |
size = len(y) | |
if size < end: | |
return np.concatenate((y, np.zeros(end - size))) | |
elif size > end: | |
return y[-end:] | |
return y | |
def audio2mel(audio_path: str, seg_len=20): | |
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
y = zero_padding(y, seg_len * sr) | |
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr) | |
log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) | |
librosa.display.specshow(log_mel_spec) | |
plt.axis("off") | |
plt.savefig( | |
f"{TEMP_DIR}/output.jpg", | |
bbox_inches="tight", | |
pad_inches=0.0, | |
) | |
plt.close() | |
def audio2cqt(audio_path: str, seg_len=20): | |
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
y = zero_padding(y, seg_len * sr) | |
cqt_spec = librosa.cqt(y=y, sr=sr) | |
log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max) | |
librosa.display.specshow(log_cqt_spec) | |
plt.axis("off") | |
plt.savefig( | |
f"{TEMP_DIR}/output.jpg", | |
bbox_inches="tight", | |
pad_inches=0.0, | |
) | |
plt.close() | |
def audio2chroma(audio_path: str, seg_len=20): | |
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
y = zero_padding(y, seg_len * sr) | |
chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr) | |
log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max) | |
librosa.display.specshow(log_chroma_spec) | |
plt.axis("off") | |
plt.savefig( | |
f"{TEMP_DIR}/output.jpg", | |
bbox_inches="tight", | |
pad_inches=0.0, | |
) | |
plt.close() | |
def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR): | |
if os.path.exists(folder_path): | |
shutil.rmtree(folder_path) | |
if not wav_path: | |
return None, "Please input an audio!" | |
spec = log_name.split("_")[-3] | |
os.makedirs(folder_path, exist_ok=True) | |
try: | |
model = EvalNet(log_name, len(CLASSES)).model | |
eval("audio2%s" % spec)(wav_path) | |
except Exception as e: | |
return None, f"{e}" | |
input = embed_img(f"{folder_path}/output.jpg") | |
output: torch.Tensor = model(input) | |
pred_id = torch.max(output.data, 1)[1] | |
return ( | |
os.path.basename(wav_path), | |
CLASSES[pred_id].capitalize(), | |
) | |
if __name__ == "__main__": | |
warnings.filterwarnings("ignore") | |
models = get_modelist(assign_model="vit_l_16_cqt") | |
examples = [] | |
example_audios = find_audio_files() | |
for audio in example_audios: | |
examples.append([audio, models[0]]) | |
with gr.Blocks() as demo: | |
gr.Interface( | |
fn=infer, | |
inputs=[ | |
gr.Audio(label="Upload a recording", type="filepath"), | |
gr.Dropdown(choices=models, label="Select a model", value=models[0]), | |
], | |
outputs=[ | |
gr.Textbox(label="Audio filename", show_copy_button=True), | |
gr.Textbox( | |
label="Chinese pentatonic mode recognition", | |
show_copy_button=True, | |
), | |
], | |
examples=examples, | |
cache_examples=False, | |
flagging_mode="never", | |
title="It is recommended to keep the recording length around 20s.", | |
) | |
gr.Markdown( | |
""" | |
# Cite | |
```bibtex | |
@article{Zhou-2025, | |
title = {CCMusic: an Open and Diverse Database for Chinese Music Information Retrieval Research}, | |
author = {Monan Zhou, Shenyang Xu, Zhaorui Liu, Zhaowen Wang, Feng Yu, Wei Li and Baoqiang Han}, | |
journal = {Transactions of the International Society for Music Information Retrieval}, | |
year = {2025} | |
} | |
```""" | |
) | |
demo.launch() | |