paraclap / eval.py
KeiKinn
evaluation instruction
d4be371
import os
import torch
from transformers import logging
from transformers import AutoTokenizer
from wrapper import EvalWrapper
from models_xin import CLAP
from utils import compute_similarity
import librosa
if __name__ == '__main__':
logging.set_verbosity_error()
ckpt = torch.hub.load_state_dict_from_url(
url="https://huggingface.co/KeiKinn/paraclap/resolve/main/best.pth.tar?download=true",
map_location="cpu",
check_hash=True,
)
text_model = 'bert-base-uncased'
audio_model = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
candidates = ['happy', 'sad', 'surprise', 'angry'] # free to adapt it to your need
wavpath = '[Waveform path]' # single channel wavform
waveform, sample_rate = librosa.load(wavpath, sr=16000)
x = torch.Tensor(waveform)
tokenizer = AutoTokenizer.from_pretrained(text_model)
candidate_tokens = tokenizer.batch_encode_plus(
candidates,
padding=True,
truncation=True,
return_tensors='pt'
)
model = CLAP(
speech_name=audio_model,
text_name=text_model,
embedding_dim=768,
)
model.load_state_dict(ckpt)
model.to(device)
print(f'Checkpoint is loaded')
model.eval()
with torch.no_grad():
z = model(
x.unsqueeze(0).to(device),
candidate_tokens
)
similarity = compute_similarity(z[2], z[0], z[1])
prediction = similarity.T.argmax(dim=1)
result = candidates[prediction]
print(result)