KeiKinn
/

paraclap

audio_classification

Model card Files Files and versions Community

paraclap / eval.py

KeiKinn

evaluation instruction

d4be371 3 months ago

history blame contribute delete

1.64 kB

	import os
	import torch
	from transformers import logging
	from transformers import AutoTokenizer
	from wrapper import EvalWrapper
	from models_xin import CLAP
	from utils import compute_similarity
	import librosa


	if __name__ == '__main__':
	logging.set_verbosity_error()
	ckpt = torch.hub.load_state_dict_from_url(
	url="https://huggingface.co/KeiKinn/paraclap/resolve/main/best.pth.tar?download=true",
	map_location="cpu",
	check_hash=True,
	)

	text_model = 'bert-base-uncased'
	audio_model = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'

	device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

	candidates = ['happy', 'sad', 'surprise', 'angry'] # free to adapt it to your need
	wavpath = '[Waveform path]' # single channel wavform

	waveform, sample_rate = librosa.load(wavpath, sr=16000)
	x = torch.Tensor(waveform)

	tokenizer = AutoTokenizer.from_pretrained(text_model)

	candidate_tokens = tokenizer.batch_encode_plus(
	candidates,
	padding=True,
	truncation=True,
	return_tensors='pt'
	)

	model = CLAP(
	speech_name=audio_model,
	text_name=text_model,
	embedding_dim=768,
	)

	model.load_state_dict(ckpt)
	model.to(device)
	print(f'Checkpoint is loaded')
	model.eval()

	with torch.no_grad():
	z = model(
	x.unsqueeze(0).to(device),
	candidate_tokens
	)

	similarity = compute_similarity(z[2], z[0], z[1])
	prediction = similarity.T.argmax(dim=1)

	result = candidates[prediction]

	print(result)