Spaces:
Sleeping
Sleeping
File size: 6,899 Bytes
42a4544 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 20 16:56:19 2018
@author: harry
"""
import librosa
import numpy as np
import torch
import torch.autograd as grad
import torch.nn.functional as F
from utils.hparam import hparam as hp
def get_centroids_prior(embeddings):
centroids = []
for speaker in embeddings:
centroid = 0
for utterance in speaker:
centroid = centroid + utterance
centroid = centroid/len(speaker)
centroids.append(centroid)
centroids = torch.stack(centroids)
return centroids
def get_centroids(embeddings):
centroids = embeddings.mean(dim=1)
return centroids
def get_centroid(embeddings, speaker_num, utterance_num):
centroid = 0
for utterance_id, utterance in enumerate(embeddings[speaker_num]):
if utterance_id == utterance_num:
continue
centroid = centroid + utterance
centroid = centroid/(len(embeddings[speaker_num])-1)
return centroid
def get_utterance_centroids(embeddings):
"""
Returns the centroids for each utterance of a speaker, where
the utterance centroid is the speaker centroid without considering
this utterance
Shape of embeddings should be:
(speaker_ct, utterance_per_speaker_ct, embedding_size)
"""
sum_centroids = embeddings.sum(dim=1)
# we want to subtract out each utterance, prior to calculating the
# the utterance centroid
sum_centroids = sum_centroids.reshape(
sum_centroids.shape[0], 1, sum_centroids.shape[-1]
)
# we want the mean but not including the utterance itself, so -1
num_utterances = embeddings.shape[1] - 1
centroids = (sum_centroids - embeddings) / num_utterances
return centroids
def get_cossim_prior(embeddings, centroids):
# Calculates cosine similarity matrix. Requires (N, M, feature) input
cossim = torch.zeros(embeddings.size(0),embeddings.size(1),centroids.size(0))
for speaker_num, speaker in enumerate(embeddings):
for utterance_num, utterance in enumerate(speaker):
for centroid_num, centroid in enumerate(centroids):
if speaker_num == centroid_num:
centroid = get_centroid(embeddings, speaker_num, utterance_num)
output = F.cosine_similarity(utterance,centroid,dim=0)+1e-6
cossim[speaker_num][utterance_num][centroid_num] = output
return cossim
def get_cossim(embeddings, centroids):
# number of utterances per speaker
num_utterances = embeddings.shape[1]
utterance_centroids = get_utterance_centroids(embeddings)
# flatten the embeddings and utterance centroids to just utterance,
# so we can do cosine similarity
utterance_centroids_flat = utterance_centroids.view(
utterance_centroids.shape[0] * utterance_centroids.shape[1],
-1
)
embeddings_flat = embeddings.view(
embeddings.shape[0] * num_utterances,
-1
)
# the cosine distance between utterance and the associated centroids
# for that utterance
# this is each speaker's utterances against his own centroid, but each
# comparison centroid has the current utterance removed
cos_same = F.cosine_similarity(embeddings_flat, utterance_centroids_flat)
# now we get the cosine distance between each utterance and the other speakers'
# centroids
# to do so requires comparing each utterance to each centroid. To keep the
# operation fast, we vectorize by using matrices L (embeddings) and
# R (centroids) where L has each utterance repeated sequentially for all
# comparisons and R has the entire centroids frame repeated for each utterance
centroids_expand = centroids.repeat((num_utterances * embeddings.shape[0], 1))
embeddings_expand = embeddings_flat.unsqueeze(1).repeat(1, embeddings.shape[0], 1)
embeddings_expand = embeddings_expand.view(
embeddings_expand.shape[0] * embeddings_expand.shape[1],
embeddings_expand.shape[-1]
)
cos_diff = F.cosine_similarity(embeddings_expand, centroids_expand)
cos_diff = cos_diff.view(
embeddings.size(0),
num_utterances,
centroids.size(0)
)
# assign the cosine distance for same speakers to the proper idx
same_idx = list(range(embeddings.size(0)))
cos_diff[same_idx, :, same_idx] = cos_same.view(embeddings.shape[0], num_utterances)
cos_diff = cos_diff + 1e-6
return cos_diff
def calc_loss_prior(sim_matrix):
# Calculates loss from (N, M, K) similarity matrix
per_embedding_loss = torch.zeros(sim_matrix.size(0), sim_matrix.size(1))
for j in range(len(sim_matrix)):
for i in range(sim_matrix.size(1)):
per_embedding_loss[j][i] = -(sim_matrix[j][i][j] - ((torch.exp(sim_matrix[j][i]).sum()+1e-6).log_()))
loss = per_embedding_loss.sum()
return loss, per_embedding_loss
def calc_loss(sim_matrix):
same_idx = list(range(sim_matrix.size(0)))
pos = sim_matrix[same_idx, :, same_idx]
neg = (torch.exp(sim_matrix).sum(dim=2) + 1e-6).log_()
per_embedding_loss = -1 * (pos - neg)
loss = per_embedding_loss.sum()
return loss, per_embedding_loss
def normalize_0_1(values, max_value, min_value):
normalized = np.clip((values - min_value) / (max_value - min_value), 0, 1)
return normalized
def mfccs_and_spec(wav_file, wav_process = False, calc_mfccs=False, calc_mag_db=False):
sound_file, _ = librosa.core.load(wav_file, sr=hp.data.sr)
window_length = int(hp.data.window*hp.data.sr)
hop_length = int(hp.data.hop*hp.data.sr)
duration = hp.data.tisv_frame * hp.data.hop + hp.data.window
# Cut silence and fix length
if wav_process == True:
sound_file, index = librosa.effects.trim(sound_file, frame_length=window_length, hop_length=hop_length)
length = int(hp.data.sr * duration)
sound_file = librosa.util.fix_length(sound_file, length)
spec = librosa.stft(sound_file, n_fft=hp.data.nfft, hop_length=hop_length, win_length=window_length)
mag_spec = np.abs(spec)
mel_basis = librosa.filters.mel(hp.data.sr, hp.data.nfft, n_mels=hp.data.nmels)
mel_spec = np.dot(mel_basis, mag_spec)
mag_db = librosa.amplitude_to_db(mag_spec)
#db mel spectrogram
mel_db = librosa.amplitude_to_db(mel_spec).T
mfccs = None
if calc_mfccs:
mfccs = np.dot(librosa.filters.dct(40, mel_db.shape[0]), mel_db).T
return mfccs, mel_db, mag_db
if __name__ == "__main__":
w = grad.Variable(torch.tensor(1.0))
b = grad.Variable(torch.tensor(0.0))
embeddings = torch.tensor([[0,1,0],[0,0,1], [0,1,0], [0,1,0], [1,0,0], [1,0,0]]).to(torch.float).reshape(3,2,3)
centroids = get_centroids(embeddings)
cossim = get_cossim(embeddings, centroids)
sim_matrix = w*cossim + b
loss, per_embedding_loss = calc_loss(sim_matrix)
|