catiR
commited on
Commit
·
1efac6a
1
Parent(s):
c5c9abd
force align tts, add voices
Browse files- scripts/clusterprosody.py +11 -9
scripts/clusterprosody.py
CHANGED
@@ -224,14 +224,14 @@ def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voi
|
|
224 |
bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
|
225 |
|
226 |
#tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
|
227 |
-
tts_fig_p = plot_one_cluster(words,'pitch',matched_data,seg_aligns,
|
228 |
-
fig_mid_p = plot_one_cluster(words,'pitch',mid_data,seg_aligns,
|
229 |
-
fig_bad_p = plot_one_cluster(words,'pitch',bad_data,seg_aligns,
|
230 |
|
231 |
|
232 |
-
tts_fig_e = plot_one_cluster(words,'rmse',matched_data,seg_aligns,
|
233 |
-
fig_mid_e = plot_one_cluster(words,'rmse',mid_data,seg_aligns,
|
234 |
-
fig_bad_e = plot_one_cluster(words,'rmse',bad_data,seg_aligns,
|
235 |
|
236 |
return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
|
237 |
|
@@ -375,9 +375,11 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
|
|
375 |
if feature.lower() in ['pitch','f0']:
|
376 |
fname = 'Pitch'
|
377 |
ffunc = lambda x: [p for p,e in x]
|
|
|
378 |
elif feature.lower() in ['energy', 'rmse']:
|
379 |
fname = 'Energy'
|
380 |
ffunc = lambda x: [e for p,e in x]
|
|
|
381 |
else:
|
382 |
print('problem with the figure')
|
383 |
return fig
|
@@ -407,13 +409,13 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
|
|
407 |
bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
|
408 |
plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
|
409 |
|
410 |
-
|
411 |
cc += 1
|
412 |
if cc >= len(colors):
|
413 |
cc=0
|
414 |
|
415 |
if voice:
|
416 |
-
tfeats =
|
417 |
t_xvals = [x*0.005 for x in range(len(tfeats))]
|
418 |
|
419 |
if len(tts_align)>1:
|
@@ -425,7 +427,7 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
|
|
425 |
for i in range(1,len(tts_align)-1):
|
426 |
bound_line = np.mean([tts_align[i][2],tts_align[i+1][1]])
|
427 |
plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i+1][0]}")
|
428 |
-
|
429 |
|
430 |
|
431 |
#plt.legend()
|
|
|
224 |
bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
|
225 |
|
226 |
#tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
|
227 |
+
tts_fig_p = plot_one_cluster(words,'pitch',matched_data,seg_aligns,best_cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
|
228 |
+
fig_mid_p = plot_one_cluster(words,'pitch',mid_data,seg_aligns,mid_cluster)
|
229 |
+
fig_bad_p = plot_one_cluster(words,'pitch',bad_data,seg_aligns,bad_cluster)
|
230 |
|
231 |
|
232 |
+
tts_fig_e = plot_one_cluster(words,'rmse',matched_data,seg_aligns,best_cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
|
233 |
+
fig_mid_e = plot_one_cluster(words,'rmse',mid_data,seg_aligns,mid_cluster)
|
234 |
+
fig_bad_e = plot_one_cluster(words,'rmse',bad_data,seg_aligns,bad_cluster)
|
235 |
|
236 |
return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
|
237 |
|
|
|
375 |
if feature.lower() in ['pitch','f0']:
|
376 |
fname = 'Pitch'
|
377 |
ffunc = lambda x: [p for p,e in x]
|
378 |
+
pfunc = plt.scatter
|
379 |
elif feature.lower() in ['energy', 'rmse']:
|
380 |
fname = 'Energy'
|
381 |
ffunc = lambda x: [e for p,e in x]
|
382 |
+
pfunc = plt.plot
|
383 |
else:
|
384 |
print('problem with the figure')
|
385 |
return fig
|
|
|
409 |
bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
|
410 |
plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
|
411 |
|
412 |
+
pfunc(feat_xvals, feats, color=colors[cc], label=f"Speaker {spk}")
|
413 |
cc += 1
|
414 |
if cc >= len(colors):
|
415 |
cc=0
|
416 |
|
417 |
if voice:
|
418 |
+
tfeats = ffunc(tts_data)
|
419 |
t_xvals = [x*0.005 for x in range(len(tfeats))]
|
420 |
|
421 |
if len(tts_align)>1:
|
|
|
427 |
for i in range(1,len(tts_align)-1):
|
428 |
bound_line = np.mean([tts_align[i][2],tts_align[i+1][1]])
|
429 |
plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i+1][0]}")
|
430 |
+
pfunc(t_xvals, tfeats, color="black", label=f"TTS {voice}")
|
431 |
|
432 |
|
433 |
#plt.legend()
|