catiR commited on
Commit
1efac6a
·
1 Parent(s): c5c9abd

force align tts, add voices

Browse files
Files changed (1) hide show
  1. scripts/clusterprosody.py +11 -9
scripts/clusterprosody.py CHANGED
@@ -224,14 +224,14 @@ def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voi
224
  bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
225
 
226
  #tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
227
- tts_fig_p = plot_one_cluster(words,'pitch',matched_data,seg_aligns,cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
228
- fig_mid_p = plot_one_cluster(words,'pitch',mid_data,seg_aligns,cluster)
229
- fig_bad_p = plot_one_cluster(words,'pitch',bad_data,seg_aligns,cluster)
230
 
231
 
232
- tts_fig_e = plot_one_cluster(words,'rmse',matched_data,seg_aligns,cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
233
- fig_mid_e = plot_one_cluster(words,'rmse',mid_data,seg_aligns,cluster)
234
- fig_bad_e = plot_one_cluster(words,'rmse',bad_data,seg_aligns,cluster)
235
 
236
  return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
237
 
@@ -375,9 +375,11 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
375
  if feature.lower() in ['pitch','f0']:
376
  fname = 'Pitch'
377
  ffunc = lambda x: [p for p,e in x]
 
378
  elif feature.lower() in ['energy', 'rmse']:
379
  fname = 'Energy'
380
  ffunc = lambda x: [e for p,e in x]
 
381
  else:
382
  print('problem with the figure')
383
  return fig
@@ -407,13 +409,13 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
407
  bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
408
  plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
409
 
410
- plt.scatter(feat_xvals, feats, color=colors[cc], label=f"Speaker {spk}")
411
  cc += 1
412
  if cc >= len(colors):
413
  cc=0
414
 
415
  if voice:
416
- tfeats = [p for p,e in tts_data]
417
  t_xvals = [x*0.005 for x in range(len(tfeats))]
418
 
419
  if len(tts_align)>1:
@@ -425,7 +427,7 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
425
  for i in range(1,len(tts_align)-1):
426
  bound_line = np.mean([tts_align[i][2],tts_align[i+1][1]])
427
  plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i+1][0]}")
428
- plt.scatter(t_xvals, tfeats, color="black", label=f"TTS {voice}")
429
 
430
 
431
  #plt.legend()
 
224
  bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
225
 
226
  #tts_fig_p = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
227
+ tts_fig_p = plot_one_cluster(words,'pitch',matched_data,seg_aligns,best_cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
228
+ fig_mid_p = plot_one_cluster(words,'pitch',mid_data,seg_aligns,mid_cluster)
229
+ fig_bad_p = plot_one_cluster(words,'pitch',bad_data,seg_aligns,bad_cluster)
230
 
231
 
232
+ tts_fig_e = plot_one_cluster(words,'rmse',matched_data,seg_aligns,best_cluster,tts_data=tts_data,tts_align=tts_align,voice=voice)
233
+ fig_mid_e = plot_one_cluster(words,'rmse',mid_data,seg_aligns,mid_cluster)
234
+ fig_bad_e = plot_one_cluster(words,'rmse',bad_data,seg_aligns,bad_cluster)
235
 
236
  return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
237
 
 
375
  if feature.lower() in ['pitch','f0']:
376
  fname = 'Pitch'
377
  ffunc = lambda x: [p for p,e in x]
378
+ pfunc = plt.scatter
379
  elif feature.lower() in ['energy', 'rmse']:
380
  fname = 'Energy'
381
  ffunc = lambda x: [e for p,e in x]
382
+ pfunc = plt.plot
383
  else:
384
  print('problem with the figure')
385
  return fig
 
409
  bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
410
  plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
411
 
412
+ pfunc(feat_xvals, feats, color=colors[cc], label=f"Speaker {spk}")
413
  cc += 1
414
  if cc >= len(colors):
415
  cc=0
416
 
417
  if voice:
418
+ tfeats = ffunc(tts_data)
419
  t_xvals = [x*0.005 for x in range(len(tfeats))]
420
 
421
  if len(tts_align)>1:
 
427
  for i in range(1,len(tts_align)-1):
428
  bound_line = np.mean([tts_align[i][2],tts_align[i+1][1]])
429
  plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i+1][0]}")
430
+ pfunc(t_xvals, tfeats, color="black", label=f"TTS {voice}")
431
 
432
 
433
  #plt.legend()