catiR commited on
Commit
a894787
·
1 Parent(s): 8827531

run clustering

Browse files
Files changed (3) hide show
  1. app.py +8 -3
  2. scripts/clusterprosody.py +51 -5
  3. scripts/runSQ.py +12 -1
app.py CHANGED
@@ -35,7 +35,7 @@ def f1(voices, sent, indices):
35
  #tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
36
  tts_audio, tts_score, graph = scripts.runSQ.run(sent, [voices], indices)
37
  score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
38
- return (tts_audio, score_report, graph)
39
 
40
 
41
  def label_indices(sentence):
@@ -46,11 +46,13 @@ def label_indices(sentence):
46
 
47
 
48
 
 
 
49
  bl = gr.Blocks()
50
  with bl:
51
 
52
 
53
- temp_sentences = ['Litlaus græn hugmynd?','Var það ekki nóg?', 'Ef svo er hvað heita þau þá?','Eru maríuhænur á Íslandi?']
54
 
55
  voices = ['Alfur','Dilja']
56
  # currently i only get json speech marks for those two.
@@ -75,9 +77,12 @@ with bl:
75
  tts_output = gr.Audio(interactive=False)
76
  report_score = gr.Markdown('Difference from TTS to real speech:')
77
  pl1 = gr.Plot()
 
 
 
78
 
79
  temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
80
- temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1])
81
 
82
 
83
  if __name__ == "__main__":
 
35
  #tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
36
  tts_audio, tts_score, graph = scripts.runSQ.run(sent, [voices], indices)
37
  score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
38
+ return (tts_audio, score_report, tts_graph, mid_graph, bad_graph)
39
 
40
 
41
  def label_indices(sentence):
 
46
 
47
 
48
 
49
+ temp_sentences = scripts.runSQ.snorm.create_temp_sent_list()
50
+
51
  bl = gr.Blocks()
52
  with bl:
53
 
54
 
55
+ #temp_sentences = ['Litlaus græn hugmynd?','Var það ekki nóg?', 'Ef svo er hvað heita þau þá?','Eru maríuhænur á Íslandi?']
56
 
57
  voices = ['Alfur','Dilja']
58
  # currently i only get json speech marks for those two.
 
77
  tts_output = gr.Audio(interactive=False)
78
  report_score = gr.Markdown('Difference from TTS to real speech:')
79
  pl1 = gr.Plot()
80
+ with gr.Row():
81
+ pl2 = gr.Plot()
82
+ pl3 = gr.Plot()
83
 
84
  temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
85
+ temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1,pl2,pl3])
86
 
87
 
88
  if __name__ == "__main__":
scripts/clusterprosody.py CHANGED
@@ -302,9 +302,16 @@ def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voi
302
 
303
  # now do graphs of matched_data with tts_data
304
  # and report best_cluster_score
305
- fig = plot_pitch_tts(speech_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
306
-
307
- return best_cluster_score, fig
 
 
 
 
 
 
 
308
 
309
 
310
 
@@ -346,10 +353,10 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
346
  tts_data, tts_align = get_tts_data(tdir,v,start_end_word_index)
347
 
348
  # match the data with a cluster -----
349
- best_cluster_score, fig = match_tts(groups, data, tts_data, tts_align, words, seg_aligns,v)
350
 
351
  # only supports one voice at a time currently
352
- return best_cluster_score, fig
353
  #return words, kmedoids_cluster_dists, groups
354
 
355
 
@@ -477,6 +484,45 @@ def plot_pitch_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id,
477
 
478
 
479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
 
482
 
 
302
 
303
  # now do graphs of matched_data with tts_data
304
  # and report best_cluster_score
305
+ tts_fig = plot_pitch_tts(matched_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
306
+
307
+ mid_cluster = tts_info[1][0]
308
+ mid_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==mid_cluster}
309
+ bad_cluster = tts_info[2][0]
310
+ bad_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==bad_cluster}
311
+ fig_mid = plot_pitch_cluster(mid_data,words,seg_aligns,mid_cluster)
312
+ fig_bad = plot_pitch_cluster(bad_data,words,seg_aligns,bad_cluster)
313
+
314
+ return best_cluster_score, tts_fig, fig_mid, fig_bad
315
 
316
 
317
 
 
353
  tts_data, tts_align = get_tts_data(tdir,v,start_end_word_index)
354
 
355
  # match the data with a cluster -----
356
+ best_cluster_score, tts_fig, fig_mid, fig_bad = match_tts(groups, data, tts_data, tts_align, words, seg_aligns,v)
357
 
358
  # only supports one voice at a time currently
359
+ return best_cluster_score, tts_fig, fig_mid, fig_bad
360
  #return words, kmedoids_cluster_dists, groups
361
 
362
 
 
484
 
485
 
486
 
487
+ def plot_pitch_cluster(speech_data,words,seg_aligns,cluster_id):
488
+ colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
489
+ cc = 0
490
+ fig = plt.figure(figsize=(8, 4))
491
+ plt.title(f"{words} - Pitch - Cluster {cluster_id}")
492
+ for k,v in speech_data.items():
493
+
494
+ spk = k.split('**')[1]
495
+
496
+ word_times = seg_aligns[k]
497
+
498
+ pitches = [p for p,e in v]
499
+ # datapoint interval is 0.005 seconds
500
+ pitch_xvals = [x*0.005 for x in range(len(pitches))]
501
+
502
+ # centre around the first word boundary -
503
+ # if 3+ words, too bad.
504
+ if len(word_times)>1:
505
+ realign = np.mean([word_times[0][2],word_times[1][1]])
506
+ pitch_xvals = [x - realign for x in pitch_xvals]
507
+ word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
508
+ plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
509
+
510
+ if len(word_times)>2:
511
+ for i in range(1,len(word_times)-1):
512
+ bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
513
+ plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
514
+
515
+ plt.scatter(pitch_xvals, pitches, color=colors[cc], label=f"Speaker {spk}")
516
+ cc += 1
517
+ if cc >= len(colors):
518
+ cc=0
519
+
520
+ #plt.legend()
521
+ #plt.show()
522
+
523
+
524
+ return fig
525
+
526
 
527
 
528
 
scripts/runSQ.py CHANGED
@@ -56,6 +56,17 @@ def snorm(s):
56
  return s
57
 
58
 
 
 
 
 
 
 
 
 
 
 
 
59
  # find all the recordings of a given sentence
60
  # listed in the corpus metadata.
61
  # sentence should be provided lowercase without punctuation
@@ -242,7 +253,7 @@ def localtest():
242
  f0_tts(sentence, voices, tts_dir, reaper_path = reaper_exc)
243
 
244
 
245
- score, fig = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)
246
 
247
 
248
 
 
56
  return s
57
 
58
 
59
+ def create_temp_sent_list():
60
+ corpusdb = '/home/user/app/human_data/SQL1adult10s_metadata.tsv'
61
+ with open(corpusdb,'r') as handle:
62
+ meta = handle.read().splitlines()
63
+ meta = [l.split('\t')[3] for l in meta[1:]]
64
+ meta = sorted(list(set(meta)))
65
+ return meta
66
+
67
+
68
+
69
+
70
  # find all the recordings of a given sentence
71
  # listed in the corpus metadata.
72
  # sentence should be provided lowercase without punctuation
 
253
  f0_tts(sentence, voices, tts_dir, reaper_path = reaper_exc)
254
 
255
 
256
+ score, tts_fig, mid_fig, bad_fig = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)
257
 
258
 
259