catiR commited on
Commit
99c2d01
·
1 Parent(s): b0c291c
Files changed (3) hide show
  1. app.py +7 -5
  2. scripts/clusterprosody.py +41 -4
  3. scripts/runSQ.py +3 -3
app.py CHANGED
@@ -33,9 +33,9 @@ setup()
33
 
34
  def f1(voices, sent, indices):
35
  #tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
36
- tts_audio, tts_score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = scripts.runSQ.run(sent, [voices], indices)
37
  score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
38
- return (tts_audio, score_report, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e)
39
 
40
 
41
  def label_indices(sentence):
@@ -89,13 +89,16 @@ with bl:
89
  with gr.Row():
90
  pl5 = gr.Plot()
91
  pl6 = gr.Plot()
92
-
 
 
 
93
 
94
 
95
 
96
 
97
  temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
98
- temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1,pl2,pl3,pl4,pl5,pl6])
99
 
100
 
101
  if __name__ == "__main__":
@@ -108,4 +111,3 @@ if __name__ == "__main__":
108
 
109
 
110
 
111
-
 
33
 
34
  def f1(voices, sent, indices):
35
  #tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
36
+ tts_audio, tts_score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html = scripts.runSQ.run(sent, [voices], indices)
37
  score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
38
+ return (tts_audio, score_report, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html)
39
 
40
 
41
  def label_indices(sentence):
 
89
  with gr.Row():
90
  pl5 = gr.Plot()
91
  pl6 = gr.Plot()
92
+
93
+ with gr.TabItem("Audio"):
94
+
95
+ play = gr.HTML(label="Audio samples")
96
 
97
 
98
 
99
 
100
  temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
101
+ temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1,pl2,pl3,pl4,pl5,pl6,play])
102
 
103
 
104
  if __name__ == "__main__":
 
111
 
112
 
113
 
 
scripts/clusterprosody.py CHANGED
@@ -137,6 +137,7 @@ def get_data(norm_sent,path_key,start_end_word_index):
137
 
138
  data = defaultdict(list)
139
  align_data = defaultdict(list)
 
140
 
141
  for spk, pdict in path_key:
142
  word_al = word_aligns[spk]
@@ -158,8 +159,9 @@ def get_data(norm_sent,path_key,start_end_word_index):
158
  #words = "-".join(word_combs)
159
  data[f"{words}**{spk}"] = d
160
  align_data[f"{words}**{spk}"] = seg_aligns
 
161
 
162
- return words, data, align_data
163
 
164
 
165
 
@@ -274,7 +276,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
274
 
275
  h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
276
 
277
- words, h_data, h_seg_aligns = get_data(norm_sent,h_all_paths,start_end_word_index)
278
 
279
  dtw_dists = pair_dists(h_data,words,h_spk_ids)
280
 
@@ -293,7 +295,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
293
 
294
 
295
  tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
296
- _, tts_data, tts_seg_aligns = get_data(norm_sent,tts_all_paths,start_end_word_index)
297
 
298
  for v in voices:
299
  voice_data = tts_data[f"{words}**{v}"]
@@ -304,13 +306,48 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
304
  # match the data with a cluster -----
305
  best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, h_data, voice_data, voice_align, words, h_seg_aligns,v)
306
 
 
 
 
307
  # only supports one voice at a time currently
308
- return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
309
  #return words, kmedoids_cluster_dists, group
310
 
311
 
312
 
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  # find offsets to visually align start of each word for speakers in cluster
315
  def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
316
  words = words.split('_')
 
137
 
138
  data = defaultdict(list)
139
  align_data = defaultdict(list)
140
+ playable_audio = {}
141
 
142
  for spk, pdict in path_key:
143
  word_al = word_aligns[spk]
 
159
  #words = "-".join(word_combs)
160
  data[f"{words}**{spk}"] = d
161
  align_data[f"{words}**{spk}"] = seg_aligns
162
+ playable_audio[spk] = (pdict['wav'], start_time, end_time)
163
 
164
+ return words, data, align_data, playable_audio
165
 
166
 
167
 
 
276
 
277
  h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
278
 
279
+ words, h_data, h_seg_aligns, h_playable = get_data(norm_sent,h_all_paths,start_end_word_index)
280
 
281
  dtw_dists = pair_dists(h_data,words,h_spk_ids)
282
 
 
295
 
296
 
297
  tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
298
+ _, tts_data, tts_seg_aligns, tts_playable_segment = get_data(norm_sent,tts_all_paths,start_end_word_index)
299
 
300
  for v in voices:
301
  voice_data = tts_data[f"{words}**{v}"]
 
306
  # match the data with a cluster -----
307
  best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, h_data, voice_data, voice_align, words, h_seg_aligns,v)
308
 
309
+
310
+ audio_html = clusters_audio(groups,h_playable)
311
+
312
  # only supports one voice at a time currently
313
+ return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html
314
  #return words, kmedoids_cluster_dists, group
315
 
316
 
317
 
318
 
319
+ # generate html panel to play audios for each human cluster
320
+ # audios is dict {recording_id : (wav_path, seg_start_time, seg_end_time)}
321
+ def clusters_audio(clusters,audios):
322
+
323
+ html = '''<html><body>'''
324
+
325
+ for label in set([c for r,c in clusters]):
326
+ recs = [r for r,c in clusters if c==label]
327
+
328
+ html += '<div>'
329
+ html += f'<h2>Cluster {label}</h2>'
330
+
331
+ html += '<div>'
332
+ html += '<table><tbody>'
333
+
334
+ for rec in recs:
335
+ html += f'<tr><td><audio controls id="{rec}">' #width="20%">
336
+ html += f'<source src="{audios[rec][0]}#t={audios[rec][1]*60:.2f},{audios[rec][2]*60:.2f}" type="audio/wav">'
337
+ html += '</audio></td>'
338
+ html += f'<td>{rec}</td></tr>'
339
+
340
+ html += '</tbody></table>'
341
+ html += '</div>'
342
+ #html += '<div style="height:2%;background:#e7fefc"></div>'
343
+
344
+ html += '</div>'
345
+ html += '</body></html>'
346
+
347
+ return html
348
+
349
+
350
+
351
  # find offsets to visually align start of each word for speakers in cluster
352
  def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
353
  words = words.split('_')
scripts/runSQ.py CHANGED
@@ -38,11 +38,11 @@ def run(sentence, voices, start_end_word_ix):
38
  temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
39
 
40
  voices = [voices[0]] # TODO. now limit one voice at a time.
41
- score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
42
 
43
  # also stop forgetting duration.
44
 
45
- return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e
46
 
47
 
48
 
@@ -281,7 +281,7 @@ def localtest():
281
 
282
  voices = [voices[0]] # TODO. now limit one voice at a time.
283
 
284
- score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
285
 
286
 
287
 
 
38
  temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
39
 
40
  voices = [voices[0]] # TODO. now limit one voice at a time.
41
+ score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
42
 
43
  # also stop forgetting duration.
44
 
45
+ return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html
46
 
47
 
48
 
 
281
 
282
  voices = [voices[0]] # TODO. now limit one voice at a time.
283
 
284
+ score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
285
 
286
 
287