catiR
commited on
Commit
·
99c2d01
1
Parent(s):
b0c291c
audio
Browse files- app.py +7 -5
- scripts/clusterprosody.py +41 -4
- scripts/runSQ.py +3 -3
app.py
CHANGED
@@ -33,9 +33,9 @@ setup()
|
|
33 |
|
34 |
def f1(voices, sent, indices):
|
35 |
#tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
|
36 |
-
tts_audio, tts_score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = scripts.runSQ.run(sent, [voices], indices)
|
37 |
score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
|
38 |
-
return (tts_audio, score_report, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e)
|
39 |
|
40 |
|
41 |
def label_indices(sentence):
|
@@ -89,13 +89,16 @@ with bl:
|
|
89 |
with gr.Row():
|
90 |
pl5 = gr.Plot()
|
91 |
pl6 = gr.Plot()
|
92 |
-
|
|
|
|
|
|
|
93 |
|
94 |
|
95 |
|
96 |
|
97 |
temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
|
98 |
-
temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1,pl2,pl3,pl4,pl5,pl6])
|
99 |
|
100 |
|
101 |
if __name__ == "__main__":
|
@@ -108,4 +111,3 @@ if __name__ == "__main__":
|
|
108 |
|
109 |
|
110 |
|
111 |
-
|
|
|
33 |
|
34 |
def f1(voices, sent, indices):
|
35 |
#tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
|
36 |
+
tts_audio, tts_score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html = scripts.runSQ.run(sent, [voices], indices)
|
37 |
score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
|
38 |
+
return (tts_audio, score_report, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html)
|
39 |
|
40 |
|
41 |
def label_indices(sentence):
|
|
|
89 |
with gr.Row():
|
90 |
pl5 = gr.Plot()
|
91 |
pl6 = gr.Plot()
|
92 |
+
|
93 |
+
with gr.TabItem("Audio"):
|
94 |
+
|
95 |
+
play = gr.HTML(label="Audio samples")
|
96 |
|
97 |
|
98 |
|
99 |
|
100 |
temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
|
101 |
+
temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1,pl2,pl3,pl4,pl5,pl6,play])
|
102 |
|
103 |
|
104 |
if __name__ == "__main__":
|
|
|
111 |
|
112 |
|
113 |
|
|
scripts/clusterprosody.py
CHANGED
@@ -137,6 +137,7 @@ def get_data(norm_sent,path_key,start_end_word_index):
|
|
137 |
|
138 |
data = defaultdict(list)
|
139 |
align_data = defaultdict(list)
|
|
|
140 |
|
141 |
for spk, pdict in path_key:
|
142 |
word_al = word_aligns[spk]
|
@@ -158,8 +159,9 @@ def get_data(norm_sent,path_key,start_end_word_index):
|
|
158 |
#words = "-".join(word_combs)
|
159 |
data[f"{words}**{spk}"] = d
|
160 |
align_data[f"{words}**{spk}"] = seg_aligns
|
|
|
161 |
|
162 |
-
return words, data, align_data
|
163 |
|
164 |
|
165 |
|
@@ -274,7 +276,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
|
|
274 |
|
275 |
h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
|
276 |
|
277 |
-
words, h_data, h_seg_aligns = get_data(norm_sent,h_all_paths,start_end_word_index)
|
278 |
|
279 |
dtw_dists = pair_dists(h_data,words,h_spk_ids)
|
280 |
|
@@ -293,7 +295,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
|
|
293 |
|
294 |
|
295 |
tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
|
296 |
-
_, tts_data, tts_seg_aligns = get_data(norm_sent,tts_all_paths,start_end_word_index)
|
297 |
|
298 |
for v in voices:
|
299 |
voice_data = tts_data[f"{words}**{v}"]
|
@@ -304,13 +306,48 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
|
|
304 |
# match the data with a cluster -----
|
305 |
best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, h_data, voice_data, voice_align, words, h_seg_aligns,v)
|
306 |
|
|
|
|
|
|
|
307 |
# only supports one voice at a time currently
|
308 |
-
return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e
|
309 |
#return words, kmedoids_cluster_dists, group
|
310 |
|
311 |
|
312 |
|
313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
# find offsets to visually align start of each word for speakers in cluster
|
315 |
def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
|
316 |
words = words.split('_')
|
|
|
137 |
|
138 |
data = defaultdict(list)
|
139 |
align_data = defaultdict(list)
|
140 |
+
playable_audio = {}
|
141 |
|
142 |
for spk, pdict in path_key:
|
143 |
word_al = word_aligns[spk]
|
|
|
159 |
#words = "-".join(word_combs)
|
160 |
data[f"{words}**{spk}"] = d
|
161 |
align_data[f"{words}**{spk}"] = seg_aligns
|
162 |
+
playable_audio[spk] = (pdict['wav'], start_time, end_time)
|
163 |
|
164 |
+
return words, data, align_data, playable_audio
|
165 |
|
166 |
|
167 |
|
|
|
276 |
|
277 |
h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
|
278 |
|
279 |
+
words, h_data, h_seg_aligns, h_playable = get_data(norm_sent,h_all_paths,start_end_word_index)
|
280 |
|
281 |
dtw_dists = pair_dists(h_data,words,h_spk_ids)
|
282 |
|
|
|
295 |
|
296 |
|
297 |
tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
|
298 |
+
_, tts_data, tts_seg_aligns, tts_playable_segment = get_data(norm_sent,tts_all_paths,start_end_word_index)
|
299 |
|
300 |
for v in voices:
|
301 |
voice_data = tts_data[f"{words}**{v}"]
|
|
|
306 |
# match the data with a cluster -----
|
307 |
best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, h_data, voice_data, voice_align, words, h_seg_aligns,v)
|
308 |
|
309 |
+
|
310 |
+
audio_html = clusters_audio(groups,h_playable)
|
311 |
+
|
312 |
# only supports one voice at a time currently
|
313 |
+
return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html
|
314 |
#return words, kmedoids_cluster_dists, group
|
315 |
|
316 |
|
317 |
|
318 |
|
319 |
+
# generate html panel to play audios for each human cluster
|
320 |
+
# audios is dict {recording_id : (wav_path, seg_start_time, seg_end_time)}
|
321 |
+
def clusters_audio(clusters,audios):
|
322 |
+
|
323 |
+
html = '''<html><body>'''
|
324 |
+
|
325 |
+
for label in set([c for r,c in clusters]):
|
326 |
+
recs = [r for r,c in clusters if c==label]
|
327 |
+
|
328 |
+
html += '<div>'
|
329 |
+
html += f'<h2>Cluster {label}</h2>'
|
330 |
+
|
331 |
+
html += '<div>'
|
332 |
+
html += '<table><tbody>'
|
333 |
+
|
334 |
+
for rec in recs:
|
335 |
+
html += f'<tr><td><audio controls id="{rec}">' #width="20%">
|
336 |
+
html += f'<source src="{audios[rec][0]}#t={audios[rec][1]*60:.2f},{audios[rec][2]*60:.2f}" type="audio/wav">'
|
337 |
+
html += '</audio></td>'
|
338 |
+
html += f'<td>{rec}</td></tr>'
|
339 |
+
|
340 |
+
html += '</tbody></table>'
|
341 |
+
html += '</div>'
|
342 |
+
#html += '<div style="height:2%;background:#e7fefc"></div>'
|
343 |
+
|
344 |
+
html += '</div>'
|
345 |
+
html += '</body></html>'
|
346 |
+
|
347 |
+
return html
|
348 |
+
|
349 |
+
|
350 |
+
|
351 |
# find offsets to visually align start of each word for speakers in cluster
|
352 |
def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
|
353 |
words = words.split('_')
|
scripts/runSQ.py
CHANGED
@@ -38,11 +38,11 @@ def run(sentence, voices, start_end_word_ix):
|
|
38 |
temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
|
39 |
|
40 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
41 |
-
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
|
42 |
|
43 |
# also stop forgetting duration.
|
44 |
|
45 |
-
return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e
|
46 |
|
47 |
|
48 |
|
@@ -281,7 +281,7 @@ def localtest():
|
|
281 |
|
282 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
283 |
|
284 |
-
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
|
285 |
|
286 |
|
287 |
|
|
|
38 |
temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
|
39 |
|
40 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
41 |
+
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
|
42 |
|
43 |
# also stop forgetting duration.
|
44 |
|
45 |
+
return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html
|
46 |
|
47 |
|
48 |
|
|
|
281 |
|
282 |
voices = [voices[0]] # TODO. now limit one voice at a time.
|
283 |
|
284 |
+
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
|
285 |
|
286 |
|
287 |
|