Spaces:

clr
/

pce

Sleeping

App Files Files Community

catiR commited on Nov 17, 2023

Commit

99c2d01

1 Parent(s): b0c291c

audio

Browse files

Files changed (3) hide show

app.py +7 -5
scripts/clusterprosody.py +41 -4
scripts/runSQ.py +3 -3

app.py CHANGED Viewed

@@ -33,9 +33,9 @@ setup()
 def f1(voices, sent, indices):
     #tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
-    tts_audio, tts_score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = scripts.runSQ.run(sent, [voices], indices)
     score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
-    return (tts_audio, score_report, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e)
 def label_indices(sentence):
@@ -89,13 +89,16 @@ with bl:
             with gr.Row():
                 pl5 = gr.Plot()
                 pl6 = gr.Plot()
     temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
-    temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1,pl2,pl3,pl4,pl5,pl6])
 if __name__ == "__main__":
@@ -108,4 +111,3 @@ if __name__ == "__main__":

 def f1(voices, sent, indices):
     #tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
+    tts_audio, tts_score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html = scripts.runSQ.run(sent, [voices], indices)
     score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
+    return (tts_audio, score_report, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, audio_html)
 def label_indices(sentence):
             with gr.Row():
                 pl5 = gr.Plot()
                 pl6 = gr.Plot()
+        with gr.TabItem("Audio"):
+            play = gr.HTML(label="Audio samples")
     temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
+    temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1,pl2,pl3,pl4,pl5,pl6,play])
 if __name__ == "__main__":

scripts/clusterprosody.py CHANGED Viewed

@@ -137,6 +137,7 @@ def get_data(norm_sent,path_key,start_end_word_index):
     data = defaultdict(list)
     align_data = defaultdict(list)
     for spk, pdict in path_key:
         word_al = word_aligns[spk]
@@ -158,8 +159,9 @@ def get_data(norm_sent,path_key,start_end_word_index):
         #words = "-".join(word_combs)
         data[f"{words}**{spk}"] = d
         align_data[f"{words}**{spk}"] = seg_aligns
-    return words, data, align_data
@@ -274,7 +276,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
     h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
-    words, h_data, h_seg_aligns = get_data(norm_sent,h_all_paths,start_end_word_index)
     dtw_dists = pair_dists(h_data,words,h_spk_ids)
@@ -293,7 +295,7 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
     tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
-    _, tts_data, tts_seg_aligns = get_data(norm_sent,tts_all_paths,start_end_word_index)
     for v in voices:
         voice_data = tts_data[f"{words}**{v}"]
@@ -304,13 +306,48 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
     # match the data with a cluster -----
         best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, h_data, voice_data, voice_align, words, h_seg_aligns,v)
     # only supports one voice at a time currently
-    return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p,  tts_fig_e, fig_mid_e, fig_bad_e
     #return words, kmedoids_cluster_dists, group
 # find offsets to visually align start of each word for speakers in cluster
 def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
     words = words.split('_')

     data = defaultdict(list)
     align_data = defaultdict(list)
+    playable_audio = {}
     for spk, pdict in path_key:
         word_al = word_aligns[spk]
         #words = "-".join(word_combs)
         data[f"{words}**{spk}"] = d
         align_data[f"{words}**{spk}"] = seg_aligns
+        playable_audio[spk] = (pdict['wav'], start_time, end_time)
+    return words, data, align_data, playable_audio
     h_all_paths = gen_h_paths(h_wav_dir,h_align_dir,h_f0_dir,h_spk_ids)
+    words, h_data, h_seg_aligns, h_playable = get_data(norm_sent,h_all_paths,start_end_word_index)
     dtw_dists = pair_dists(h_data,words,h_spk_ids)
     tts_all_paths = gen_tts_paths(tts_sent_dir, voices)
+    _, tts_data, tts_seg_aligns, tts_playable_segment = get_data(norm_sent,tts_all_paths,start_end_word_index)
     for v in voices:
         voice_data = tts_data[f"{words}**{v}"]
     # match the data with a cluster -----
         best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p, tts_fig_e, fig_mid_e, fig_bad_e = match_tts(groups, h_data, voice_data, voice_align, words, h_seg_aligns,v)
+    audio_html = clusters_audio(groups,h_playable)
     # only supports one voice at a time currently
+    return best_cluster_score, tts_fig_p, fig_mid_p, fig_bad_p,  tts_fig_e, fig_mid_e, fig_bad_e, audio_html
     #return words, kmedoids_cluster_dists, group
+# generate html panel to play audios for each human cluster
+# audios is dict {recording_id : (wav_path, seg_start_time, seg_end_time)}
+def clusters_audio(clusters,audios):
+    html = '''<html><body>'''
+    for label in set([c for r,c in clusters]):
+        recs = [r for r,c in clusters if c==label]
+        html += '<div>'
+        html += f'<h2>Cluster {label}</h2>'
+        html += '<div>'
+        html += '<table><tbody>'
+        for rec in recs:
+            html += f'<tr><td><audio controls id="{rec}">'   #width="20%">
+            html += f'<source src="{audios[rec][0]}#t={audios[rec][1]*60:.2f},{audios[rec][2]*60:.2f}" type="audio/wav">'
+            html += '</audio></td>'
+            html += f'<td>{rec}</td></tr>'
+        html += '</tbody></table>'
+        html += '</div>'
+        #html += '<div style="height:2%;background:#e7fefc"></div>'
+        html += '</div>'
+    html += '</body></html>'
+    return html
 # find offsets to visually align start of each word for speakers in cluster
 def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
     words = words.split('_')

scripts/runSQ.py CHANGED Viewed

@@ -38,11 +38,11 @@ def run(sentence, voices, start_end_word_ix):
         temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
         voices = [voices[0]] # TODO. now limit one voice at a time.
-        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
     # also stop forgetting duration.
-    return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e
@@ -281,7 +281,7 @@ def localtest():
         voices = [voices[0]] # TODO. now limit one voice at a time.
-        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)

         temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path)
         voices = [voices[0]] # TODO. now limit one voice at a time.
+        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)
     # also stop forgetting duration.
+    return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html
         voices = [voices[0]] # TODO. now limit one voice at a time.
+        score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e, html = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix)