Spaces:

clr
/

pce

Sleeping

App Files Files Community

catiR commited on Oct 31, 2023

Commit

8d1fcc3

1 Parent(s): 1efac6a

align word starts

Browse files

Files changed (2) hide show

scripts/clusterprosody.py +35 -90
scripts/runSQ.py +2 -1

scripts/clusterprosody.py CHANGED Viewed

@@ -42,7 +42,6 @@ def get_word_aligns(norm_sent, aln_paths):
-#TODO pass whole path
 def get_pitches(start_time, end_time, fpath):
     """
     Returns an array of pitch values for a given speech.
@@ -75,7 +74,6 @@ def get_pitches(start_time, end_time, fpath):
-# TODO take whole path
 # jcheng used energy from esps get_f0
 # get f0 says (?) :
 #The RMS value of each record is computed based on a 30 msec hanning
@@ -106,7 +104,7 @@ def downsample_rmse2pitch(rmse,pitch_len):
 # parse user input string to usable word indices for the sentence
-# TODO handle cases
 def parse_word_indices(start_end_word_index):
     ixs = start_end_word_index.split('-')
     if len(ixs) == 1:
@@ -300,69 +298,30 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
-# TODO there IS sth for making tts_data
-# but im probably p much on my own rlly for that.
-# TODO this one is v v helpful.
-# but mind if i adjusted a dictionaries earlier.
-def spks_all_cdist():
-    speaker_to_tts_dtw_dists = defaultdict(list)
-    for key1, value1 in data.items():
-        d = key1.split("-")
-        words1 = d[:-2]
-        id1, id2 = d[-2], d[-1]
-        for key2, value2 in tts_data.items():
-            d = key2.split("-")
-            words2 = d[:-2]
-            id3, id4 = d[-2], d[-1]
-            if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
-                speaker_to_tts_dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
-    return speaker_to_tts_dtw_dists
-#TODO i think this is also gr8
-# but like figure out how its doing
-# bc dict format and stuff,
-# working keying by word index instead of word text, ***********
-# and for 1 wd or 3+ wd units...
-def tts_cdist():
-    tts_dist_to_cluster = defaultdict(list)
-    for words1, datas1 in kmedoids_cluster_dists.items():
-        for d1 in datas1:
-            cluster, sp_id1, arr = d1
-            for words2, datas2 in speaker_to_tts_dtw_dists.items():
-                for d2 in datas2:
-                    ids, dist = d2
-                    sp_id2, tts_alfur = ids.split("_")
-                    if sp_id1 == sp_id2 and words1 == words2:
-                        tts_dist_to_cluster[f"{words1}-{cluster}"].append(dist)
-    tts_mean_dist_to_cluster = {
-        key: np.mean(value) for key, value in tts_dist_to_cluster.items()
-    }
-    return tts_mean_dist_to_cluster
-# TODO check if anything uses this?
-def get_audio_part(start_time, end_time, id, path):
-    """
-    Returns a dictionary of RMSE values for a given sentence.
-    """
-    f = os.path.join(path, id + ".wav")
-    audio, sr = librosa.load(f, sr=16000)
-    segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
-    return segment
@@ -384,32 +343,26 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
         print('problem with the figure')
         return fig
     plt.title(f"{words} - {fname} - Cluster {cluster_id}")
     for k,v in speech_data.items():
         spk = k.split('**')[1]
         word_times = seg_aligns[k]
         feats = ffunc(v)
         # datapoint interval is 0.005 seconds
         feat_xvals = [x*0.005 for x in range(len(feats))]
-        # centre around the first word boundary -
-        # if 3+ words, too bad.
-        if len(word_times)>1:
-            realign = np.mean([word_times[0][2],word_times[1][1]])
-            feat_xvals = [x - realign for x in feat_xvals]
-            word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
-            plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
-        if len(word_times)>2:
-            for i in range(1,len(word_times)-1):
-                bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
-                plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
         pfunc(feat_xvals, feats, color=colors[cc], label=f"Speaker {spk}")
         cc += 1
         if cc >= len(colors):
             cc=0
@@ -417,16 +370,8 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
     if voice:
         tfeats = ffunc(tts_data)
         t_xvals = [x*0.005 for x in range(len(tfeats))]
-        if len(tts_align)>1:
-            realign = np.mean([tts_align[0][2],tts_align[1][1]])
-            t_xvals = [x - realign for x in t_xvals]
-            tts_align = [(w,s-realign,e-realign) for w,s,e in tts_align]
-        if len(tts_align)>2:
-            for i in range(1,len(tts_align)-1):
-                bound_line = np.mean([tts_align[i][2],tts_align[i+1][1]])
-                plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i+1][0]}")
         pfunc(t_xvals, tfeats, color="black", label=f"TTS {voice}")

 def get_pitches(start_time, end_time, fpath):
     """
     Returns an array of pitch values for a given speech.
 # jcheng used energy from esps get_f0
 # get f0 says (?) :
 #The RMS value of each record is computed based on a 30 msec hanning
 # parse user input string to usable word indices for the sentence
+# TODO handle more user input cases
 def parse_word_indices(start_end_word_index):
     ixs = start_end_word_index.split('-')
     if len(ixs) == 1:
+# realign at the start of each word
+# destroys pause information but overall more legible
+def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
+    words = words.split('_')
+    retimes = []
+    for i in range(len(words)):
+        starts = [human_aligns[spk][i][1] for spk in cluster_speakers]
+        if tts_align:
+            starts.append(tts_align[i][1])
+        retimes.append((words[i],max(starts)))
+    return retimes
+def retime_speaker_xvals(retimes, speaker_aligns, speaker_xvals):
+    new_xvals = []
+    def xlim(x,i,retimes,speaker_aligns):
+        return (x < speaker_aligns[i+1][1]) if i+1<len(retimes) else True
+    for i in range(len(retimes)):
+        wd,st = retimes[i]
+        w,s,e = speaker_aligns[i]
+        xdiff = st-s
+        new_xvals += [x+xdiff for x in speaker_xvals if (x>= s) and xlim(x,i,retimes,speaker_aligns) ]
+    return [round(x,2) for x in new_xvals]
         print('problem with the figure')
         return fig
+    # boundary for start of each word
+    retimes = reset_cluster_times(words,list(speech_data.keys()),seg_aligns,tts_align)
+    if len(retimes)>1:
+        for w,bound_line in retimes:
+            plt.axvline(x=bound_line, color="gray", linestyle='--', linewidth=1, label=f'Start "{w}"')
     plt.title(f"{words} - {fname} - Cluster {cluster_id}")
     for k,v in speech_data.items():
         spk = k.split('**')[1]
         word_times = seg_aligns[k]
         feats = ffunc(v)
         # datapoint interval is 0.005 seconds
         feat_xvals = [x*0.005 for x in range(len(feats))]
+        feat_xvals = retime_speaker_xvals(retimes, word_times, feat_xvals)
         pfunc(feat_xvals, feats, color=colors[cc], label=f"Speaker {spk}")
         cc += 1
         if cc >= len(colors):
             cc=0
     if voice:
         tfeats = ffunc(tts_data)
         t_xvals = [x*0.005 for x in range(len(tfeats))]
+        t_xvals = retime_speaker_xvals(retimes, tts_align, t_xvals)
         pfunc(t_xvals, tfeats, color="black", label=f"TTS {voice}")

scripts/runSQ.py CHANGED Viewed

@@ -195,6 +195,7 @@ def setup_tts_sent(sentence,ttsdir,meta_path = 'tts_meta.tsv'):
     sentence = sentence.replace('\n',' ')
     with open(f'{ttsdir}{meta_path}','a+') as handle:
         tts_meta = handle.read().splitlines()
         tts_meta = [l.split('\t') for l in tts_meta]
@@ -223,7 +224,7 @@ def setup_tts_sent(sentence,ttsdir,meta_path = 'tts_meta.tsv'):
 def localtest():
     sentence = 'Ef svo er, hvað heita þau þá?'#'Var það ekki nóg?'
-    voices = ['Alfur'] #,'Dilja']
     # make for now the interface allows max one voice
     start_end_word_ix = '5-7'

     sentence = sentence.replace('\n',' ')
     with open(f'{ttsdir}{meta_path}','a+') as handle:
+        handle.seek(0)
         tts_meta = handle.read().splitlines()
         tts_meta = [l.split('\t') for l in tts_meta]
 def localtest():
     sentence = 'Ef svo er, hvað heita þau þá?'#'Var það ekki nóg?'
+    voices = ['Alfur_v2'] #,'Dilja']
     # make for now the interface allows max one voice
     start_end_word_ix = '5-7'