catiR commited on
Commit
8d1fcc3
·
1 Parent(s): 1efac6a

align word starts

Browse files
Files changed (2) hide show
  1. scripts/clusterprosody.py +35 -90
  2. scripts/runSQ.py +2 -1
scripts/clusterprosody.py CHANGED
@@ -42,7 +42,6 @@ def get_word_aligns(norm_sent, aln_paths):
42
 
43
 
44
 
45
- #TODO pass whole path
46
  def get_pitches(start_time, end_time, fpath):
47
  """
48
  Returns an array of pitch values for a given speech.
@@ -75,7 +74,6 @@ def get_pitches(start_time, end_time, fpath):
75
 
76
 
77
 
78
- # TODO take whole path
79
  # jcheng used energy from esps get_f0
80
  # get f0 says (?) :
81
  #The RMS value of each record is computed based on a 30 msec hanning
@@ -106,7 +104,7 @@ def downsample_rmse2pitch(rmse,pitch_len):
106
 
107
 
108
  # parse user input string to usable word indices for the sentence
109
- # TODO handle cases
110
  def parse_word_indices(start_end_word_index):
111
  ixs = start_end_word_index.split('-')
112
  if len(ixs) == 1:
@@ -300,69 +298,30 @@ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts
300
 
301
 
302
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
- # TODO there IS sth for making tts_data
305
- # but im probably p much on my own rlly for that.
306
-
307
-
308
-
309
- # TODO this one is v v helpful.
310
- # but mind if i adjusted a dictionaries earlier.
311
- def spks_all_cdist():
312
- speaker_to_tts_dtw_dists = defaultdict(list)
313
-
314
- for key1, value1 in data.items():
315
- d = key1.split("-")
316
- words1 = d[:-2]
317
- id1, id2 = d[-2], d[-1]
318
- for key2, value2 in tts_data.items():
319
- d = key2.split("-")
320
- words2 = d[:-2]
321
- id3, id4 = d[-2], d[-1]
322
- if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
323
- speaker_to_tts_dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
324
- return speaker_to_tts_dtw_dists
325
-
326
-
327
-
328
- #TODO i think this is also gr8
329
- # but like figure out how its doing
330
- # bc dict format and stuff,
331
- # working keying by word index instead of word text, ***********
332
- # and for 1 wd or 3+ wd units...
333
- def tts_cdist():
334
- tts_dist_to_cluster = defaultdict(list)
335
-
336
- for words1, datas1 in kmedoids_cluster_dists.items():
337
- for d1 in datas1:
338
- cluster, sp_id1, arr = d1
339
- for words2, datas2 in speaker_to_tts_dtw_dists.items():
340
- for d2 in datas2:
341
- ids, dist = d2
342
- sp_id2, tts_alfur = ids.split("_")
343
- if sp_id1 == sp_id2 and words1 == words2:
344
- tts_dist_to_cluster[f"{words1}-{cluster}"].append(dist)
345
-
346
- tts_mean_dist_to_cluster = {
347
- key: np.mean(value) for key, value in tts_dist_to_cluster.items()
348
- }
349
- return tts_mean_dist_to_cluster
350
-
351
-
352
-
353
-
354
-
355
-
356
- # TODO check if anything uses this?
357
- def get_audio_part(start_time, end_time, id, path):
358
- """
359
- Returns a dictionary of RMSE values for a given sentence.
360
- """
361
-
362
- f = os.path.join(path, id + ".wav")
363
- audio, sr = librosa.load(f, sr=16000)
364
- segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
365
- return segment
366
 
367
 
368
 
@@ -384,32 +343,26 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
384
  print('problem with the figure')
385
  return fig
386
 
 
 
 
 
 
 
 
387
  plt.title(f"{words} - {fname} - Cluster {cluster_id}")
 
388
  for k,v in speech_data.items():
389
 
390
  spk = k.split('**')[1]
391
-
392
  word_times = seg_aligns[k]
393
 
394
-
395
  feats = ffunc(v)
396
  # datapoint interval is 0.005 seconds
397
  feat_xvals = [x*0.005 for x in range(len(feats))]
398
-
399
- # centre around the first word boundary -
400
- # if 3+ words, too bad.
401
- if len(word_times)>1:
402
- realign = np.mean([word_times[0][2],word_times[1][1]])
403
- feat_xvals = [x - realign for x in feat_xvals]
404
- word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
405
- plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
406
-
407
- if len(word_times)>2:
408
- for i in range(1,len(word_times)-1):
409
- bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
410
- plt.axvline(x=bound_line, color=colors[cc], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
411
-
412
  pfunc(feat_xvals, feats, color=colors[cc], label=f"Speaker {spk}")
 
413
  cc += 1
414
  if cc >= len(colors):
415
  cc=0
@@ -417,16 +370,8 @@ def plot_one_cluster(words,feature,speech_data,seg_aligns,cluster_id,tts_data=No
417
  if voice:
418
  tfeats = ffunc(tts_data)
419
  t_xvals = [x*0.005 for x in range(len(tfeats))]
 
420
 
421
- if len(tts_align)>1:
422
- realign = np.mean([tts_align[0][2],tts_align[1][1]])
423
- t_xvals = [x - realign for x in t_xvals]
424
- tts_align = [(w,s-realign,e-realign) for w,s,e in tts_align]
425
-
426
- if len(tts_align)>2:
427
- for i in range(1,len(tts_align)-1):
428
- bound_line = np.mean([tts_align[i][2],tts_align[i+1][1]])
429
- plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i+1][0]}")
430
  pfunc(t_xvals, tfeats, color="black", label=f"TTS {voice}")
431
 
432
 
 
42
 
43
 
44
 
 
45
  def get_pitches(start_time, end_time, fpath):
46
  """
47
  Returns an array of pitch values for a given speech.
 
74
 
75
 
76
 
 
77
  # jcheng used energy from esps get_f0
78
  # get f0 says (?) :
79
  #The RMS value of each record is computed based on a 30 msec hanning
 
104
 
105
 
106
  # parse user input string to usable word indices for the sentence
107
+ # TODO handle more user input cases
108
  def parse_word_indices(start_end_word_index):
109
  ixs = start_end_word_index.split('-')
110
  if len(ixs) == 1:
 
298
 
299
 
300
 
301
+ # realign at the start of each word
302
+ # destroys pause information but overall more legible
303
+ def reset_cluster_times(words,cluster_speakers,human_aligns,tts_align):
304
+ words = words.split('_')
305
+ retimes = []
306
+ for i in range(len(words)):
307
+ starts = [human_aligns[spk][i][1] for spk in cluster_speakers]
308
+ if tts_align:
309
+ starts.append(tts_align[i][1])
310
+ retimes.append((words[i],max(starts)))
311
+ return retimes
312
 
313
+ def retime_speaker_xvals(retimes, speaker_aligns, speaker_xvals):
314
+ new_xvals = []
315
+ def xlim(x,i,retimes,speaker_aligns):
316
+ return (x < speaker_aligns[i+1][1]) if i+1<len(retimes) else True
317
+
318
+ for i in range(len(retimes)):
319
+ wd,st = retimes[i]
320
+ w,s,e = speaker_aligns[i]
321
+ xdiff = st-s
322
+ new_xvals += [x+xdiff for x in speaker_xvals if (x>= s) and xlim(x,i,retimes,speaker_aligns) ]
323
+
324
+ return [round(x,2) for x in new_xvals]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
 
327
 
 
343
  print('problem with the figure')
344
  return fig
345
 
346
+
347
+ # boundary for start of each word
348
+ retimes = reset_cluster_times(words,list(speech_data.keys()),seg_aligns,tts_align)
349
+ if len(retimes)>1:
350
+ for w,bound_line in retimes:
351
+ plt.axvline(x=bound_line, color="gray", linestyle='--', linewidth=1, label=f'Start "{w}"')
352
+
353
  plt.title(f"{words} - {fname} - Cluster {cluster_id}")
354
+
355
  for k,v in speech_data.items():
356
 
357
  spk = k.split('**')[1]
 
358
  word_times = seg_aligns[k]
359
 
 
360
  feats = ffunc(v)
361
  # datapoint interval is 0.005 seconds
362
  feat_xvals = [x*0.005 for x in range(len(feats))]
363
+ feat_xvals = retime_speaker_xvals(retimes, word_times, feat_xvals)
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  pfunc(feat_xvals, feats, color=colors[cc], label=f"Speaker {spk}")
365
+
366
  cc += 1
367
  if cc >= len(colors):
368
  cc=0
 
370
  if voice:
371
  tfeats = ffunc(tts_data)
372
  t_xvals = [x*0.005 for x in range(len(tfeats))]
373
+ t_xvals = retime_speaker_xvals(retimes, tts_align, t_xvals)
374
 
 
 
 
 
 
 
 
 
 
375
  pfunc(t_xvals, tfeats, color="black", label=f"TTS {voice}")
376
 
377
 
scripts/runSQ.py CHANGED
@@ -195,6 +195,7 @@ def setup_tts_sent(sentence,ttsdir,meta_path = 'tts_meta.tsv'):
195
  sentence = sentence.replace('\n',' ')
196
 
197
  with open(f'{ttsdir}{meta_path}','a+') as handle:
 
198
  tts_meta = handle.read().splitlines()
199
  tts_meta = [l.split('\t') for l in tts_meta]
200
 
@@ -223,7 +224,7 @@ def setup_tts_sent(sentence,ttsdir,meta_path = 'tts_meta.tsv'):
223
 
224
  def localtest():
225
  sentence = 'Ef svo er, hvað heita þau þá?'#'Var það ekki nóg?'
226
- voices = ['Alfur'] #,'Dilja']
227
  # make for now the interface allows max one voice
228
 
229
  start_end_word_ix = '5-7'
 
195
  sentence = sentence.replace('\n',' ')
196
 
197
  with open(f'{ttsdir}{meta_path}','a+') as handle:
198
+ handle.seek(0)
199
  tts_meta = handle.read().splitlines()
200
  tts_meta = [l.split('\t') for l in tts_meta]
201
 
 
224
 
225
  def localtest():
226
  sentence = 'Ef svo er, hvað heita þau þá?'#'Var það ekki nóg?'
227
+ voices = ['Alfur_v2'] #,'Dilja']
228
  # make for now the interface allows max one voice
229
 
230
  start_end_word_ix = '5-7'