catiR commited on
Commit
53792d8
·
1 Parent(s): 779c244

run clustering

Browse files
Files changed (4) hide show
  1. app.py +25 -12
  2. scripts/clusterprosody.py +332 -227
  3. scripts/reaper2pass.py +18 -13
  4. scripts/runSQ.py +63 -25
app.py CHANGED
@@ -31,11 +31,17 @@ print('about to setup')
31
  setup()
32
 
33
 
34
- def f1(voices, sent):
35
- one_tts = scripts.runSQ.run(sent,voices)
36
- return (one_tts)
 
37
 
38
 
 
 
 
 
 
39
 
40
 
41
 
@@ -51,18 +57,25 @@ with bl:
51
  # i get everyone elses wavs tho
52
 
53
  with gr.Row():
54
- with gr.Column(scale=4):
55
- voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value='Alfur')
56
- temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
57
- with gr.Column(scale=1):
58
- temp_button = gr.Button(value="A button")
59
-
60
-
61
- tts_output = gr.Audio(interactive=False)
62
 
 
 
 
 
 
63
 
64
 
65
- temp_button.click(f1,[voiceselect,temp_sentmenu],[tts_output])
 
 
 
 
 
66
 
67
 
68
  if __name__ == "__main__":
 
31
  setup()
32
 
33
 
34
+ def f1(voices, sent, indices):
35
+ tts_audio, tts_score, graph = scripts.runSQ.run(sent, voices, indices)
36
+ score_report = f'Difference from TTS to real speech: {round(tts_score,2)}'
37
+ return (tts_audio, score_report, graph)
38
 
39
 
40
+ def label_indices(sentence):
41
+ sentence = scripts.runSQ.snorm(sentence)
42
+ sentence = sentence.split(' ')
43
+ labelled = [(word, i) for i, word in enumerate(sentence)]
44
+ return labelled
45
 
46
 
47
 
 
57
  # i get everyone elses wavs tho
58
 
59
  with gr.Row():
60
+ #with gr.Column(scale=4):
61
+ temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
62
+ #voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value='Alfur')
63
+
64
+ marked_sentence = gr.HighlightedText(interactive=False)
 
 
 
65
 
66
+ spanselect = gr.Textbox(value='1-3',info='Enter the index of the word(s) to analyse. It can be a single word: 4 or a span of words separated by a dash: 2-3')
67
+ voiceselect = gr.Radio(voices, label="TTS voice",value='Alfur')
68
+
69
+ #with gr.Column(scale=1):
70
+ temp_button = gr.Button(value="Run with selected options")
71
 
72
 
73
+ tts_output = gr.Audio(interactive=False)
74
+ report_score = gr.Markdown('Difference from TTS to real speech:')
75
+ pl1 = gr.Plot()
76
+
77
+ temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
78
+ temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],[tts_output,report_score,pl1])
79
 
80
 
81
  if __name__ == "__main__":
scripts/clusterprosody.py CHANGED
@@ -1,4 +1,6 @@
1
  import numpy as np
 
 
2
  import matplotlib.pyplot as plt
3
  import soundfile as sf
4
  from collections import defaultdict
@@ -15,26 +17,13 @@ import os, librosa, json
15
 
16
 
17
 
18
- # will need:
19
- # the whole sentence text (index, word) pairs
20
- # the indices of units the user wants
21
- # human meta db of all human recordings
22
- # tts dir, human wav + align + f0 dirs
23
- # list of tts voices
24
- # an actual wav file for each human rec, probably
25
- # params like: use f0, use rmse, (use dur), [.....]
26
- # .. check what i wrote anywhere abt this.
27
-
28
-
29
-
30
  def z_score(x, mean, std):
31
  return (x - mean) / std
32
 
33
 
34
 
35
- # TODO ADJUST
36
- # new input will be one Meta db
37
- # output should probably be the same, e.g.
38
  # {'013823-0457777': [('hvaða', 0.89, 1.35),
39
  # ('sjúkdómar', 1.35, 2.17),
40
  # ('geta', 2.17, 2.4),
@@ -53,82 +42,72 @@ def z_score(x, mean, std):
53
  # ('fylgt', 1.96, 2.27),
54
  # ('óbeinum', 2.27, 2.73),
55
  # ('reykingum', 2.73, 3.27)] }
56
- def get_word_aligns(sentences, directory):
 
 
57
  """
58
  Returns a dictionary of word alignments for a given sentence.
59
  """
60
  word_aligns = defaultdict(list)
61
 
62
- for sentence in sentences:
63
- print(sentence)
64
- slist = sentence.split(" ")
65
-
66
- for filename in os.listdir(directory):
67
- f = os.path.join(directory, filename)
68
-
69
- with open(f) as f:
70
- lines = f.read().splitlines()[1:]
71
- lines = [line.split(",") for line in lines]
72
- if len(lines) >= len(slist) and lines[0][2] == slist[0] and all([lines[i][2] == slist[i] for i, line in enumerate(slist)]):
73
- id = filename.replace(".csv", "")
74
- word_al = [(lines[j][2], float(lines[j][0]), float(lines[j][1])) for j, line in enumerate(slist)]
75
- # word_aligns[id].append(word_al) # If one speaker has multiple sentences
76
- word_aligns[id] = word_al
77
-
78
- if len(word_aligns) >= 10 * len(sentences): break
79
-
80
  return word_aligns
 
81
 
82
 
83
-
84
-
85
-
86
- # TODO ADJUST
87
- # or tbqh it is possibly fine as is
88
- # well, what file format is it reading.
89
- # either adjust my f0 file format or adjust this, a little.
90
  def get_pitches(start_time, end_time, id, path):
91
  """
92
  Returns an array of pitch values for a given speech.
 
93
  """
94
 
95
  f = os.path.join(path, id + ".f0")
96
  with open(f) as f:
97
- lines = f.read().splitlines()[7:]
98
  lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats
99
  pitches = []
 
100
 
101
  # find the mean of all pitches in the whole sentence
102
- mean = np.mean([line[2] for line in lines if line[2] != -1])
103
  # find the std of all pitches in the whole sentence
104
- std = np.std([line[2] for line in lines if line[2] != -1])
 
105
 
106
- fifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 5)
107
- ninetyfifth_percentile = np.percentile([line[2] for line in lines if line[2] != -1], 95)
108
-
109
  for line in lines:
110
- time, is_pitch, pitch = line
111
 
112
  if start_time <= time <= end_time:
113
  if is_pitch:
114
- if fifth_percentile <= pitch <= ninetyfifth_percentile:
115
- pitches.append(z_score(pitch, mean, std))
116
- elif pitch < fifth_percentile:
117
- pitches.append(z_score(fifth_percentile, mean, std))
118
- elif pitch > ninetyfifth_percentile:
119
- pitches.append(z_score(ninetyfifth_percentile, mean, std))
120
  else:
121
- pitches.append(z_score(fifth_percentile, mean, std))
 
122
 
123
  return pitches
124
 
125
 
126
 
127
-
128
- # TODO adjust
129
- # probably mainly for the assumption about filepath lol
130
- # but also then, comprehend it lol
131
- def get_rmse(start_time, end_time, id, path, pitch_len):
 
 
 
 
 
132
  """
133
  Returns an array of RMSE values for a given speech.
134
  """
@@ -136,75 +115,71 @@ def get_rmse(start_time, end_time, id, path, pitch_len):
136
  f = os.path.join(path, id + ".wav")
137
  audio, sr = librosa.load(f, sr=16000)
138
  segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
139
- rmse = librosa.feature.rms(segment)
140
  rmse = rmse[0]
 
 
 
 
 
141
  idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
142
  return rmse[idx]
 
143
 
144
-
145
-
146
-
147
- tEMP_start_end_word_pairs = [
148
- [("hvaða", "sjúkdómar"), ("geta", "fylgt"), ("óbeinum", "reykingum")],
149
- [("en", "af", "hverju"), ("skyldi", "vera"), ("svona", "mikið", "bull"), ("í", "stjórnmálum")],
150
- ]
151
-
152
-
153
- #TODO !!!!!!!!!!!!!########
154
- # make it take any list of (1stword, lastword) or (word)
155
- # units and do the thing for those units.
156
- # make it work if the sentence has 2 of the same word
157
- # PROBABLY this means i actually need to display the sentence
158
- # to the user with the words numbered,
159
- # and make the user input word indices.
160
- def get_data(word_aligns, start_end_word_pairs):
161
  """
162
  Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations.
163
  """
164
 
165
- data = defaultdict(list)
166
- f0_dir = "aligned-reaper/samromur-queries/f0/"
167
- wav_dir = "aligned-reaper/samromur-queries/wav/"
168
 
 
 
 
 
 
 
169
  for id, word_al in word_aligns.items():
170
- for sent in start_end_word_pairs:
171
- for word_combs in sent:
172
- start, end = word_combs[0], word_combs[-1]
173
-
174
- if any(x[0] == start for x in word_al) and any(x[0] == end for x in word_al):
175
- start_time = [al[1] for al in word_al if al[0] == start][0]
176
- end_time = [al[2] for al in word_al if al[0] == end][0]
177
 
178
- pitches = get_pitches(start_time, end_time, id, f0_dir)
179
- rmses = get_rmse(start_time, end_time, id, wav_dir, len(pitches))
180
- spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
181
- pitches_cpy = np.array(deepcopy(pitches))
182
- rmses_cpy = np.array(deepcopy(rmses))
183
- d = [[p, r, s] for p, r, s in zip(pitches_cpy, rmses_cpy, spectral_centroids)]
184
- words = "-".join(word_combs)
185
- data[f"{words}-{id}"] = d
 
 
 
 
186
 
187
- return data
188
- # output -
189
- # {'hvaða-sjúkdómar-013823-0457777': [[-1.9923755532468812, 0.0027455997, -0.4325454395749879],
190
- # [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
191
- # [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
192
- # [-1.9923755532468812, 0.0027455997, -0.4325454395749879],
193
- # [-1.9923755532468812, 0.0033261522, -0.4428492071628255]],
194
- # 'geta-fylgt-013823-0457777': [[x,x,x],[x,x,x]],
195
- # 'hvaða-sjúkdómar-013726-0843679': [[],[]] }
196
- # e.g. it seems to be a flat dict whose keys are unique speaker&unit tokens
197
- # for which each entry is list len timepoints, at each timepoint dim feats (for me up to 2 not 3)
198
 
199
 
200
-
201
- # up to here was forming the data
202
- # -----------------------------------------------------
203
- # from here down is probably clustering it
204
-
205
-
206
-
207
- # TODO i have no idea how necessary this will be at all
208
  def dtw_distance(x, y):
209
  """
210
  Returns the DTW distance between two pitch sequences.
@@ -216,116 +191,224 @@ def dtw_distance(x, y):
216
 
217
 
218
 
219
- # TODO idk but it looks p good
220
- # HOWEVER consider exclude the 0 self-comparisons
221
- # or see if there is something later that takes care of them
222
- dtw_dists = defaultdict(list)
223
-
224
- for key1, value1 in data.items():
225
- d = key1.split("-")
226
- words1 = d[:-2]
227
- id1, id2 = d[-2], d[-1]
228
- for key2, value2 in data.items():
229
- d = key2.split("-")
230
- words2 = d[:-2]
231
- id3, id4 = d[-2], d[-1]
232
- if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
233
- dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
234
-
235
- # dtw dists ends up as the dict from units to list of tuples
236
- # {'hvaða-sjúkdómar': [('013823-0457777_013823-0457777', 0.0),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  # ('013823-0457777_013698-0441666', 0.5999433281203399),
238
  # ('013823-0457777_014675-0563760', 0.4695447105594414),
239
  # ('014226-0508808_013823-0457777', 0.44080874425223393),
240
  # ('014226-0508808_014226-0508808', 0.0),
241
  # ('014226-0508808_013726-0843679', 0.5599404672667414),
242
- # ('014226-0508808_013681-0442313', 0.6871330752342419)] }
243
- # note that currently the 0 self-comparisons are present here so
 
 
244
 
245
 
246
 
247
  # TODO
248
- # a) do i need this?
249
- # b) make n_clusters a param with default 3
250
  def kmedoids_clustering(X):
251
  kmedoids = KMedoids(n_clusters=3, random_state=0).fit(X)
252
  y_km = kmedoids.labels_
253
  return y_km, kmedoids
254
 
255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
 
 
 
258
 
259
- # TODO !!!!!!!!!!!! #########
260
- # THIS IS LIKE THE MAIN THINGS probably
261
- # ok ya it can probably use some restructurings
262
- # like i can make something make ids_dist2 format already earlier.
263
- # also triplecheck what kind of distancematrix is supposed to go into X
264
- # and what currently is it
265
- # although ok i think it might be, and self-organising,
266
- # and why it keeps the 0s and has symmetric doubles of everything.
267
- # HOWEVER the 10 should possibly be replaced with nspeakers param ?!?!??
268
 
269
 
270
- # btw since i guess clustering strictly operates on X,
271
- # once i reduce whatever duration thing down to pair-distances,
272
- # it no longer matters that duration and pitch/energy had different dimensionality...
273
- # .... in fact should i actually dtw on 3 feats pitch/ener/dur separately and er cluster on
274
- # 3dim distance mat? or can u not give it distances in multidim space bc distance doesnt do that
275
- # in which case i could still, u kno, average the 3 distances into 1 x, altho..
276
 
277
- kmedoids_cluster_dists = defaultdict(list)
 
 
 
 
 
 
278
 
279
- for words, datas in dtw_dists.items():
280
- ids_dist = {d[0]: d[1] for d in datas}
281
 
282
- ids_dist2 = defaultdict(list)
 
283
 
284
- for d in datas:
285
- id1, id2 = d[0].split("_")
286
- ids_dist2[id1].append(d[1])
287
 
288
- X = [d[1] for d in datas]
289
- X = [X[i:i+10] for i in range(0, len(X), 10)]
 
 
 
 
290
  X = np.array(X)
 
291
  y_km, kmedoids = kmedoids_clustering(X)
292
- plot_clusters(X, y_km, words)
293
-
294
- c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)]
295
 
296
  result = zip(X, kmedoids.labels_)
297
- sortedR = sorted(result, key=lambda x: x[1])
298
 
299
- for dp in sortedR:
300
- arr, label = dp
301
- ids = next((k for k, v in ids_dist2.items() if np.array_equal(v, arr)), None)
 
 
 
 
 
302
 
303
- if ids is None:
304
- print("ID is none")
305
- continue
306
 
307
- kmedoids_cluster_dists[words].append((label, ids, arr))
308
 
309
- # TODO probably remember to make it RETURN kmedoids_cluster_dists ..
310
 
311
 
 
 
312
 
313
 
314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
 
317
 
318
- # ###############
319
- # TTS and misc ------------------
320
- #
 
 
 
 
321
 
 
 
 
 
 
 
 
 
 
322
 
323
- # TODO rename this get_audio_part
324
- # also maybe take that tmp wav-making out of reaper and put it somewhere general.
325
- # so everything gets a wav.
326
- # TODO do NOT specify SR
327
- # and CHECK if everything that depends on this is ok with arbitrary SR
328
- def get_audio(start_time, end_time, id, path):
 
 
 
 
 
 
329
  """
330
  Returns a dictionary of RMSE values for a given sentence.
331
  """
@@ -337,65 +420,77 @@ def get_audio(start_time, end_time, id, path):
337
 
338
 
339
 
340
- # see near end of notebook for v nice way to grab timespans of tts audio
341
- # (or just the start/end timestamps to mark them) from alignment json
342
- # based on word position index -
343
- # so probably really do show user the sentence with each word numbered.
344
 
 
 
 
 
 
 
345
 
 
346
 
347
- # TODO the speech_marks.json is NOT EXACTLY what u get from tiro
348
- # but idr how different, so.
349
- alfur_sents = speech_marks_data["Alfur"]
350
- with open("speech_marks.json") as f:
351
- speech_marks_data = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
 
 
 
 
355
 
 
356
 
357
- # TODO there IS sth for making tts_data
358
- # but im probably p much on my own rlly for that.
359
 
360
 
361
- # TODO this one is v v helpful.
362
- # but mind if i adjusted a dictionaries earlier.
363
- speaker_to_tts_dtw_dists = defaultdict(list)
364
 
365
- for key1, value1 in data.items():
366
- d = key1.split("-")
367
- words1 = d[:-2]
368
- id1, id2 = d[-2], d[-1]
369
- for key2, value2 in tts_data.items():
370
- d = key2.split("-")
371
- words2 = d[:-2]
372
- id3, id4 = d[-2], d[-1]
373
- if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
374
- speaker_to_tts_dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
375
 
376
 
377
- #TODO i think this is also gr8
378
- # but like figure out how its doing
379
- # bc dict format and stuff,
380
- # working keying by word index instead of word text, ***********
381
- # and for 1 wd or 3+ wd units...
382
- tts_dist_to_cluster = defaultdict(list)
383
 
384
- for words1, datas1 in kmedoids_cluster_dists.items():
385
- for d1 in datas1:
386
- cluster, sp_id1, arr = d1
387
- for words2, datas2 in speaker_to_tts_dtw_dists.items():
388
- for d2 in datas2:
389
- ids, dist = d2
390
- sp_id2, tts_alfur = ids.split("_")
391
- if sp_id1 == sp_id2 and words1 == words2:
392
- tts_dist_to_cluster[f"{words1}-{cluster}"].append(dist)
393
 
394
- tts_mean_dist_to_cluster = {
395
- key: np.mean(value) for key, value in tts_dist_to_cluster.items()
396
- }
397
 
398
 
 
 
 
 
 
399
 
400
 
401
  # THEN there is -
@@ -416,10 +511,20 @@ tts_mean_dist_to_cluster = {
416
 
417
 
418
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
 
420
- # PLOTTING IS GOING TO BE A WHOLE NIGHTMare
421
- # that is just too bad
422
-
423
  def plot_clusters(X, y, word):
424
  u_labels = np.unique(y)
425
 
 
1
  import numpy as np
2
+ import matplotlib
3
+ matplotlib.use('Agg')
4
  import matplotlib.pyplot as plt
5
  import soundfile as sf
6
  from collections import defaultdict
 
17
 
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def z_score(x, mean, std):
21
  return (x - mean) / std
22
 
23
 
24
 
25
+
26
+ # output
 
27
  # {'013823-0457777': [('hvaða', 0.89, 1.35),
28
  # ('sjúkdómar', 1.35, 2.17),
29
  # ('geta', 2.17, 2.4),
 
42
  # ('fylgt', 1.96, 2.27),
43
  # ('óbeinum', 2.27, 2.73),
44
  # ('reykingum', 2.73, 3.27)] }
45
+
46
+ # takes a list of human SPEAKER IDS not the whole meta db
47
+ def get_word_aligns(rec_ids, norm_sent, aln_dir):
48
  """
49
  Returns a dictionary of word alignments for a given sentence.
50
  """
51
  word_aligns = defaultdict(list)
52
 
53
+ for rec in rec_ids:
54
+ slist = norm_sent.split(" ")
55
+ aln_path = os.path.join(aln_dir, f'{rec}.tsv')
56
+ with open(aln_path) as f:
57
+ lines = f.read().splitlines()
58
+ lines = [l.split('\t') for l in lines]
59
+ try:
60
+ assert len(lines) == len(slist)
61
+ word_aligns[rec] = [(w,float(s),float(e)) for w,s,e in lines]
62
+ except:
63
+ print(slist, lines, "<---- something didn't match")
 
 
 
 
 
 
 
64
  return word_aligns
65
+
66
 
67
 
 
 
 
 
 
 
 
68
  def get_pitches(start_time, end_time, id, path):
69
  """
70
  Returns an array of pitch values for a given speech.
71
+ Reads from .f0 file of Time, F0, IsVoiced
72
  """
73
 
74
  f = os.path.join(path, id + ".f0")
75
  with open(f) as f:
76
+ lines = f.read().splitlines()
77
  lines = [[float(x) for x in line.split()] for line in lines] # split lines into floats
78
  pitches = []
79
+
80
 
81
  # find the mean of all pitches in the whole sentence
82
+ mean = np.mean([line[1] for line in lines if line[2] != -1])
83
  # find the std of all pitches in the whole sentence
84
+ std = np.std([line[1] for line in lines if line[2] != -1])
85
+
86
 
 
 
 
87
  for line in lines:
88
+ time, pitch, is_pitch = line
89
 
90
  if start_time <= time <= end_time:
91
  if is_pitch:
92
+ pitches.append(z_score(pitch, mean, std))
 
 
 
 
 
93
  else:
94
+ #pitches.append(z_score(fifth_percentile, mean, std))
95
+ pitches.append(-0.99)
96
 
97
  return pitches
98
 
99
 
100
 
101
+ # jcheng used energy from esps get_f0
102
+ # get f0 says (?) :
103
+ #The RMS value of each record is computed based on a 30 msec hanning
104
+ #window with its left edge placed 5 msec before the beginning of the
105
+ #frame.
106
+ # jcheng z-scored the energys, per file.
107
+ # TODO: implement that. ?
108
+ # not sure librosa provides hamming window in rms function directly
109
+ # TODO handle audio that not originally .wav
110
+ def get_rmse(start_time, end_time, id, path):
111
  """
112
  Returns an array of RMSE values for a given speech.
113
  """
 
115
  f = os.path.join(path, id + ".wav")
116
  audio, sr = librosa.load(f, sr=16000)
117
  segment = audio[int(np.floor(start_time * sr)):int(np.ceil(end_time * sr))]
118
+ rmse = librosa.feature.rms(y=segment)
119
  rmse = rmse[0]
120
+ #idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
121
+ return rmse#[idx]
122
+
123
+
124
+ def downsample_rmse2pitch(rmse,pitch_len):
125
  idx = np.round(np.linspace(0, len(rmse) - 1, pitch_len)).astype(int)
126
  return rmse[idx]
127
+
128
 
129
+
130
+ # parse user input string to usable word indices for the sentence
131
+ # TODO handle cases
132
+ def parse_word_indices(start_end_word_index):
133
+ ixs = start_end_word_index.split('-')
134
+ if len(ixs) == 1:
135
+ s = int(ixs[0])
136
+ e = int(ixs[0])
137
+ else:
138
+ s = int(ixs[0])
139
+ e = int(ixs[-1])
140
+ return s-1,e-1
141
+
142
+
143
+ # take any (1stword, lastword) or (word)
144
+ # unit and prepare data for that unit
145
+ def get_data(norm_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, start_end_word_index):
146
  """
147
  Returns a dictionary of pitch, rmse, and spectral centroids values for a given sentence/word combinations.
148
  """
149
 
150
+ s_ix, e_ix = parse_word_indices(start_end_word_index)
 
 
151
 
152
+ words = '_'.join(norm_sent.split(' ')[s_ix:e_ix+1])
153
+
154
+ word_aligns = get_word_aligns(h_spk_ids,norm_sent,h_align_dir)
155
+ data = defaultdict(list)
156
+ align_data = defaultdict(list)
157
+
158
  for id, word_al in word_aligns.items():
159
+ start_time = word_al[s_ix][1]
160
+ end_time = word_al[e_ix][2]
161
+
162
+ seg_aligns = word_al[s_ix:e_ix+1]
163
+ seg_aligns = [(w,round(s-start_time,2),round(e-start_time,2)) for w,s,e in seg_aligns]
 
 
164
 
165
+ pitches = get_pitches(start_time, end_time, id, h_f0_dir)
166
+
167
+ rmses = get_rmse(start_time, end_time, id, h_wav_dir)
168
+ rmses = downsample_rmse2pitch(rmses,len(pitches))
169
+ #spectral_centroids = get_spectral_centroids(start_time, end_time, id, wav_dir, len(pitches))
170
+
171
+ pitches_cpy = np.array(deepcopy(pitches))
172
+ rmses_cpy = np.array(deepcopy(rmses))
173
+ d = [[p, r] for p, r in zip(pitches_cpy, rmses_cpy)]
174
+ #words = "-".join(word_combs)
175
+ data[f"{words}**{id}"] = d
176
+ align_data[f"{words}**{id}"] = seg_aligns
177
 
178
+ return words, data, align_data
179
+
180
+
 
 
 
 
 
 
 
 
181
 
182
 
 
 
 
 
 
 
 
 
183
  def dtw_distance(x, y):
184
  """
185
  Returns the DTW distance between two pitch sequences.
 
191
 
192
 
193
 
194
+ # recs is a sorted list of rec IDs
195
+ # all recs/data contain the same words
196
+ # rec1 and rec2 can be the same
197
+ def pair_dists(data,words,recs):
198
+
199
+ dtw_dists = []
200
+
201
+ for rec1 in recs:
202
+ key1 = f'{words}**{rec1}'
203
+ val1 = data[key1]
204
+ for rec2 in recs:
205
+ key2 = f'{words}**{rec2}'
206
+ val2 = data[key2]
207
+ dtw_dists.append((f"{rec1}**{rec2}", dtw_distance(val1, val2)))
208
+
209
+ #for key1, value1 in data.items():
210
+ # d1 = key1.split("**")
211
+ # words1 = d1[0]
212
+ # if not words:
213
+ # words = words1
214
+ # spk1 = d1[1]
215
+ # for key2, value2 in data.items():
216
+ # d2 = key2.split("**")
217
+ # words2 = d2[0]
218
+ # spk2 = d2[1]
219
+ # if all([w0 == w2 for w0, w2 in zip(words.split('_'), words2.split('_'))]):
220
+ #dtw_dists[words1].append((f"{spk1}**{spk2}", dtw_distance(value1, value2)))
221
+ # dtw_dists.append((f"{spk1}**{spk2}", dtw_distance(value1, value2)))
222
+ return dtw_dists
223
+ # dtw dists is the dict from units to list of tuples
224
+ # or: now just the list not labelled with the unit.
225
+ # {'hvaða-sjúkdómar':
226
+ # [('013823-0457777_013823-0457777', 0.0),
227
  # ('013823-0457777_013698-0441666', 0.5999433281203399),
228
  # ('013823-0457777_014675-0563760', 0.4695447105594414),
229
  # ('014226-0508808_013823-0457777', 0.44080874425223393),
230
  # ('014226-0508808_014226-0508808', 0.0),
231
  # ('014226-0508808_013726-0843679', 0.5599404672667414),
232
+ # ('014226-0508808_013681-0442313', 0.6871330752342419)]
233
+ # }
234
+ # the 0-distance self-comparisons are present here
235
+ # along with both copies of symmetric Speaker1**Speaker2, Speaker2**Speaker1
236
 
237
 
238
 
239
  # TODO
240
+ # make n_clusters a param with default 3
 
241
  def kmedoids_clustering(X):
242
  kmedoids = KMedoids(n_clusters=3, random_state=0).fit(X)
243
  y_km = kmedoids.labels_
244
  return y_km, kmedoids
245
 
246
 
247
+ def get_tts_data(tdir,voice,start_end_word_index):
248
+ with open(f'{tdir}{voice}.json') as f:
249
+ speechmarks = json.load(f)
250
+ speechmarks = speechmarks['alignments']
251
+
252
+ sr=16000
253
+ tts_audio, _ = librosa.load(f'{tdir}{voice}.wav',sr=sr)
254
+
255
+ # TODO
256
+ # tts operates on punctuated version
257
+ # so clean this up instead of assuming it will work
258
+ s_ix, e_ix = parse_word_indices(start_end_word_index)
259
+
260
+ # TODO
261
+ # default speechmarks return word start time only -
262
+ # this cannot describe pauses #######
263
+ s_tts = speechmarks[s_ix]["time"]/1000
264
+ if e_ix+1 < len(speechmarks): #if user doesn't want final word, which has no end time mark,
265
+ e_tts = speechmarks[e_ix+1]["time"]/1000
266
+ tts_segment = tts_audio[int(np.floor(s_tts * sr)):int(np.ceil(e_tts * sr))]
267
+ else:
268
+ tts_segment = tts_audio[int(np.floor(s_tts * sr)):]
269
+ e_tts = len(tts_audio) / sr
270
+ # TODO not ideal as probably silence padding on end file?
271
+
272
+ tts_align = [(speechmarks[ix]["value"],speechmarks[ix]["time"]) for ix in range(s_ix,e_ix+1)]
273
+ tts_align = [(w,s/1000) for w,s in tts_align]
274
+ tts_align = [(w,round(s-s_tts,3)) for w,s in tts_align]
275
+
276
+ tts_f0 = get_pitches(s_tts, e_tts, voice, tdir)
277
+ tts_rmse = get_rmse(s_tts, e_tts, voice, tdir)
278
+ tts_rmse = downsample_rmse2pitch(tts_rmse,len(tts_f0))
279
+ t_pitches_cpy = np.array(deepcopy(tts_f0))
280
+ t_rmses_cpy = np.array(deepcopy(tts_rmse))
281
+ tts_data = [[p, r] for p, r in zip(t_pitches_cpy, t_rmses_cpy)]
282
+ return tts_data, tts_align
283
+
284
+
285
 
286
+ def match_tts(clusters, speech_data, tts_data, tts_align, words, seg_aligns, voice):
287
+
288
+ tts_info = []
289
+ for label in set([c for r,c in clusters]):
290
+ recs = [r for r,c in clusters if c==label]
291
+ dists = []
292
+ for rec in recs:
293
+ key = f'{words}**{rec}'
294
+ dists.append(dtw_distance(tts_data, speech_data[key]))
295
+ tts_info.append((label,np.nanmean(dists)))
296
+
297
+ tts_info = sorted(tts_info,key = lambda x: x[1])
298
+ best_cluster = tts_info[0][0]
299
+ best_cluster_score = tts_info[0][1]
300
+
301
+ matched_data = {f'{words}**{r}': speech_data[f'{words}**{r}'] for r,c in clusters if c==best_cluster}
302
 
303
+ # now do graphs of matched_data with tts_data
304
+ # and report best_cluster_score
305
+ fig = plot_pitch_tts(speech_data,tts_data, tts_align, words,seg_aligns,best_cluster,voice)
306
 
307
+ return best_cluster_score, fig
308
+
 
 
 
 
 
 
 
309
 
310
 
 
 
 
 
 
 
311
 
312
+ # since clustering strictly operates on X,
313
+ # once reduce a duration metric down to pair-distances,
314
+ # it no longer matters that duration and pitch/energy had different dimensionality
315
+ # TODO option to dtw on 3 feats pitch/ener/dur separately
316
+ # check if possible cluster with 3dim distance mat?
317
+ # or can it not take that input in multidimensional space
318
+ # then the 3 dists can still be averaged to flatten, if appropriately scaled
319
 
320
+ def cluster(norm_sent,orig_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, tts_dir, voices, start_end_word_index):
 
321
 
322
+ h_spk_ids = sorted(h_spk_ids)
323
+ nsents = len(h_spk_ids)
324
 
325
+ words, data, seg_aligns = get_data(norm_sent,h_spk_ids, h_align_dir, h_f0_dir, h_wav_dir, start_end_word_index)
 
 
326
 
327
+ dtw_dists = pair_dists(data,words,h_spk_ids)
328
+
329
+ kmedoids_cluster_dists = []
330
+
331
+ X = [d[1] for d in dtw_dists]
332
+ X = [X[i:i+nsents] for i in range(0, len(X), nsents)]
333
  X = np.array(X)
334
+
335
  y_km, kmedoids = kmedoids_clustering(X)
336
+ #plot_clusters(X, y_km, words)
337
+ #c1, c2, c3 = [X[np.where(kmedoids.labels_ == i)] for i in range(3)]
 
338
 
339
  result = zip(X, kmedoids.labels_)
340
+ groups = [[r,c] for r,c in zip(h_spk_ids,kmedoids.labels_)]
341
 
342
+
343
+ # tts: assume the first 64 chars of sentence are enough
344
+ tdir = f'{tts_dir}{orig_sent.replace(" ","_")[:65]}/'
345
+ for v in voices:
346
+ tts_data, tts_align = get_tts_data(tdir,v,start_end_word_index)
347
+
348
+ # match the data with a cluster -----
349
+ best_cluster_score, fig = match_tts(groups, data, tts_data, tts_align, words, seg_aligns,v)
350
 
351
+ # only supports one voice at a time currently
352
+ return best_cluster_score, fig
353
+ #return words, kmedoids_cluster_dists, groups
354
 
 
355
 
 
356
 
357
 
358
+ # TODO there IS sth for making tts_data
359
+ # but im probably p much on my own rlly for that.
360
 
361
 
362
 
363
+ # TODO this one is v v helpful.
364
+ # but mind if i adjusted a dictionaries earlier.
365
+ def spks_all_cdist():
366
+ speaker_to_tts_dtw_dists = defaultdict(list)
367
+
368
+ for key1, value1 in data.items():
369
+ d = key1.split("-")
370
+ words1 = d[:-2]
371
+ id1, id2 = d[-2], d[-1]
372
+ for key2, value2 in tts_data.items():
373
+ d = key2.split("-")
374
+ words2 = d[:-2]
375
+ id3, id4 = d[-2], d[-1]
376
+ if all([w1 == w2 for w1, w2 in zip(words1, words2)]):
377
+ speaker_to_tts_dtw_dists[f"{'-'.join(words1)}"].append((f"{id1}-{id2}_{id3}-{id4}", dtw_distance(value1, value2)))
378
+ return speaker_to_tts_dtw_dists
379
 
380
 
381
 
382
+ #TODO i think this is also gr8
383
+ # but like figure out how its doing
384
+ # bc dict format and stuff,
385
+ # working keying by word index instead of word text, ***********
386
+ # and for 1 wd or 3+ wd units...
387
+ def tts_cdist():
388
+ tts_dist_to_cluster = defaultdict(list)
389
 
390
+ for words1, datas1 in kmedoids_cluster_dists.items():
391
+ for d1 in datas1:
392
+ cluster, sp_id1, arr = d1
393
+ for words2, datas2 in speaker_to_tts_dtw_dists.items():
394
+ for d2 in datas2:
395
+ ids, dist = d2
396
+ sp_id2, tts_alfur = ids.split("_")
397
+ if sp_id1 == sp_id2 and words1 == words2:
398
+ tts_dist_to_cluster[f"{words1}-{cluster}"].append(dist)
399
 
400
+ tts_mean_dist_to_cluster = {
401
+ key: np.mean(value) for key, value in tts_dist_to_cluster.items()
402
+ }
403
+ return tts_mean_dist_to_cluster
404
+
405
+
406
+
407
+
408
+
409
+
410
+ # TODO check if anything uses this?
411
+ def get_audio_part(start_time, end_time, id, path):
412
  """
413
  Returns a dictionary of RMSE values for a given sentence.
414
  """
 
420
 
421
 
422
 
 
 
 
 
423
 
424
+ def plot_pitch_tts(speech_data,tts_data, tts_align,words,seg_aligns,cluster_id, voice):
425
+ colors = ["red", "green", "blue", "orange", "purple", "pink", "brown", "gray", "cyan"]
426
+ i = 0
427
+ fig = plt.figure(figsize=(6, 5))
428
+ plt.title(f"{words} - Pitch - Cluster {cluster_id}")
429
+ for k,v in speech_data.items():
430
 
431
+ spk = k.split('**')[1]
432
 
433
+ word_times = seg_aligns[k]
434
+
435
+ pitches = [p for p,e in v]
436
+ # datapoint interval is 0.005 seconds
437
+ pitch_xvals = [x*0.005 for x in range(len(pitches))]
438
+
439
+ # centre around the first word boundary -
440
+ # if 3+ words, too bad.
441
+ if len(word_times)>1:
442
+ realign = np.mean([word_times[0][2],word_times[1][1]])
443
+ pitch_xvals = [x - realign for x in pitch_xvals]
444
+ word_times = [(w,s-realign,e-realign) for w,s,e in word_times]
445
+ plt.axvline(x= 0, color="gray", linestyle='--', linewidth=1, label=f"{word_times[0][0]} -> {word_times[1][0]} boundary")
446
+
447
+ if len(word_times)>2:
448
+ for i in range(1,len(word_times)-1):
449
+ bound_line = np.mean([word_times[i][2],word_times[i+1][1]])
450
+ plt.axvline(x=bound_line, color=colors[i], linestyle='--', linewidth=1, label=f"Speaker {spk} -> {word_times[i+1][0]}")
451
+
452
+ plt.scatter(pitch_xvals, pitches, color=colors[i], label=f"Speaker {spk}")
453
+ i += 1
454
 
455
+ tpitches = [p for p,e in tts_data]
456
+ t_xvals = [x*0.005 for x in range(len(tpitches))]
457
+
458
+ if len(tts_align)>1:
459
+ realign = tts_align[1][1]
460
+ t_xvals = [x - realign for x in t_xvals]
461
+ tts_align = [(w,s-realign) for w,s in tts_align]
462
+
463
+ if len(tts_align)>2:
464
+ for i in range(2,len(tts_align)):
465
+ bound_line = tts_align[i][1]
466
+ plt.axvline(x=bound_line, color="black", linestyle='--', linewidth=1, label=f"TTS -> {tts_align[i][0]}")
467
+ plt.scatter(t_xvals, tpitches, color="black", label=f"TTS {voice}")
468
 
469
 
470
+ plt.legend()
471
+ #plt.show()
472
+
473
 
474
+ return fig
475
 
 
 
476
 
477
 
 
 
 
478
 
 
 
 
 
 
 
 
 
 
 
479
 
480
 
 
 
 
 
 
 
481
 
482
+ # want to:
483
+ # - find tts best cluster
484
+ # - find avg dist for tts in that cluster
485
+ # - find avg dist for any human to the rest of its cluster
 
 
 
 
 
486
 
 
 
 
487
 
488
 
489
+ # see near end of notebook for v nice way to grab timespans of tts audio
490
+ # (or just the start/end timestamps to mark them) from alignment json
491
+ # based on word position index -
492
+ # so probably really do show user the sentence with each word numbered.
493
+
494
 
495
 
496
  # THEN there is -
 
511
 
512
 
513
 
514
+ # will need:
515
+ # the whole sentence text (index, word) pairs
516
+ # the indices of units the user wants
517
+ # human meta db of all human recordings
518
+ # tts dir, human wav + align + f0 dirs
519
+ # list of tts voices
520
+ # an actual wav file for each human rec, probably
521
+ # params like: use f0, use rmse, (use dur), [.....]
522
+ # .. check.
523
+
524
+
525
+
526
+
527
 
 
 
 
528
  def plot_clusters(X, y, word):
529
  u_labels = np.unique(y)
530
 
scripts/reaper2pass.py CHANGED
@@ -27,8 +27,8 @@ def reaper_soundfile(sound_path, orig_filetype):
27
 
28
 
29
 
30
-
31
- def get_reaper(wav_path, maxf0='700', minf0='50', reaper_path = "REAPER/build/reaper"):
32
 
33
  f0_data = subprocess.run([reaper_path, "-i", wav_path, '-f', '/dev/stdout', '-x', maxf0, '-m', minf0, '-a'],capture_output=True).stdout
34
  #print('PLAIN:',f0_data)
@@ -38,7 +38,7 @@ def get_reaper(wav_path, maxf0='700', minf0='50', reaper_path = "REAPER/build/re
38
  #print(f0_data)
39
  f0_data = [l.split(' ') for l in f0_data]
40
  f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
41
- f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
42
 
43
  return f0_data
44
 
@@ -49,15 +49,15 @@ def get_reaper(wav_path, maxf0='700', minf0='50', reaper_path = "REAPER/build/re
49
  # and write that to a text file.
50
  # alternate would be letting reaper write its own files
51
  # instead of capturing the stdout...
52
- def save_pitch(f0_data, save_path,hed=True):
53
  with open(save_path,'w') as handle:
54
  if hed:
55
- handle.write('TIME\tF0\n')
56
- handle.write(''.join([f'{t}\t{f}\n' for t,f in f0_data]))
57
 
58
 
59
  # 2 pass pitch estimation
60
- def estimate_pitch(sound_path):
61
 
62
  orig_ftype = sound_path.split('.')[-1]
63
  if orig_ftype == '.wav':
@@ -66,10 +66,10 @@ def estimate_pitch(sound_path):
66
  tmp_path = reaper_soundfile(sound_path, orig_ftype)
67
  wav_path = tmp_path
68
 
69
- print('REAPER FILE PATH:', wav_path)
70
 
71
- first_pass = get_reaper(wav_path)
72
- first_pass = [f for t,f in first_pass]
73
 
74
  q1 = np.quantile(first_pass,0.25)
75
  q3 = np.quantile(first_pass,0.75)
@@ -77,10 +77,15 @@ def estimate_pitch(sound_path):
77
  pfloor = 0.75 * q1
78
  pceil = 1.5 * q3
79
 
80
- second_pass = get_reaper(wav_path,maxf0 = str(round(pceil)), minf0 = str(round(pfloor)))
81
 
82
 
83
- if orig_ftype != '.wav':
84
- subprocess.run(["rm", tmp_path])
 
 
 
 
85
 
86
  return second_pass
 
 
27
 
28
 
29
 
30
+ # returns f0 data as list of Time, F0 if exists, voicing indicator
31
+ def get_reaper(wav_path, reaper_path, maxf0='700', minf0='50'):
32
 
33
  f0_data = subprocess.run([reaper_path, "-i", wav_path, '-f', '/dev/stdout', '-x', maxf0, '-m', minf0, '-a'],capture_output=True).stdout
34
  #print('PLAIN:',f0_data)
 
38
  #print(f0_data)
39
  f0_data = [l.split(' ') for l in f0_data]
40
  f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
41
+ f0_data = [ [float(t), float(f), float(v)] for t,v,f in f0_data]
42
 
43
  return f0_data
44
 
 
49
  # and write that to a text file.
50
  # alternate would be letting reaper write its own files
51
  # instead of capturing the stdout...
52
+ def save_pitch(f0_data, save_path,hed=False):
53
  with open(save_path,'w') as handle:
54
  if hed:
55
+ handle.write('TIME\tF0\tVOICED\n')
56
+ handle.write(''.join([f'{t}\t{f}\t{v}\n' for t,f,v in f0_data]))
57
 
58
 
59
  # 2 pass pitch estimation
60
+ def estimate_pitch(sound_path,reaper_path = "REAPER/build/reaper"):
61
 
62
  orig_ftype = sound_path.split('.')[-1]
63
  if orig_ftype == '.wav':
 
66
  tmp_path = reaper_soundfile(sound_path, orig_ftype)
67
  wav_path = tmp_path
68
 
69
+ #print('REAPER FILE PATH:', wav_path)
70
 
71
+ first_pass = get_reaper(wav_path,reaper_path)
72
+ first_pass = [f for t,f,v in first_pass if float(v) ==1]
73
 
74
  q1 = np.quantile(first_pass,0.25)
75
  q3 = np.quantile(first_pass,0.75)
 
77
  pfloor = 0.75 * q1
78
  pceil = 1.5 * q3
79
 
80
+ second_pass = get_reaper(wav_path,reaper_path, maxf0 = str(round(pceil)), minf0 = str(round(pfloor)))
81
 
82
 
83
+ #if orig_ftype != '.wav':
84
+ # subprocess.run(["rm", tmp_path])
85
+ # don't remove it yet, need it for clustering too
86
+ # therefore, actually change so reaper2pass is called from inside clusterprosody
87
+ # before it wants to read the f0 file.
88
+ # TODO
89
 
90
  return second_pass
91
+
scripts/runSQ.py CHANGED
@@ -2,6 +2,10 @@ import os, unicodedata
2
  from scripts.ctcalign import aligner, wav16m
3
  from scripts.tapi import tiro
4
  from scripts.reaper2pass import estimate_pitch, save_pitch
 
 
 
 
5
 
6
  # given a Sentence string,
7
  # using a metadata file of SQ, // SQL1adult_metadata.tsv
@@ -9,7 +13,7 @@ from scripts.reaper2pass import estimate_pitch, save_pitch
9
  # report how many, or if 0.
10
 
11
 
12
- def run(sentence, voices):
13
  #sentence = 'hvaða sjúkdómar geta fylgt óbeinum reykingum'
14
  #voices = ['Alfur','Dilja','Karl', 'Dora']
15
  # On tts.tiro.is speech marks are only available
@@ -18,7 +22,7 @@ def run(sentence, voices):
18
 
19
  corpus_meta = '/home/user/app/human_data/SQL1adult10s_metadata.tsv'
20
  speech_dir = '/home/user/app/human_data/audio/squeries/'
21
- speech_aligns = '/home/user/app/human_data/aligns/squeries/'
22
  speech_f0 = '/home/user/app/human_data/f0/squeries/'
23
  align_model_path ="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
24
 
@@ -31,24 +35,18 @@ def run(sentence, voices):
31
  if meta:
32
  align_human(meta,speech_aligns,speech_dir,align_model_path)
33
  f0_human(meta, speech_f0, speech_dir)
34
- #TODO cluster humans
35
- # input - meta, speech dir, human aligns dir, human f0 dir, any cluster params.
36
- # output maybe an object.
37
  if voices:
38
- temp_a_sample = get_tts(sentence,voices,tts_dir)
 
39
  f0_tts(sentence, voices, tts_dir)
40
-
41
- # by now, all the data to cluster and eval exists in the right place.
42
- # (after the last todo of saving pitch to disk instead of only list)
43
-
44
- # next, make a thing that does clustering.
45
- # its input is Meta + the paths to find wav, aln, f0 datas.
46
-
47
- # its output may as well actually be graphs lol
48
 
49
  # also stop forgetting duration.
50
 
51
- return temp_a_sample
52
 
53
 
54
  def snorm(s):
@@ -61,6 +59,7 @@ def snorm(s):
61
  # find all the recordings of a given sentence
62
  # listed in the corpus metadata.
63
  # sentence should be provided lowercase without punctuation
 
64
  def get_recordings(sentence, corpusdb):
65
  with open(corpusdb,'r') as handle:
66
  meta = handle.read().splitlines()
@@ -116,7 +115,7 @@ def align_human(meta,align_dir,speech_dir,model_path):
116
 
117
  # check if f0s exist for all of those files.
118
  # if not, warn, and make them with TODO reaper
119
- def f0_human(meta, f0_dir, speech_dir):
120
  no_f0 = []
121
 
122
  for rec in meta:
@@ -131,12 +130,12 @@ def f0_human(meta, f0_dir, speech_dir):
131
  for rec in no_f0:
132
  wav_path = f'{speech_dir}{rec[2]}'
133
  fpath = f0_dir + rec[2].replace('.wav','.f0')
134
- f0_data = estimate_pitch(wav_path)
135
  save_pitch(f0_data,fpath)
136
 
137
 
138
- print('2ND PASS PITCHES OF', fpath)
139
- print(f0_data)
140
 
141
 
142
  else:
@@ -163,6 +162,7 @@ def get_tts(sentence,voices,ttsdir):
163
  no_voice.append(v)
164
  if not temp_sample_path:
165
  temp_sample_path = wpath
 
166
 
167
  if no_voice:
168
  print(f'Need to generate TTS for {len(no_voice)} voices')
@@ -174,14 +174,14 @@ def get_tts(sentence,voices,ttsdir):
174
  else:
175
  print('TTS for all voices existed')
176
 
177
- return temp_sample_path
178
 
179
 
180
 
181
  # check if the TTS f0s exist
182
  # if not warn + make
183
  # TODO collapse functions
184
- def f0_tts(sentence, voices, ttsdir):
185
 
186
  # assume the first 64 chars of sentence are enough
187
  dpath = sentence.replace(' ','_')[:65]
@@ -198,11 +198,8 @@ def f0_tts(sentence, voices, ttsdir):
198
  for v in voices:
199
  wav_path = f'{ttsdir}{dpath}/{v}.wav'
200
  fpath = f'{ttsdir}{dpath}/{v}.f0'
201
- f0_data = estimate_pitch(wav_path)
202
  save_pitch(f0_data,fpath)
203
-
204
- print('2ND PASS PITCHES OF', fpath)
205
- print(f0_data)
206
 
207
  else:
208
  print('All TTS pitch trackings existed')
@@ -210,6 +207,47 @@ def f0_tts(sentence, voices, ttsdir):
210
 
211
 
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
  # https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73
215
 
 
2
  from scripts.ctcalign import aligner, wav16m
3
  from scripts.tapi import tiro
4
  from scripts.reaper2pass import estimate_pitch, save_pitch
5
+ import scripts.clusterprosody as cl
6
+
7
+
8
+
9
 
10
  # given a Sentence string,
11
  # using a metadata file of SQ, // SQL1adult_metadata.tsv
 
13
  # report how many, or if 0.
14
 
15
 
16
+ def run(sentence, voices, start_end_word_ix):
17
  #sentence = 'hvaða sjúkdómar geta fylgt óbeinum reykingum'
18
  #voices = ['Alfur','Dilja','Karl', 'Dora']
19
  # On tts.tiro.is speech marks are only available
 
22
 
23
  corpus_meta = '/home/user/app/human_data/SQL1adult10s_metadata.tsv'
24
  speech_dir = '/home/user/app/human_data/audio/squeries/'
25
+ speech_aligns = '/home/user/app/human_data/align/squeries/'
26
  speech_f0 = '/home/user/app/human_data/f0/squeries/'
27
  align_model_path ="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
28
 
 
35
  if meta:
36
  align_human(meta,speech_aligns,speech_dir,align_model_path)
37
  f0_human(meta, speech_f0, speech_dir)
38
+ human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta])
39
+
 
40
  if voices:
41
+ voices = [voices[0]] # TODO. now limit one voice at a time.
42
+ tts_sample, tts_speechmarks = get_tts(sentence,voices,tts_dir)
43
  f0_tts(sentence, voices, tts_dir)
44
+
45
+ score, fig = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)
 
 
 
 
 
 
46
 
47
  # also stop forgetting duration.
48
 
49
+ return tts_sample, score, fig
50
 
51
 
52
  def snorm(s):
 
59
  # find all the recordings of a given sentence
60
  # listed in the corpus metadata.
61
  # sentence should be provided lowercase without punctuation
62
+ # TODO something not fatal to interface if <10
63
  def get_recordings(sentence, corpusdb):
64
  with open(corpusdb,'r') as handle:
65
  meta = handle.read().splitlines()
 
115
 
116
  # check if f0s exist for all of those files.
117
  # if not, warn, and make them with TODO reaper
118
+ def f0_human(meta, f0_dir, speech_dir, reaper_path = "REAPER/build/reaper"):
119
  no_f0 = []
120
 
121
  for rec in meta:
 
130
  for rec in no_f0:
131
  wav_path = f'{speech_dir}{rec[2]}'
132
  fpath = f0_dir + rec[2].replace('.wav','.f0')
133
+ f0_data = estimate_pitch(wav_path, reaper_path)
134
  save_pitch(f0_data,fpath)
135
 
136
 
137
+ #print('2ND PASS PITCHES OF', fpath)
138
+ #print(f0_data)
139
 
140
 
141
  else:
 
162
  no_voice.append(v)
163
  if not temp_sample_path:
164
  temp_sample_path = wpath
165
+ temp_json_path = jpath
166
 
167
  if no_voice:
168
  print(f'Need to generate TTS for {len(no_voice)} voices')
 
174
  else:
175
  print('TTS for all voices existed')
176
 
177
+ return temp_sample_path, temp_json_path
178
 
179
 
180
 
181
  # check if the TTS f0s exist
182
  # if not warn + make
183
  # TODO collapse functions
184
+ def f0_tts(sentence, voices, ttsdir, reaper_path = "REAPER/build/reaper"):
185
 
186
  # assume the first 64 chars of sentence are enough
187
  dpath = sentence.replace(' ','_')[:65]
 
198
  for v in voices:
199
  wav_path = f'{ttsdir}{dpath}/{v}.wav'
200
  fpath = f'{ttsdir}{dpath}/{v}.f0'
201
+ f0_data = estimate_pitch(wav_path, reaper_path)
202
  save_pitch(f0_data,fpath)
 
 
 
203
 
204
  else:
205
  print('All TTS pitch trackings existed')
 
207
 
208
 
209
 
210
+ def localtest():
211
+ sentence = 'Ef svo er, hvað heita þau þá?'#'Var það ekki nóg?'
212
+ voices = ['Alfur'] #,'Dilja']
213
+ # make for now the interface allows max one voice
214
+
215
+ start_end_word_ix = '5-7'
216
+
217
+ locl = '/home/caitlinr/work/peval/pce/'
218
+ corpus_meta = locl+'human_data/SQL1adult10s_metadata.tsv'
219
+ speech_dir = locl+'human_data/audio/squeries/'
220
+ speech_aligns = locl+'human_data/align/squeries/'
221
+ speech_f0 = locl+'human_data/f0/squeries/'
222
+ align_model_path ="/home/caitlinr/work/models/LVL/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
223
+
224
+ tts_dir = locl+'tts_data/'
225
+
226
+ reaper_exc = '/home/caitlinr/work/notterra/REAPER/build/reaper'
227
+
228
+ norm_sentence = snorm(sentence)
229
+ meta = get_recordings(norm_sentence, corpus_meta)
230
+ #print(meta)
231
+ if meta:
232
+ align_human(meta,speech_aligns,speech_dir,align_model_path)
233
+ f0_human(meta, speech_f0, speech_dir, reaper_path = reaper_exc )
234
+
235
+ human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta])
236
+
237
+ if voices:
238
+ voices = [voices[0]] # TODO. now limit one voice at a time.
239
+ audio_sample, speechmarks = get_tts(sentence,voices,tts_dir)
240
+ f0_tts(sentence, voices, tts_dir, reaper_path = reaper_exc)
241
+
242
+
243
+ score, fig = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix)
244
+
245
+
246
+
247
+ #localtest()
248
+ # torch matplotlib librosa sklearn_extra pydub
249
+ # env pclustr
250
+
251
 
252
  # https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73
253