catiR commited on
Commit
e2901c5
·
1 Parent(s): 0d95ee8

run clustering

Browse files
Files changed (1) hide show
  1. scripts/runSQ.py +8 -36
scripts/runSQ.py CHANGED
@@ -192,12 +192,19 @@ def f0_tts(sentence, voices, ttsdir, reaper_path = "REAPER/build/reaper"):
192
  fpath = f'{ttsdir}{dpath}/{v}.f0'
193
  if not os.path.exists(fpath):
194
  no_f0.append(v)
195
-
 
 
 
196
  if no_f0:
197
  print(f'Need to estimate pitch for {len(no_f0)} voices')
198
  for v in voices:
199
  wav_path = f'{ttsdir}{dpath}/{v}.wav'
200
  fpath = f'{ttsdir}{dpath}/{v}.f0'
 
 
 
 
201
  f0_data = estimate_pitch(wav_path, reaper_path)
202
  save_pitch(f0_data,fpath)
203
 
@@ -251,41 +258,6 @@ def localtest():
251
 
252
  # https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73
253
 
254
- # CLUSTER the humans
255
- # - read energy and pitch, to alignments
256
- # - dtw based with selected chunking ? code should exist.
257
-
258
- # ... experimental variants?
259
- # ** 1 dimension at a time vs 2 on top of each other
260
- # ** 25 points resampling (euclidean, kmeans, i guess....) vs all points dtw kmediods
261
- # +/or maybe some intermediate parts of that??? like 25 points dtw medoids particularly **
262
- # --different normings for pitch? different settings for energy (tbqh i hope not too much?)
263
- # TODO '''replacement with a constant low value''' ********
264
- # errrrrrrrm duration?
265
- # duration feature vector will have a different length than the others, BUT,
266
- # besides the single clustering,,
267
- # i SUPPOSE one could TRY assigning the phone's 'speech rate' value to every frame of the phone, so it doesn't change while the other 2 values do change.... like it would still VAGUELY represent that 2 people elongating the same vowel/syllable are doing similar things with duration while someone eliding that vowel is doing a different durational thing right there?
268
- # might want to z-score this dimension across ALL speakers tho not within a speaker
269
- # try doing it both ways at least. bc not sure to what extent i want absolute vs. relative rate info here.
270
- #(note - unless chengs dur metric is of a kind where only rel makes sense in the first place. idr.)
271
-
272
-
273
-
274
-
275
- # GRAPH the humans.
276
- # - probably modify this code a bit to centre on boundary.
277
- # - idk.
278
-
279
-
280
- # TEST each TTS
281
- # - structure its features
282
- # - find its avg dist for each human cluster
283
- # - find the lowest dist cluster
284
- # - report the dist for i guess this and all clusters
285
- # - GRAPH the tts with its best cluster
286
-
287
-
288
-
289
  # EVALUATION
290
  # - of the tts
291
  # - of the method: consistency? coherency / interpretability of 'best' voice across different features; alt. ability to recover good & problematic features from a combined method if that is chosen as the best?
 
192
  fpath = f'{ttsdir}{dpath}/{v}.f0'
193
  if not os.path.exists(fpath):
194
  no_f0.append(v)
195
+
196
+ ttt = subprocess.run(["ls", "-la", "ttsdir"], capture_output=True, text=True)
197
+ print('LS::', ttt.stdout)
198
+
199
  if no_f0:
200
  print(f'Need to estimate pitch for {len(no_f0)} voices')
201
  for v in voices:
202
  wav_path = f'{ttsdir}{dpath}/{v}.wav'
203
  fpath = f'{ttsdir}{dpath}/{v}.f0'
204
+
205
+ print(wav_path)
206
+ print(fpath)
207
+
208
  f0_data = estimate_pitch(wav_path, reaper_path)
209
  save_pitch(f0_data,fpath)
210
 
 
258
 
259
  # https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73
260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  # EVALUATION
262
  # - of the tts
263
  # - of the method: consistency? coherency / interpretability of 'best' voice across different features; alt. ability to recover good & problematic features from a combined method if that is chosen as the best?