catiR
commited on
Commit
·
e2901c5
1
Parent(s):
0d95ee8
run clustering
Browse files- scripts/runSQ.py +8 -36
scripts/runSQ.py
CHANGED
@@ -192,12 +192,19 @@ def f0_tts(sentence, voices, ttsdir, reaper_path = "REAPER/build/reaper"):
|
|
192 |
fpath = f'{ttsdir}{dpath}/{v}.f0'
|
193 |
if not os.path.exists(fpath):
|
194 |
no_f0.append(v)
|
195 |
-
|
|
|
|
|
|
|
196 |
if no_f0:
|
197 |
print(f'Need to estimate pitch for {len(no_f0)} voices')
|
198 |
for v in voices:
|
199 |
wav_path = f'{ttsdir}{dpath}/{v}.wav'
|
200 |
fpath = f'{ttsdir}{dpath}/{v}.f0'
|
|
|
|
|
|
|
|
|
201 |
f0_data = estimate_pitch(wav_path, reaper_path)
|
202 |
save_pitch(f0_data,fpath)
|
203 |
|
@@ -251,41 +258,6 @@ def localtest():
|
|
251 |
|
252 |
# https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73
|
253 |
|
254 |
-
# CLUSTER the humans
|
255 |
-
# - read energy and pitch, to alignments
|
256 |
-
# - dtw based with selected chunking ? code should exist.
|
257 |
-
|
258 |
-
# ... experimental variants?
|
259 |
-
# ** 1 dimension at a time vs 2 on top of each other
|
260 |
-
# ** 25 points resampling (euclidean, kmeans, i guess....) vs all points dtw kmediods
|
261 |
-
# +/or maybe some intermediate parts of that??? like 25 points dtw medoids particularly **
|
262 |
-
# --different normings for pitch? different settings for energy (tbqh i hope not too much?)
|
263 |
-
# TODO '''replacement with a constant low value''' ********
|
264 |
-
# errrrrrrrm duration?
|
265 |
-
# duration feature vector will have a different length than the others, BUT,
|
266 |
-
# besides the single clustering,,
|
267 |
-
# i SUPPOSE one could TRY assigning the phone's 'speech rate' value to every frame of the phone, so it doesn't change while the other 2 values do change.... like it would still VAGUELY represent that 2 people elongating the same vowel/syllable are doing similar things with duration while someone eliding that vowel is doing a different durational thing right there?
|
268 |
-
# might want to z-score this dimension across ALL speakers tho not within a speaker
|
269 |
-
# try doing it both ways at least. bc not sure to what extent i want absolute vs. relative rate info here.
|
270 |
-
#(note - unless chengs dur metric is of a kind where only rel makes sense in the first place. idr.)
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
# GRAPH the humans.
|
276 |
-
# - probably modify this code a bit to centre on boundary.
|
277 |
-
# - idk.
|
278 |
-
|
279 |
-
|
280 |
-
# TEST each TTS
|
281 |
-
# - structure its features
|
282 |
-
# - find its avg dist for each human cluster
|
283 |
-
# - find the lowest dist cluster
|
284 |
-
# - report the dist for i guess this and all clusters
|
285 |
-
# - GRAPH the tts with its best cluster
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
# EVALUATION
|
290 |
# - of the tts
|
291 |
# - of the method: consistency? coherency / interpretability of 'best' voice across different features; alt. ability to recover good & problematic features from a combined method if that is chosen as the best?
|
|
|
192 |
fpath = f'{ttsdir}{dpath}/{v}.f0'
|
193 |
if not os.path.exists(fpath):
|
194 |
no_f0.append(v)
|
195 |
+
|
196 |
+
ttt = subprocess.run(["ls", "-la", "ttsdir"], capture_output=True, text=True)
|
197 |
+
print('LS::', ttt.stdout)
|
198 |
+
|
199 |
if no_f0:
|
200 |
print(f'Need to estimate pitch for {len(no_f0)} voices')
|
201 |
for v in voices:
|
202 |
wav_path = f'{ttsdir}{dpath}/{v}.wav'
|
203 |
fpath = f'{ttsdir}{dpath}/{v}.f0'
|
204 |
+
|
205 |
+
print(wav_path)
|
206 |
+
print(fpath)
|
207 |
+
|
208 |
f0_data = estimate_pitch(wav_path, reaper_path)
|
209 |
save_pitch(f0_data,fpath)
|
210 |
|
|
|
258 |
|
259 |
# https://colab.research.google.com/drive/1RApnJEocx3-mqdQC2h5SH8vucDkSlQYt?authuser=1#scrollTo=410ecd91fa29bc73
|
260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
# EVALUATION
|
262 |
# - of the tts
|
263 |
# - of the method: consistency? coherency / interpretability of 'best' voice across different features; alt. ability to recover good & problematic features from a combined method if that is chosen as the best?
|