|
import os, unicodedata |
|
from scripts.ctcalign import aligner, wav16m |
|
from scripts.tapi import tiro |
|
from scripts.reaper2pass import estimate_pitch, save_pitch |
|
import scripts.clusterprosody as cl |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run(sentence, voices, start_end_word_ix): |
|
|
|
|
|
|
|
|
|
|
|
|
|
corpus_meta = '/home/user/app/human_data/SQL1adult10s_metadata.tsv' |
|
speech_dir = '/home/user/app/human_data/audio/squeries/' |
|
speech_aligns = '/home/user/app/human_data/align/squeries/' |
|
speech_f0 = '/home/user/app/human_data/f0/squeries/' |
|
align_model_path ="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h" |
|
|
|
tts_dir = '/home/user/app/tts_data/' |
|
|
|
|
|
norm_sentence = snorm(sentence) |
|
|
|
meta = get_recordings(norm_sentence, corpus_meta) |
|
if meta: |
|
align_human(meta,speech_aligns,speech_dir,align_model_path) |
|
f0_human(meta, speech_f0, speech_dir) |
|
human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta]) |
|
|
|
if voices: |
|
voices = [voices[0]] |
|
tts_sample, tts_speechmarks = get_tts(sentence,voices,tts_dir) |
|
f0_tts(sentence, voices, tts_dir) |
|
|
|
score, fig = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix) |
|
|
|
|
|
|
|
return tts_sample, score, fig |
|
|
|
|
|
def snorm(s): |
|
s = ''.join([c.lower() for c in s if not unicodedata.category(c).startswith("P") ]) |
|
while ' ' in s: |
|
s = s.replace(' ', ' ') |
|
return s |
|
|
|
|
|
def create_temp_sent_list(): |
|
corpusdb = '/home/user/app/human_data/SQL1adult10s_metadata.tsv' |
|
with open(corpusdb,'r') as handle: |
|
meta = handle.read().splitlines() |
|
meta = [l.split('\t')[3] for l in meta[1:]] |
|
meta = sorted(list(set(meta))) |
|
return meta |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_recordings(sentence, corpusdb): |
|
with open(corpusdb,'r') as handle: |
|
meta = handle.read().splitlines() |
|
meta = [l.split('\t') for l in meta[1:]] |
|
|
|
|
|
smeta = [l for l in meta if l[4] == sentence] |
|
|
|
if len(smeta) < 10: |
|
if len(smeta) < 1: |
|
print('This sentence does not exist in the corpus') |
|
else: |
|
print('Under 10 copies of the sentence: skipping.') |
|
return [] |
|
else: |
|
print(f'{len(smeta)} recordings of sentence <{sentence}>') |
|
return smeta |
|
|
|
|
|
|
|
|
|
|
|
def align_human(meta,align_dir,speech_dir,model_path): |
|
|
|
model_word_sep = '|' |
|
model_blank_tk = '[PAD]' |
|
|
|
no_align = [] |
|
|
|
for rec in meta: |
|
apath = align_dir + rec[2].replace('.wav','.tsv') |
|
if not os.path.exists(apath): |
|
no_align.append(rec) |
|
|
|
if no_align: |
|
print(f'Need to run alignment for {len(no_align)} files') |
|
if not os.path.exists(align_dir): |
|
os.makedirs(align_dir) |
|
|
|
caligner = aligner(model_path,model_word_sep,model_blank_tk) |
|
for rec in no_align: |
|
|
|
wav_path = f'{speech_dir}{rec[2]}' |
|
word_aln = caligner(wav16m(wav_path),rec[4],is_normed=True) |
|
apath = align_dir + rec[2].replace('.wav','.tsv') |
|
word_aln = [[str(x) for x in l] for l in word_aln] |
|
with open(apath,'w') as handle: |
|
handle.write(''.join(['\t'.join(l)+'\n' for l in word_aln])) |
|
else: |
|
print('All alignments existed') |
|
|
|
|
|
|
|
|
|
|
|
def f0_human(meta, f0_dir, speech_dir, reaper_path = "REAPER/build/reaper"): |
|
no_f0 = [] |
|
|
|
for rec in meta: |
|
fpath = f0_dir + rec[2].replace('.wav','.f0') |
|
if not os.path.exists(fpath): |
|
no_f0.append(rec) |
|
|
|
if no_f0: |
|
print(f'Need to estimate pitch for {len(no_f0)} recordings') |
|
if not os.path.exists(f0_dir): |
|
os.makedirs(f0_dir) |
|
for rec in no_f0: |
|
wav_path = f'{speech_dir}{rec[2]}' |
|
fpath = f0_dir + rec[2].replace('.wav','.f0') |
|
f0_data = estimate_pitch(wav_path, reaper_path) |
|
save_pitch(f0_data,fpath) |
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
print('All speech pitch trackings existed') |
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_tts(sentence,voices,ttsdir): |
|
|
|
|
|
dpath = sentence.replace(' ','_')[:65] |
|
|
|
no_voice = [] |
|
|
|
temp_sample_path = '' |
|
|
|
for v in voices: |
|
wpath = f'{ttsdir}{dpath}/{v}.wav' |
|
jpath = f'{ttsdir}{dpath}/{v}.json' |
|
if not (os.path.exists(wpath) and os.path.exists(jpath)): |
|
no_voice.append(v) |
|
if not temp_sample_path: |
|
temp_sample_path = wpath |
|
temp_json_path = jpath |
|
|
|
if no_voice: |
|
print(f'Need to generate TTS for {len(no_voice)} voices') |
|
if not os.path.exists(f'{ttsdir}{dpath}'): |
|
os.makedirs(f'{ttsdir}{dpath}') |
|
for v in voices: |
|
wf, af = tiro(sentence,v,save=f'{ttsdir}{dpath}/') |
|
|
|
else: |
|
print('TTS for all voices existed') |
|
|
|
return temp_sample_path, temp_json_path |
|
|
|
|
|
|
|
|
|
|
|
|
|
def f0_tts(sentence, voices, ttsdir, reaper_path = "REAPER/build/reaper"): |
|
|
|
|
|
dpath = sentence.replace(' ','_')[:65] |
|
|
|
no_f0 = [] |
|
|
|
for v in voices: |
|
fpath = f'{ttsdir}{dpath}/{v}.f0' |
|
if not os.path.exists(fpath): |
|
no_f0.append(v) |
|
|
|
|
|
if no_f0: |
|
print(f'Need to estimate pitch for {len(no_f0)} voices') |
|
for v in voices: |
|
wav_path = f'{ttsdir}{dpath}/{v}.wav' |
|
fpath = f'{ttsdir}{dpath}/{v}.f0' |
|
|
|
f0_data = estimate_pitch(wav_path, reaper_path) |
|
save_pitch(f0_data,fpath) |
|
|
|
else: |
|
print('All TTS pitch trackings existed') |
|
|
|
|
|
|
|
|
|
def localtest(): |
|
sentence = 'Ef svo er, hvað heita þau þá?' |
|
voices = ['Alfur'] |
|
|
|
|
|
start_end_word_ix = '5-7' |
|
|
|
locl = '/home/caitlinr/work/peval/pce/' |
|
corpus_meta = locl+'human_data/SQL1adult10s_metadata.tsv' |
|
speech_dir = locl+'human_data/audio/squeries/' |
|
speech_aligns = locl+'human_data/align/squeries/' |
|
speech_f0 = locl+'human_data/f0/squeries/' |
|
align_model_path ="/home/caitlinr/work/models/LVL/wav2vec2-large-xlsr-53-icelandic-ep10-1000h" |
|
|
|
tts_dir = locl+'tts_data/' |
|
|
|
reaper_exc = '/home/caitlinr/work/notterra/REAPER/build/reaper' |
|
|
|
norm_sentence = snorm(sentence) |
|
meta = get_recordings(norm_sentence, corpus_meta) |
|
|
|
if meta: |
|
align_human(meta,speech_aligns,speech_dir,align_model_path) |
|
f0_human(meta, speech_f0, speech_dir, reaper_path = reaper_exc ) |
|
|
|
human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta]) |
|
|
|
if voices: |
|
voices = [voices[0]] |
|
audio_sample, speechmarks = get_tts(sentence,voices,tts_dir) |
|
f0_tts(sentence, voices, tts_dir, reaper_path = reaper_exc) |
|
|
|
|
|
score, tts_fig, mid_fig, bad_fig = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_dir, voices, start_end_word_ix) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|