import os, unicodedata, string, random |
from scripts.ctcalign import aligner, wav16m |
from scripts.tapi import tiro |
from scripts.reaper2pass import estimate_pitch, save_pitch |
import scripts.clusterprosody as cl |
def run(sentence, voices, start_end_word_ix): |
corpus_meta = '/home/user/app/human_data/SQL1adult10s_metadata.tsv' |
speech_dir = '/home/user/app/human_data/audio/squeries/' |
speech_aligns = '/home/user/app/human_data/align/squeries/' |
speech_f0 = '/home/user/app/human_data/f0/squeries/' |
align_model_path ="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h" |
tts_dir = '/home/user/app/tts_data/' |
norm_sentence = snorm(sentence) |
sentence = sentence.replace('\t', ' ') |
human_rec_ids = get_samromur_queries(norm_sentence, corpus_meta, speech_dir, speech_aligns, align_model_path, speech_f0) |
if voices: |
temp_tts_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path) |
voices = [voices[0]] |
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix) |
return temp_tts_sample, score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e |
def snorm(s): |
s = ''.join([c.lower() for c in s if not unicodedata.category(c).startswith("P") ]) |
while ' ' in s: |
s = s.replace(' ', ' ') |
return s |
def create_temp_sent_list(): |
corpusdb = '/home/user/app/human_data/SQL1adult10s_metadata.tsv' |
with open(corpusdb,'r') as handle: |
meta = handle.read().splitlines() |
meta = [l.split('\t')[3] for l in meta[1:]] |
meta = sorted(list(set(meta))) |
return meta |
def align_file(wav_path, output_path, norm_sentence, word_aligner = None, model_path = "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"): |
model_word_sep = '|' |
model_blank_tk = '[PAD]' |
if not word_aligner: |
print('initiating forced alignment, can take some time...') |
word_aligner = aligner(model_path,model_word_sep,model_blank_tk) |
word_aln = word_aligner(wav16m(wav_path),norm_sentence,is_normed=True) |
word_aln = [[str(x) for x in l] for l in word_aln] |
with open(output_path,'w') as handle: |
handle.write(''.join(['\t'.join(l)+'\n' for l in word_aln])) |
return word_aligner |
def get_samromur_queries(sentence, corpusdb, speech_dir, align_dir, align_model_path, f0_dir, reaper_path = "REAPER/build/reaper"): |
with open(corpusdb,'r') as handle: |
meta = handle.read().splitlines() |
meta = [l.split('\t') for l in meta[1:]] |
meta = [l for l in meta if l[4] == sentence] |
if len(meta) < 10: |
if len(meta) < 1: |
print('This sentence does not exist in the corpus') |
else: |
print('Under 10 copies of the sentence: skipping.') |
return [] |
else: |
print(f'{len(meta)} recordings of sentence <{sentence}>') |
word_aligner = None |
if not os.path.exists(align_dir): |
os.makedirs(align_dir) |
if not os.path.exists(f0_dir): |
os.makedirs(f0_dir) |
for rec in meta: |
wpath = f'{speech_dir}{rec[2]}' |
apath = align_dir + rec[2].replace('.wav','.tsv') |
if not os.path.exists(apath): |
word_aligner = align_file(wpath,apath, rec[4], word_aligner = word_aligner, model_path = align_model_path) |
fpath = f0_dir + rec[2].replace('.wav','.f0') |
if not os.path.exists(fpath): |
fpath = f0_dir + rec[2].replace('.wav','.f0') |
f0_data = estimate_pitch(wpath, reaper_path) |
save_pitch(f0_data,fpath) |
human_rec_ids = sorted([l[2].split('.wav')[0] for l in meta]) |
return human_rec_ids |
def get_tts(sentence,voices,ttsdir,align_model_path,reaper_path = "REAPER/build/reaper"): |
dpath = setup_tts_sent(sentence,ttsdir) |
sample_paths = [] |
word_aligner = None |
for v in voices: |
wpath = f'{dpath}/{v}.wav' |
apath = f'{dpath}/{v}.tsv' |
fpath = f'{dpath}/{v}.f0' |
if not os.path.exists(wpath): |
wf = tiro(sentence,v,save=f'{dpath}/') |
if not os.path.exists(apath): |
word_aligner = align_file(wpath, apath, snorm(sentence), word_aligner = word_aligner, model_path = align_model_path) |
if not os.path.exists(fpath): |
f0_data = estimate_pitch(wpath, reaper_path) |
save_pitch(f0_data,fpath) |
sample_paths.append(wpath) |
temp_sample_path = wpath |
return temp_sample_path, dpath |
def setup_tts_sent(sentence,ttsdir,meta_path = 'tts_meta.tsv'): |
if not os.path.exists(f'{ttsdir}'): |
os.makedirs(f'{ttsdir}') |
sentence = sentence.replace('\n',' ') |
with open(f'{ttsdir}{meta_path}','a+') as handle: |
handle.seek(0) |
tts_meta = handle.read().splitlines() |
tts_meta = [l.split('\t') for l in tts_meta] |
tts_meta = {sent:s_id for s_id,sent in tts_meta} |
if sentence not in tts_meta.keys(): |
sent_id = sentence.replace(' ','_')[:33] |
rand_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=6)) |
while f'{sent_id}_{rand_id}' in tts_meta.values(): |
rand_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=6)) |
sent_id = f'{sent_id}_{rand_id}' |
handle.write(f'{sent_id}\t{sentence}\n') |
else: |
sent_id = tts_meta[sentence] |
sent_dir = f'{ttsdir}{sent_id}' |
if not os.path.exists(f'{sent_dir}'): |
os.makedirs(f'{sent_dir}') |
return sent_dir |
def precompute(corpusdb, speech_dir, align_dir, align_model_path, f0_dir, reaper_path, fromi=None,toi=None): |
with open(corpusdb,'r') as handle: |
meta = handle.read().splitlines() |
meta = [l.split('\t') for l in meta[1:]] |
word_aligner = None |
if not os.path.exists(align_dir): |
os.makedirs(align_dir) |
if not os.path.exists(f0_dir): |
os.makedirs(f0_dir) |
if (fromi and toi): |
meta = meta[fromi:toi] |
for rec in meta: |
wpath = f'{speech_dir}{rec[2]}' |
apath = align_dir + rec[2].replace('.wav','.tsv') |
if not os.path.exists(apath): |
word_aligner = align_file(wpath,apath, rec[4], word_aligner = word_aligner, model_path = align_model_path) |
fpath = f0_dir + rec[2].replace('.wav','.f0') |
if not os.path.exists(fpath): |
fpath = f0_dir + rec[2].replace('.wav','.f0') |
f0_data = estimate_pitch(wpath, reaper_path) |
save_pitch(f0_data,fpath) |
return max(toi,len(meta)) |
def localtest(): |
sentence = 'En er hægt að taka orðalagið bókstaflega?' |
voices = ['Alfur_v2'] |
start_end_word_ix = '1-3' |
locl = '/home/caitlinr/work/peval/pce/' |
corpus_meta = locl+'human_data/SQL1adult10s_metadata.tsv' |
speech_dir = locl+'human_data/audio/squeries/' |
speech_aligns = locl+'human_data/align/squeries/' |
speech_f0 = locl+'human_data/f0/squeries/' |
align_model_path ="/home/caitlinr/work/models/LVL/wav2vec2-large-xlsr-53-icelandic-ep10-1000h" |
tts_dir = locl+'tts_data/' |
reaper_exc = '/home/caitlinr/work/notterra/REAPER/build/reaper' |
norm_sentence = snorm(sentence) |
human_rec_ids = get_samromur_queries(norm_sentence, corpus_meta, speech_dir, speech_aligns, align_model_path, speech_f0, reaper_path = reaper_exc) |
if voices: |
one_audio_sample, tts_sent_dir = get_tts(sentence,voices,tts_dir,align_model_path,reaper_path = reaper_exc) |
voices = [voices[0]] |
score, tts_fig_p, mid_fig_p, bad_fig_p, tts_fig_e, fig_mid_e, fig_bad_e = cl.cluster(norm_sentence, sentence, human_rec_ids, speech_aligns, speech_f0, speech_dir, tts_sent_dir, voices, start_end_word_ix) |