|
import gradio as gr |
|
import subprocess, os |
|
import scripts.runSQ |
|
|
|
|
|
|
|
|
|
def setup(): |
|
r0 = subprocess.run(["pwd"], capture_output=True, text=True) |
|
print('PWD::', r0.stdout) |
|
|
|
|
|
r9x = subprocess.run(["ls", "-la"], capture_output=True, text=True) |
|
print('LS::', r9x.stdout) |
|
|
|
subprocess.run(["unzip", "./REAPER-master.zip"]) |
|
subprocess.run(["rm", "./REAPER-master.zip"]) |
|
subprocess.run(["mv", "REAPER-master", "REAPER"]) |
|
|
|
os.chdir('./REAPER') |
|
subprocess.run(["mkdir", "build"]) |
|
os.chdir('./build') |
|
r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True) |
|
print(r2.stdout) |
|
r3 = subprocess.run(["make"], capture_output=True, text=True) |
|
print(r3.stdout) |
|
|
|
os.chdir('../..') |
|
r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True) |
|
print('LS::', r9.stdout) |
|
|
|
|
|
print('about to setup') |
|
setup() |
|
|
|
|
|
|
|
def label_indices(sentence): |
|
sentence = scripts.runSQ.snorm(sentence) |
|
sentence = sentence.split(' ') |
|
labelled = [(f'{word} {i+1} ', str(i+1)) for i, word in enumerate(sentence)] |
|
return labelled |
|
|
|
|
|
|
|
def d2l(d): |
|
return [(k,v) for k,v in d.items()] |
|
def l2d(l): |
|
return {k:v for k,v in l} |
|
|
|
temp_sentences = scripts.runSQ.create_temp_sent_list() |
|
|
|
bl = gr.Blocks() |
|
with bl: |
|
|
|
|
|
voices = ['Dilja_v2', 'Alfur_v2', 'Dilja', 'Alfur', 'Bjartur', 'Rosa'] |
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.TabItem("Options"): |
|
temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence") |
|
marked_sentence = gr.HighlightedText(interactive=False,label="Word selection key",color_map = {str(i):"#dcfce7" for i in range(333)}) |
|
|
|
with gr.Row(): |
|
spanselect = gr.Textbox(value='1-3',label="Select words",info='Enter the index of the word(s) to analyse, according to the key above. It can be a single word: 4 or a span of words separated by a dash: 2-3') |
|
|
|
voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value=['Dilja_v2','Alfur_v2']) |
|
|
|
|
|
temp_button = gr.Button(value="Run with selected options") |
|
|
|
with gr.TabItem("About"): |
|
docu = gr.Markdown(""" |
|
# Multi-target prosody evaluation |
|
### 1. Choose a sentence - they are from Samrómur Queries |
|
### 2. The words will be numbered by position - type the number or range you want to evaluate |
|
### 3. Choose a TTS voice - they come from Tiro's API https://tiro.is/talgerving |
|
### 4. Run |
|
|
|
The evaluation automatically clusters human speakers according to prosodic features, |
|
and then measures how different the synthesised speech is from each natural cluster. |
|
Clustering and TTS scoring use only the selected word(s) from Step 2, not the whole sentence. |
|
Close match to one cluster shows what prosodic act TTS might have achieved, in the selected words. |
|
TTS whose prosody does not match any cluster might sound unnatural. |
|
|
|
TTS output includes generated audio, pitch, energy, and scores for each cluster. |
|
Output is only shown for the selected voice(s). |
|
Below, human data shows pitch and energy of each cluster, along with original audio. |
|
|
|
TTS often takes over 30 seconds per sentence/voice. |
|
After you have done it once, re-running different word spans for the same sentence/voice is much faster. |
|
|
|
See "Automatic assessment of prosody in high-stakes English tests" (Jian Cheng, ISCA 2011) |
|
regarding multi-target prosody scoring. This version extends the implementation by Magnús Freyr Morthens |
|
supported by Rannís student innovation fund. |
|
""") |
|
|
|
ttstabs = {v:{} for v in voices} |
|
with gr.Tabs(): |
|
for v in voices: |
|
with gr.TabItem(v): |
|
ttstabs[v]['tts_output'] = gr.Audio(interactive=False) |
|
with gr.Row(): |
|
ttstabs[v]['ptts'] = gr.Plot() |
|
ttstabs[v]['etts'] = gr.Plot() |
|
ttstabs[v]['scorearea'] = gr.Markdown(f'TTS results for **{v}** will appear here') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def f1(voices, sent, indices): |
|
|
|
f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, tts_results = scripts.runSQ.run(sent, voices, indices) |
|
outputs = {pc0: f0_fig_0, pc1: f0_fig_1, pc2: f0_fig_2, ec0: en_fig_0, ec1: en_fig_1, ec2: en_fig_2, play: audio_html} |
|
|
|
|
|
for v in voices: |
|
outputs[ttstabs[v]['tts_output']] = tts_results[v]['audio'] |
|
outputs[ttstabs[v]['ptts']] = tts_results[v]['f0_fig_tts'] |
|
outputs[ttstabs[v]['etts']] = tts_results[v]['en_fig_tts'] |
|
outputs[ttstabs[v]['scorearea']] = tts_results[v]['scoreinfo'] |
|
|
|
clear = [v for v in ttstabs.keys() if v not in voices] |
|
for v in clear: |
|
outputs[ttstabs[v]['tts_output']] = None |
|
outputs[ttstabs[v]['ptts']] = None |
|
outputs[ttstabs[v]['etts']] = None |
|
outputs[ttstabs[v]['scorearea']] = f'TTS results for **{v}** will appear here' |
|
|
|
return outputs |
|
|
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Pitch"): |
|
|
|
pc0 = gr.Plot() |
|
pc1 = gr.Plot() |
|
pc2 = gr.Plot() |
|
|
|
with gr.TabItem("Energy"): |
|
|
|
ec0 = gr.Plot() |
|
ec1 = gr.Plot() |
|
ec2 = gr.Plot() |
|
|
|
with gr.TabItem("Audio"): |
|
|
|
play = gr.HTML(label="Audio samples") |
|
|
|
|
|
|
|
temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence) |
|
outputs_list = [pc0,pc1,pc2,ec0,ec1,ec2,play] |
|
for v in voices: |
|
outputs_list += [ttstabs[v]['tts_output'], ttstabs[v]['ptts'],ttstabs[v]['etts'],ttstabs[v]['scorearea']] |
|
temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],outputs_list) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
bl.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|