import gradio as gr |
import subprocess, os |
import scripts.runSQ |
def setup(): |
r0 = subprocess.run(["pwd"], capture_output=True, text=True) |
print('PWD::', r0.stdout) |
r9x = subprocess.run(["ls", "-la"], capture_output=True, text=True) |
print('LS::', r9x.stdout) |
subprocess.run(["unzip", "./REAPER-master.zip"]) |
subprocess.run(["rm", "./REAPER-master.zip"]) |
subprocess.run(["mv", "REAPER-master", "REAPER"]) |
os.chdir('./REAPER') |
subprocess.run(["mkdir", "build"]) |
os.chdir('./build') |
r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True) |
print(r2.stdout) |
r3 = subprocess.run(["make"], capture_output=True, text=True) |
print(r3.stdout) |
os.chdir('../..') |
r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True) |
print('LS::', r9.stdout) |
print('about to setup') |
setup() |
def label_indices(sentence): |
sentence = scripts.runSQ.snorm(sentence) |
sentence = sentence.split(' ') |
labelled = [(f'{word} {i+1} ', str(i+1)) for i, word in enumerate(sentence)] |
return labelled |
def d2l(d): |
return [(k,v) for k,v in d.items()] |
def l2d(l): |
return {k:v for k,v in l} |
temp_sentences = scripts.runSQ.create_temp_sent_list() |
bl = gr.Blocks() |
with bl: |
voices = ['Dilja_v2', 'Alfur_v2', 'Dilja', 'Alfur', 'Bjartur', 'Rosa'] |
with gr.Tabs(): |
with gr.TabItem("Options"): |
temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence") |
marked_sentence = gr.HighlightedText(interactive=False,label="Word selection key",color_map = {str(i):"#dcfce7" for i in range(333)}) |
with gr.Row(): |
spanselect = gr.Textbox(value='1-3',label="Select words",info='Enter the index of the word(s) to analyse, according to the key above. It can be a single word: 4 or a span of words separated by a dash: 2-3') |
voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value=['Dilja_v2','Alfur_v2']) |
temp_button = gr.Button(value="Run with selected options") |
with gr.TabItem("About"): |
docu = gr.Markdown(""" |
# Multi-target prosody evaluation |
### 1. Choose a sentence - they are from Samrómur Queries |
### 2. The words will be numbered by position - type the number or range you want to evaluate |
### 3. Choose a TTS voice - they come from Tiro's API https://tiro.is/talgerving |
### 4. Run |
The evaluation automatically clusters human speakers according to prosodic features, |
and then measures how different the synthesised speech is from each natural cluster. |
Clustering and TTS scoring use only the selected word(s) from Step 2, not the whole sentence. |
Close match to one cluster shows what prosodic act TTS might have achieved, in the selected words. |
TTS whose prosody does not match any cluster might sound unnatural. |
TTS output includes generated audio, pitch, energy, and scores for each cluster. |
Output is only shown for the selected voice(s). |
Below, human data shows pitch and energy of each cluster, along with original audio. |
TTS often takes over 30 seconds per sentence/voice. |
After you have done it once, re-running different word spans for the same sentence/voice is much faster. |
See "Automatic assessment of prosody in high-stakes English tests" (Jian Cheng, ISCA 2011) |
regarding multi-target prosody scoring. This version extends the implementation by Magnús Freyr Morthens |
supported by Rannís student innovation fund. |
""") |
ttstabs = {v:{} for v in voices} |
with gr.Tabs(): |
for v in voices: |
with gr.TabItem(v): |
ttstabs[v]['tts_output'] = gr.Audio(interactive=False) |
with gr.Row(): |
ttstabs[v]['ptts'] = gr.Plot() |
ttstabs[v]['etts'] = gr.Plot() |
ttstabs[v]['scorearea'] = gr.Markdown(f'TTS results for **{v}** will appear here') |
def f1(voices, sent, indices): |
f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, tts_results = scripts.runSQ.run(sent, voices, indices) |
outputs = {pc0: f0_fig_0, pc1: f0_fig_1, pc2: f0_fig_2, ec0: en_fig_0, ec1: en_fig_1, ec2: en_fig_2, play: audio_html} |
for v in voices: |
outputs[ttstabs[v]['tts_output']] = tts_results[v]['audio'] |
outputs[ttstabs[v]['ptts']] = tts_results[v]['f0_fig_tts'] |
outputs[ttstabs[v]['etts']] = tts_results[v]['en_fig_tts'] |
outputs[ttstabs[v]['scorearea']] = tts_results[v]['scoreinfo'] |
clear = [v for v in ttstabs.keys() if v not in voices] |
for v in clear: |
outputs[ttstabs[v]['tts_output']] = None |
outputs[ttstabs[v]['ptts']] = None |
outputs[ttstabs[v]['etts']] = None |
outputs[ttstabs[v]['scorearea']] = f'TTS results for **{v}** will appear here' |
return outputs |
with gr.Tabs(): |
with gr.TabItem("Pitch"): |
pc0 = gr.Plot() |
pc1 = gr.Plot() |
pc2 = gr.Plot() |
with gr.TabItem("Energy"): |
ec0 = gr.Plot() |
ec1 = gr.Plot() |
ec2 = gr.Plot() |
with gr.TabItem("Audio"): |
play = gr.HTML(label="Audio samples") |
temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence) |
outputs_list = [pc0,pc1,pc2,ec0,ec1,ec2,play] |
for v in voices: |
outputs_list += [ttstabs[v]['tts_output'], ttstabs[v]['ptts'],ttstabs[v]['etts'],ttstabs[v]['scorearea']] |
temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],outputs_list) |
if __name__ == "__main__": |
bl.launch() |