File size: 6,915 Bytes
f0fa26d 51c86c4 f0fa26d 1095ae0 f0fa26d 1095ae0 f0fa26d 53792d8 28363d1 53792d8 f0fa26d 1095ae0 f0fa26d 0d67145 a894787 f0fa26d 1095ae0 366ecce 1095ae0 53792d8 1095ae0 2dfee05 1095ae0 53792d8 3948240 1095ae0 3948240 1095ae0 99c2d01 3948240 53792d8 1095ae0 f0fa26d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import gradio as gr
import subprocess, os
import scripts.runSQ
#https://huggingface.co/spaces/clr/prosalign/blob/main/app.py
def setup():
r0 = subprocess.run(["pwd"], capture_output=True, text=True)
print('PWD::', r0.stdout)
#r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/REAPER-master.zip", "-O", "./master.zip"], capture_output=True, text=True)
#print(r1.stdout)
r9x = subprocess.run(["ls", "-la"], capture_output=True, text=True)
print('LS::', r9x.stdout)
subprocess.run(["unzip", "./REAPER-master.zip"])
subprocess.run(["rm", "./REAPER-master.zip"])
subprocess.run(["mv", "REAPER-master", "REAPER"])
os.chdir('./REAPER')
subprocess.run(["mkdir", "build"])
os.chdir('./build')
r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True)
print(r2.stdout)
r3 = subprocess.run(["make"], capture_output=True, text=True)
print(r3.stdout)
os.chdir('../..')
r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True)
print('LS::', r9.stdout)
print('about to setup')
setup()
def label_indices(sentence):
sentence = scripts.runSQ.snorm(sentence)
sentence = sentence.split(' ')
labelled = [(f'{word} {i+1} ', str(i+1)) for i, word in enumerate(sentence)]
return labelled
#gradio states dont like dicts
def d2l(d):
return [(k,v) for k,v in d.items()]
def l2d(l):
return {k:v for k,v in l}
temp_sentences = scripts.runSQ.create_temp_sent_list()
bl = gr.Blocks()
with bl:
voices = ['Dilja_v2', 'Alfur_v2', 'Dilja', 'Alfur', 'Bjartur', 'Rosa']
with gr.Tabs():
with gr.TabItem("Options"):
temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
marked_sentence = gr.HighlightedText(interactive=False,label="Word selection key",color_map = {str(i):"#dcfce7" for i in range(333)})
with gr.Row():
spanselect = gr.Textbox(value='1-3',label="Select words",info='Enter the index of the word(s) to analyse, according to the key above. It can be a single word: 4 or a span of words separated by a dash: 2-3')
#voiceselect = gr.Radio(voices, label="TTS voice",value='Alfur_v2')
voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value=['Dilja_v2','Alfur_v2'])
#with gr.Column(scale=1):
temp_button = gr.Button(value="Run with selected options")
with gr.TabItem("About"):
docu = gr.Markdown("""
# Multi-target prosody evaluation
### 1. Choose a sentence - they are from Samrómur Queries
### 2. The words will be numbered by position - type the number or range you want to evaluate
### 3. Choose a TTS voice - they come from Tiro's API https://tiro.is/talgerving
### 4. Run
The evaluation automatically clusters human speakers according to prosodic features,
and then measures how different the synthesised speech is from each natural cluster.
Clustering and TTS scoring use only the selected word(s) from Step 2, not the whole sentence.
Close match to one cluster shows what prosodic act TTS might have achieved, in the selected words.
TTS whose prosody does not match any cluster might sound unnatural.
TTS output includes generated audio, pitch, energy, and scores for each cluster.
Output is only shown for the selected voice(s).
Below, human data shows pitch and energy of each cluster, along with original audio.
TTS often takes over 30 seconds per sentence/voice.
After you have done it once, re-running different word spans for the same sentence/voice is much faster.
See "Automatic assessment of prosody in high-stakes English tests" (Jian Cheng, ISCA 2011)
regarding multi-target prosody scoring. This version extends the implementation by Magnús Freyr Morthens
supported by Rannís student innovation fund.
""")
ttstabs = {v:{} for v in voices}
with gr.Tabs():
for v in voices:
with gr.TabItem(v):
ttstabs[v]['tts_output'] = gr.Audio(interactive=False)
with gr.Row():
ttstabs[v]['ptts'] = gr.Plot()
ttstabs[v]['etts'] = gr.Plot()
ttstabs[v]['scorearea'] = gr.Markdown(f'TTS results for **{v}** will appear here')
#tts_output = gr.Audio(interactive=False)
#with gr.Row():
# ptts = gr.Plot()
# etts = gr.Plot()
#report_score = gr.Markdown('Difference from TTS to real speech:')
# cant store ttstabs in gradio state, use here
def f1(voices, sent, indices):
#tts_audio, tts_score, f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, f0_fig_tts, en_fig_tts = scripts.runSQ.run(sent, [voices], indices)
f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, tts_results = scripts.runSQ.run(sent, voices, indices)
outputs = {pc0: f0_fig_0, pc1: f0_fig_1, pc2: f0_fig_2, ec0: en_fig_0, ec1: en_fig_1, ec2: en_fig_2, play: audio_html}
for v in voices:
outputs[ttstabs[v]['tts_output']] = tts_results[v]['audio']
outputs[ttstabs[v]['ptts']] = tts_results[v]['f0_fig_tts']
outputs[ttstabs[v]['etts']] = tts_results[v]['en_fig_tts']
outputs[ttstabs[v]['scorearea']] = tts_results[v]['scoreinfo']
clear = [v for v in ttstabs.keys() if v not in voices]
for v in clear:
outputs[ttstabs[v]['tts_output']] = None
outputs[ttstabs[v]['ptts']] = None
outputs[ttstabs[v]['etts']] = None
outputs[ttstabs[v]['scorearea']] = f'TTS results for **{v}** will appear here'
return outputs #(tts_audio, score_report, f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, f0_fig_tts, en_fig_tts)
with gr.Tabs():
with gr.TabItem("Pitch"):
pc0 = gr.Plot()
pc1 = gr.Plot()
pc2 = gr.Plot()
with gr.TabItem("Energy"):
ec0 = gr.Plot()
ec1 = gr.Plot()
ec2 = gr.Plot()
with gr.TabItem("Audio"):
play = gr.HTML(label="Audio samples")
temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
outputs_list = [pc0,pc1,pc2,ec0,ec1,ec2,play]
for v in voices:
outputs_list += [ttstabs[v]['tts_output'], ttstabs[v]['ptts'],ttstabs[v]['etts'],ttstabs[v]['scorearea']]
temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],outputs_list)
#[tts_output,report_score,pc0,pc1,pc2,ec0,ec1,ec2,play,ptts,etts])
if __name__ == "__main__":
bl.launch()
|