File size: 6,915 Bytes
f0fa26d
51c86c4
f0fa26d
 
 
 
 
 
 
 
1095ae0
 
 
 
 
 
 
f0fa26d
1095ae0
f0fa26d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53792d8
 
 
28363d1
53792d8
f0fa26d
1095ae0
 
 
 
 
 
f0fa26d
0d67145
a894787
f0fa26d
 
 
 
1095ae0
366ecce
1095ae0
53792d8
1095ae0
 
 
 
2dfee05
1095ae0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53792d8
3948240
 
 
 
1095ae0
 
 
3948240
 
 
1095ae0
 
 
99c2d01
 
 
 
3948240
 
 
53792d8
1095ae0
 
 
 
 
f0fa26d
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import gradio as gr
import subprocess, os
import scripts.runSQ

#https://huggingface.co/spaces/clr/prosalign/blob/main/app.py


def setup():
    r0 = subprocess.run(["pwd"], capture_output=True, text=True)
    print('PWD::', r0.stdout)
    #r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/REAPER-master.zip", "-O", "./master.zip"], capture_output=True, text=True)
    #print(r1.stdout)
    r9x = subprocess.run(["ls", "-la"], capture_output=True, text=True)
    print('LS::', r9x.stdout)
    
    subprocess.run(["unzip", "./REAPER-master.zip"])
    subprocess.run(["rm", "./REAPER-master.zip"])
    subprocess.run(["mv", "REAPER-master", "REAPER"])
    
    os.chdir('./REAPER')
    subprocess.run(["mkdir", "build"])
    os.chdir('./build')
    r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True)
    print(r2.stdout)
    r3 = subprocess.run(["make"], capture_output=True, text=True)
    print(r3.stdout)
    
    os.chdir('../..')
    r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True)
    print('LS::', r9.stdout)

                        
print('about to setup')
setup()



def label_indices(sentence):
    sentence = scripts.runSQ.snorm(sentence)
    sentence = sentence.split(' ')
    labelled = [(f'{word} {i+1} ', str(i+1)) for i, word in enumerate(sentence)]
    return labelled

    
#gradio states dont like dicts
def d2l(d):
    return [(k,v) for k,v in d.items()]
def l2d(l):
    return {k:v for k,v in l}

temp_sentences = scripts.runSQ.create_temp_sent_list()

bl = gr.Blocks()
with bl:

    
    voices = ['Dilja_v2', 'Alfur_v2', 'Dilja', 'Alfur', 'Bjartur', 'Rosa']

    with gr.Tabs():

            
        with gr.TabItem("Options"):
            temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
            marked_sentence = gr.HighlightedText(interactive=False,label="Word selection key",color_map = {str(i):"#dcfce7" for i in range(333)})

            with gr.Row():
                spanselect = gr.Textbox(value='1-3',label="Select words",info='Enter the index of the word(s) to analyse, according to the key above. It can be a single word: 4 or a span of words separated by a dash: 2-3')
                #voiceselect = gr.Radio(voices, label="TTS voice",value='Alfur_v2')
                voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value=['Dilja_v2','Alfur_v2'])

                #with gr.Column(scale=1):
                temp_button = gr.Button(value="Run with selected options")

        with gr.TabItem("About"):
            docu = gr.Markdown("""
    # Multi-target prosody evaluation
    ### 1. Choose a sentence - they are from Samrómur Queries
    ### 2. The words will be numbered by position - type the number or range you want to evaluate
    ### 3. Choose a TTS voice - they come from Tiro's API https://tiro.is/talgerving
    ### 4. Run

    The evaluation automatically clusters human speakers according to prosodic features, 
    and then measures how different the synthesised speech is from each natural cluster. 
    Clustering and TTS scoring use only the selected word(s) from Step 2, not the whole sentence. 
    Close match to one cluster shows what prosodic act TTS might have achieved, in the selected words. 
    TTS whose prosody does not match any cluster might sound unnatural. 
    
    TTS output includes generated audio, pitch, energy, and scores for each cluster. 
    Output is only shown for the selected voice(s). 
    Below, human data shows pitch and energy of each cluster, along with original audio. 
    
    TTS often takes over 30 seconds per sentence/voice. 
    After you have done it once, re-running different word spans for the same sentence/voice is much faster.

    See "Automatic assessment of prosody in high-stakes English tests" (Jian Cheng, ISCA 2011)
    regarding multi-target prosody scoring. This version extends the implementation by Magnús Freyr Morthens 
    supported by Rannís student innovation fund.
    """)

    ttstabs = {v:{} for v in voices}
    with gr.Tabs():
        for v in voices:
            with gr.TabItem(v):
                ttstabs[v]['tts_output'] = gr.Audio(interactive=False)
                with gr.Row():
                    ttstabs[v]['ptts'] = gr.Plot()
                    ttstabs[v]['etts'] = gr.Plot()
                ttstabs[v]['scorearea'] = gr.Markdown(f'TTS results for **{v}** will appear here')
                #tts_output = gr.Audio(interactive=False)
                #with gr.Row():
                #    ptts = gr.Plot()
                #    etts = gr.Plot()
                #report_score = gr.Markdown('Difference from TTS to real speech:')

    # cant store ttstabs in gradio state, use here
    def f1(voices, sent, indices):
        #tts_audio, tts_score, f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, f0_fig_tts, en_fig_tts = scripts.runSQ.run(sent, [voices], indices)
        f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, tts_results = scripts.runSQ.run(sent, voices, indices)
        outputs = {pc0: f0_fig_0, pc1: f0_fig_1, pc2: f0_fig_2, ec0: en_fig_0, ec1: en_fig_1, ec2: en_fig_2, play: audio_html}

        
        for v in voices:
            outputs[ttstabs[v]['tts_output']] = tts_results[v]['audio']
            outputs[ttstabs[v]['ptts']] = tts_results[v]['f0_fig_tts']
            outputs[ttstabs[v]['etts']] = tts_results[v]['en_fig_tts']
            outputs[ttstabs[v]['scorearea']] = tts_results[v]['scoreinfo']
            
        clear = [v for v in ttstabs.keys() if v not in voices]
        for v in clear:
            outputs[ttstabs[v]['tts_output']] = None
            outputs[ttstabs[v]['ptts']] = None
            outputs[ttstabs[v]['etts']] = None
            outputs[ttstabs[v]['scorearea']] = f'TTS results for **{v}** will appear here'
    
        return outputs #(tts_audio, score_report, f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, f0_fig_tts, en_fig_tts)

    
    with gr.Tabs():
        with gr.TabItem("Pitch"):
    
            pc0 = gr.Plot()
            pc1 = gr.Plot()
            pc2 = gr.Plot()
                
        with gr.TabItem("Energy"):
    
            ec0 = gr.Plot()
            ec1 = gr.Plot()
            ec2 = gr.Plot()
                
        with gr.TabItem("Audio"):
    
            play = gr.HTML(label="Audio samples")  



    temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
    outputs_list = [pc0,pc1,pc2,ec0,ec1,ec2,play]
    for v in voices:
        outputs_list += [ttstabs[v]['tts_output'], ttstabs[v]['ptts'],ttstabs[v]['etts'],ttstabs[v]['scorearea']]
    temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],outputs_list)
                      #[tts_output,report_score,pc0,pc1,pc2,ec0,ec1,ec2,play,ptts,etts])
    
    
if __name__ == "__main__":
    bl.launch()