Spaces:

clr
/

pce

Sleeping

pce

File size: 6,915 Bytes

import gradio as gr
import subprocess, os
import scripts.runSQ

#https://huggingface.co/spaces/clr/prosalign/blob/main/app.py


def setup():
    r0 = subprocess.run(["pwd"], capture_output=True, text=True)
    print('PWD::', r0.stdout)
    #r1 = subprocess.run(["wget", "https://github.com/google/REAPER/archive/refs/heads/REAPER-master.zip", "-O", "./master.zip"], capture_output=True, text=True)
    #print(r1.stdout)
    r9x = subprocess.run(["ls", "-la"], capture_output=True, text=True)
    print('LS::', r9x.stdout)
    
    subprocess.run(["unzip", "./REAPER-master.zip"])
    subprocess.run(["rm", "./REAPER-master.zip"])
    subprocess.run(["mv", "REAPER-master", "REAPER"])
    
    os.chdir('./REAPER')
    subprocess.run(["mkdir", "build"])
    os.chdir('./build')
    r2 = subprocess.run(["cmake", ".."], capture_output=True, text=True)
    print(r2.stdout)
    r3 = subprocess.run(["make"], capture_output=True, text=True)
    print(r3.stdout)
    
    os.chdir('../..')
    r9 = subprocess.run(["ls", "-la"], capture_output=True, text=True)
    print('LS::', r9.stdout)

                        
print('about to setup')
setup()



def label_indices(sentence):
    sentence = scripts.runSQ.snorm(sentence)
    sentence = sentence.split(' ')
    labelled = [(f'{word} {i+1} ', str(i+1)) for i, word in enumerate(sentence)]
    return labelled

    
#gradio states dont like dicts
def d2l(d):
    return [(k,v) for k,v in d.items()]
def l2d(l):
    return {k:v for k,v in l}

temp_sentences = scripts.runSQ.create_temp_sent_list()

bl = gr.Blocks()
with bl:

    
    voices = ['Dilja_v2', 'Alfur_v2', 'Dilja', 'Alfur', 'Bjartur', 'Rosa']

    with gr.Tabs():

            
        with gr.TabItem("Options"):
            temp_sentmenu = gr.Dropdown(temp_sentences, label="Sentence")
            marked_sentence = gr.HighlightedText(interactive=False,label="Word selection key",color_map = {str(i):"#dcfce7" for i in range(333)})

            with gr.Row():
                spanselect = gr.Textbox(value='1-3',label="Select words",info='Enter the index of the word(s) to analyse, according to the key above. It can be a single word: 4 or a span of words separated by a dash: 2-3')
                #voiceselect = gr.Radio(voices, label="TTS voice",value='Alfur_v2')
                voiceselect = gr.CheckboxGroup(voices, label="TTS voice",value=['Dilja_v2','Alfur_v2'])

                #with gr.Column(scale=1):
                temp_button = gr.Button(value="Run with selected options")

        with gr.TabItem("About"):
            docu = gr.Markdown("""
    # Multi-target prosody evaluation
    ### 1. Choose a sentence - they are from Samrómur Queries
    ### 2. The words will be numbered by position - type the number or range you want to evaluate
    ### 3. Choose a TTS voice - they come from Tiro's API https://tiro.is/talgerving
    ### 4. Run

    The evaluation automatically clusters human speakers according to prosodic features, 
    and then measures how different the synthesised speech is from each natural cluster. 
    Clustering and TTS scoring use only the selected word(s) from Step 2, not the whole sentence. 
    Close match to one cluster shows what prosodic act TTS might have achieved, in the selected words. 
    TTS whose prosody does not match any cluster might sound unnatural. 
    
    TTS output includes generated audio, pitch, energy, and scores for each cluster. 
    Output is only shown for the selected voice(s). 
    Below, human data shows pitch and energy of each cluster, along with original audio. 
    
    TTS often takes over 30 seconds per sentence/voice. 
    After you have done it once, re-running different word spans for the same sentence/voice is much faster.

    See "Automatic assessment of prosody in high-stakes English tests" (Jian Cheng, ISCA 2011)
    regarding multi-target prosody scoring. This version extends the implementation by Magnús Freyr Morthens 
    supported by Rannís student innovation fund.
    """)

    ttstabs = {v:{} for v in voices}
    with gr.Tabs():
        for v in voices:
            with gr.TabItem(v):
                ttstabs[v]['tts_output'] = gr.Audio(interactive=False)
                with gr.Row():
                    ttstabs[v]['ptts'] = gr.Plot()
                    ttstabs[v]['etts'] = gr.Plot()
                ttstabs[v]['scorearea'] = gr.Markdown(f'TTS results for **{v}** will appear here')
                #tts_output = gr.Audio(interactive=False)
                #with gr.Row():
                #    ptts = gr.Plot()
                #    etts = gr.Plot()
                #report_score = gr.Markdown('Difference from TTS to real speech:')

    # cant store ttstabs in gradio state, use here
    def f1(voices, sent, indices):
        #tts_audio, tts_score, f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, f0_fig_tts, en_fig_tts = scripts.runSQ.run(sent, [voices], indices)
        f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, tts_results = scripts.runSQ.run(sent, voices, indices)
        outputs = {pc0: f0_fig_0, pc1: f0_fig_1, pc2: f0_fig_2, ec0: en_fig_0, ec1: en_fig_1, ec2: en_fig_2, play: audio_html}

        
        for v in voices:
            outputs[ttstabs[v]['tts_output']] = tts_results[v]['audio']
            outputs[ttstabs[v]['ptts']] = tts_results[v]['f0_fig_tts']
            outputs[ttstabs[v]['etts']] = tts_results[v]['en_fig_tts']
            outputs[ttstabs[v]['scorearea']] = tts_results[v]['scoreinfo']
            
        clear = [v for v in ttstabs.keys() if v not in voices]
        for v in clear:
            outputs[ttstabs[v]['tts_output']] = None
            outputs[ttstabs[v]['ptts']] = None
            outputs[ttstabs[v]['etts']] = None
            outputs[ttstabs[v]['scorearea']] = f'TTS results for **{v}** will appear here'
    
        return outputs #(tts_audio, score_report, f0_fig_0, f0_fig_1, f0_fig_2, en_fig_0, en_fig_1, en_fig_2, audio_html, f0_fig_tts, en_fig_tts)

    
    with gr.Tabs():
        with gr.TabItem("Pitch"):
    
            pc0 = gr.Plot()
            pc1 = gr.Plot()
            pc2 = gr.Plot()
                
        with gr.TabItem("Energy"):
    
            ec0 = gr.Plot()
            ec1 = gr.Plot()
            ec2 = gr.Plot()
                
        with gr.TabItem("Audio"):
    
            play = gr.HTML(label="Audio samples")  



    temp_sentmenu.input(label_indices,temp_sentmenu,marked_sentence)
    outputs_list = [pc0,pc1,pc2,ec0,ec1,ec2,play]
    for v in voices:
        outputs_list += [ttstabs[v]['tts_output'], ttstabs[v]['ptts'],ttstabs[v]['etts'],ttstabs[v]['scorearea']]
    temp_button.click(f1,[voiceselect,temp_sentmenu,spanselect],outputs_list)
                      #[tts_output,report_score,pc0,pc1,pc2,ec0,ec1,ec2,play,ptts,etts])
    
    
if __name__ == "__main__":
    bl.launch()