Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Upload 5 files
Browse files- .gitignore +2 -0
- app.py +11 -9
    	
        .gitignore
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            *.pyc
         | 
| 2 | 
            +
            __pycache__
         | 
    	
        app.py
    CHANGED
    
    | @@ -4,7 +4,7 @@ import gradio as gr | |
| 4 | 
             
            from PIL import Image
         | 
| 5 | 
             
            from omegaconf import OmegaConf
         | 
| 6 | 
             
            from pathlib import Path
         | 
| 7 | 
            -
            from vocoder. | 
| 8 | 
             
            from ldm.models.diffusion.ddim import DDIMSampler
         | 
| 9 | 
             
            from ldm.util import instantiate_from_config
         | 
| 10 | 
             
            from wav_evaluation.models.CLAPWrapper import CLAPWrapper
         | 
| @@ -29,7 +29,7 @@ def initialize_model(config, ckpt): | |
| 29 | 
             
                return sampler
         | 
| 30 |  | 
| 31 | 
             
            sampler = initialize_model('configs/text_to_audio/txt2audio_args.yaml', 'useful_ckpts/ta40multi_epoch=000085.ckpt')
         | 
| 32 | 
            -
            vocoder =  | 
| 33 | 
             
            clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
         | 
| 34 |  | 
| 35 | 
             
            def select_best_audio(prompt,wav_list):
         | 
| @@ -52,7 +52,7 @@ def txt2audio(sampler,vocoder,prompt, seed, scale, ddim_steps, n_samples=1, W=62 | |
| 52 | 
             
                uc = None
         | 
| 53 | 
             
                if scale != 1.0:
         | 
| 54 | 
             
                    uc = sampler.model.get_learned_conditioning(n_samples * [""])
         | 
| 55 | 
            -
                c = sampler.model.get_learned_conditioning(n_samples * [prompt])
         | 
| 56 | 
             
                shape = [sampler.model.first_stage_model.embed_dim, H//8, W//8]  # (z_dim, 80//2^x, 848//2^x)
         | 
| 57 | 
             
                samples_ddim, _ = sampler.sample(S=ddim_steps,
         | 
| 58 | 
             
                                                    conditioning=c,
         | 
| @@ -74,7 +74,7 @@ def txt2audio(sampler,vocoder,prompt, seed, scale, ddim_steps, n_samples=1, W=62 | |
| 74 | 
             
                return best_wav
         | 
| 75 |  | 
| 76 |  | 
| 77 | 
            -
            def predict(prompt, ddim_steps, num_samples, scale, seed) | 
| 78 | 
             
                melbins,mel_len = 80,624
         | 
| 79 | 
             
                with torch.no_grad():
         | 
| 80 | 
             
                    result = txt2audio(
         | 
| @@ -97,21 +97,23 @@ with gr.Blocks() as demo: | |
| 97 |  | 
| 98 | 
             
                with gr.Row():
         | 
| 99 | 
             
                    with gr.Column():
         | 
| 100 | 
            -
                        prompt = gr.Textbox(label="Prompt: Input your text here | 
| 101 | 
             
                        run_button = gr.Button(label="Run")
         | 
| 102 |  | 
| 103 |  | 
| 104 | 
             
                        with gr.Accordion("Advanced options", open=False):
         | 
| 105 | 
             
                            num_samples = gr.Slider(
         | 
| 106 | 
            -
                                label=" | 
|  | |
|  | |
| 107 | 
             
                            # num_samples = 1
         | 
| 108 | 
             
                            ddim_steps = gr.Slider(label="Steps", minimum=1,
         | 
| 109 | 
             
                                                   maximum=150, value=100, step=1)
         | 
| 110 | 
             
                            scale = gr.Slider(
         | 
| 111 | 
            -
                                label="Guidance Scale", minimum=0.1, maximum=4.0, value=1.5, step=0.1
         | 
| 112 | 
             
                            )
         | 
| 113 | 
             
                            seed = gr.Slider(
         | 
| 114 | 
            -
                                label="Seed",
         | 
| 115 | 
             
                                minimum=0,
         | 
| 116 | 
             
                                maximum=2147483647,
         | 
| 117 | 
             
                                step=1,
         | 
| @@ -138,4 +140,4 @@ with gr.Blocks() as demo: | |
| 138 | 
             
                    with gr.Column():
         | 
| 139 | 
             
                        pass
         | 
| 140 |  | 
| 141 | 
            -
            demo.launch()
         | 
|  | |
| 4 | 
             
            from PIL import Image
         | 
| 5 | 
             
            from omegaconf import OmegaConf
         | 
| 6 | 
             
            from pathlib import Path
         | 
| 7 | 
            +
            from vocoder.bigvgan.models import VocoderBigVGAN
         | 
| 8 | 
             
            from ldm.models.diffusion.ddim import DDIMSampler
         | 
| 9 | 
             
            from ldm.util import instantiate_from_config
         | 
| 10 | 
             
            from wav_evaluation.models.CLAPWrapper import CLAPWrapper
         | 
|  | |
| 29 | 
             
                return sampler
         | 
| 30 |  | 
| 31 | 
             
            sampler = initialize_model('configs/text_to_audio/txt2audio_args.yaml', 'useful_ckpts/ta40multi_epoch=000085.ckpt')
         | 
| 32 | 
            +
            vocoder = VocoderBigVGAN('vocoder/logs/bigv16k53w',device=device)
         | 
| 33 | 
             
            clap_model = CLAPWrapper('useful_ckpts/CLAP/CLAP_weights_2022.pth','useful_ckpts/CLAP/config.yml',use_cuda=torch.cuda.is_available())
         | 
| 34 |  | 
| 35 | 
             
            def select_best_audio(prompt,wav_list):
         | 
|  | |
| 52 | 
             
                uc = None
         | 
| 53 | 
             
                if scale != 1.0:
         | 
| 54 | 
             
                    uc = sampler.model.get_learned_conditioning(n_samples * [""])
         | 
| 55 | 
            +
                c = sampler.model.get_learned_conditioning(n_samples * [prompt])# shape:[1,77,1280],即还没有变成句子embedding,仍是每个单词的embedding
         | 
| 56 | 
             
                shape = [sampler.model.first_stage_model.embed_dim, H//8, W//8]  # (z_dim, 80//2^x, 848//2^x)
         | 
| 57 | 
             
                samples_ddim, _ = sampler.sample(S=ddim_steps,
         | 
| 58 | 
             
                                                    conditioning=c,
         | 
|  | |
| 74 | 
             
                return best_wav
         | 
| 75 |  | 
| 76 |  | 
| 77 | 
            +
            def predict(prompt, ddim_steps, num_samples, scale, seed):# 经过试验,这个input_image需要是256x256、512x512的大小效果才正常,实际应该resize一下,输出再resize回去,但是他们使用的是pad,不知道为什么
         | 
| 78 | 
             
                melbins,mel_len = 80,624
         | 
| 79 | 
             
                with torch.no_grad():
         | 
| 80 | 
             
                    result = txt2audio(
         | 
|  | |
| 97 |  | 
| 98 | 
             
                with gr.Row():
         | 
| 99 | 
             
                    with gr.Column():
         | 
| 100 | 
            +
                        prompt = gr.Textbox(label="Prompt: Input your text here.        ")
         | 
| 101 | 
             
                        run_button = gr.Button(label="Run")
         | 
| 102 |  | 
| 103 |  | 
| 104 | 
             
                        with gr.Accordion("Advanced options", open=False):
         | 
| 105 | 
             
                            num_samples = gr.Slider(
         | 
| 106 | 
            +
                                label="Select from audios num.This number control the number of candidates \
         | 
| 107 | 
            +
                                    (e.g., generate three audios and choose the best to show you). A Larger value usually lead to \
         | 
| 108 | 
            +
                                    better quality with heavier computation", minimum=1, maximum=10, value=3, step=1)
         | 
| 109 | 
             
                            # num_samples = 1
         | 
| 110 | 
             
                            ddim_steps = gr.Slider(label="Steps", minimum=1,
         | 
| 111 | 
             
                                                   maximum=150, value=100, step=1)
         | 
| 112 | 
             
                            scale = gr.Slider(
         | 
| 113 | 
            +
                                label="Guidance Scale:(Large => more relevant to text but the quality may drop)", minimum=0.1, maximum=4.0, value=1.5, step=0.1
         | 
| 114 | 
             
                            )
         | 
| 115 | 
             
                            seed = gr.Slider(
         | 
| 116 | 
            +
                                label="Seed:Change this value (any integer number) will lead to a different generation result.",
         | 
| 117 | 
             
                                minimum=0,
         | 
| 118 | 
             
                                maximum=2147483647,
         | 
| 119 | 
             
                                step=1,
         | 
|  | |
| 140 | 
             
                    with gr.Column():
         | 
| 141 | 
             
                        pass
         | 
| 142 |  | 
| 143 | 
            +
            demo.launch(share=True)
         |