Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						258fb54
	
1
								Parent(s):
							
							b8368df
								
(wip)debug
Browse files- app.py +11 -1
- cosyvoice/cli/cosyvoice.py +0 -18
- cosyvoice/cli/frontend.py +0 -5
    	
        app.py
    CHANGED
    
    | @@ -86,6 +86,16 @@ def get_cosyvoice(): | |
| 86 | 
             
                                       load_trt=load_trt)
         | 
| 87 | 
             
                        return cosyvoice_instance
         | 
| 88 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 89 | 
             
            @spaces.GPU
         | 
| 90 | 
             
            def get_asr():
         | 
| 91 | 
             
                global asr_model
         | 
| @@ -194,7 +204,7 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload | |
| 194 | 
             
                    logging.info('get zero_shot inference request')
         | 
| 195 | 
             
                    prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         | 
| 196 | 
             
                    set_all_random_seed(seed)
         | 
| 197 | 
            -
                    for i in  | 
| 198 | 
             
                        yield (target_sr, i['tts_speech'].numpy().flatten())
         | 
| 199 | 
             
                elif mode_checkbox_group == 'Cross-lingual Clone':
         | 
| 200 | 
             
                    logging.info('get cross_lingual inference request')
         | 
|  | |
| 86 | 
             
                                       load_trt=load_trt)
         | 
| 87 | 
             
                        return cosyvoice_instance
         | 
| 88 |  | 
| 89 | 
            +
            @spaces.GPU
         | 
| 90 | 
            +
            def infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream, speed):
         | 
| 91 | 
            +
                cosyvoice = get_cosyvoice()
         | 
| 92 | 
            +
                if cosyvoice.frontend.instruct is True:
         | 
| 93 | 
            +
                    logging.warning('CosyVoice2-0.5B does not support zero-shot inference, please use CosyVoice-300M or CosyVoice-300M-Instruct.')
         | 
| 94 | 
            +
                    return
         | 
| 95 | 
            +
                for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
         | 
| 96 | 
            +
                    yield i
         | 
| 97 | 
            +
             | 
| 98 | 
            +
             | 
| 99 | 
             
            @spaces.GPU
         | 
| 100 | 
             
            def get_asr():
         | 
| 101 | 
             
                global asr_model
         | 
|  | |
| 204 | 
             
                    logging.info('get zero_shot inference request')
         | 
| 205 | 
             
                    prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         | 
| 206 | 
             
                    set_all_random_seed(seed)
         | 
| 207 | 
            +
                    for i in infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
         | 
| 208 | 
             
                        yield (target_sr, i['tts_speech'].numpy().flatten())
         | 
| 209 | 
             
                elif mode_checkbox_group == 'Cross-lingual Clone':
         | 
| 210 | 
             
                    logging.info('get cross_lingual inference request')
         | 
    	
        cosyvoice/cli/cosyvoice.py
    CHANGED
    
    | @@ -23,7 +23,6 @@ from cosyvoice.utils.file_utils import logging | |
| 23 | 
             
            import spaces
         | 
| 24 |  | 
| 25 | 
             
            class CosyVoice:
         | 
| 26 | 
            -
                @spaces.GPU
         | 
| 27 | 
             
                def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
         | 
| 28 | 
             
                    instruct = True if '-Instruct' in model_dir else False
         | 
| 29 | 
             
                    self.instruct = instruct
         | 
| @@ -56,18 +55,11 @@ class CosyVoice: | |
| 56 | 
             
                    if load_onnx:
         | 
| 57 | 
             
                        self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
         | 
| 58 |  | 
| 59 | 
            -
                @spaces.GPU
         | 
| 60 | 
             
                def list_avaliable_spks(self):
         | 
| 61 | 
             
                    spks = list(self.frontend.spk2info.keys())
         | 
| 62 | 
             
                    return spks
         | 
| 63 |  | 
| 64 | 
            -
                @spaces.GPU
         | 
| 65 | 
            -
                def reload_frontend(self):
         | 
| 66 | 
            -
                    self.frontend.reload_onnx()
         | 
| 67 | 
            -
             | 
| 68 | 
            -
                @spaces.GPU
         | 
| 69 | 
             
                def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
         | 
| 70 | 
            -
                    self.reload_frontend()
         | 
| 71 | 
             
                    for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
         | 
| 72 | 
             
                        model_input = self.frontend.frontend_sft(i, spk_id)
         | 
| 73 | 
             
                        start_time = time.time()
         | 
| @@ -78,9 +70,7 @@ class CosyVoice: | |
| 78 | 
             
                            yield model_output
         | 
| 79 | 
             
                            start_time = time.time()
         | 
| 80 |  | 
| 81 | 
            -
                @spaces.GPU
         | 
| 82 | 
             
                def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
         | 
| 83 | 
            -
                    self.reload_frontend()
         | 
| 84 | 
             
                    prompt_text = self.frontend.text_normalize(prompt_text, split=False)
         | 
| 85 | 
             
                    for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
         | 
| 86 | 
             
                        if len(i) < 0.5 * len(prompt_text):
         | 
| @@ -94,7 +84,6 @@ class CosyVoice: | |
| 94 | 
             
                            yield model_output
         | 
| 95 | 
             
                            start_time = time.time()
         | 
| 96 |  | 
| 97 | 
            -
                @spaces.GPU
         | 
| 98 | 
             
                def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
         | 
| 99 | 
             
                    self.reload_frontend()
         | 
| 100 | 
             
                    if self.frontend.instruct is True:
         | 
| @@ -109,9 +98,7 @@ class CosyVoice: | |
| 109 | 
             
                            yield model_output
         | 
| 110 | 
             
                            start_time = time.time()
         | 
| 111 |  | 
| 112 | 
            -
                @spaces.GPU
         | 
| 113 | 
             
                def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
         | 
| 114 | 
            -
                    self.reload_frontend()
         | 
| 115 | 
             
                    assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
         | 
| 116 | 
             
                    if self.frontend.instruct is False:
         | 
| 117 | 
             
                        raise ValueError('{} do not support instruct inference'.format(self.model_dir))
         | 
| @@ -126,9 +113,7 @@ class CosyVoice: | |
| 126 | 
             
                            yield model_output
         | 
| 127 | 
             
                            start_time = time.time()
         | 
| 128 |  | 
| 129 | 
            -
                @spaces.GPU
         | 
| 130 | 
             
                def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
         | 
| 131 | 
            -
                    self.reload_frontend()
         | 
| 132 | 
             
                    for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
         | 
| 133 | 
             
                        model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
         | 
| 134 | 
             
                        start_time = time.time()
         | 
| @@ -139,9 +124,7 @@ class CosyVoice: | |
| 139 | 
             
                            yield model_output
         | 
| 140 | 
             
                            start_time = time.time()
         | 
| 141 |  | 
| 142 | 
            -
                @spaces.GPU
         | 
| 143 | 
             
                def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
         | 
| 144 | 
            -
                    self.reload_frontend()
         | 
| 145 | 
             
                    model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
         | 
| 146 | 
             
                    start_time = time.time()
         | 
| 147 | 
             
                    for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
         | 
| @@ -151,7 +134,6 @@ class CosyVoice: | |
| 151 | 
             
                        start_time = time.time()
         | 
| 152 |  | 
| 153 | 
             
            class CosyVoice2(CosyVoice):
         | 
| 154 | 
            -
                @spaces.GPU
         | 
| 155 | 
             
                def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
         | 
| 156 | 
             
                    instruct = True if '-Instruct' in model_dir else False
         | 
| 157 | 
             
                    self.instruct = instruct
         | 
|  | |
| 23 | 
             
            import spaces
         | 
| 24 |  | 
| 25 | 
             
            class CosyVoice:
         | 
|  | |
| 26 | 
             
                def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
         | 
| 27 | 
             
                    instruct = True if '-Instruct' in model_dir else False
         | 
| 28 | 
             
                    self.instruct = instruct
         | 
|  | |
| 55 | 
             
                    if load_onnx:
         | 
| 56 | 
             
                        self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
         | 
| 57 |  | 
|  | |
| 58 | 
             
                def list_avaliable_spks(self):
         | 
| 59 | 
             
                    spks = list(self.frontend.spk2info.keys())
         | 
| 60 | 
             
                    return spks
         | 
| 61 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 62 | 
             
                def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
         | 
|  | |
| 63 | 
             
                    for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
         | 
| 64 | 
             
                        model_input = self.frontend.frontend_sft(i, spk_id)
         | 
| 65 | 
             
                        start_time = time.time()
         | 
|  | |
| 70 | 
             
                            yield model_output
         | 
| 71 | 
             
                            start_time = time.time()
         | 
| 72 |  | 
|  | |
| 73 | 
             
                def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
         | 
|  | |
| 74 | 
             
                    prompt_text = self.frontend.text_normalize(prompt_text, split=False)
         | 
| 75 | 
             
                    for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
         | 
| 76 | 
             
                        if len(i) < 0.5 * len(prompt_text):
         | 
|  | |
| 84 | 
             
                            yield model_output
         | 
| 85 | 
             
                            start_time = time.time()
         | 
| 86 |  | 
|  | |
| 87 | 
             
                def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
         | 
| 88 | 
             
                    self.reload_frontend()
         | 
| 89 | 
             
                    if self.frontend.instruct is True:
         | 
|  | |
| 98 | 
             
                            yield model_output
         | 
| 99 | 
             
                            start_time = time.time()
         | 
| 100 |  | 
|  | |
| 101 | 
             
                def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
         | 
|  | |
| 102 | 
             
                    assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
         | 
| 103 | 
             
                    if self.frontend.instruct is False:
         | 
| 104 | 
             
                        raise ValueError('{} do not support instruct inference'.format(self.model_dir))
         | 
|  | |
| 113 | 
             
                            yield model_output
         | 
| 114 | 
             
                            start_time = time.time()
         | 
| 115 |  | 
|  | |
| 116 | 
             
                def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
         | 
|  | |
| 117 | 
             
                    for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
         | 
| 118 | 
             
                        model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
         | 
| 119 | 
             
                        start_time = time.time()
         | 
|  | |
| 124 | 
             
                            yield model_output
         | 
| 125 | 
             
                            start_time = time.time()
         | 
| 126 |  | 
|  | |
| 127 | 
             
                def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
         | 
|  | |
| 128 | 
             
                    model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
         | 
| 129 | 
             
                    start_time = time.time()
         | 
| 130 | 
             
                    for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
         | 
|  | |
| 134 | 
             
                        start_time = time.time()
         | 
| 135 |  | 
| 136 | 
             
            class CosyVoice2(CosyVoice):
         | 
|  | |
| 137 | 
             
                def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
         | 
| 138 | 
             
                    instruct = True if '-Instruct' in model_dir else False
         | 
| 139 | 
             
                    self.instruct = instruct
         | 
    	
        cosyvoice/cli/frontend.py
    CHANGED
    
    | @@ -80,11 +80,6 @@ class CosyVoiceFrontEnd: | |
| 80 | 
             
                        self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
         | 
| 81 | 
             
                        self.en_tn_model = EnNormalizer()
         | 
| 82 |  | 
| 83 | 
            -
                def reload_onnx(self):
         | 
| 84 | 
            -
                    self.campplus_session = onnxruntime.InferenceSession(self.campplus_model, sess_options=self.option, providers=["CPUExecutionProvider"])
         | 
| 85 | 
            -
                    self.speech_tokenizer_session = onnxruntime.InferenceSession(self.speech_tokenizer_model, sess_options=self.option,
         | 
| 86 | 
            -
                                                                                 providers=["CPUExecutionProvider"])
         | 
| 87 | 
            -
             | 
| 88 | 
             
                def _extract_text_token(self, text):
         | 
| 89 | 
             
                    text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
         | 
| 90 | 
             
                    text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
         | 
|  | |
| 80 | 
             
                        self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
         | 
| 81 | 
             
                        self.en_tn_model = EnNormalizer()
         | 
| 82 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 83 | 
             
                def _extract_text_token(self, text):
         | 
| 84 | 
             
                    text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
         | 
| 85 | 
             
                    text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
         |