Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
- app.py +2 -3
- inference-cli.py +2 -3
- model/utils.py +2 -3
    	
        app.py
    CHANGED
    
    | @@ -158,9 +158,8 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, exp_name, remove_silence, | |
| 158 |  | 
| 159 | 
             
                    # Calculate duration
         | 
| 160 | 
             
                    ref_audio_len = audio.shape[-1] // hop_length
         | 
| 161 | 
            -
                     | 
| 162 | 
            -
                     | 
| 163 | 
            -
                    gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
         | 
| 164 | 
             
                    duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
         | 
| 165 |  | 
| 166 | 
             
                    # inference
         | 
|  | |
| 158 |  | 
| 159 | 
             
                    # Calculate duration
         | 
| 160 | 
             
                    ref_audio_len = audio.shape[-1] // hop_length
         | 
| 161 | 
            +
                    ref_text_len = len(ref_text.encode('utf-8'))
         | 
| 162 | 
            +
                    gen_text_len = len(gen_text.encode('utf-8'))
         | 
|  | |
| 163 | 
             
                    duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
         | 
| 164 |  | 
| 165 | 
             
                    # inference
         | 
    	
        inference-cli.py
    CHANGED
    
    | @@ -250,9 +250,8 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, model,ckpt_file,file_voca | |
| 250 |  | 
| 251 | 
             
                    # Calculate duration
         | 
| 252 | 
             
                    ref_audio_len = audio.shape[-1] // hop_length
         | 
| 253 | 
            -
                     | 
| 254 | 
            -
                     | 
| 255 | 
            -
                    gen_text_len = len(gen_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gen_text))
         | 
| 256 | 
             
                    duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
         | 
| 257 |  | 
| 258 | 
             
                    # inference
         | 
|  | |
| 250 |  | 
| 251 | 
             
                    # Calculate duration
         | 
| 252 | 
             
                    ref_audio_len = audio.shape[-1] // hop_length
         | 
| 253 | 
            +
                    ref_text_len = len(ref_text.encode('utf-8'))
         | 
| 254 | 
            +
                    gen_text_len = len(gen_text.encode('utf-8'))
         | 
|  | |
| 255 | 
             
                    duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
         | 
| 256 |  | 
| 257 | 
             
                    # inference
         | 
    	
        model/utils.py
    CHANGED
    
    | @@ -296,9 +296,8 @@ def get_inference_prompt( | |
| 296 | 
             
                        # # test vocoder resynthesis
         | 
| 297 | 
             
                        # ref_audio = gt_audio
         | 
| 298 | 
             
                    else:
         | 
| 299 | 
            -
                         | 
| 300 | 
            -
                         | 
| 301 | 
            -
                        gen_text_len = len(gt_text.encode('utf-8')) + 3 * len(re.findall(zh_pause_punc, gt_text))
         | 
| 302 | 
             
                        total_mel_len = ref_mel_len + int(ref_mel_len / ref_text_len * gen_text_len / speed)
         | 
| 303 |  | 
| 304 | 
             
                    # to mel spectrogram
         | 
|  | |
| 296 | 
             
                        # # test vocoder resynthesis
         | 
| 297 | 
             
                        # ref_audio = gt_audio
         | 
| 298 | 
             
                    else:
         | 
| 299 | 
            +
                        ref_text_len = len(prompt_text.encode('utf-8'))
         | 
| 300 | 
            +
                        gen_text_len = len(gt_text.encode('utf-8'))
         | 
|  | |
| 301 | 
             
                        total_mel_len = ref_mel_len + int(ref_mel_len / ref_text_len * gen_text_len / speed)
         | 
| 302 |  | 
| 303 | 
             
                    # to mel spectrogram
         | 
