Spaces:

Emova-ollm
/

EMOVA-demo

Running on Zero

KaiChen1998 commited on Nov 11, 2024

Commit

b3ea40b

1 Parent(s): 91deaa2

update code for speech tokenizer

Files changed (1) hide show

app.py CHANGED Viewed

@@ -21,10 +21,10 @@ auth_token = os.environ.get("TOKEN_FROM_SECRET")
 # Audio part
 ##########################################
 from huggingface_hub import snapshot_download
-snapshot_download(repo_id="Emova-ollm/emova_speech_tokenizer", local_dir='./speech', token=auth_token)
-from speech.speech_utils import s2u_extract_unit_demo, get_ckpt_config_path, load_model
-from speech.speech_utils import load_condition_centroid, get_config_checkpoint_file, load_U2S_model, synthesis
 ####################
 # S2U
@@ -35,19 +35,20 @@ unit_type = '40ms_multilingual_8888'
 language = 'English'
 s2u_model_name = 'SPIRAL-FSQ-CTC'
-ckpt_path, config_path = get_ckpt_config_path(unit_type, language)
-s2u_model = load_model(ckpt_path, config_path, s2u_model_name)
 ####################
 # U2S
 ####################
-condition2style_centroid_file = "./speech/condition_style_centroid/condition2style_centroid.txt"
 condition2style_centroid_file_dict, condition2style_centroid_embedding_dict = load_condition_centroid(condition2style_centroid_file)
 unit_type = '40ms_multilingual_8888_xujing_cosyvoice_FT'
 language = 'Chinese'
-model_config_file, model_checkpoint_file = get_config_checkpoint_file(unit_type, language)
 net_g, hps = load_U2S_model(model_config_file, model_checkpoint_file, unit_type)
 ####################
 # task format

 # Audio part
 ##########################################
 from huggingface_hub import snapshot_download
+snapshot_download(repo_id="Emova-ollm/emova_speech_tokenizer", token=auth_token)
+from emova_speech_tokenizer.speech_utils import get_S2U_ckpt_config_path, load_S2U_model, s2u_extract_unit_demo
+from emova_speech_tokenizer.speech_utils import load_condition_centroid, get_U2S_config_checkpoint_file, load_U2S_model, synthesis
 ####################
 # S2U
 language = 'English'
 s2u_model_name = 'SPIRAL-FSQ-CTC'
+ckpt_path, config_path = get_S2U_ckpt_config_path(unit_type, language)
+s2u_model = load_S2U_model(ckpt_path, config_path, s2u_model_name).cuda()
 ####################
 # U2S
 ####################
+condition2style_centroid_file = "./speech_tokenization/condition_style_centroid/condition2style_centroid.txt"
 condition2style_centroid_file_dict, condition2style_centroid_embedding_dict = load_condition_centroid(condition2style_centroid_file)
 unit_type = '40ms_multilingual_8888_xujing_cosyvoice_FT'
 language = 'Chinese'
+model_config_file, model_checkpoint_file = get_U2S_config_checkpoint_file(unit_type, language)
 net_g, hps = load_U2S_model(model_config_file, model_checkpoint_file, unit_type)
+net_g = net_g.cuda()
 ####################
 # task format