diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..507ca700bfd20ffc5c04db681bb2ed442e2f0d09 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-10800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-14400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-7200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoint-10800/config.json b/checkpoint-10800/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ce016ecf9d037d4afa627d419c26b5ddd28a2f8 --- /dev/null +++ b/checkpoint-10800/config.json @@ -0,0 +1,43 @@ +{ + "architectures": [ + "UltravoxModel" + ], + "audio_latency_block_size": null, + "audio_model_id": "openai/whisper-large-v3-turbo", + "audio_model_lora_config": { + "lora_alpha": 8, + "r": 0, + "target_modules": [ + "k_proj", + "q_proj", + "linear_k", + "linear_q" + ] + }, + "auto_map": { + "AutoConfig": "ultravox_config.UltravoxConfig", + "AutoModel": "ultravox_model.UltravoxModel" + }, + "hidden_size": 4096, + "ignore_index": -100, + "initializer_range": 0.02, + "model_type": "ultravox", + "norm_init": 0.4, + "pad_token_id": 128009, + "projector_act": "swiglu", + "stack_factor": 8, + "text_model_id": "meta-llama/Llama-3.2-1B-Instruct", + "text_model_lora_config": { + "lora_alpha": 8, + "r": 0, + "target_modules": [ + "k_proj", + "q_proj", + "linear_k", + "linear_q" + ] + }, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.0", + "vocab_size": 128256 +} diff --git a/checkpoint-10800/generation_config.json b/checkpoint-10800/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4dac817850f65a6be4d01d824462c9fe54468763 --- /dev/null +++ b/checkpoint-10800/generation_config.json @@ -0,0 +1,11 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "transformers_version": "4.47.0" +} diff --git a/checkpoint-10800/model.safetensors b/checkpoint-10800/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9257459e35b5cb2333f78b10422fa30f35030d72 --- /dev/null +++ b/checkpoint-10800/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb8c9e69ce51c40cf7113f8c13039453372cf800e2546f6e70e2806ea50b01ff +size 92299736 diff --git a/checkpoint-10800/optimizer.pt b/checkpoint-10800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9c2a482192618b91a8fdd369ae579b24aa4730a --- /dev/null +++ b/checkpoint-10800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a469d87dbbb0799b502a39ab9d99b8cb02e0bcee34e1d96a01d5d6ce56f06268 +size 184602962 diff --git a/checkpoint-10800/rng_state.pth b/checkpoint-10800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef2c084c6e7fa581c15339924d21d94e6e36a272 --- /dev/null +++ b/checkpoint-10800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:282eacecfb6f7e9eb42fb868e2a5ed7c671ea124a6cbc6714f7daf0bf89f1c9e +size 14244 diff --git a/checkpoint-10800/scheduler.pt b/checkpoint-10800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..33b05792bca2f573edcf0742d9651201c8e5e3c4 --- /dev/null +++ b/checkpoint-10800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01ae122cbb1630c5e9128a1def009fb9fe6db25bf8dbd0de4ec680be8f1568e1 +size 1064 diff --git a/checkpoint-10800/special_tokens_map.json b/checkpoint-10800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/checkpoint-10800/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-10800/tokenizer.json b/checkpoint-10800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-10800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-10800/tokenizer_config.json b/checkpoint-10800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/checkpoint-10800/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-10800/trainer_state.json b/checkpoint-10800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b91e5bc29f4edb66ae7b4dd8746136c84284a07e --- /dev/null +++ b/checkpoint-10800/trainer_state.json @@ -0,0 +1,1066 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1874967448481797, + "eval_steps": 1000, + "global_step": 10800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.736080970816479e-05, + "grad_norm": 10.5625, + "learning_rate": 2e-06, + "loss": 1.0, + "step": 1 + }, + { + "epoch": 0.001736080970816479, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.3311, + "step": 100 + }, + { + "epoch": 0.003472161941632958, + "grad_norm": 0.16796875, + "learning_rate": 0.0004, + "loss": 0.2169, + "step": 200 + }, + { + "epoch": 0.005208242912449436, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006, + "loss": 0.2032, + "step": 300 + }, + { + "epoch": 0.006944323883265916, + "grad_norm": 0.11279296875, + "learning_rate": 0.0008, + "loss": 0.188, + "step": 400 + }, + { + "epoch": 0.008680404854082394, + "grad_norm": 0.10107421875, + "learning_rate": 0.001, + "loss": 0.1758, + "step": 500 + }, + { + "epoch": 0.010416485824898873, + "grad_norm": 0.09521484375, + "learning_rate": 0.0012, + "loss": 0.1637, + "step": 600 + }, + { + "epoch": 0.012152566795715351, + "grad_norm": 0.08154296875, + "learning_rate": 0.0014, + "loss": 0.1518, + "step": 700 + }, + { + "epoch": 0.013888647766531832, + "grad_norm": 0.08642578125, + "learning_rate": 0.0016, + "loss": 0.1485, + "step": 800 + }, + { + "epoch": 0.01562472873734831, + "grad_norm": 0.1044921875, + "learning_rate": 0.0018000000000000002, + "loss": 0.1433, + "step": 900 + }, + { + "epoch": 0.01736080970816479, + "grad_norm": 0.05419921875, + "learning_rate": 0.002, + "loss": 0.139, + "step": 1000 + }, + { + "epoch": 0.01736080970816479, + "eval_covost2-en-de_loss": 1.896493673324585, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 9.8697, + "eval_covost2-en-de_samples_per_second": 6.485, + "eval_covost2-en-de_steps_per_second": 0.811, + "step": 1000 + }, + { + "epoch": 0.01736080970816479, + "eval_covost2-zh-en_loss": 3.1452860832214355, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.3732, + "eval_covost2-zh-en_samples_per_second": 7.643, + "eval_covost2-zh-en_steps_per_second": 0.955, + "step": 1000 + }, + { + "epoch": 0.01736080970816479, + "eval_peoplespeech-clean-transcription_loss": 3.2206106185913086, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.6941, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.602, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.825, + "step": 1000 + }, + { + "epoch": 0.01909689067898127, + "grad_norm": 0.059814453125, + "learning_rate": 0.001999725185109816, + "loss": 0.1334, + "step": 1100 + }, + { + "epoch": 0.020832971649797746, + "grad_norm": 0.07373046875, + "learning_rate": 0.0019989008914857113, + "loss": 0.1288, + "step": 1200 + }, + { + "epoch": 0.022569052620614226, + "grad_norm": 0.049560546875, + "learning_rate": 0.00199752757218401, + "loss": 0.1262, + "step": 1300 + }, + { + "epoch": 0.024305133591430703, + "grad_norm": 0.0517578125, + "learning_rate": 0.001995605982021898, + "loss": 0.1222, + "step": 1400 + }, + { + "epoch": 0.026041214562247183, + "grad_norm": 0.058349609375, + "learning_rate": 0.0019931371771625545, + "loss": 0.1193, + "step": 1500 + }, + { + "epoch": 0.027777295533063663, + "grad_norm": 0.0498046875, + "learning_rate": 0.001990122514534651, + "loss": 0.1196, + "step": 1600 + }, + { + "epoch": 0.02951337650388014, + "grad_norm": 0.05517578125, + "learning_rate": 0.0019865636510865464, + "loss": 0.115, + "step": 1700 + }, + { + "epoch": 0.03124945747469662, + "grad_norm": 0.044677734375, + "learning_rate": 0.001982462542875576, + "loss": 0.115, + "step": 1800 + }, + { + "epoch": 0.0329855384455131, + "grad_norm": 0.05419921875, + "learning_rate": 0.001977821443992945, + "loss": 0.1125, + "step": 1900 + }, + { + "epoch": 0.03472161941632958, + "grad_norm": 0.047119140625, + "learning_rate": 0.001972642905324813, + "loss": 0.1094, + "step": 2000 + }, + { + "epoch": 0.03472161941632958, + "eval_covost2-en-de_loss": 1.6700351238250732, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.1279, + "eval_covost2-en-de_samples_per_second": 7.874, + "eval_covost2-en-de_steps_per_second": 0.984, + "step": 2000 + }, + { + "epoch": 0.03472161941632958, + "eval_covost2-zh-en_loss": 3.093877077102661, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.1488, + "eval_covost2-zh-en_samples_per_second": 7.854, + "eval_covost2-zh-en_steps_per_second": 0.982, + "step": 2000 + }, + { + "epoch": 0.03472161941632958, + "eval_peoplespeech-clean-transcription_loss": 2.478968620300293, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.5507, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.701, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.838, + "step": 2000 + }, + { + "epoch": 0.036457700387146054, + "grad_norm": 0.048583984375, + "learning_rate": 0.0019669297731502505, + "loss": 0.1077, + "step": 2100 + }, + { + "epoch": 0.03819378135796254, + "grad_norm": 0.054443359375, + "learning_rate": 0.00196068518757684, + "loss": 0.1069, + "step": 2200 + }, + { + "epoch": 0.039929862328779014, + "grad_norm": 0.047119140625, + "learning_rate": 0.001953912580814779, + "loss": 0.1043, + "step": 2300 + }, + { + "epoch": 0.04166594329959549, + "grad_norm": 0.044921875, + "learning_rate": 0.0019466156752904343, + "loss": 0.1035, + "step": 2400 + }, + { + "epoch": 0.043402024270411975, + "grad_norm": 0.050537109375, + "learning_rate": 0.0019387984816003866, + "loss": 0.1033, + "step": 2500 + }, + { + "epoch": 0.04513810524122845, + "grad_norm": 0.056396484375, + "learning_rate": 0.0019304652963070869, + "loss": 0.102, + "step": 2600 + }, + { + "epoch": 0.04687418621204493, + "grad_norm": 0.046875, + "learning_rate": 0.0019216206995773372, + "loss": 0.0998, + "step": 2700 + }, + { + "epoch": 0.048610267182861405, + "grad_norm": 0.042236328125, + "learning_rate": 0.0019122695526648968, + "loss": 0.1002, + "step": 2800 + }, + { + "epoch": 0.05034634815367789, + "grad_norm": 0.04638671875, + "learning_rate": 0.0019024169952385887, + "loss": 0.0978, + "step": 2900 + }, + { + "epoch": 0.052082429124494366, + "grad_norm": 0.05126953125, + "learning_rate": 0.0018920684425573864, + "loss": 0.097, + "step": 3000 + }, + { + "epoch": 0.052082429124494366, + "eval_covost2-en-de_loss": 1.749150276184082, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.1948, + "eval_covost2-en-de_samples_per_second": 7.81, + "eval_covost2-en-de_steps_per_second": 0.976, + "step": 3000 + }, + { + "epoch": 0.052082429124494366, + "eval_covost2-zh-en_loss": 3.198117971420288, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.1979, + "eval_covost2-zh-en_samples_per_second": 7.807, + "eval_covost2-zh-en_steps_per_second": 0.976, + "step": 3000 + }, + { + "epoch": 0.052082429124494366, + "eval_peoplespeech-clean-transcription_loss": 2.345036506652832, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 11.4402, + "eval_peoplespeech-clean-transcription_samples_per_second": 5.594, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.699, + "step": 3000 + }, + { + "epoch": 0.05381851009531084, + "grad_norm": 0.06494140625, + "learning_rate": 0.0018812295824940284, + "loss": 0.0955, + "step": 3100 + }, + { + "epoch": 0.055554591066127326, + "grad_norm": 0.044677734375, + "learning_rate": 0.0018699063724087904, + "loss": 0.0951, + "step": 3200 + }, + { + "epoch": 0.0572906720369438, + "grad_norm": 0.0390625, + "learning_rate": 0.0018581050358751443, + "loss": 0.0947, + "step": 3300 + }, + { + "epoch": 0.05902675300776028, + "grad_norm": 0.056396484375, + "learning_rate": 0.0018458320592590974, + "loss": 0.0939, + "step": 3400 + }, + { + "epoch": 0.060762833978576763, + "grad_norm": 0.047119140625, + "learning_rate": 0.0018330941881540914, + "loss": 0.0941, + "step": 3500 + }, + { + "epoch": 0.06249891494939324, + "grad_norm": 0.046630859375, + "learning_rate": 0.0018198984236734246, + "loss": 0.0927, + "step": 3600 + }, + { + "epoch": 0.06423499592020972, + "grad_norm": 0.055419921875, + "learning_rate": 0.0018062520186022297, + "loss": 0.0948, + "step": 3700 + }, + { + "epoch": 0.0659710768910262, + "grad_norm": 0.046142578125, + "learning_rate": 0.0017921624734111292, + "loss": 0.09, + "step": 3800 + }, + { + "epoch": 0.06770715786184267, + "grad_norm": 0.04736328125, + "learning_rate": 0.001777637532133752, + "loss": 0.0926, + "step": 3900 + }, + { + "epoch": 0.06944323883265915, + "grad_norm": 0.048828125, + "learning_rate": 0.0017626851781103819, + "loss": 0.0906, + "step": 4000 + }, + { + "epoch": 0.06944323883265915, + "eval_covost2-en-de_loss": 1.7936017513275146, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.0356, + "eval_covost2-en-de_samples_per_second": 7.965, + "eval_covost2-en-de_steps_per_second": 0.996, + "step": 4000 + }, + { + "epoch": 0.06944323883265915, + "eval_covost2-zh-en_loss": 3.2699265480041504, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 9.5779, + "eval_covost2-zh-en_samples_per_second": 6.682, + "eval_covost2-zh-en_steps_per_second": 0.835, + "step": 4000 + }, + { + "epoch": 0.06944323883265915, + "eval_peoplespeech-clean-transcription_loss": 2.3380110263824463, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.5943, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.671, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.834, + "step": 4000 + }, + { + "epoch": 0.07117931980347564, + "grad_norm": 0.041259765625, + "learning_rate": 0.001747313629600077, + "loss": 0.0926, + "step": 4100 + }, + { + "epoch": 0.07291540077429211, + "grad_norm": 0.05322265625, + "learning_rate": 0.001731531335263669, + "loss": 0.0907, + "step": 4200 + }, + { + "epoch": 0.07465148174510859, + "grad_norm": 0.05126953125, + "learning_rate": 0.0017153469695201276, + "loss": 0.0898, + "step": 4300 + }, + { + "epoch": 0.07638756271592508, + "grad_norm": 0.061767578125, + "learning_rate": 0.0016987694277788418, + "loss": 0.0876, + "step": 4400 + }, + { + "epoch": 0.07812364368674155, + "grad_norm": 0.042724609375, + "learning_rate": 0.001681807821550438, + "loss": 0.0874, + "step": 4500 + }, + { + "epoch": 0.07985972465755803, + "grad_norm": 0.05126953125, + "learning_rate": 0.0016644714734388218, + "loss": 0.0865, + "step": 4600 + }, + { + "epoch": 0.08159580562837451, + "grad_norm": 0.042724609375, + "learning_rate": 0.0016467699120171987, + "loss": 0.0866, + "step": 4700 + }, + { + "epoch": 0.08333188659919098, + "grad_norm": 0.0419921875, + "learning_rate": 0.001628712866590885, + "loss": 0.0864, + "step": 4800 + }, + { + "epoch": 0.08506796757000747, + "grad_norm": 0.051513671875, + "learning_rate": 0.0016103102618497923, + "loss": 0.0862, + "step": 4900 + }, + { + "epoch": 0.08680404854082395, + "grad_norm": 0.052734375, + "learning_rate": 0.0015915722124135226, + "loss": 0.0855, + "step": 5000 + }, + { + "epoch": 0.08680404854082395, + "eval_covost2-en-de_loss": 1.7862941026687622, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.2861, + "eval_covost2-en-de_samples_per_second": 7.724, + "eval_covost2-en-de_steps_per_second": 0.965, + "step": 5000 + }, + { + "epoch": 0.08680404854082395, + "eval_covost2-zh-en_loss": 3.33290433883667, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.4063, + "eval_covost2-zh-en_samples_per_second": 7.613, + "eval_covost2-zh-en_steps_per_second": 0.952, + "step": 5000 + }, + { + "epoch": 0.08680404854082395, + "eval_peoplespeech-clean-transcription_loss": 2.2601113319396973, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.4946, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.741, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.843, + "step": 5000 + }, + { + "epoch": 0.08854012951164042, + "grad_norm": 0.053466796875, + "learning_rate": 0.001572509017272072, + "loss": 0.0872, + "step": 5100 + }, + { + "epoch": 0.0902762104824569, + "grad_norm": 0.044189453125, + "learning_rate": 0.0015531311541251993, + "loss": 0.0859, + "step": 5200 + }, + { + "epoch": 0.09201229145327339, + "grad_norm": 0.052978515625, + "learning_rate": 0.0015334492736235703, + "loss": 0.085, + "step": 5300 + }, + { + "epoch": 0.09374837242408986, + "grad_norm": 0.04833984375, + "learning_rate": 0.0015134741935148419, + "loss": 0.0844, + "step": 5400 + }, + { + "epoch": 0.09548445339490634, + "grad_norm": 0.047119140625, + "learning_rate": 0.0014932168926979072, + "loss": 0.0844, + "step": 5500 + }, + { + "epoch": 0.09722053436572281, + "grad_norm": 0.05029296875, + "learning_rate": 0.0014726885051885652, + "loss": 0.0856, + "step": 5600 + }, + { + "epoch": 0.0989566153365393, + "grad_norm": 0.049560546875, + "learning_rate": 0.0014519003139999338, + "loss": 0.0841, + "step": 5700 + }, + { + "epoch": 0.10069269630735578, + "grad_norm": 0.056884765625, + "learning_rate": 0.0014308637449409706, + "loss": 0.0841, + "step": 5800 + }, + { + "epoch": 0.10242877727817225, + "grad_norm": 0.041015625, + "learning_rate": 0.0014095903603365066, + "loss": 0.0825, + "step": 5900 + }, + { + "epoch": 0.10416485824898873, + "grad_norm": 0.048583984375, + "learning_rate": 0.0013880918526722496, + "loss": 0.0828, + "step": 6000 + }, + { + "epoch": 0.10416485824898873, + "eval_covost2-en-de_loss": 1.8097732067108154, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.2052, + "eval_covost2-en-de_samples_per_second": 7.8, + "eval_covost2-en-de_steps_per_second": 0.975, + "step": 6000 + }, + { + "epoch": 0.10416485824898873, + "eval_covost2-zh-en_loss": 3.331326961517334, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.2653, + "eval_covost2-zh-en_samples_per_second": 7.743, + "eval_covost2-zh-en_steps_per_second": 0.968, + "step": 6000 + }, + { + "epoch": 0.10416485824898873, + "eval_peoplespeech-clean-transcription_loss": 2.250232219696045, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.4708, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.758, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.845, + "step": 6000 + }, + { + "epoch": 0.10590093921980522, + "grad_norm": 0.04443359375, + "learning_rate": 0.0013663800381682463, + "loss": 0.0819, + "step": 6100 + }, + { + "epoch": 0.10763702019062169, + "grad_norm": 0.05419921875, + "learning_rate": 0.0013444668502843329, + "loss": 0.08, + "step": 6200 + }, + { + "epoch": 0.10937310116143817, + "grad_norm": 0.0478515625, + "learning_rate": 0.0013223643331611537, + "loss": 0.0805, + "step": 6300 + }, + { + "epoch": 0.11110918213225465, + "grad_norm": 0.051513671875, + "learning_rate": 0.001300084635000341, + "loss": 0.0799, + "step": 6400 + }, + { + "epoch": 0.11284526310307112, + "grad_norm": 0.0498046875, + "learning_rate": 0.0012776400013875004, + "loss": 0.0807, + "step": 6500 + }, + { + "epoch": 0.1145813440738876, + "grad_norm": 0.050537109375, + "learning_rate": 0.0012550427685616766, + "loss": 0.0799, + "step": 6600 + }, + { + "epoch": 0.11631742504470409, + "grad_norm": 0.05029296875, + "learning_rate": 0.0012323053566349834, + "loss": 0.0802, + "step": 6700 + }, + { + "epoch": 0.11805350601552056, + "grad_norm": 0.047119140625, + "learning_rate": 0.0012094402627661448, + "loss": 0.0796, + "step": 6800 + }, + { + "epoch": 0.11978958698633704, + "grad_norm": 0.044677734375, + "learning_rate": 0.0011864600542916813, + "loss": 0.0784, + "step": 6900 + }, + { + "epoch": 0.12152566795715353, + "grad_norm": 0.0478515625, + "learning_rate": 0.0011633773618185302, + "loss": 0.0808, + "step": 7000 + }, + { + "epoch": 0.12152566795715353, + "eval_covost2-en-de_loss": 1.7786378860473633, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.0291, + "eval_covost2-en-de_samples_per_second": 7.971, + "eval_covost2-en-de_steps_per_second": 0.996, + "step": 7000 + }, + { + "epoch": 0.12152566795715353, + "eval_covost2-zh-en_loss": 3.273571252822876, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.3234, + "eval_covost2-zh-en_samples_per_second": 7.689, + "eval_covost2-zh-en_steps_per_second": 0.961, + "step": 7000 + }, + { + "epoch": 0.12152566795715353, + "eval_peoplespeech-clean-transcription_loss": 2.2290830612182617, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.7693, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.551, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.819, + "step": 7000 + }, + { + "epoch": 0.12326174892797, + "grad_norm": 0.0478515625, + "learning_rate": 0.0011402048722818862, + "loss": 0.0786, + "step": 7100 + }, + { + "epoch": 0.12499782989878648, + "grad_norm": 0.049560546875, + "learning_rate": 0.0011169553219720827, + "loss": 0.0795, + "step": 7200 + }, + { + "epoch": 0.12673391086960295, + "grad_norm": 0.04736328125, + "learning_rate": 0.001093641489534351, + "loss": 0.0787, + "step": 7300 + }, + { + "epoch": 0.12846999184041943, + "grad_norm": 0.054931640625, + "learning_rate": 0.001070276188945293, + "loss": 0.0784, + "step": 7400 + }, + { + "epoch": 0.13020607281123592, + "grad_norm": 0.0478515625, + "learning_rate": 0.00104687226246994, + "loss": 0.0787, + "step": 7500 + }, + { + "epoch": 0.1319421537820524, + "grad_norm": 0.048828125, + "learning_rate": 0.0010234425736032607, + "loss": 0.0788, + "step": 7600 + }, + { + "epoch": 0.13367823475286889, + "grad_norm": 0.058837890625, + "learning_rate": 0.001, + "loss": 0.0769, + "step": 7700 + }, + { + "epoch": 0.13541431572368534, + "grad_norm": 0.055908203125, + "learning_rate": 0.0009765574263967396, + "loss": 0.077, + "step": 7800 + }, + { + "epoch": 0.13715039669450182, + "grad_norm": 0.05322265625, + "learning_rate": 0.0009531277375300599, + "loss": 0.0764, + "step": 7900 + }, + { + "epoch": 0.1388864776653183, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009297238110547074, + "loss": 0.0764, + "step": 8000 + }, + { + "epoch": 0.1388864776653183, + "eval_covost2-en-de_loss": 1.7951624393463135, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.1477, + "eval_covost2-en-de_samples_per_second": 7.855, + "eval_covost2-en-de_steps_per_second": 0.982, + "step": 8000 + }, + { + "epoch": 0.1388864776653183, + "eval_covost2-zh-en_loss": 3.301699161529541, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.8691, + "eval_covost2-zh-en_samples_per_second": 7.216, + "eval_covost2-zh-en_steps_per_second": 0.902, + "step": 8000 + }, + { + "epoch": 0.1388864776653183, + "eval_peoplespeech-clean-transcription_loss": 2.1518499851226807, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.5239, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.72, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.84, + "step": 8000 + }, + { + "epoch": 0.1406225586361348, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009063585104656494, + "loss": 0.0762, + "step": 8100 + }, + { + "epoch": 0.14235863960695128, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008830446780279176, + "loss": 0.0769, + "step": 8200 + }, + { + "epoch": 0.14409472057776776, + "grad_norm": 0.046875, + "learning_rate": 0.0008597951277181142, + "loss": 0.0751, + "step": 8300 + }, + { + "epoch": 0.14583080154858422, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008366226381814697, + "loss": 0.0765, + "step": 8400 + }, + { + "epoch": 0.1475668825194007, + "grad_norm": 0.052490234375, + "learning_rate": 0.000813539945708319, + "loss": 0.0763, + "step": 8500 + }, + { + "epoch": 0.14930296349021718, + "grad_norm": 0.068359375, + "learning_rate": 0.0007905597372338558, + "loss": 0.0744, + "step": 8600 + }, + { + "epoch": 0.15103904446103367, + "grad_norm": 0.055908203125, + "learning_rate": 0.0007676946433650169, + "loss": 0.0737, + "step": 8700 + }, + { + "epoch": 0.15277512543185015, + "grad_norm": 0.055908203125, + "learning_rate": 0.0007449572314383236, + "loss": 0.0758, + "step": 8800 + }, + { + "epoch": 0.1545112064026666, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007223599986124993, + "loss": 0.0753, + "step": 8900 + }, + { + "epoch": 0.1562472873734831, + "grad_norm": 0.05517578125, + "learning_rate": 0.0006999153649996595, + "loss": 0.0736, + "step": 9000 + }, + { + "epoch": 0.1562472873734831, + "eval_covost2-en-de_loss": 1.7736568450927734, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.2798, + "eval_covost2-en-de_samples_per_second": 7.73, + "eval_covost2-en-de_steps_per_second": 0.966, + "step": 9000 + }, + { + "epoch": 0.1562472873734831, + "eval_covost2-zh-en_loss": 3.2736916542053223, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.6328, + "eval_covost2-zh-en_samples_per_second": 7.414, + "eval_covost2-zh-en_steps_per_second": 0.927, + "step": 9000 + }, + { + "epoch": 0.1562472873734831, + "eval_peoplespeech-clean-transcription_loss": 2.169971227645874, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 10.7684, + "eval_peoplespeech-clean-transcription_samples_per_second": 5.943, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.743, + "step": 9000 + }, + { + "epoch": 0.15798336834429957, + "grad_norm": 0.06201171875, + "learning_rate": 0.0006776356668388464, + "loss": 0.073, + "step": 9100 + }, + { + "epoch": 0.15971944931511606, + "grad_norm": 0.0654296875, + "learning_rate": 0.0006555331497156671, + "loss": 0.0753, + "step": 9200 + }, + { + "epoch": 0.16145553028593254, + "grad_norm": 0.056640625, + "learning_rate": 0.0006336199618317538, + "loss": 0.0754, + "step": 9300 + }, + { + "epoch": 0.16319161125674903, + "grad_norm": 0.06005859375, + "learning_rate": 0.0006119081473277501, + "loss": 0.0736, + "step": 9400 + }, + { + "epoch": 0.16492769222756548, + "grad_norm": 0.059326171875, + "learning_rate": 0.0005904096396634935, + "loss": 0.0721, + "step": 9500 + }, + { + "epoch": 0.16666377319838196, + "grad_norm": 0.06201171875, + "learning_rate": 0.0005691362550590297, + "loss": 0.0717, + "step": 9600 + }, + { + "epoch": 0.16839985416919845, + "grad_norm": 0.051025390625, + "learning_rate": 0.0005480996860000663, + "loss": 0.0738, + "step": 9700 + }, + { + "epoch": 0.17013593514001493, + "grad_norm": 0.06494140625, + "learning_rate": 0.0005273114948114346, + "loss": 0.0737, + "step": 9800 + }, + { + "epoch": 0.17187201611083142, + "grad_norm": 0.058837890625, + "learning_rate": 0.0005067831073020928, + "loss": 0.0711, + "step": 9900 + }, + { + "epoch": 0.1736080970816479, + "grad_norm": 0.050537109375, + "learning_rate": 0.00048652580648515787, + "loss": 0.0722, + "step": 10000 + }, + { + "epoch": 0.1736080970816479, + "eval_covost2-en-de_loss": 1.768043875694275, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.2025, + "eval_covost2-en-de_samples_per_second": 7.802, + "eval_covost2-en-de_steps_per_second": 0.975, + "step": 10000 + }, + { + "epoch": 0.1736080970816479, + "eval_covost2-zh-en_loss": 3.288457155227661, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.5315, + "eval_covost2-zh-en_samples_per_second": 7.502, + "eval_covost2-zh-en_steps_per_second": 0.938, + "step": 10000 + }, + { + "epoch": 0.1736080970816479, + "eval_peoplespeech-clean-transcription_loss": 2.099651336669922, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.7081, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.592, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.824, + "step": 10000 + }, + { + "epoch": 0.17534417805246436, + "grad_norm": 0.059326171875, + "learning_rate": 0.0004665507263764299, + "loss": 0.0717, + "step": 10100 + }, + { + "epoch": 0.17708025902328084, + "grad_norm": 0.06103515625, + "learning_rate": 0.0004468688458748006, + "loss": 0.0729, + "step": 10200 + }, + { + "epoch": 0.17881633999409732, + "grad_norm": 0.051513671875, + "learning_rate": 0.0004274909827279283, + "loss": 0.0711, + "step": 10300 + }, + { + "epoch": 0.1805524209649138, + "grad_norm": 0.059326171875, + "learning_rate": 0.0004084277875864776, + "loss": 0.0712, + "step": 10400 + }, + { + "epoch": 0.1822885019357303, + "grad_norm": 0.062255859375, + "learning_rate": 0.00038968973815020803, + "loss": 0.0708, + "step": 10500 + }, + { + "epoch": 0.18402458290654677, + "grad_norm": 0.06640625, + "learning_rate": 0.00037128713340911534, + "loss": 0.0716, + "step": 10600 + }, + { + "epoch": 0.18576066387736323, + "grad_norm": 0.0517578125, + "learning_rate": 0.00035323008798280133, + "loss": 0.0728, + "step": 10700 + }, + { + "epoch": 0.1874967448481797, + "grad_norm": 0.076171875, + "learning_rate": 0.00033552852656117837, + "loss": 0.0711, + "step": 10800 + } + ], + "logging_steps": 100, + "max_steps": 14400, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.964486494302536e+17, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-10800/training_args.bin b/checkpoint-10800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..83b5b195c1720ea55bc992756c0ab6f1e2ef4671 --- /dev/null +++ b/checkpoint-10800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1fd9b1621955a41f9e58d5a3e2b2d6a70bd41f5404ebfa5cb0ca999c290090 +size 5688 diff --git a/checkpoint-10800/ultravox_config.py b/checkpoint-10800/ultravox_config.py new file mode 100644 index 0000000000000000000000000000000000000000..3671250d6b6a69604473ed1a526484c8c9a77f68 --- /dev/null +++ b/checkpoint-10800/ultravox_config.py @@ -0,0 +1,170 @@ +import dataclasses +from enum import Enum +from typing import Any, Dict, List, Optional + +import transformers + + +@dataclasses.dataclass +class LoraConfigSimplified: + """ + Low Rank Approximation (LoRA) configuration. + + Used for language and audio models separately. + """ + + # The rank of the approximation + r: int = 0 + lora_alpha: float = 8 + target_modules: Optional[List[str]] = dataclasses.field( + default_factory=lambda: ["k_proj", "q_proj", "linear_k", "linear_q"] + ) + + +class LossFunction(str, Enum): + CrossEntropy = "ce" + KL_Divergence = "kl" + + +@dataclasses.dataclass +class LossConfig: + loss_function: LossFunction = LossFunction.KL_Divergence + kl_temperature: float = 2.0 + + @property + def requires_alt_fields(self): + return self.loss_function == LossFunction.KL_Divergence + + +class UltravoxConfig(transformers.PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`UltravoxForConditionalGeneration`]. It is used to instantiate an + Ultravox model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + audio_config (`Wav2Vec2Config`, *optional*): + Custom audio config or dict + text_config (`Union[AutoConfig, dict]`, *optional*): + The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`. + ignore_index (`int`, *optional*, defaults to -100): + The ignore index for the loss function. + audio_token_index (`int`, *optional*, defaults to 32000): + The audio token index to encode the audio prompt. + stack_factor (`int`, *optional*, defaults to 8): + Audio downsampling factor for the multimodal projector. + norm_init (`float`, *optional*, defaults to 0.4): + The initialization value for the layer normalization. + projector_act (`str`, *optional*, defaults to `"swiglu"`): + The activation function used by the multimodal projector. + text_model_lora_config (`LoraConfigSimplified`, *optional*): + The LoRA configuration for finetuning the text model. + audio_model_lora_config (`LoraConfigSimplified`, *optional*): + The LoRA configuration for finetuning the audio model. + audio_latency_block_size (`int`, *optional*, defaults to `None`): + The latency block size for simulating audio streaming. + + + Example: + + ```python + >>> from transformers import UltravoxForConditionalGeneration, Wav2Vec2Config, UltravoxConfig, LlamaConfig + + >>> # Initializing an audio encoder config + >>> audio_config = Wav2Vec2Config() + + >>> # Initializing a Llama config + >>> text_config = LlamaConfig() + + >>> # Initializing a default configuration + >>> configuration = UltravoxConfig(audio_config, text_config) + + >>> # Initializing a completely untrained model from the configuration + >>> model = UltravoxForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # Initialize a model from pretrained checkpoints and random projector weights + >>> config = UltravoxConfig(audio_model_id="facebook/wav2vec2-base-960h", text_model_id="meta-llama/Llama-2-7b-chat-hf") + ```""" + + model_type = "ultravox" + is_composition = False + + def __init__( + self, + audio_config: Optional[Dict[str, Any]] = None, + text_config: Optional[Dict[str, Any]] = None, + audio_model_id: Optional[str] = None, + text_model_id: Optional[str] = None, + ignore_index: int = -100, + hidden_size: int = 4096, + stack_factor: int = 8, + norm_init: float = 0.4, + projector_act: str = "swiglu", + text_model_lora_config: Optional[LoraConfigSimplified] = None, + audio_model_lora_config: Optional[LoraConfigSimplified] = None, + audio_latency_block_size: Optional[int] = None, + **kwargs, + ): + self.ignore_index = ignore_index + + self.audio_model_id = audio_model_id + self.text_model_id = text_model_id + + self.hidden_size = hidden_size + self.stack_factor = stack_factor + self.norm_init = norm_init + self.projector_act = projector_act + + if text_model_id is not None: + self.text_config: transformers.LlamaConfig = ( + transformers.AutoConfig.from_pretrained(text_model_id) + ) + else: + text_config = text_config or {} + self.text_config = transformers.CONFIG_MAPPING[ + text_config.get("model_type", "llama") + ](**text_config) + + if audio_model_id is not None: + self.audio_config: transformers.PretrainedConfig = ( + transformers.AutoConfig.from_pretrained(audio_model_id) + ) + else: + audio_config = audio_config or {} + self.audio_config = transformers.CONFIG_MAPPING[ + audio_config.get("model_type", "wav2vec2") + ](**audio_config) + + self.text_model_lora_config = ( + text_model_lora_config + if isinstance(text_model_lora_config, dict) + else dataclasses.asdict(text_model_lora_config or LoraConfigSimplified()) + ) + self.audio_model_lora_config = ( + audio_model_lora_config + if isinstance(audio_model_lora_config, dict) + else dataclasses.asdict(audio_model_lora_config or LoraConfigSimplified()) + ) + self.audio_latency_block_size = audio_latency_block_size + + self.vocab_size = self.text_config.vocab_size + + self.initializer_range = self.text_config.initializer_range + + super().__init__(**kwargs) + + def to_diff_dict(self) -> Dict[str, Any]: + diff_dict = super().to_diff_dict() + + # remove text_config and audio_config if text_model_id and audio_model_id are present + if self.text_model_id is not None: + diff_dict.pop("text_config", None) + if self.audio_model_id is not None: + diff_dict.pop("audio_config", None) + + return diff_dict diff --git a/checkpoint-10800/ultravox_model.py b/checkpoint-10800/ultravox_model.py new file mode 100644 index 0000000000000000000000000000000000000000..ccea3e0ca174ec3119184d1f5a0f384c1cdbcae9 --- /dev/null +++ b/checkpoint-10800/ultravox_model.py @@ -0,0 +1,723 @@ +import logging +from typing import Any, Dict, Optional, Set, Tuple, Union + +import peft +import torch +import torch.nn as nn +import torch.nn.functional as F +import transformers +import transformers.activations +import transformers.modeling_outputs +import transformers.models +from transformers.models.whisper import modeling_whisper as whisper + +# We must use relative import in this directory to allow uploading to HF Hub +# Even "from . import X" pattern doesn't work (undocumented and unclear why) +from .ultravox_config import LossConfig +from .ultravox_config import LossFunction +from .ultravox_config import UltravoxConfig + + +class UltravoxModel(transformers.LlamaPreTrainedModel): + """ + The Ultravox model which consists of an audio encoder and a language model. + + Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and + projected to the language model's embedding space using a few linear layers. + The text is embedded by the language model as usual and then the audio and text embeddings are merged together. + + A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings. + + Parameters: + config: Model configuration class with all the parameters of the model. + """ + + config_class = UltravoxConfig + config: UltravoxConfig # for type hinting + # Usually we load encoder and LLM weights from a pretrained model separately, so they are allowed to be missing + _keys_to_ignore_on_load_missing = ["audio_tower.*", "language_model.*"] + + def __init__(self, config: UltravoxConfig): + super().__init__(config) + self._register_load_state_dict_pre_hook(self._pre_load_state_dict_hook) + + self.keep_params: Set[str] = set() + self.vocab_size = config.vocab_size + + self.audio_tower = self._create_audio_tower(config) + self.multi_modal_projector = self._create_multi_modal_projector(config) + self.language_model = self._create_language_model(config) + + # Determine no_split_modules dynamically to use with FSDP auto_wrap policy. + # FSDP throws an error if some of the layer types are not found in the model. + # This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"] + self._no_split_modules = (self.language_model._no_split_modules or []) + ( + self.audio_tower._no_split_modules or [] + ) + + self.loss_config = LossConfig() + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def tie_weights(self): + return self.language_model.tie_weights() + + def set_loss_config(self, loss_config: LossConfig): + self.loss_config = loss_config + + def _setup_cache( + self, cache_cls, max_batch_size: int, max_cache_len: Optional[int] = None + ): + self.language_model._setup_cache(cache_cls, max_batch_size, max_cache_len) + + def _reorder_cache(self, past_key_values, beam_idx): + return self.language_model._reorder_cache(past_key_values, beam_idx) + + def resize_token_embeddings( + self, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + ) -> nn.Embedding: + model_embeds = self.language_model.resize_token_embeddings( + new_num_tokens, pad_to_multiple_of + ) + # update vocab size + self.config.text_config.vocab_size = model_embeds.num_embeddings + self.config.vocab_size = model_embeds.num_embeddings + self.vocab_size = model_embeds.num_embeddings + return model_embeds + + def _compute_kl_loss( + self, + lm_output: transformers.modeling_outputs.CausalLMOutputWithPast, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + alt_input_ids: Optional[torch.Tensor] = None, + alt_attention_mask: Optional[torch.Tensor] = None, + alt_labels: Optional[torch.Tensor] = None, + **kwargs, + ): + # disable gradient computation for the teacher model + with torch.no_grad(): + # compute the teacher (text-only) model's distribution + alt_inputs_embeds = self.get_input_embeddings().forward(alt_input_ids) + alt_lm_output = self.language_model.forward( + inputs_embeds=alt_inputs_embeds, + labels=alt_labels, + attention_mask=alt_attention_mask, + past_key_values=past_key_values, + **kwargs, + ) + # compute the KL divergence loss between the two models + kl_loss = F.kl_div( + F.log_softmax( + lm_output.logits[labels != -100] / self.loss_config.kl_temperature, + dim=-1, + ), + F.softmax( + alt_lm_output.logits[alt_labels != -100] + / self.loss_config.kl_temperature, + dim=-1, + ), + reduction="batchmean", + ) + return {"loss": kl_loss} + + def forward( + self, + input_ids: torch.Tensor, + audio_values: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + audio_token_start_idx: Optional[torch.Tensor] = None, + audio_len: Optional[torch.Tensor] = None, + audio_token_len: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + # the alt_* fields are needed for KL divergence loss + alt_input_ids: Optional[torch.Tensor] = None, + alt_attention_mask: Optional[torch.Tensor] = None, + alt_labels: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]: + """ + Forward pass for the Ultravox model. + + `input_ids` are the tokenized text input. They are embedded by the language model as usual. + `audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and + projected to the language model's embedding space using a few linear layers. + The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start + of the audio embeddings in the merged embeddings. + + Args: + input_ids: The tokenized text input. + audio_values: The processed audio values. + inputs_embeds: The embeddings for the input tokens. + labels: The tokenized text labels. + attention_mask: The attention mask for the input. + position_ids: The position ids for the input. + past_key_values: The past key value cache for the language model attention layers. + **kwargs: Additional keyword arguments. Passed directly to the language model. + """ + if inputs_embeds is None: + # B x T -> B x T x D + inputs_embeds = self.get_input_embeddings().forward(input_ids) + + if audio_values is not None: + assert ( + audio_token_start_idx is not None and audio_token_len is not None + ), "audio_token_start_idx and audio_token_len must be provided if audio_values are provided." + assert ( + len(audio_token_start_idx) == len(audio_token_len) == len(audio_values) + ), "audio_token_start_idx, audio_token_len, and audio_values must have the same batch size." + + # B x A/3200 x D + audio_tower_output = self.audio_tower.forward( + audio_values.to(self.audio_tower.dtype), + audio_len=audio_len, + ).last_hidden_state + audio_tower_output = audio_tower_output.to(inputs_embeds.dtype) + + audio_embeds = self.multi_modal_projector.forward(audio_tower_output) + + # combine audio and text embeddings + for i, (audio, start, length) in enumerate( + zip(audio_embeds, audio_token_start_idx, audio_token_len) + ): + length = min(length, audio.shape[0]) + inputs_embeds[i, start : start + length] = audio[:length] + + lm_output = self.language_model.forward( + inputs_embeds=inputs_embeds, + labels=labels, + attention_mask=attention_mask, + past_key_values=past_key_values, + **kwargs, + ) + if self.training: + if self.loss_config.loss_function == LossFunction.CrossEntropy: + return lm_output + elif self.loss_config.loss_function == LossFunction.KL_Divergence: + return self._compute_kl_loss( + lm_output=lm_output, + labels=labels, + past_key_values=past_key_values, + alt_input_ids=alt_input_ids, + alt_attention_mask=alt_attention_mask, + alt_labels=alt_labels, + **kwargs, + ) + else: + raise ValueError( + f"Unsupported loss function: {self.loss_config.loss_function}" + ) + else: + return lm_output + + def prepare_inputs_for_generation( + self, + input_ids: torch.Tensor, + audio_values: Optional[torch.FloatTensor] = None, + audio_token_start_idx: Optional[torch.Tensor] = None, + audio_token_len: Optional[torch.Tensor] = None, + audio_len: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, + ) -> Dict[str, Any]: + model_input = self.language_model.prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + # include audio information in model_input only when it is needed during prefilling + # audio_token_start_idx should always be relative to the current cache position + prefill_start_idx = 0 if cache_position is None else cache_position[0] + if ( + audio_values is not None + and audio_token_start_idx is not None + and prefill_start_idx <= torch.max(audio_token_start_idx) + ): + model_input["audio_values"] = audio_values + model_input["audio_token_start_idx"] = ( + audio_token_start_idx - prefill_start_idx + ) + model_input["audio_token_len"] = audio_token_len + model_input["audio_len"] = audio_len + + return model_input + + @classmethod + def _create_multi_modal_projector( + cls, config: UltravoxConfig + ) -> "UltravoxProjector": + projector = UltravoxProjector(config) + projector.to(config.torch_dtype) + return projector + + @classmethod + def _create_audio_tower( + cls, config: UltravoxConfig + ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]: + if config.audio_model_id is not None: + if "whisper" in config.audio_model_id is not None: + audio_tower = ModifiedWhisperEncoder.from_pretrained( + config.audio_model_id, torch_dtype=config.torch_dtype + ) + audio_tower.init_latency_mask( + config.audio_latency_block_size, dtype=config.torch_dtype + ) + else: + assert config.audio_latency_block_size in ( + None, + 0, + ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'" + audio_tower = transformers.AutoModel.from_pretrained( + config.audio_model_id, torch_dtype=config.torch_dtype + ) + else: + if "whisper" in config.audio_config._name_or_path: + audio_tower = ModifiedWhisperEncoder(config.audio_config) + audio_tower.init_latency_mask( + config.audio_latency_block_size, dtype=config.torch_dtype + ) + else: + assert config.audio_latency_block_size in ( + None, + 0, + ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'" + with transformers.modeling_utils.no_init_weights(): + # we only ever use from_config if the weights are retrained, hence initializing is not + # required. This makes the model quite creation faster since init on CPU is quite slow. + audio_tower = transformers.AutoModel.from_config( + config.audio_config + ) + + if isinstance( + audio_tower, + (transformers.Wav2Vec2BertModel, transformers.WhisperModel), + ): + # For these models we only need the encoder part + # Wav2Vec2BertModel -> Wav2Vec2BertEncoder + # WhisperModel -> WhisperEncoder + audio_tower = audio_tower.encoder + + audio_tower = apply_lora(audio_tower, config.audio_model_lora_config) + return audio_tower + + @classmethod + def _create_language_model( + cls, config: UltravoxConfig + ) -> transformers.LlamaForCausalLM: + if config.text_model_id is not None: + language_model = transformers.AutoModelForCausalLM.from_pretrained( + config.text_model_id, + attn_implementation=config._attn_implementation, + torch_dtype=config.torch_dtype, + ) + else: + with transformers.modeling_utils.no_init_weights(): + # we only ever use from_config if the weights are retrained, hence initializing is not + # required. This makes the model quite creation faster since init on CPU is quite slow. + language_model = transformers.AutoModelForCausalLM.from_config( + config.text_config, + attn_implementation=config._attn_implementation, + torch_dtype=config.torch_dtype, + ) + + language_model = apply_lora(language_model, config.text_model_lora_config) + return language_model + + def merge_and_unload(self): + if isinstance(self.language_model, peft.PeftModel): + self.language_model = self.language_model.merge_and_unload() + # no need to download base language model weights anymore, so we can remove the id + self.config.text_model_id = None + self.keep_params.update( + set( + [ + f"language_model.{name}" + for name, _ in self.language_model.named_parameters() + ] + ) + ) + + if isinstance(self.audio_tower, peft.PeftModel): + self.audio_tower = self.audio_tower.merge_and_unload() + # no need to download base audio model weights anymore, so we can remove the id + self.config.audio_model_id = None + self.keep_params.update( + set( + [ + f"audio_tower.{name}" + for name, _ in self.audio_tower.named_parameters() + ] + ) + ) + + for param in ["text_model_lora_config", "audio_model_lora_config"]: + if hasattr(self.config, param): + delattr(self.config, param) + + def push_to_hub(self, *args, **kwargs): + self.merge_and_unload() + self.to(self.language_model.dtype) + return super().push_to_hub(*args, **kwargs) + + def save_pretrained( + self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs + ): + if state_dict is None: + state_dict = super().state_dict() + + named_params = dict(self.named_parameters()) + + state_dict = { + k: v + for k, v in state_dict.items() + if k in self.keep_params + or (k in named_params and named_params[k].requires_grad) + } + + super().save_pretrained(*args, state_dict=state_dict, **kwargs) + + def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs): + self.keep_params.update(set(state_dict.keys())) + + def print_trainable_parameters(self): + """ + Prints the number of trainable parameters in the model (reuses Peft model's method) + """ + count_params = peft.peft_model.PeftModel.get_nb_trainable_parameters + + trainable_params, all_param = count_params(self) + + logging.info( + f"trainable params: {trainable_params:,d} || all params: {all_param:,d}" + f" || trainable%: {100 * trainable_params / all_param:.1f}%" + ) + + lm_trainable_params, lm_all_params = count_params(self.language_model) + audio_trainable_params, audio_all_params = count_params(self.audio_tower) + + projector_trainable_params = ( + trainable_params - lm_trainable_params - audio_trainable_params + ) + projector_all_params = all_param - lm_all_params - audio_all_params + + logging.info( + f"Trainable%: " + f" LLM: {100 * lm_trainable_params / lm_all_params:.1f}%" + f" || Audio Encoder: {100 * audio_trainable_params / audio_all_params:.1f}%" + f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%" + ) + + +def is_cache_empty( + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] +) -> bool: + """ + Check if the cache is empty. + """ + if past_key_values is None: + return True + if isinstance(past_key_values, tuple): + return all(len(c) == 0 for c in past_key_values) + return past_key_values.get_seq_length() == 0 + + +def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module: + """ + Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead. + """ + lora_config = peft.LoraConfig(**lora_config or {}) + + if lora_config.r == 0: + # freeze the model entirely + for param in model.parameters(): + param.requires_grad = False + else: + model = peft.get_peft_model(model, lora_config) + + return model + + +class StackAudioFrames(nn.Module): + """ + Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`. + + The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames. + NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor, + we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings. + In most cases this extra padding will get removed in the model's forward function so it has no effect. + """ + + def __init__(self, stack_factor: int = 8): + super().__init__() + self.stack_factor = stack_factor + + def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor: + B, T, C = audio_embeds.shape + T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor + audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T + self.stack_factor)) + B, T, C = audio_embeds.shape + audio_embeds = audio_embeds.view( + B, T // self.stack_factor, C * self.stack_factor + ) + return audio_embeds + + +class RMSNorm(transformers.models.llama.modeling_llama.LlamaRMSNorm): + def __init__(self, hidden_size: int, init: float = 1, eps: float = 1e-6): + super().__init__(hidden_size=hidden_size, eps=eps) + self.weight.data.fill_(init) + + +class SwiGLU(nn.Module): + def forward(self, x): + x, gate = x.chunk(2, dim=-1) + return F.silu(gate) * x + + +class UltravoxProjector(nn.Sequential): + def __init__(self, config: UltravoxConfig): + super().__init__() + self.hidden_dim = config.hidden_size + self._pad_and_stack = StackAudioFrames(config.stack_factor) + dim = config.audio_config.hidden_size * config.stack_factor + self.ln_pre = RMSNorm(dim, init=config.norm_init) + self.linear_1 = nn.Linear(dim, self.hidden_dim, bias=False) + dim = self.hidden_dim + self.act = transformers.activations.get_activation(config.projector_act) + dim = dim // 2 if config.projector_act == "swiglu" else dim + self.linear_2 = nn.Linear(dim, config.text_config.hidden_size, bias=False) + self.ln_post = RMSNorm(config.text_config.hidden_size, init=config.norm_init) + + def forward(self, audio_features: torch.Tensor) -> torch.Tensor: + audio_features = self._pad_and_stack(audio_features) + audio_features = self.ln_pre(audio_features) + hidden_states = self.linear_1(audio_features) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + hidden_states = self.ln_post(hidden_states) + return hidden_states + + +class ModifiedWhisperEncoder( + whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin +): + """ + Encoder portion of OpenAI's Whisper model. + + This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes: + 1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder + 2. allow less than 30 second of audio padding to be passed in: + - relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal + - embed_pos is now sliced to match the length of `inputs_embeds` + + Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py + """ + + base_model_prefix = "model.encoder" + _no_split_modules = ["WhisperEncoderLayer"] + + def init_latency_mask(self, audio_latency_block_size: int, dtype: torch.dtype): + if audio_latency_block_size is None: + self.audio_streaming_mask = None + return + + # maximum sequence length + max_seqlen = ( + self.config.max_source_positions + * self.conv1.stride[0] + * self.conv2.stride[0] + ) + assert ( + max_seqlen > 0 + ), f"maximum sequence length must be positive, got {max_seqlen}" + assert ( + max_seqlen % audio_latency_block_size == 0 + ), f"audio_latency_block_size {audio_latency_block_size} must divide {max_seqlen} evenly." + # Given the block size, we calculate number of blocks. + audio_latency_nblocks = max_seqlen // audio_latency_block_size + audio_streaming_mask = ( + torch.tril( + torch.ones(audio_latency_nblocks, audio_latency_nblocks), + diagonal=0, + ) + .repeat_interleave(audio_latency_block_size, dim=0) + .repeat_interleave(audio_latency_block_size, dim=1) + ) + audio_streaming_mask = (1.0 - audio_streaming_mask) * torch.finfo(dtype).min + audio_streaming_mask = audio_streaming_mask[None, None, :, :] + self.register_buffer( + "audio_streaming_mask", audio_streaming_mask, persistent=False + ) + + def forward( + self, + input_features, + audio_len=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + expected_seq_length = ( + self.config.max_source_positions + * self.conv1.stride[0] + * self.conv2.stride[0] + ) + if input_features.shape[-1] > expected_seq_length: + raise ValueError( + f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}." + ) + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + inputs_embeds = nn.functional.gelu(self.conv1(input_features)) + inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) + + inputs_embeds = inputs_embeds.permute(0, 2, 1) + embed_pos = self.embed_positions.weight[: inputs_embeds.size(-2)] + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout( + hidden_states, p=self.dropout, training=self.training + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # Create attention mask based on audio lengths to mask out padding tokens + # For each sample in batch: + # - Convert raw audio length to feature length after convolutions + # - Create boolean mask that is True for valid positions and False for padding + # - Convert to extended attention mask format expected by transformer layers + # (1.0 for positions to attend to, large negative for positions to ignore) + # This masking ensures consistent behavior between training and inference + # by preventing the model from attending to padding tokens in both cases + attention_mask = None + if audio_len != None: + audio_feature_len = self._get_feat_extract_output_lengths(audio_len) + max_seq_len = hidden_states.shape[1] + attention_mask = torch.arange(max_seq_len, device=hidden_states.device)[ + None, : + ].lt(audio_feature_len.view(-1, 1)) + attention_mask = self.get_extended_attention_mask( + attention_mask, + None, + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + + if self.audio_streaming_mask is not None: + seqlen = hidden_states.size(-2) + if attention_mask is not None: + attention_mask = torch.minimum( + self.audio_streaming_mask[:, :, :seqlen, :seqlen], attention_mask + ) # merge + else: + attention_mask = self.audio_streaming_mask[:, :, :seqlen, :seqlen] + attention_mask = attention_mask.to(hidden_states.dtype) + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + assert head_mask.size()[0] == ( + len(self.layers) + ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=( + head_mask[idx] if head_mask is not None else None + ), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, encoder_states, all_attentions] + if v is not None + ) + return transformers.modeling_outputs.BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=encoder_states, + attentions=all_attentions, + ) + + +UltravoxConfig.register_for_auto_class() +UltravoxModel.register_for_auto_class() + +transformers.AutoConfig.register("ultravox", UltravoxConfig) +transformers.AutoModel.register(UltravoxConfig, UltravoxModel) + +transformers.activations.ACT2FN["swiglu"] = SwiGLU diff --git a/checkpoint-14400/config.json b/checkpoint-14400/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ce016ecf9d037d4afa627d419c26b5ddd28a2f8 --- /dev/null +++ b/checkpoint-14400/config.json @@ -0,0 +1,43 @@ +{ + "architectures": [ + "UltravoxModel" + ], + "audio_latency_block_size": null, + "audio_model_id": "openai/whisper-large-v3-turbo", + "audio_model_lora_config": { + "lora_alpha": 8, + "r": 0, + "target_modules": [ + "k_proj", + "q_proj", + "linear_k", + "linear_q" + ] + }, + "auto_map": { + "AutoConfig": "ultravox_config.UltravoxConfig", + "AutoModel": "ultravox_model.UltravoxModel" + }, + "hidden_size": 4096, + "ignore_index": -100, + "initializer_range": 0.02, + "model_type": "ultravox", + "norm_init": 0.4, + "pad_token_id": 128009, + "projector_act": "swiglu", + "stack_factor": 8, + "text_model_id": "meta-llama/Llama-3.2-1B-Instruct", + "text_model_lora_config": { + "lora_alpha": 8, + "r": 0, + "target_modules": [ + "k_proj", + "q_proj", + "linear_k", + "linear_q" + ] + }, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.0", + "vocab_size": 128256 +} diff --git a/checkpoint-14400/generation_config.json b/checkpoint-14400/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4dac817850f65a6be4d01d824462c9fe54468763 --- /dev/null +++ b/checkpoint-14400/generation_config.json @@ -0,0 +1,11 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "transformers_version": "4.47.0" +} diff --git a/checkpoint-14400/model.safetensors b/checkpoint-14400/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8a76fff3f8e4d9abf6f493159379ea05b1446d90 --- /dev/null +++ b/checkpoint-14400/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46fb828f3d8af8e0688d12814507e753d1b1b539be4d4fc1c4354c929de48237 +size 92299736 diff --git a/checkpoint-14400/optimizer.pt b/checkpoint-14400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..79edae5b6cd78f0585bf6e6ddace59dcb96bdf6f --- /dev/null +++ b/checkpoint-14400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83a3a84b785ddcbb571fb4f67d0a415d0ac6f7b538cec8be3e6cd7b18916ad72 +size 184602962 diff --git a/checkpoint-14400/rng_state.pth b/checkpoint-14400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..db9797d81b3660f96d15f16d85450b815cff73f5 --- /dev/null +++ b/checkpoint-14400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11516b7ccd6e385a675b71eca16211a235edde3d873d7f7e38c7de807a794cd2 +size 14244 diff --git a/checkpoint-14400/scheduler.pt b/checkpoint-14400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e390e7f555fc9c814b348911280e7916c4a719e4 --- /dev/null +++ b/checkpoint-14400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c57798d3fa88e9f7f19098498af2fbd85578814b976457ff05f5a72957807e9 +size 1064 diff --git a/checkpoint-14400/special_tokens_map.json b/checkpoint-14400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/checkpoint-14400/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-14400/tokenizer.json b/checkpoint-14400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-14400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-14400/tokenizer_config.json b/checkpoint-14400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/checkpoint-14400/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-14400/trainer_state.json b/checkpoint-14400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4d43ad651d655a1a86d2cc93151efde924b99729 --- /dev/null +++ b/checkpoint-14400/trainer_state.json @@ -0,0 +1,1426 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.24999565979757296, + "eval_steps": 1000, + "global_step": 14400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.736080970816479e-05, + "grad_norm": 10.5625, + "learning_rate": 2e-06, + "loss": 1.0, + "step": 1 + }, + { + "epoch": 0.001736080970816479, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.3311, + "step": 100 + }, + { + "epoch": 0.003472161941632958, + "grad_norm": 0.16796875, + "learning_rate": 0.0004, + "loss": 0.2169, + "step": 200 + }, + { + "epoch": 0.005208242912449436, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006, + "loss": 0.2032, + "step": 300 + }, + { + "epoch": 0.006944323883265916, + "grad_norm": 0.11279296875, + "learning_rate": 0.0008, + "loss": 0.188, + "step": 400 + }, + { + "epoch": 0.008680404854082394, + "grad_norm": 0.10107421875, + "learning_rate": 0.001, + "loss": 0.1758, + "step": 500 + }, + { + "epoch": 0.010416485824898873, + "grad_norm": 0.09521484375, + "learning_rate": 0.0012, + "loss": 0.1637, + "step": 600 + }, + { + "epoch": 0.012152566795715351, + "grad_norm": 0.08154296875, + "learning_rate": 0.0014, + "loss": 0.1518, + "step": 700 + }, + { + "epoch": 0.013888647766531832, + "grad_norm": 0.08642578125, + "learning_rate": 0.0016, + "loss": 0.1485, + "step": 800 + }, + { + "epoch": 0.01562472873734831, + "grad_norm": 0.1044921875, + "learning_rate": 0.0018000000000000002, + "loss": 0.1433, + "step": 900 + }, + { + "epoch": 0.01736080970816479, + "grad_norm": 0.05419921875, + "learning_rate": 0.002, + "loss": 0.139, + "step": 1000 + }, + { + "epoch": 0.01736080970816479, + "eval_covost2-en-de_loss": 1.896493673324585, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 9.8697, + "eval_covost2-en-de_samples_per_second": 6.485, + "eval_covost2-en-de_steps_per_second": 0.811, + "step": 1000 + }, + { + "epoch": 0.01736080970816479, + "eval_covost2-zh-en_loss": 3.1452860832214355, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.3732, + "eval_covost2-zh-en_samples_per_second": 7.643, + "eval_covost2-zh-en_steps_per_second": 0.955, + "step": 1000 + }, + { + "epoch": 0.01736080970816479, + "eval_peoplespeech-clean-transcription_loss": 3.2206106185913086, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.6941, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.602, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.825, + "step": 1000 + }, + { + "epoch": 0.01909689067898127, + "grad_norm": 0.059814453125, + "learning_rate": 0.001999725185109816, + "loss": 0.1334, + "step": 1100 + }, + { + "epoch": 0.020832971649797746, + "grad_norm": 0.07373046875, + "learning_rate": 0.0019989008914857113, + "loss": 0.1288, + "step": 1200 + }, + { + "epoch": 0.022569052620614226, + "grad_norm": 0.049560546875, + "learning_rate": 0.00199752757218401, + "loss": 0.1262, + "step": 1300 + }, + { + "epoch": 0.024305133591430703, + "grad_norm": 0.0517578125, + "learning_rate": 0.001995605982021898, + "loss": 0.1222, + "step": 1400 + }, + { + "epoch": 0.026041214562247183, + "grad_norm": 0.058349609375, + "learning_rate": 0.0019931371771625545, + "loss": 0.1193, + "step": 1500 + }, + { + "epoch": 0.027777295533063663, + "grad_norm": 0.0498046875, + "learning_rate": 0.001990122514534651, + "loss": 0.1196, + "step": 1600 + }, + { + "epoch": 0.02951337650388014, + "grad_norm": 0.05517578125, + "learning_rate": 0.0019865636510865464, + "loss": 0.115, + "step": 1700 + }, + { + "epoch": 0.03124945747469662, + "grad_norm": 0.044677734375, + "learning_rate": 0.001982462542875576, + "loss": 0.115, + "step": 1800 + }, + { + "epoch": 0.0329855384455131, + "grad_norm": 0.05419921875, + "learning_rate": 0.001977821443992945, + "loss": 0.1125, + "step": 1900 + }, + { + "epoch": 0.03472161941632958, + "grad_norm": 0.047119140625, + "learning_rate": 0.001972642905324813, + "loss": 0.1094, + "step": 2000 + }, + { + "epoch": 0.03472161941632958, + "eval_covost2-en-de_loss": 1.6700351238250732, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.1279, + "eval_covost2-en-de_samples_per_second": 7.874, + "eval_covost2-en-de_steps_per_second": 0.984, + "step": 2000 + }, + { + "epoch": 0.03472161941632958, + "eval_covost2-zh-en_loss": 3.093877077102661, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.1488, + "eval_covost2-zh-en_samples_per_second": 7.854, + "eval_covost2-zh-en_steps_per_second": 0.982, + "step": 2000 + }, + { + "epoch": 0.03472161941632958, + "eval_peoplespeech-clean-transcription_loss": 2.478968620300293, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.5507, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.701, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.838, + "step": 2000 + }, + { + "epoch": 0.036457700387146054, + "grad_norm": 0.048583984375, + "learning_rate": 0.0019669297731502505, + "loss": 0.1077, + "step": 2100 + }, + { + "epoch": 0.03819378135796254, + "grad_norm": 0.054443359375, + "learning_rate": 0.00196068518757684, + "loss": 0.1069, + "step": 2200 + }, + { + "epoch": 0.039929862328779014, + "grad_norm": 0.047119140625, + "learning_rate": 0.001953912580814779, + "loss": 0.1043, + "step": 2300 + }, + { + "epoch": 0.04166594329959549, + "grad_norm": 0.044921875, + "learning_rate": 0.0019466156752904343, + "loss": 0.1035, + "step": 2400 + }, + { + "epoch": 0.043402024270411975, + "grad_norm": 0.050537109375, + "learning_rate": 0.0019387984816003866, + "loss": 0.1033, + "step": 2500 + }, + { + "epoch": 0.04513810524122845, + "grad_norm": 0.056396484375, + "learning_rate": 0.0019304652963070869, + "loss": 0.102, + "step": 2600 + }, + { + "epoch": 0.04687418621204493, + "grad_norm": 0.046875, + "learning_rate": 0.0019216206995773372, + "loss": 0.0998, + "step": 2700 + }, + { + "epoch": 0.048610267182861405, + "grad_norm": 0.042236328125, + "learning_rate": 0.0019122695526648968, + "loss": 0.1002, + "step": 2800 + }, + { + "epoch": 0.05034634815367789, + "grad_norm": 0.04638671875, + "learning_rate": 0.0019024169952385887, + "loss": 0.0978, + "step": 2900 + }, + { + "epoch": 0.052082429124494366, + "grad_norm": 0.05126953125, + "learning_rate": 0.0018920684425573864, + "loss": 0.097, + "step": 3000 + }, + { + "epoch": 0.052082429124494366, + "eval_covost2-en-de_loss": 1.749150276184082, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.1948, + "eval_covost2-en-de_samples_per_second": 7.81, + "eval_covost2-en-de_steps_per_second": 0.976, + "step": 3000 + }, + { + "epoch": 0.052082429124494366, + "eval_covost2-zh-en_loss": 3.198117971420288, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.1979, + "eval_covost2-zh-en_samples_per_second": 7.807, + "eval_covost2-zh-en_steps_per_second": 0.976, + "step": 3000 + }, + { + "epoch": 0.052082429124494366, + "eval_peoplespeech-clean-transcription_loss": 2.345036506652832, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 11.4402, + "eval_peoplespeech-clean-transcription_samples_per_second": 5.594, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.699, + "step": 3000 + }, + { + "epoch": 0.05381851009531084, + "grad_norm": 0.06494140625, + "learning_rate": 0.0018812295824940284, + "loss": 0.0955, + "step": 3100 + }, + { + "epoch": 0.055554591066127326, + "grad_norm": 0.044677734375, + "learning_rate": 0.0018699063724087904, + "loss": 0.0951, + "step": 3200 + }, + { + "epoch": 0.0572906720369438, + "grad_norm": 0.0390625, + "learning_rate": 0.0018581050358751443, + "loss": 0.0947, + "step": 3300 + }, + { + "epoch": 0.05902675300776028, + "grad_norm": 0.056396484375, + "learning_rate": 0.0018458320592590974, + "loss": 0.0939, + "step": 3400 + }, + { + "epoch": 0.060762833978576763, + "grad_norm": 0.047119140625, + "learning_rate": 0.0018330941881540914, + "loss": 0.0941, + "step": 3500 + }, + { + "epoch": 0.06249891494939324, + "grad_norm": 0.046630859375, + "learning_rate": 0.0018198984236734246, + "loss": 0.0927, + "step": 3600 + }, + { + "epoch": 0.06423499592020972, + "grad_norm": 0.055419921875, + "learning_rate": 0.0018062520186022297, + "loss": 0.0948, + "step": 3700 + }, + { + "epoch": 0.0659710768910262, + "grad_norm": 0.046142578125, + "learning_rate": 0.0017921624734111292, + "loss": 0.09, + "step": 3800 + }, + { + "epoch": 0.06770715786184267, + "grad_norm": 0.04736328125, + "learning_rate": 0.001777637532133752, + "loss": 0.0926, + "step": 3900 + }, + { + "epoch": 0.06944323883265915, + "grad_norm": 0.048828125, + "learning_rate": 0.0017626851781103819, + "loss": 0.0906, + "step": 4000 + }, + { + "epoch": 0.06944323883265915, + "eval_covost2-en-de_loss": 1.7936017513275146, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.0356, + "eval_covost2-en-de_samples_per_second": 7.965, + "eval_covost2-en-de_steps_per_second": 0.996, + "step": 4000 + }, + { + "epoch": 0.06944323883265915, + "eval_covost2-zh-en_loss": 3.2699265480041504, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 9.5779, + "eval_covost2-zh-en_samples_per_second": 6.682, + "eval_covost2-zh-en_steps_per_second": 0.835, + "step": 4000 + }, + { + "epoch": 0.06944323883265915, + "eval_peoplespeech-clean-transcription_loss": 2.3380110263824463, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.5943, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.671, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.834, + "step": 4000 + }, + { + "epoch": 0.07117931980347564, + "grad_norm": 0.041259765625, + "learning_rate": 0.001747313629600077, + "loss": 0.0926, + "step": 4100 + }, + { + "epoch": 0.07291540077429211, + "grad_norm": 0.05322265625, + "learning_rate": 0.001731531335263669, + "loss": 0.0907, + "step": 4200 + }, + { + "epoch": 0.07465148174510859, + "grad_norm": 0.05126953125, + "learning_rate": 0.0017153469695201276, + "loss": 0.0898, + "step": 4300 + }, + { + "epoch": 0.07638756271592508, + "grad_norm": 0.061767578125, + "learning_rate": 0.0016987694277788418, + "loss": 0.0876, + "step": 4400 + }, + { + "epoch": 0.07812364368674155, + "grad_norm": 0.042724609375, + "learning_rate": 0.001681807821550438, + "loss": 0.0874, + "step": 4500 + }, + { + "epoch": 0.07985972465755803, + "grad_norm": 0.05126953125, + "learning_rate": 0.0016644714734388218, + "loss": 0.0865, + "step": 4600 + }, + { + "epoch": 0.08159580562837451, + "grad_norm": 0.042724609375, + "learning_rate": 0.0016467699120171987, + "loss": 0.0866, + "step": 4700 + }, + { + "epoch": 0.08333188659919098, + "grad_norm": 0.0419921875, + "learning_rate": 0.001628712866590885, + "loss": 0.0864, + "step": 4800 + }, + { + "epoch": 0.08506796757000747, + "grad_norm": 0.051513671875, + "learning_rate": 0.0016103102618497923, + "loss": 0.0862, + "step": 4900 + }, + { + "epoch": 0.08680404854082395, + "grad_norm": 0.052734375, + "learning_rate": 0.0015915722124135226, + "loss": 0.0855, + "step": 5000 + }, + { + "epoch": 0.08680404854082395, + "eval_covost2-en-de_loss": 1.7862941026687622, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.2861, + "eval_covost2-en-de_samples_per_second": 7.724, + "eval_covost2-en-de_steps_per_second": 0.965, + "step": 5000 + }, + { + "epoch": 0.08680404854082395, + "eval_covost2-zh-en_loss": 3.33290433883667, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.4063, + "eval_covost2-zh-en_samples_per_second": 7.613, + "eval_covost2-zh-en_steps_per_second": 0.952, + "step": 5000 + }, + { + "epoch": 0.08680404854082395, + "eval_peoplespeech-clean-transcription_loss": 2.2601113319396973, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.4946, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.741, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.843, + "step": 5000 + }, + { + "epoch": 0.08854012951164042, + "grad_norm": 0.053466796875, + "learning_rate": 0.001572509017272072, + "loss": 0.0872, + "step": 5100 + }, + { + "epoch": 0.0902762104824569, + "grad_norm": 0.044189453125, + "learning_rate": 0.0015531311541251993, + "loss": 0.0859, + "step": 5200 + }, + { + "epoch": 0.09201229145327339, + "grad_norm": 0.052978515625, + "learning_rate": 0.0015334492736235703, + "loss": 0.085, + "step": 5300 + }, + { + "epoch": 0.09374837242408986, + "grad_norm": 0.04833984375, + "learning_rate": 0.0015134741935148419, + "loss": 0.0844, + "step": 5400 + }, + { + "epoch": 0.09548445339490634, + "grad_norm": 0.047119140625, + "learning_rate": 0.0014932168926979072, + "loss": 0.0844, + "step": 5500 + }, + { + "epoch": 0.09722053436572281, + "grad_norm": 0.05029296875, + "learning_rate": 0.0014726885051885652, + "loss": 0.0856, + "step": 5600 + }, + { + "epoch": 0.0989566153365393, + "grad_norm": 0.049560546875, + "learning_rate": 0.0014519003139999338, + "loss": 0.0841, + "step": 5700 + }, + { + "epoch": 0.10069269630735578, + "grad_norm": 0.056884765625, + "learning_rate": 0.0014308637449409706, + "loss": 0.0841, + "step": 5800 + }, + { + "epoch": 0.10242877727817225, + "grad_norm": 0.041015625, + "learning_rate": 0.0014095903603365066, + "loss": 0.0825, + "step": 5900 + }, + { + "epoch": 0.10416485824898873, + "grad_norm": 0.048583984375, + "learning_rate": 0.0013880918526722496, + "loss": 0.0828, + "step": 6000 + }, + { + "epoch": 0.10416485824898873, + "eval_covost2-en-de_loss": 1.8097732067108154, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.2052, + "eval_covost2-en-de_samples_per_second": 7.8, + "eval_covost2-en-de_steps_per_second": 0.975, + "step": 6000 + }, + { + "epoch": 0.10416485824898873, + "eval_covost2-zh-en_loss": 3.331326961517334, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.2653, + "eval_covost2-zh-en_samples_per_second": 7.743, + "eval_covost2-zh-en_steps_per_second": 0.968, + "step": 6000 + }, + { + "epoch": 0.10416485824898873, + "eval_peoplespeech-clean-transcription_loss": 2.250232219696045, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.4708, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.758, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.845, + "step": 6000 + }, + { + "epoch": 0.10590093921980522, + "grad_norm": 0.04443359375, + "learning_rate": 0.0013663800381682463, + "loss": 0.0819, + "step": 6100 + }, + { + "epoch": 0.10763702019062169, + "grad_norm": 0.05419921875, + "learning_rate": 0.0013444668502843329, + "loss": 0.08, + "step": 6200 + }, + { + "epoch": 0.10937310116143817, + "grad_norm": 0.0478515625, + "learning_rate": 0.0013223643331611537, + "loss": 0.0805, + "step": 6300 + }, + { + "epoch": 0.11110918213225465, + "grad_norm": 0.051513671875, + "learning_rate": 0.001300084635000341, + "loss": 0.0799, + "step": 6400 + }, + { + "epoch": 0.11284526310307112, + "grad_norm": 0.0498046875, + "learning_rate": 0.0012776400013875004, + "loss": 0.0807, + "step": 6500 + }, + { + "epoch": 0.1145813440738876, + "grad_norm": 0.050537109375, + "learning_rate": 0.0012550427685616766, + "loss": 0.0799, + "step": 6600 + }, + { + "epoch": 0.11631742504470409, + "grad_norm": 0.05029296875, + "learning_rate": 0.0012323053566349834, + "loss": 0.0802, + "step": 6700 + }, + { + "epoch": 0.11805350601552056, + "grad_norm": 0.047119140625, + "learning_rate": 0.0012094402627661448, + "loss": 0.0796, + "step": 6800 + }, + { + "epoch": 0.11978958698633704, + "grad_norm": 0.044677734375, + "learning_rate": 0.0011864600542916813, + "loss": 0.0784, + "step": 6900 + }, + { + "epoch": 0.12152566795715353, + "grad_norm": 0.0478515625, + "learning_rate": 0.0011633773618185302, + "loss": 0.0808, + "step": 7000 + }, + { + "epoch": 0.12152566795715353, + "eval_covost2-en-de_loss": 1.7786378860473633, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.0291, + "eval_covost2-en-de_samples_per_second": 7.971, + "eval_covost2-en-de_steps_per_second": 0.996, + "step": 7000 + }, + { + "epoch": 0.12152566795715353, + "eval_covost2-zh-en_loss": 3.273571252822876, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.3234, + "eval_covost2-zh-en_samples_per_second": 7.689, + "eval_covost2-zh-en_steps_per_second": 0.961, + "step": 7000 + }, + { + "epoch": 0.12152566795715353, + "eval_peoplespeech-clean-transcription_loss": 2.2290830612182617, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.7693, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.551, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.819, + "step": 7000 + }, + { + "epoch": 0.12326174892797, + "grad_norm": 0.0478515625, + "learning_rate": 0.0011402048722818862, + "loss": 0.0786, + "step": 7100 + }, + { + "epoch": 0.12499782989878648, + "grad_norm": 0.049560546875, + "learning_rate": 0.0011169553219720827, + "loss": 0.0795, + "step": 7200 + }, + { + "epoch": 0.12673391086960295, + "grad_norm": 0.04736328125, + "learning_rate": 0.001093641489534351, + "loss": 0.0787, + "step": 7300 + }, + { + "epoch": 0.12846999184041943, + "grad_norm": 0.054931640625, + "learning_rate": 0.001070276188945293, + "loss": 0.0784, + "step": 7400 + }, + { + "epoch": 0.13020607281123592, + "grad_norm": 0.0478515625, + "learning_rate": 0.00104687226246994, + "loss": 0.0787, + "step": 7500 + }, + { + "epoch": 0.1319421537820524, + "grad_norm": 0.048828125, + "learning_rate": 0.0010234425736032607, + "loss": 0.0788, + "step": 7600 + }, + { + "epoch": 0.13367823475286889, + "grad_norm": 0.058837890625, + "learning_rate": 0.001, + "loss": 0.0769, + "step": 7700 + }, + { + "epoch": 0.13541431572368534, + "grad_norm": 0.055908203125, + "learning_rate": 0.0009765574263967396, + "loss": 0.077, + "step": 7800 + }, + { + "epoch": 0.13715039669450182, + "grad_norm": 0.05322265625, + "learning_rate": 0.0009531277375300599, + "loss": 0.0764, + "step": 7900 + }, + { + "epoch": 0.1388864776653183, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009297238110547074, + "loss": 0.0764, + "step": 8000 + }, + { + "epoch": 0.1388864776653183, + "eval_covost2-en-de_loss": 1.7951624393463135, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.1477, + "eval_covost2-en-de_samples_per_second": 7.855, + "eval_covost2-en-de_steps_per_second": 0.982, + "step": 8000 + }, + { + "epoch": 0.1388864776653183, + "eval_covost2-zh-en_loss": 3.301699161529541, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.8691, + "eval_covost2-zh-en_samples_per_second": 7.216, + "eval_covost2-zh-en_steps_per_second": 0.902, + "step": 8000 + }, + { + "epoch": 0.1388864776653183, + "eval_peoplespeech-clean-transcription_loss": 2.1518499851226807, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.5239, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.72, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.84, + "step": 8000 + }, + { + "epoch": 0.1406225586361348, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009063585104656494, + "loss": 0.0762, + "step": 8100 + }, + { + "epoch": 0.14235863960695128, + "grad_norm": 0.0654296875, + "learning_rate": 0.0008830446780279176, + "loss": 0.0769, + "step": 8200 + }, + { + "epoch": 0.14409472057776776, + "grad_norm": 0.046875, + "learning_rate": 0.0008597951277181142, + "loss": 0.0751, + "step": 8300 + }, + { + "epoch": 0.14583080154858422, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008366226381814697, + "loss": 0.0765, + "step": 8400 + }, + { + "epoch": 0.1475668825194007, + "grad_norm": 0.052490234375, + "learning_rate": 0.000813539945708319, + "loss": 0.0763, + "step": 8500 + }, + { + "epoch": 0.14930296349021718, + "grad_norm": 0.068359375, + "learning_rate": 0.0007905597372338558, + "loss": 0.0744, + "step": 8600 + }, + { + "epoch": 0.15103904446103367, + "grad_norm": 0.055908203125, + "learning_rate": 0.0007676946433650169, + "loss": 0.0737, + "step": 8700 + }, + { + "epoch": 0.15277512543185015, + "grad_norm": 0.055908203125, + "learning_rate": 0.0007449572314383236, + "loss": 0.0758, + "step": 8800 + }, + { + "epoch": 0.1545112064026666, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007223599986124993, + "loss": 0.0753, + "step": 8900 + }, + { + "epoch": 0.1562472873734831, + "grad_norm": 0.05517578125, + "learning_rate": 0.0006999153649996595, + "loss": 0.0736, + "step": 9000 + }, + { + "epoch": 0.1562472873734831, + "eval_covost2-en-de_loss": 1.7736568450927734, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.2798, + "eval_covost2-en-de_samples_per_second": 7.73, + "eval_covost2-en-de_steps_per_second": 0.966, + "step": 9000 + }, + { + "epoch": 0.1562472873734831, + "eval_covost2-zh-en_loss": 3.2736916542053223, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.6328, + "eval_covost2-zh-en_samples_per_second": 7.414, + "eval_covost2-zh-en_steps_per_second": 0.927, + "step": 9000 + }, + { + "epoch": 0.1562472873734831, + "eval_peoplespeech-clean-transcription_loss": 2.169971227645874, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 10.7684, + "eval_peoplespeech-clean-transcription_samples_per_second": 5.943, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.743, + "step": 9000 + }, + { + "epoch": 0.15798336834429957, + "grad_norm": 0.06201171875, + "learning_rate": 0.0006776356668388464, + "loss": 0.073, + "step": 9100 + }, + { + "epoch": 0.15971944931511606, + "grad_norm": 0.0654296875, + "learning_rate": 0.0006555331497156671, + "loss": 0.0753, + "step": 9200 + }, + { + "epoch": 0.16145553028593254, + "grad_norm": 0.056640625, + "learning_rate": 0.0006336199618317538, + "loss": 0.0754, + "step": 9300 + }, + { + "epoch": 0.16319161125674903, + "grad_norm": 0.06005859375, + "learning_rate": 0.0006119081473277501, + "loss": 0.0736, + "step": 9400 + }, + { + "epoch": 0.16492769222756548, + "grad_norm": 0.059326171875, + "learning_rate": 0.0005904096396634935, + "loss": 0.0721, + "step": 9500 + }, + { + "epoch": 0.16666377319838196, + "grad_norm": 0.06201171875, + "learning_rate": 0.0005691362550590297, + "loss": 0.0717, + "step": 9600 + }, + { + "epoch": 0.16839985416919845, + "grad_norm": 0.051025390625, + "learning_rate": 0.0005480996860000663, + "loss": 0.0738, + "step": 9700 + }, + { + "epoch": 0.17013593514001493, + "grad_norm": 0.06494140625, + "learning_rate": 0.0005273114948114346, + "loss": 0.0737, + "step": 9800 + }, + { + "epoch": 0.17187201611083142, + "grad_norm": 0.058837890625, + "learning_rate": 0.0005067831073020928, + "loss": 0.0711, + "step": 9900 + }, + { + "epoch": 0.1736080970816479, + "grad_norm": 0.050537109375, + "learning_rate": 0.00048652580648515787, + "loss": 0.0722, + "step": 10000 + }, + { + "epoch": 0.1736080970816479, + "eval_covost2-en-de_loss": 1.768043875694275, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.2025, + "eval_covost2-en-de_samples_per_second": 7.802, + "eval_covost2-en-de_steps_per_second": 0.975, + "step": 10000 + }, + { + "epoch": 0.1736080970816479, + "eval_covost2-zh-en_loss": 3.288457155227661, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.5315, + "eval_covost2-zh-en_samples_per_second": 7.502, + "eval_covost2-zh-en_steps_per_second": 0.938, + "step": 10000 + }, + { + "epoch": 0.1736080970816479, + "eval_peoplespeech-clean-transcription_loss": 2.099651336669922, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.7081, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.592, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.824, + "step": 10000 + }, + { + "epoch": 0.17534417805246436, + "grad_norm": 0.059326171875, + "learning_rate": 0.0004665507263764299, + "loss": 0.0717, + "step": 10100 + }, + { + "epoch": 0.17708025902328084, + "grad_norm": 0.06103515625, + "learning_rate": 0.0004468688458748006, + "loss": 0.0729, + "step": 10200 + }, + { + "epoch": 0.17881633999409732, + "grad_norm": 0.051513671875, + "learning_rate": 0.0004274909827279283, + "loss": 0.0711, + "step": 10300 + }, + { + "epoch": 0.1805524209649138, + "grad_norm": 0.059326171875, + "learning_rate": 0.0004084277875864776, + "loss": 0.0712, + "step": 10400 + }, + { + "epoch": 0.1822885019357303, + "grad_norm": 0.062255859375, + "learning_rate": 0.00038968973815020803, + "loss": 0.0708, + "step": 10500 + }, + { + "epoch": 0.18402458290654677, + "grad_norm": 0.06640625, + "learning_rate": 0.00037128713340911534, + "loss": 0.0716, + "step": 10600 + }, + { + "epoch": 0.18576066387736323, + "grad_norm": 0.0517578125, + "learning_rate": 0.00035323008798280133, + "loss": 0.0728, + "step": 10700 + }, + { + "epoch": 0.1874967448481797, + "grad_norm": 0.076171875, + "learning_rate": 0.00033552852656117837, + "loss": 0.0711, + "step": 10800 + }, + { + "epoch": 0.1892328258189962, + "grad_norm": 0.06494140625, + "learning_rate": 0.00031819217844956217, + "loss": 0.0701, + "step": 10900 + }, + { + "epoch": 0.19096890678981268, + "grad_norm": 0.054443359375, + "learning_rate": 0.00030123057222115836, + "loss": 0.0705, + "step": 11000 + }, + { + "epoch": 0.19096890678981268, + "eval_covost2-en-de_loss": 1.7685788869857788, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.4174, + "eval_covost2-en-de_samples_per_second": 7.603, + "eval_covost2-en-de_steps_per_second": 0.95, + "step": 11000 + }, + { + "epoch": 0.19096890678981268, + "eval_covost2-zh-en_loss": 3.286205530166626, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 11.2405, + "eval_covost2-zh-en_samples_per_second": 5.694, + "eval_covost2-zh-en_steps_per_second": 0.712, + "step": 11000 + }, + { + "epoch": 0.19096890678981268, + "eval_peoplespeech-clean-transcription_loss": 2.1252198219299316, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.3932, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.813, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.852, + "step": 11000 + }, + { + "epoch": 0.19270498776062917, + "grad_norm": 0.076171875, + "learning_rate": 0.0002846530304798727, + "loss": 0.0709, + "step": 11100 + }, + { + "epoch": 0.19444106873144562, + "grad_norm": 0.07275390625, + "learning_rate": 0.00026846866473633125, + "loss": 0.0717, + "step": 11200 + }, + { + "epoch": 0.1961771497022621, + "grad_norm": 0.0498046875, + "learning_rate": 0.00025268637039992293, + "loss": 0.0699, + "step": 11300 + }, + { + "epoch": 0.1979132306730786, + "grad_norm": 0.046875, + "learning_rate": 0.00023731482188961818, + "loss": 0.0712, + "step": 11400 + }, + { + "epoch": 0.19964931164389507, + "grad_norm": 0.050048828125, + "learning_rate": 0.00022236246786624792, + "loss": 0.0714, + "step": 11500 + }, + { + "epoch": 0.20138539261471156, + "grad_norm": 0.04931640625, + "learning_rate": 0.00020783752658887068, + "loss": 0.071, + "step": 11600 + }, + { + "epoch": 0.20312147358552804, + "grad_norm": 0.06298828125, + "learning_rate": 0.0001937479813977703, + "loss": 0.0706, + "step": 11700 + }, + { + "epoch": 0.2048575545563445, + "grad_norm": 0.05712890625, + "learning_rate": 0.00018010157632657541, + "loss": 0.07, + "step": 11800 + }, + { + "epoch": 0.20659363552716098, + "grad_norm": 0.068359375, + "learning_rate": 0.00016690581184590858, + "loss": 0.0708, + "step": 11900 + }, + { + "epoch": 0.20832971649797746, + "grad_norm": 0.05908203125, + "learning_rate": 0.00015416794074090258, + "loss": 0.069, + "step": 12000 + }, + { + "epoch": 0.20832971649797746, + "eval_covost2-en-de_loss": 1.7618954181671143, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.2954, + "eval_covost2-en-de_samples_per_second": 7.715, + "eval_covost2-en-de_steps_per_second": 0.964, + "step": 12000 + }, + { + "epoch": 0.20832971649797746, + "eval_covost2-zh-en_loss": 3.287311553955078, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.0269, + "eval_covost2-zh-en_samples_per_second": 7.973, + "eval_covost2-zh-en_steps_per_second": 0.997, + "step": 12000 + }, + { + "epoch": 0.20832971649797746, + "eval_peoplespeech-clean-transcription_loss": 2.119732141494751, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.4528, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.77, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.846, + "step": 12000 + }, + { + "epoch": 0.21006579746879395, + "grad_norm": 0.059814453125, + "learning_rate": 0.00014189496412485593, + "loss": 0.0703, + "step": 12100 + }, + { + "epoch": 0.21180187843961043, + "grad_norm": 0.053466796875, + "learning_rate": 0.00013009362759120978, + "loss": 0.0713, + "step": 12200 + }, + { + "epoch": 0.21353795941042691, + "grad_norm": 0.0673828125, + "learning_rate": 0.00011877041750597173, + "loss": 0.0694, + "step": 12300 + }, + { + "epoch": 0.21527404038124337, + "grad_norm": 0.060302734375, + "learning_rate": 0.00010793155744261352, + "loss": 0.07, + "step": 12400 + }, + { + "epoch": 0.21701012135205985, + "grad_norm": 0.0390625, + "learning_rate": 9.758300476141169e-05, + "loss": 0.0675, + "step": 12500 + }, + { + "epoch": 0.21874620232287634, + "grad_norm": 0.058349609375, + "learning_rate": 8.773044733510338e-05, + "loss": 0.0699, + "step": 12600 + }, + { + "epoch": 0.22048228329369282, + "grad_norm": 0.04931640625, + "learning_rate": 7.837930042266262e-05, + "loss": 0.0708, + "step": 12700 + }, + { + "epoch": 0.2222183642645093, + "grad_norm": 0.054931640625, + "learning_rate": 6.953470369291348e-05, + "loss": 0.0701, + "step": 12800 + }, + { + "epoch": 0.2239544452353258, + "grad_norm": 0.0556640625, + "learning_rate": 6.120151839961363e-05, + "loss": 0.0703, + "step": 12900 + }, + { + "epoch": 0.22569052620614224, + "grad_norm": 0.0654296875, + "learning_rate": 5.338432470956589e-05, + "loss": 0.0707, + "step": 13000 + }, + { + "epoch": 0.22569052620614224, + "eval_covost2-en-de_loss": 1.7618814706802368, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.2896, + "eval_covost2-en-de_samples_per_second": 7.721, + "eval_covost2-en-de_steps_per_second": 0.965, + "step": 13000 + }, + { + "epoch": 0.22569052620614224, + "eval_covost2-zh-en_loss": 3.287533760070801, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.1323, + "eval_covost2-zh-en_samples_per_second": 7.87, + "eval_covost2-zh-en_steps_per_second": 0.984, + "step": 13000 + }, + { + "epoch": 0.22569052620614224, + "eval_peoplespeech-clean-transcription_loss": 2.1182146072387695, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.6791, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.612, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.827, + "step": 13000 + }, + { + "epoch": 0.22742660717695873, + "grad_norm": 0.056396484375, + "learning_rate": 4.6087419185220966e-05, + "loss": 0.0698, + "step": 13100 + }, + { + "epoch": 0.2291626881477752, + "grad_norm": 0.0634765625, + "learning_rate": 3.931481242315993e-05, + "loss": 0.0709, + "step": 13200 + }, + { + "epoch": 0.2308987691185917, + "grad_norm": 0.0625, + "learning_rate": 3.307022684974936e-05, + "loss": 0.071, + "step": 13300 + }, + { + "epoch": 0.23263485008940818, + "grad_norm": 0.05712890625, + "learning_rate": 2.7357094675186987e-05, + "loss": 0.0704, + "step": 13400 + }, + { + "epoch": 0.23437093106022464, + "grad_norm": 0.068359375, + "learning_rate": 2.2178556007054874e-05, + "loss": 0.0704, + "step": 13500 + }, + { + "epoch": 0.23610701203104112, + "grad_norm": 0.05615234375, + "learning_rate": 1.7537457124423894e-05, + "loss": 0.0712, + "step": 13600 + }, + { + "epoch": 0.2378430930018576, + "grad_norm": 0.07275390625, + "learning_rate": 1.3436348913453578e-05, + "loss": 0.0709, + "step": 13700 + }, + { + "epoch": 0.2395791739726741, + "grad_norm": 0.05078125, + "learning_rate": 9.877485465349056e-06, + "loss": 0.0701, + "step": 13800 + }, + { + "epoch": 0.24131525494349057, + "grad_norm": 0.053955078125, + "learning_rate": 6.862822837445881e-06, + "loss": 0.0708, + "step": 13900 + }, + { + "epoch": 0.24305133591430705, + "grad_norm": 0.07275390625, + "learning_rate": 4.394017978101905e-06, + "loss": 0.0711, + "step": 14000 + }, + { + "epoch": 0.24305133591430705, + "eval_covost2-en-de_loss": 1.762545108795166, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 7.89, + "eval_covost2-en-de_samples_per_second": 8.111, + "eval_covost2-en-de_steps_per_second": 1.014, + "step": 14000 + }, + { + "epoch": 0.24305133591430705, + "eval_covost2-zh-en_loss": 3.2872180938720703, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.1851, + "eval_covost2-zh-en_samples_per_second": 7.819, + "eval_covost2-zh-en_steps_per_second": 0.977, + "step": 14000 + }, + { + "epoch": 0.24305133591430705, + "eval_peoplespeech-clean-transcription_loss": 2.116088628768921, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.4366, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.782, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.848, + "step": 14000 + }, + { + "epoch": 0.2447874168851235, + "grad_norm": 0.059326171875, + "learning_rate": 2.472427815989886e-06, + "loss": 0.0705, + "step": 14100 + }, + { + "epoch": 0.24652349785594, + "grad_norm": 0.03955078125, + "learning_rate": 1.099108514288627e-06, + "loss": 0.0703, + "step": 14200 + }, + { + "epoch": 0.24825957882675648, + "grad_norm": 0.07763671875, + "learning_rate": 2.748148901841052e-07, + "loss": 0.0714, + "step": 14300 + }, + { + "epoch": 0.24999565979757296, + "grad_norm": 0.0625, + "learning_rate": 0.0, + "loss": 0.0704, + "step": 14400 + } + ], + "logging_steps": 100, + "max_steps": 14400, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.620180878885847e+17, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-14400/training_args.bin b/checkpoint-14400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..83b5b195c1720ea55bc992756c0ab6f1e2ef4671 --- /dev/null +++ b/checkpoint-14400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1fd9b1621955a41f9e58d5a3e2b2d6a70bd41f5404ebfa5cb0ca999c290090 +size 5688 diff --git a/checkpoint-14400/ultravox_config.py b/checkpoint-14400/ultravox_config.py new file mode 100644 index 0000000000000000000000000000000000000000..3671250d6b6a69604473ed1a526484c8c9a77f68 --- /dev/null +++ b/checkpoint-14400/ultravox_config.py @@ -0,0 +1,170 @@ +import dataclasses +from enum import Enum +from typing import Any, Dict, List, Optional + +import transformers + + +@dataclasses.dataclass +class LoraConfigSimplified: + """ + Low Rank Approximation (LoRA) configuration. + + Used for language and audio models separately. + """ + + # The rank of the approximation + r: int = 0 + lora_alpha: float = 8 + target_modules: Optional[List[str]] = dataclasses.field( + default_factory=lambda: ["k_proj", "q_proj", "linear_k", "linear_q"] + ) + + +class LossFunction(str, Enum): + CrossEntropy = "ce" + KL_Divergence = "kl" + + +@dataclasses.dataclass +class LossConfig: + loss_function: LossFunction = LossFunction.KL_Divergence + kl_temperature: float = 2.0 + + @property + def requires_alt_fields(self): + return self.loss_function == LossFunction.KL_Divergence + + +class UltravoxConfig(transformers.PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`UltravoxForConditionalGeneration`]. It is used to instantiate an + Ultravox model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + audio_config (`Wav2Vec2Config`, *optional*): + Custom audio config or dict + text_config (`Union[AutoConfig, dict]`, *optional*): + The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`. + ignore_index (`int`, *optional*, defaults to -100): + The ignore index for the loss function. + audio_token_index (`int`, *optional*, defaults to 32000): + The audio token index to encode the audio prompt. + stack_factor (`int`, *optional*, defaults to 8): + Audio downsampling factor for the multimodal projector. + norm_init (`float`, *optional*, defaults to 0.4): + The initialization value for the layer normalization. + projector_act (`str`, *optional*, defaults to `"swiglu"`): + The activation function used by the multimodal projector. + text_model_lora_config (`LoraConfigSimplified`, *optional*): + The LoRA configuration for finetuning the text model. + audio_model_lora_config (`LoraConfigSimplified`, *optional*): + The LoRA configuration for finetuning the audio model. + audio_latency_block_size (`int`, *optional*, defaults to `None`): + The latency block size for simulating audio streaming. + + + Example: + + ```python + >>> from transformers import UltravoxForConditionalGeneration, Wav2Vec2Config, UltravoxConfig, LlamaConfig + + >>> # Initializing an audio encoder config + >>> audio_config = Wav2Vec2Config() + + >>> # Initializing a Llama config + >>> text_config = LlamaConfig() + + >>> # Initializing a default configuration + >>> configuration = UltravoxConfig(audio_config, text_config) + + >>> # Initializing a completely untrained model from the configuration + >>> model = UltravoxForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # Initialize a model from pretrained checkpoints and random projector weights + >>> config = UltravoxConfig(audio_model_id="facebook/wav2vec2-base-960h", text_model_id="meta-llama/Llama-2-7b-chat-hf") + ```""" + + model_type = "ultravox" + is_composition = False + + def __init__( + self, + audio_config: Optional[Dict[str, Any]] = None, + text_config: Optional[Dict[str, Any]] = None, + audio_model_id: Optional[str] = None, + text_model_id: Optional[str] = None, + ignore_index: int = -100, + hidden_size: int = 4096, + stack_factor: int = 8, + norm_init: float = 0.4, + projector_act: str = "swiglu", + text_model_lora_config: Optional[LoraConfigSimplified] = None, + audio_model_lora_config: Optional[LoraConfigSimplified] = None, + audio_latency_block_size: Optional[int] = None, + **kwargs, + ): + self.ignore_index = ignore_index + + self.audio_model_id = audio_model_id + self.text_model_id = text_model_id + + self.hidden_size = hidden_size + self.stack_factor = stack_factor + self.norm_init = norm_init + self.projector_act = projector_act + + if text_model_id is not None: + self.text_config: transformers.LlamaConfig = ( + transformers.AutoConfig.from_pretrained(text_model_id) + ) + else: + text_config = text_config or {} + self.text_config = transformers.CONFIG_MAPPING[ + text_config.get("model_type", "llama") + ](**text_config) + + if audio_model_id is not None: + self.audio_config: transformers.PretrainedConfig = ( + transformers.AutoConfig.from_pretrained(audio_model_id) + ) + else: + audio_config = audio_config or {} + self.audio_config = transformers.CONFIG_MAPPING[ + audio_config.get("model_type", "wav2vec2") + ](**audio_config) + + self.text_model_lora_config = ( + text_model_lora_config + if isinstance(text_model_lora_config, dict) + else dataclasses.asdict(text_model_lora_config or LoraConfigSimplified()) + ) + self.audio_model_lora_config = ( + audio_model_lora_config + if isinstance(audio_model_lora_config, dict) + else dataclasses.asdict(audio_model_lora_config or LoraConfigSimplified()) + ) + self.audio_latency_block_size = audio_latency_block_size + + self.vocab_size = self.text_config.vocab_size + + self.initializer_range = self.text_config.initializer_range + + super().__init__(**kwargs) + + def to_diff_dict(self) -> Dict[str, Any]: + diff_dict = super().to_diff_dict() + + # remove text_config and audio_config if text_model_id and audio_model_id are present + if self.text_model_id is not None: + diff_dict.pop("text_config", None) + if self.audio_model_id is not None: + diff_dict.pop("audio_config", None) + + return diff_dict diff --git a/checkpoint-14400/ultravox_model.py b/checkpoint-14400/ultravox_model.py new file mode 100644 index 0000000000000000000000000000000000000000..ccea3e0ca174ec3119184d1f5a0f384c1cdbcae9 --- /dev/null +++ b/checkpoint-14400/ultravox_model.py @@ -0,0 +1,723 @@ +import logging +from typing import Any, Dict, Optional, Set, Tuple, Union + +import peft +import torch +import torch.nn as nn +import torch.nn.functional as F +import transformers +import transformers.activations +import transformers.modeling_outputs +import transformers.models +from transformers.models.whisper import modeling_whisper as whisper + +# We must use relative import in this directory to allow uploading to HF Hub +# Even "from . import X" pattern doesn't work (undocumented and unclear why) +from .ultravox_config import LossConfig +from .ultravox_config import LossFunction +from .ultravox_config import UltravoxConfig + + +class UltravoxModel(transformers.LlamaPreTrainedModel): + """ + The Ultravox model which consists of an audio encoder and a language model. + + Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and + projected to the language model's embedding space using a few linear layers. + The text is embedded by the language model as usual and then the audio and text embeddings are merged together. + + A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings. + + Parameters: + config: Model configuration class with all the parameters of the model. + """ + + config_class = UltravoxConfig + config: UltravoxConfig # for type hinting + # Usually we load encoder and LLM weights from a pretrained model separately, so they are allowed to be missing + _keys_to_ignore_on_load_missing = ["audio_tower.*", "language_model.*"] + + def __init__(self, config: UltravoxConfig): + super().__init__(config) + self._register_load_state_dict_pre_hook(self._pre_load_state_dict_hook) + + self.keep_params: Set[str] = set() + self.vocab_size = config.vocab_size + + self.audio_tower = self._create_audio_tower(config) + self.multi_modal_projector = self._create_multi_modal_projector(config) + self.language_model = self._create_language_model(config) + + # Determine no_split_modules dynamically to use with FSDP auto_wrap policy. + # FSDP throws an error if some of the layer types are not found in the model. + # This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"] + self._no_split_modules = (self.language_model._no_split_modules or []) + ( + self.audio_tower._no_split_modules or [] + ) + + self.loss_config = LossConfig() + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def tie_weights(self): + return self.language_model.tie_weights() + + def set_loss_config(self, loss_config: LossConfig): + self.loss_config = loss_config + + def _setup_cache( + self, cache_cls, max_batch_size: int, max_cache_len: Optional[int] = None + ): + self.language_model._setup_cache(cache_cls, max_batch_size, max_cache_len) + + def _reorder_cache(self, past_key_values, beam_idx): + return self.language_model._reorder_cache(past_key_values, beam_idx) + + def resize_token_embeddings( + self, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + ) -> nn.Embedding: + model_embeds = self.language_model.resize_token_embeddings( + new_num_tokens, pad_to_multiple_of + ) + # update vocab size + self.config.text_config.vocab_size = model_embeds.num_embeddings + self.config.vocab_size = model_embeds.num_embeddings + self.vocab_size = model_embeds.num_embeddings + return model_embeds + + def _compute_kl_loss( + self, + lm_output: transformers.modeling_outputs.CausalLMOutputWithPast, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + alt_input_ids: Optional[torch.Tensor] = None, + alt_attention_mask: Optional[torch.Tensor] = None, + alt_labels: Optional[torch.Tensor] = None, + **kwargs, + ): + # disable gradient computation for the teacher model + with torch.no_grad(): + # compute the teacher (text-only) model's distribution + alt_inputs_embeds = self.get_input_embeddings().forward(alt_input_ids) + alt_lm_output = self.language_model.forward( + inputs_embeds=alt_inputs_embeds, + labels=alt_labels, + attention_mask=alt_attention_mask, + past_key_values=past_key_values, + **kwargs, + ) + # compute the KL divergence loss between the two models + kl_loss = F.kl_div( + F.log_softmax( + lm_output.logits[labels != -100] / self.loss_config.kl_temperature, + dim=-1, + ), + F.softmax( + alt_lm_output.logits[alt_labels != -100] + / self.loss_config.kl_temperature, + dim=-1, + ), + reduction="batchmean", + ) + return {"loss": kl_loss} + + def forward( + self, + input_ids: torch.Tensor, + audio_values: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + audio_token_start_idx: Optional[torch.Tensor] = None, + audio_len: Optional[torch.Tensor] = None, + audio_token_len: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + # the alt_* fields are needed for KL divergence loss + alt_input_ids: Optional[torch.Tensor] = None, + alt_attention_mask: Optional[torch.Tensor] = None, + alt_labels: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]: + """ + Forward pass for the Ultravox model. + + `input_ids` are the tokenized text input. They are embedded by the language model as usual. + `audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and + projected to the language model's embedding space using a few linear layers. + The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start + of the audio embeddings in the merged embeddings. + + Args: + input_ids: The tokenized text input. + audio_values: The processed audio values. + inputs_embeds: The embeddings for the input tokens. + labels: The tokenized text labels. + attention_mask: The attention mask for the input. + position_ids: The position ids for the input. + past_key_values: The past key value cache for the language model attention layers. + **kwargs: Additional keyword arguments. Passed directly to the language model. + """ + if inputs_embeds is None: + # B x T -> B x T x D + inputs_embeds = self.get_input_embeddings().forward(input_ids) + + if audio_values is not None: + assert ( + audio_token_start_idx is not None and audio_token_len is not None + ), "audio_token_start_idx and audio_token_len must be provided if audio_values are provided." + assert ( + len(audio_token_start_idx) == len(audio_token_len) == len(audio_values) + ), "audio_token_start_idx, audio_token_len, and audio_values must have the same batch size." + + # B x A/3200 x D + audio_tower_output = self.audio_tower.forward( + audio_values.to(self.audio_tower.dtype), + audio_len=audio_len, + ).last_hidden_state + audio_tower_output = audio_tower_output.to(inputs_embeds.dtype) + + audio_embeds = self.multi_modal_projector.forward(audio_tower_output) + + # combine audio and text embeddings + for i, (audio, start, length) in enumerate( + zip(audio_embeds, audio_token_start_idx, audio_token_len) + ): + length = min(length, audio.shape[0]) + inputs_embeds[i, start : start + length] = audio[:length] + + lm_output = self.language_model.forward( + inputs_embeds=inputs_embeds, + labels=labels, + attention_mask=attention_mask, + past_key_values=past_key_values, + **kwargs, + ) + if self.training: + if self.loss_config.loss_function == LossFunction.CrossEntropy: + return lm_output + elif self.loss_config.loss_function == LossFunction.KL_Divergence: + return self._compute_kl_loss( + lm_output=lm_output, + labels=labels, + past_key_values=past_key_values, + alt_input_ids=alt_input_ids, + alt_attention_mask=alt_attention_mask, + alt_labels=alt_labels, + **kwargs, + ) + else: + raise ValueError( + f"Unsupported loss function: {self.loss_config.loss_function}" + ) + else: + return lm_output + + def prepare_inputs_for_generation( + self, + input_ids: torch.Tensor, + audio_values: Optional[torch.FloatTensor] = None, + audio_token_start_idx: Optional[torch.Tensor] = None, + audio_token_len: Optional[torch.Tensor] = None, + audio_len: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, + ) -> Dict[str, Any]: + model_input = self.language_model.prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + # include audio information in model_input only when it is needed during prefilling + # audio_token_start_idx should always be relative to the current cache position + prefill_start_idx = 0 if cache_position is None else cache_position[0] + if ( + audio_values is not None + and audio_token_start_idx is not None + and prefill_start_idx <= torch.max(audio_token_start_idx) + ): + model_input["audio_values"] = audio_values + model_input["audio_token_start_idx"] = ( + audio_token_start_idx - prefill_start_idx + ) + model_input["audio_token_len"] = audio_token_len + model_input["audio_len"] = audio_len + + return model_input + + @classmethod + def _create_multi_modal_projector( + cls, config: UltravoxConfig + ) -> "UltravoxProjector": + projector = UltravoxProjector(config) + projector.to(config.torch_dtype) + return projector + + @classmethod + def _create_audio_tower( + cls, config: UltravoxConfig + ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]: + if config.audio_model_id is not None: + if "whisper" in config.audio_model_id is not None: + audio_tower = ModifiedWhisperEncoder.from_pretrained( + config.audio_model_id, torch_dtype=config.torch_dtype + ) + audio_tower.init_latency_mask( + config.audio_latency_block_size, dtype=config.torch_dtype + ) + else: + assert config.audio_latency_block_size in ( + None, + 0, + ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'" + audio_tower = transformers.AutoModel.from_pretrained( + config.audio_model_id, torch_dtype=config.torch_dtype + ) + else: + if "whisper" in config.audio_config._name_or_path: + audio_tower = ModifiedWhisperEncoder(config.audio_config) + audio_tower.init_latency_mask( + config.audio_latency_block_size, dtype=config.torch_dtype + ) + else: + assert config.audio_latency_block_size in ( + None, + 0, + ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'" + with transformers.modeling_utils.no_init_weights(): + # we only ever use from_config if the weights are retrained, hence initializing is not + # required. This makes the model quite creation faster since init on CPU is quite slow. + audio_tower = transformers.AutoModel.from_config( + config.audio_config + ) + + if isinstance( + audio_tower, + (transformers.Wav2Vec2BertModel, transformers.WhisperModel), + ): + # For these models we only need the encoder part + # Wav2Vec2BertModel -> Wav2Vec2BertEncoder + # WhisperModel -> WhisperEncoder + audio_tower = audio_tower.encoder + + audio_tower = apply_lora(audio_tower, config.audio_model_lora_config) + return audio_tower + + @classmethod + def _create_language_model( + cls, config: UltravoxConfig + ) -> transformers.LlamaForCausalLM: + if config.text_model_id is not None: + language_model = transformers.AutoModelForCausalLM.from_pretrained( + config.text_model_id, + attn_implementation=config._attn_implementation, + torch_dtype=config.torch_dtype, + ) + else: + with transformers.modeling_utils.no_init_weights(): + # we only ever use from_config if the weights are retrained, hence initializing is not + # required. This makes the model quite creation faster since init on CPU is quite slow. + language_model = transformers.AutoModelForCausalLM.from_config( + config.text_config, + attn_implementation=config._attn_implementation, + torch_dtype=config.torch_dtype, + ) + + language_model = apply_lora(language_model, config.text_model_lora_config) + return language_model + + def merge_and_unload(self): + if isinstance(self.language_model, peft.PeftModel): + self.language_model = self.language_model.merge_and_unload() + # no need to download base language model weights anymore, so we can remove the id + self.config.text_model_id = None + self.keep_params.update( + set( + [ + f"language_model.{name}" + for name, _ in self.language_model.named_parameters() + ] + ) + ) + + if isinstance(self.audio_tower, peft.PeftModel): + self.audio_tower = self.audio_tower.merge_and_unload() + # no need to download base audio model weights anymore, so we can remove the id + self.config.audio_model_id = None + self.keep_params.update( + set( + [ + f"audio_tower.{name}" + for name, _ in self.audio_tower.named_parameters() + ] + ) + ) + + for param in ["text_model_lora_config", "audio_model_lora_config"]: + if hasattr(self.config, param): + delattr(self.config, param) + + def push_to_hub(self, *args, **kwargs): + self.merge_and_unload() + self.to(self.language_model.dtype) + return super().push_to_hub(*args, **kwargs) + + def save_pretrained( + self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs + ): + if state_dict is None: + state_dict = super().state_dict() + + named_params = dict(self.named_parameters()) + + state_dict = { + k: v + for k, v in state_dict.items() + if k in self.keep_params + or (k in named_params and named_params[k].requires_grad) + } + + super().save_pretrained(*args, state_dict=state_dict, **kwargs) + + def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs): + self.keep_params.update(set(state_dict.keys())) + + def print_trainable_parameters(self): + """ + Prints the number of trainable parameters in the model (reuses Peft model's method) + """ + count_params = peft.peft_model.PeftModel.get_nb_trainable_parameters + + trainable_params, all_param = count_params(self) + + logging.info( + f"trainable params: {trainable_params:,d} || all params: {all_param:,d}" + f" || trainable%: {100 * trainable_params / all_param:.1f}%" + ) + + lm_trainable_params, lm_all_params = count_params(self.language_model) + audio_trainable_params, audio_all_params = count_params(self.audio_tower) + + projector_trainable_params = ( + trainable_params - lm_trainable_params - audio_trainable_params + ) + projector_all_params = all_param - lm_all_params - audio_all_params + + logging.info( + f"Trainable%: " + f" LLM: {100 * lm_trainable_params / lm_all_params:.1f}%" + f" || Audio Encoder: {100 * audio_trainable_params / audio_all_params:.1f}%" + f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%" + ) + + +def is_cache_empty( + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] +) -> bool: + """ + Check if the cache is empty. + """ + if past_key_values is None: + return True + if isinstance(past_key_values, tuple): + return all(len(c) == 0 for c in past_key_values) + return past_key_values.get_seq_length() == 0 + + +def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module: + """ + Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead. + """ + lora_config = peft.LoraConfig(**lora_config or {}) + + if lora_config.r == 0: + # freeze the model entirely + for param in model.parameters(): + param.requires_grad = False + else: + model = peft.get_peft_model(model, lora_config) + + return model + + +class StackAudioFrames(nn.Module): + """ + Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`. + + The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames. + NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor, + we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings. + In most cases this extra padding will get removed in the model's forward function so it has no effect. + """ + + def __init__(self, stack_factor: int = 8): + super().__init__() + self.stack_factor = stack_factor + + def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor: + B, T, C = audio_embeds.shape + T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor + audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T + self.stack_factor)) + B, T, C = audio_embeds.shape + audio_embeds = audio_embeds.view( + B, T // self.stack_factor, C * self.stack_factor + ) + return audio_embeds + + +class RMSNorm(transformers.models.llama.modeling_llama.LlamaRMSNorm): + def __init__(self, hidden_size: int, init: float = 1, eps: float = 1e-6): + super().__init__(hidden_size=hidden_size, eps=eps) + self.weight.data.fill_(init) + + +class SwiGLU(nn.Module): + def forward(self, x): + x, gate = x.chunk(2, dim=-1) + return F.silu(gate) * x + + +class UltravoxProjector(nn.Sequential): + def __init__(self, config: UltravoxConfig): + super().__init__() + self.hidden_dim = config.hidden_size + self._pad_and_stack = StackAudioFrames(config.stack_factor) + dim = config.audio_config.hidden_size * config.stack_factor + self.ln_pre = RMSNorm(dim, init=config.norm_init) + self.linear_1 = nn.Linear(dim, self.hidden_dim, bias=False) + dim = self.hidden_dim + self.act = transformers.activations.get_activation(config.projector_act) + dim = dim // 2 if config.projector_act == "swiglu" else dim + self.linear_2 = nn.Linear(dim, config.text_config.hidden_size, bias=False) + self.ln_post = RMSNorm(config.text_config.hidden_size, init=config.norm_init) + + def forward(self, audio_features: torch.Tensor) -> torch.Tensor: + audio_features = self._pad_and_stack(audio_features) + audio_features = self.ln_pre(audio_features) + hidden_states = self.linear_1(audio_features) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + hidden_states = self.ln_post(hidden_states) + return hidden_states + + +class ModifiedWhisperEncoder( + whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin +): + """ + Encoder portion of OpenAI's Whisper model. + + This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes: + 1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder + 2. allow less than 30 second of audio padding to be passed in: + - relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal + - embed_pos is now sliced to match the length of `inputs_embeds` + + Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py + """ + + base_model_prefix = "model.encoder" + _no_split_modules = ["WhisperEncoderLayer"] + + def init_latency_mask(self, audio_latency_block_size: int, dtype: torch.dtype): + if audio_latency_block_size is None: + self.audio_streaming_mask = None + return + + # maximum sequence length + max_seqlen = ( + self.config.max_source_positions + * self.conv1.stride[0] + * self.conv2.stride[0] + ) + assert ( + max_seqlen > 0 + ), f"maximum sequence length must be positive, got {max_seqlen}" + assert ( + max_seqlen % audio_latency_block_size == 0 + ), f"audio_latency_block_size {audio_latency_block_size} must divide {max_seqlen} evenly." + # Given the block size, we calculate number of blocks. + audio_latency_nblocks = max_seqlen // audio_latency_block_size + audio_streaming_mask = ( + torch.tril( + torch.ones(audio_latency_nblocks, audio_latency_nblocks), + diagonal=0, + ) + .repeat_interleave(audio_latency_block_size, dim=0) + .repeat_interleave(audio_latency_block_size, dim=1) + ) + audio_streaming_mask = (1.0 - audio_streaming_mask) * torch.finfo(dtype).min + audio_streaming_mask = audio_streaming_mask[None, None, :, :] + self.register_buffer( + "audio_streaming_mask", audio_streaming_mask, persistent=False + ) + + def forward( + self, + input_features, + audio_len=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + expected_seq_length = ( + self.config.max_source_positions + * self.conv1.stride[0] + * self.conv2.stride[0] + ) + if input_features.shape[-1] > expected_seq_length: + raise ValueError( + f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}." + ) + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + inputs_embeds = nn.functional.gelu(self.conv1(input_features)) + inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) + + inputs_embeds = inputs_embeds.permute(0, 2, 1) + embed_pos = self.embed_positions.weight[: inputs_embeds.size(-2)] + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout( + hidden_states, p=self.dropout, training=self.training + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # Create attention mask based on audio lengths to mask out padding tokens + # For each sample in batch: + # - Convert raw audio length to feature length after convolutions + # - Create boolean mask that is True for valid positions and False for padding + # - Convert to extended attention mask format expected by transformer layers + # (1.0 for positions to attend to, large negative for positions to ignore) + # This masking ensures consistent behavior between training and inference + # by preventing the model from attending to padding tokens in both cases + attention_mask = None + if audio_len != None: + audio_feature_len = self._get_feat_extract_output_lengths(audio_len) + max_seq_len = hidden_states.shape[1] + attention_mask = torch.arange(max_seq_len, device=hidden_states.device)[ + None, : + ].lt(audio_feature_len.view(-1, 1)) + attention_mask = self.get_extended_attention_mask( + attention_mask, + None, + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + + if self.audio_streaming_mask is not None: + seqlen = hidden_states.size(-2) + if attention_mask is not None: + attention_mask = torch.minimum( + self.audio_streaming_mask[:, :, :seqlen, :seqlen], attention_mask + ) # merge + else: + attention_mask = self.audio_streaming_mask[:, :, :seqlen, :seqlen] + attention_mask = attention_mask.to(hidden_states.dtype) + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + assert head_mask.size()[0] == ( + len(self.layers) + ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=( + head_mask[idx] if head_mask is not None else None + ), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, encoder_states, all_attentions] + if v is not None + ) + return transformers.modeling_outputs.BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=encoder_states, + attentions=all_attentions, + ) + + +UltravoxConfig.register_for_auto_class() +UltravoxModel.register_for_auto_class() + +transformers.AutoConfig.register("ultravox", UltravoxConfig) +transformers.AutoModel.register(UltravoxConfig, UltravoxModel) + +transformers.activations.ACT2FN["swiglu"] = SwiGLU diff --git a/checkpoint-3600/config.json b/checkpoint-3600/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ce016ecf9d037d4afa627d419c26b5ddd28a2f8 --- /dev/null +++ b/checkpoint-3600/config.json @@ -0,0 +1,43 @@ +{ + "architectures": [ + "UltravoxModel" + ], + "audio_latency_block_size": null, + "audio_model_id": "openai/whisper-large-v3-turbo", + "audio_model_lora_config": { + "lora_alpha": 8, + "r": 0, + "target_modules": [ + "k_proj", + "q_proj", + "linear_k", + "linear_q" + ] + }, + "auto_map": { + "AutoConfig": "ultravox_config.UltravoxConfig", + "AutoModel": "ultravox_model.UltravoxModel" + }, + "hidden_size": 4096, + "ignore_index": -100, + "initializer_range": 0.02, + "model_type": "ultravox", + "norm_init": 0.4, + "pad_token_id": 128009, + "projector_act": "swiglu", + "stack_factor": 8, + "text_model_id": "meta-llama/Llama-3.2-1B-Instruct", + "text_model_lora_config": { + "lora_alpha": 8, + "r": 0, + "target_modules": [ + "k_proj", + "q_proj", + "linear_k", + "linear_q" + ] + }, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.0", + "vocab_size": 128256 +} diff --git a/checkpoint-3600/generation_config.json b/checkpoint-3600/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4dac817850f65a6be4d01d824462c9fe54468763 --- /dev/null +++ b/checkpoint-3600/generation_config.json @@ -0,0 +1,11 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "transformers_version": "4.47.0" +} diff --git a/checkpoint-3600/model.safetensors b/checkpoint-3600/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a8bc48bd5554e5b1ce4ff3122f69176d7fa8c009 --- /dev/null +++ b/checkpoint-3600/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a836f112ba9058dd435211b09c64746a2be44030acdd82292dd98c7bd9acc71 +size 92299736 diff --git a/checkpoint-3600/optimizer.pt b/checkpoint-3600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d2226ee3d028ead5339dfec380853d3fe23fa35 --- /dev/null +++ b/checkpoint-3600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d136891ef5201ca20b5fd3ead330b04f58e73151d0b226ff89df32e1ee665388 +size 184602962 diff --git a/checkpoint-3600/rng_state.pth b/checkpoint-3600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..97136f0e360c21721d9ab387cf52d151172c6070 --- /dev/null +++ b/checkpoint-3600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b8f9f335a085578e86e02f0fc1eb48e60e3e89e3098e3c6cce66c291fc8ee61 +size 14244 diff --git a/checkpoint-3600/scheduler.pt b/checkpoint-3600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c2be0ed1ba4c827f1e8aeee8358e2114dd64728 --- /dev/null +++ b/checkpoint-3600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04d44c72cd3a348b42894fd1f0aa67e10dd8fbbcc70558ee71674069905052af +size 1064 diff --git a/checkpoint-3600/special_tokens_map.json b/checkpoint-3600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/checkpoint-3600/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-3600/tokenizer.json b/checkpoint-3600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-3600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-3600/tokenizer_config.json b/checkpoint-3600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/checkpoint-3600/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-3600/trainer_state.json b/checkpoint-3600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c8567070acf698e9c61a47c6150555ee56434175 --- /dev/null +++ b/checkpoint-3600/trainer_state.json @@ -0,0 +1,373 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.06249891494939324, + "eval_steps": 1000, + "global_step": 3600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.736080970816479e-05, + "grad_norm": 10.5625, + "learning_rate": 2e-06, + "loss": 1.0, + "step": 1 + }, + { + "epoch": 0.001736080970816479, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.3311, + "step": 100 + }, + { + "epoch": 0.003472161941632958, + "grad_norm": 0.16796875, + "learning_rate": 0.0004, + "loss": 0.2169, + "step": 200 + }, + { + "epoch": 0.005208242912449436, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006, + "loss": 0.2032, + "step": 300 + }, + { + "epoch": 0.006944323883265916, + "grad_norm": 0.11279296875, + "learning_rate": 0.0008, + "loss": 0.188, + "step": 400 + }, + { + "epoch": 0.008680404854082394, + "grad_norm": 0.10107421875, + "learning_rate": 0.001, + "loss": 0.1758, + "step": 500 + }, + { + "epoch": 0.010416485824898873, + "grad_norm": 0.09521484375, + "learning_rate": 0.0012, + "loss": 0.1637, + "step": 600 + }, + { + "epoch": 0.012152566795715351, + "grad_norm": 0.08154296875, + "learning_rate": 0.0014, + "loss": 0.1518, + "step": 700 + }, + { + "epoch": 0.013888647766531832, + "grad_norm": 0.08642578125, + "learning_rate": 0.0016, + "loss": 0.1485, + "step": 800 + }, + { + "epoch": 0.01562472873734831, + "grad_norm": 0.1044921875, + "learning_rate": 0.0018000000000000002, + "loss": 0.1433, + "step": 900 + }, + { + "epoch": 0.01736080970816479, + "grad_norm": 0.05419921875, + "learning_rate": 0.002, + "loss": 0.139, + "step": 1000 + }, + { + "epoch": 0.01736080970816479, + "eval_covost2-en-de_loss": 1.896493673324585, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 9.8697, + "eval_covost2-en-de_samples_per_second": 6.485, + "eval_covost2-en-de_steps_per_second": 0.811, + "step": 1000 + }, + { + "epoch": 0.01736080970816479, + "eval_covost2-zh-en_loss": 3.1452860832214355, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.3732, + "eval_covost2-zh-en_samples_per_second": 7.643, + "eval_covost2-zh-en_steps_per_second": 0.955, + "step": 1000 + }, + { + "epoch": 0.01736080970816479, + "eval_peoplespeech-clean-transcription_loss": 3.2206106185913086, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.6941, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.602, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.825, + "step": 1000 + }, + { + "epoch": 0.01909689067898127, + "grad_norm": 0.059814453125, + "learning_rate": 0.001999725185109816, + "loss": 0.1334, + "step": 1100 + }, + { + "epoch": 0.020832971649797746, + "grad_norm": 0.07373046875, + "learning_rate": 0.0019989008914857113, + "loss": 0.1288, + "step": 1200 + }, + { + "epoch": 0.022569052620614226, + "grad_norm": 0.049560546875, + "learning_rate": 0.00199752757218401, + "loss": 0.1262, + "step": 1300 + }, + { + "epoch": 0.024305133591430703, + "grad_norm": 0.0517578125, + "learning_rate": 0.001995605982021898, + "loss": 0.1222, + "step": 1400 + }, + { + "epoch": 0.026041214562247183, + "grad_norm": 0.058349609375, + "learning_rate": 0.0019931371771625545, + "loss": 0.1193, + "step": 1500 + }, + { + "epoch": 0.027777295533063663, + "grad_norm": 0.0498046875, + "learning_rate": 0.001990122514534651, + "loss": 0.1196, + "step": 1600 + }, + { + "epoch": 0.02951337650388014, + "grad_norm": 0.05517578125, + "learning_rate": 0.0019865636510865464, + "loss": 0.115, + "step": 1700 + }, + { + "epoch": 0.03124945747469662, + "grad_norm": 0.044677734375, + "learning_rate": 0.001982462542875576, + "loss": 0.115, + "step": 1800 + }, + { + "epoch": 0.0329855384455131, + "grad_norm": 0.05419921875, + "learning_rate": 0.001977821443992945, + "loss": 0.1125, + "step": 1900 + }, + { + "epoch": 0.03472161941632958, + "grad_norm": 0.047119140625, + "learning_rate": 0.001972642905324813, + "loss": 0.1094, + "step": 2000 + }, + { + "epoch": 0.03472161941632958, + "eval_covost2-en-de_loss": 1.6700351238250732, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.1279, + "eval_covost2-en-de_samples_per_second": 7.874, + "eval_covost2-en-de_steps_per_second": 0.984, + "step": 2000 + }, + { + "epoch": 0.03472161941632958, + "eval_covost2-zh-en_loss": 3.093877077102661, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.1488, + "eval_covost2-zh-en_samples_per_second": 7.854, + "eval_covost2-zh-en_steps_per_second": 0.982, + "step": 2000 + }, + { + "epoch": 0.03472161941632958, + "eval_peoplespeech-clean-transcription_loss": 2.478968620300293, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.5507, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.701, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.838, + "step": 2000 + }, + { + "epoch": 0.036457700387146054, + "grad_norm": 0.048583984375, + "learning_rate": 0.0019669297731502505, + "loss": 0.1077, + "step": 2100 + }, + { + "epoch": 0.03819378135796254, + "grad_norm": 0.054443359375, + "learning_rate": 0.00196068518757684, + "loss": 0.1069, + "step": 2200 + }, + { + "epoch": 0.039929862328779014, + "grad_norm": 0.047119140625, + "learning_rate": 0.001953912580814779, + "loss": 0.1043, + "step": 2300 + }, + { + "epoch": 0.04166594329959549, + "grad_norm": 0.044921875, + "learning_rate": 0.0019466156752904343, + "loss": 0.1035, + "step": 2400 + }, + { + "epoch": 0.043402024270411975, + "grad_norm": 0.050537109375, + "learning_rate": 0.0019387984816003866, + "loss": 0.1033, + "step": 2500 + }, + { + "epoch": 0.04513810524122845, + "grad_norm": 0.056396484375, + "learning_rate": 0.0019304652963070869, + "loss": 0.102, + "step": 2600 + }, + { + "epoch": 0.04687418621204493, + "grad_norm": 0.046875, + "learning_rate": 0.0019216206995773372, + "loss": 0.0998, + "step": 2700 + }, + { + "epoch": 0.048610267182861405, + "grad_norm": 0.042236328125, + "learning_rate": 0.0019122695526648968, + "loss": 0.1002, + "step": 2800 + }, + { + "epoch": 0.05034634815367789, + "grad_norm": 0.04638671875, + "learning_rate": 0.0019024169952385887, + "loss": 0.0978, + "step": 2900 + }, + { + "epoch": 0.052082429124494366, + "grad_norm": 0.05126953125, + "learning_rate": 0.0018920684425573864, + "loss": 0.097, + "step": 3000 + }, + { + "epoch": 0.052082429124494366, + "eval_covost2-en-de_loss": 1.749150276184082, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.1948, + "eval_covost2-en-de_samples_per_second": 7.81, + "eval_covost2-en-de_steps_per_second": 0.976, + "step": 3000 + }, + { + "epoch": 0.052082429124494366, + "eval_covost2-zh-en_loss": 3.198117971420288, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.1979, + "eval_covost2-zh-en_samples_per_second": 7.807, + "eval_covost2-zh-en_steps_per_second": 0.976, + "step": 3000 + }, + { + "epoch": 0.052082429124494366, + "eval_peoplespeech-clean-transcription_loss": 2.345036506652832, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 11.4402, + "eval_peoplespeech-clean-transcription_samples_per_second": 5.594, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.699, + "step": 3000 + }, + { + "epoch": 0.05381851009531084, + "grad_norm": 0.06494140625, + "learning_rate": 0.0018812295824940284, + "loss": 0.0955, + "step": 3100 + }, + { + "epoch": 0.055554591066127326, + "grad_norm": 0.044677734375, + "learning_rate": 0.0018699063724087904, + "loss": 0.0951, + "step": 3200 + }, + { + "epoch": 0.0572906720369438, + "grad_norm": 0.0390625, + "learning_rate": 0.0018581050358751443, + "loss": 0.0947, + "step": 3300 + }, + { + "epoch": 0.05902675300776028, + "grad_norm": 0.056396484375, + "learning_rate": 0.0018458320592590974, + "loss": 0.0939, + "step": 3400 + }, + { + "epoch": 0.060762833978576763, + "grad_norm": 0.047119140625, + "learning_rate": 0.0018330941881540914, + "loss": 0.0941, + "step": 3500 + }, + { + "epoch": 0.06249891494939324, + "grad_norm": 0.046630859375, + "learning_rate": 0.0018198984236734246, + "loss": 0.0927, + "step": 3600 + } + ], + "logging_steps": 100, + "max_steps": 14400, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.65445799352533e+17, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3600/training_args.bin b/checkpoint-3600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..83b5b195c1720ea55bc992756c0ab6f1e2ef4671 --- /dev/null +++ b/checkpoint-3600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1fd9b1621955a41f9e58d5a3e2b2d6a70bd41f5404ebfa5cb0ca999c290090 +size 5688 diff --git a/checkpoint-3600/ultravox_config.py b/checkpoint-3600/ultravox_config.py new file mode 100644 index 0000000000000000000000000000000000000000..3671250d6b6a69604473ed1a526484c8c9a77f68 --- /dev/null +++ b/checkpoint-3600/ultravox_config.py @@ -0,0 +1,170 @@ +import dataclasses +from enum import Enum +from typing import Any, Dict, List, Optional + +import transformers + + +@dataclasses.dataclass +class LoraConfigSimplified: + """ + Low Rank Approximation (LoRA) configuration. + + Used for language and audio models separately. + """ + + # The rank of the approximation + r: int = 0 + lora_alpha: float = 8 + target_modules: Optional[List[str]] = dataclasses.field( + default_factory=lambda: ["k_proj", "q_proj", "linear_k", "linear_q"] + ) + + +class LossFunction(str, Enum): + CrossEntropy = "ce" + KL_Divergence = "kl" + + +@dataclasses.dataclass +class LossConfig: + loss_function: LossFunction = LossFunction.KL_Divergence + kl_temperature: float = 2.0 + + @property + def requires_alt_fields(self): + return self.loss_function == LossFunction.KL_Divergence + + +class UltravoxConfig(transformers.PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`UltravoxForConditionalGeneration`]. It is used to instantiate an + Ultravox model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + audio_config (`Wav2Vec2Config`, *optional*): + Custom audio config or dict + text_config (`Union[AutoConfig, dict]`, *optional*): + The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`. + ignore_index (`int`, *optional*, defaults to -100): + The ignore index for the loss function. + audio_token_index (`int`, *optional*, defaults to 32000): + The audio token index to encode the audio prompt. + stack_factor (`int`, *optional*, defaults to 8): + Audio downsampling factor for the multimodal projector. + norm_init (`float`, *optional*, defaults to 0.4): + The initialization value for the layer normalization. + projector_act (`str`, *optional*, defaults to `"swiglu"`): + The activation function used by the multimodal projector. + text_model_lora_config (`LoraConfigSimplified`, *optional*): + The LoRA configuration for finetuning the text model. + audio_model_lora_config (`LoraConfigSimplified`, *optional*): + The LoRA configuration for finetuning the audio model. + audio_latency_block_size (`int`, *optional*, defaults to `None`): + The latency block size for simulating audio streaming. + + + Example: + + ```python + >>> from transformers import UltravoxForConditionalGeneration, Wav2Vec2Config, UltravoxConfig, LlamaConfig + + >>> # Initializing an audio encoder config + >>> audio_config = Wav2Vec2Config() + + >>> # Initializing a Llama config + >>> text_config = LlamaConfig() + + >>> # Initializing a default configuration + >>> configuration = UltravoxConfig(audio_config, text_config) + + >>> # Initializing a completely untrained model from the configuration + >>> model = UltravoxForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # Initialize a model from pretrained checkpoints and random projector weights + >>> config = UltravoxConfig(audio_model_id="facebook/wav2vec2-base-960h", text_model_id="meta-llama/Llama-2-7b-chat-hf") + ```""" + + model_type = "ultravox" + is_composition = False + + def __init__( + self, + audio_config: Optional[Dict[str, Any]] = None, + text_config: Optional[Dict[str, Any]] = None, + audio_model_id: Optional[str] = None, + text_model_id: Optional[str] = None, + ignore_index: int = -100, + hidden_size: int = 4096, + stack_factor: int = 8, + norm_init: float = 0.4, + projector_act: str = "swiglu", + text_model_lora_config: Optional[LoraConfigSimplified] = None, + audio_model_lora_config: Optional[LoraConfigSimplified] = None, + audio_latency_block_size: Optional[int] = None, + **kwargs, + ): + self.ignore_index = ignore_index + + self.audio_model_id = audio_model_id + self.text_model_id = text_model_id + + self.hidden_size = hidden_size + self.stack_factor = stack_factor + self.norm_init = norm_init + self.projector_act = projector_act + + if text_model_id is not None: + self.text_config: transformers.LlamaConfig = ( + transformers.AutoConfig.from_pretrained(text_model_id) + ) + else: + text_config = text_config or {} + self.text_config = transformers.CONFIG_MAPPING[ + text_config.get("model_type", "llama") + ](**text_config) + + if audio_model_id is not None: + self.audio_config: transformers.PretrainedConfig = ( + transformers.AutoConfig.from_pretrained(audio_model_id) + ) + else: + audio_config = audio_config or {} + self.audio_config = transformers.CONFIG_MAPPING[ + audio_config.get("model_type", "wav2vec2") + ](**audio_config) + + self.text_model_lora_config = ( + text_model_lora_config + if isinstance(text_model_lora_config, dict) + else dataclasses.asdict(text_model_lora_config or LoraConfigSimplified()) + ) + self.audio_model_lora_config = ( + audio_model_lora_config + if isinstance(audio_model_lora_config, dict) + else dataclasses.asdict(audio_model_lora_config or LoraConfigSimplified()) + ) + self.audio_latency_block_size = audio_latency_block_size + + self.vocab_size = self.text_config.vocab_size + + self.initializer_range = self.text_config.initializer_range + + super().__init__(**kwargs) + + def to_diff_dict(self) -> Dict[str, Any]: + diff_dict = super().to_diff_dict() + + # remove text_config and audio_config if text_model_id and audio_model_id are present + if self.text_model_id is not None: + diff_dict.pop("text_config", None) + if self.audio_model_id is not None: + diff_dict.pop("audio_config", None) + + return diff_dict diff --git a/checkpoint-3600/ultravox_model.py b/checkpoint-3600/ultravox_model.py new file mode 100644 index 0000000000000000000000000000000000000000..ccea3e0ca174ec3119184d1f5a0f384c1cdbcae9 --- /dev/null +++ b/checkpoint-3600/ultravox_model.py @@ -0,0 +1,723 @@ +import logging +from typing import Any, Dict, Optional, Set, Tuple, Union + +import peft +import torch +import torch.nn as nn +import torch.nn.functional as F +import transformers +import transformers.activations +import transformers.modeling_outputs +import transformers.models +from transformers.models.whisper import modeling_whisper as whisper + +# We must use relative import in this directory to allow uploading to HF Hub +# Even "from . import X" pattern doesn't work (undocumented and unclear why) +from .ultravox_config import LossConfig +from .ultravox_config import LossFunction +from .ultravox_config import UltravoxConfig + + +class UltravoxModel(transformers.LlamaPreTrainedModel): + """ + The Ultravox model which consists of an audio encoder and a language model. + + Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and + projected to the language model's embedding space using a few linear layers. + The text is embedded by the language model as usual and then the audio and text embeddings are merged together. + + A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings. + + Parameters: + config: Model configuration class with all the parameters of the model. + """ + + config_class = UltravoxConfig + config: UltravoxConfig # for type hinting + # Usually we load encoder and LLM weights from a pretrained model separately, so they are allowed to be missing + _keys_to_ignore_on_load_missing = ["audio_tower.*", "language_model.*"] + + def __init__(self, config: UltravoxConfig): + super().__init__(config) + self._register_load_state_dict_pre_hook(self._pre_load_state_dict_hook) + + self.keep_params: Set[str] = set() + self.vocab_size = config.vocab_size + + self.audio_tower = self._create_audio_tower(config) + self.multi_modal_projector = self._create_multi_modal_projector(config) + self.language_model = self._create_language_model(config) + + # Determine no_split_modules dynamically to use with FSDP auto_wrap policy. + # FSDP throws an error if some of the layer types are not found in the model. + # This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"] + self._no_split_modules = (self.language_model._no_split_modules or []) + ( + self.audio_tower._no_split_modules or [] + ) + + self.loss_config = LossConfig() + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def tie_weights(self): + return self.language_model.tie_weights() + + def set_loss_config(self, loss_config: LossConfig): + self.loss_config = loss_config + + def _setup_cache( + self, cache_cls, max_batch_size: int, max_cache_len: Optional[int] = None + ): + self.language_model._setup_cache(cache_cls, max_batch_size, max_cache_len) + + def _reorder_cache(self, past_key_values, beam_idx): + return self.language_model._reorder_cache(past_key_values, beam_idx) + + def resize_token_embeddings( + self, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + ) -> nn.Embedding: + model_embeds = self.language_model.resize_token_embeddings( + new_num_tokens, pad_to_multiple_of + ) + # update vocab size + self.config.text_config.vocab_size = model_embeds.num_embeddings + self.config.vocab_size = model_embeds.num_embeddings + self.vocab_size = model_embeds.num_embeddings + return model_embeds + + def _compute_kl_loss( + self, + lm_output: transformers.modeling_outputs.CausalLMOutputWithPast, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + alt_input_ids: Optional[torch.Tensor] = None, + alt_attention_mask: Optional[torch.Tensor] = None, + alt_labels: Optional[torch.Tensor] = None, + **kwargs, + ): + # disable gradient computation for the teacher model + with torch.no_grad(): + # compute the teacher (text-only) model's distribution + alt_inputs_embeds = self.get_input_embeddings().forward(alt_input_ids) + alt_lm_output = self.language_model.forward( + inputs_embeds=alt_inputs_embeds, + labels=alt_labels, + attention_mask=alt_attention_mask, + past_key_values=past_key_values, + **kwargs, + ) + # compute the KL divergence loss between the two models + kl_loss = F.kl_div( + F.log_softmax( + lm_output.logits[labels != -100] / self.loss_config.kl_temperature, + dim=-1, + ), + F.softmax( + alt_lm_output.logits[alt_labels != -100] + / self.loss_config.kl_temperature, + dim=-1, + ), + reduction="batchmean", + ) + return {"loss": kl_loss} + + def forward( + self, + input_ids: torch.Tensor, + audio_values: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + audio_token_start_idx: Optional[torch.Tensor] = None, + audio_len: Optional[torch.Tensor] = None, + audio_token_len: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + # the alt_* fields are needed for KL divergence loss + alt_input_ids: Optional[torch.Tensor] = None, + alt_attention_mask: Optional[torch.Tensor] = None, + alt_labels: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]: + """ + Forward pass for the Ultravox model. + + `input_ids` are the tokenized text input. They are embedded by the language model as usual. + `audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and + projected to the language model's embedding space using a few linear layers. + The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start + of the audio embeddings in the merged embeddings. + + Args: + input_ids: The tokenized text input. + audio_values: The processed audio values. + inputs_embeds: The embeddings for the input tokens. + labels: The tokenized text labels. + attention_mask: The attention mask for the input. + position_ids: The position ids for the input. + past_key_values: The past key value cache for the language model attention layers. + **kwargs: Additional keyword arguments. Passed directly to the language model. + """ + if inputs_embeds is None: + # B x T -> B x T x D + inputs_embeds = self.get_input_embeddings().forward(input_ids) + + if audio_values is not None: + assert ( + audio_token_start_idx is not None and audio_token_len is not None + ), "audio_token_start_idx and audio_token_len must be provided if audio_values are provided." + assert ( + len(audio_token_start_idx) == len(audio_token_len) == len(audio_values) + ), "audio_token_start_idx, audio_token_len, and audio_values must have the same batch size." + + # B x A/3200 x D + audio_tower_output = self.audio_tower.forward( + audio_values.to(self.audio_tower.dtype), + audio_len=audio_len, + ).last_hidden_state + audio_tower_output = audio_tower_output.to(inputs_embeds.dtype) + + audio_embeds = self.multi_modal_projector.forward(audio_tower_output) + + # combine audio and text embeddings + for i, (audio, start, length) in enumerate( + zip(audio_embeds, audio_token_start_idx, audio_token_len) + ): + length = min(length, audio.shape[0]) + inputs_embeds[i, start : start + length] = audio[:length] + + lm_output = self.language_model.forward( + inputs_embeds=inputs_embeds, + labels=labels, + attention_mask=attention_mask, + past_key_values=past_key_values, + **kwargs, + ) + if self.training: + if self.loss_config.loss_function == LossFunction.CrossEntropy: + return lm_output + elif self.loss_config.loss_function == LossFunction.KL_Divergence: + return self._compute_kl_loss( + lm_output=lm_output, + labels=labels, + past_key_values=past_key_values, + alt_input_ids=alt_input_ids, + alt_attention_mask=alt_attention_mask, + alt_labels=alt_labels, + **kwargs, + ) + else: + raise ValueError( + f"Unsupported loss function: {self.loss_config.loss_function}" + ) + else: + return lm_output + + def prepare_inputs_for_generation( + self, + input_ids: torch.Tensor, + audio_values: Optional[torch.FloatTensor] = None, + audio_token_start_idx: Optional[torch.Tensor] = None, + audio_token_len: Optional[torch.Tensor] = None, + audio_len: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, + ) -> Dict[str, Any]: + model_input = self.language_model.prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + # include audio information in model_input only when it is needed during prefilling + # audio_token_start_idx should always be relative to the current cache position + prefill_start_idx = 0 if cache_position is None else cache_position[0] + if ( + audio_values is not None + and audio_token_start_idx is not None + and prefill_start_idx <= torch.max(audio_token_start_idx) + ): + model_input["audio_values"] = audio_values + model_input["audio_token_start_idx"] = ( + audio_token_start_idx - prefill_start_idx + ) + model_input["audio_token_len"] = audio_token_len + model_input["audio_len"] = audio_len + + return model_input + + @classmethod + def _create_multi_modal_projector( + cls, config: UltravoxConfig + ) -> "UltravoxProjector": + projector = UltravoxProjector(config) + projector.to(config.torch_dtype) + return projector + + @classmethod + def _create_audio_tower( + cls, config: UltravoxConfig + ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]: + if config.audio_model_id is not None: + if "whisper" in config.audio_model_id is not None: + audio_tower = ModifiedWhisperEncoder.from_pretrained( + config.audio_model_id, torch_dtype=config.torch_dtype + ) + audio_tower.init_latency_mask( + config.audio_latency_block_size, dtype=config.torch_dtype + ) + else: + assert config.audio_latency_block_size in ( + None, + 0, + ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'" + audio_tower = transformers.AutoModel.from_pretrained( + config.audio_model_id, torch_dtype=config.torch_dtype + ) + else: + if "whisper" in config.audio_config._name_or_path: + audio_tower = ModifiedWhisperEncoder(config.audio_config) + audio_tower.init_latency_mask( + config.audio_latency_block_size, dtype=config.torch_dtype + ) + else: + assert config.audio_latency_block_size in ( + None, + 0, + ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'" + with transformers.modeling_utils.no_init_weights(): + # we only ever use from_config if the weights are retrained, hence initializing is not + # required. This makes the model quite creation faster since init on CPU is quite slow. + audio_tower = transformers.AutoModel.from_config( + config.audio_config + ) + + if isinstance( + audio_tower, + (transformers.Wav2Vec2BertModel, transformers.WhisperModel), + ): + # For these models we only need the encoder part + # Wav2Vec2BertModel -> Wav2Vec2BertEncoder + # WhisperModel -> WhisperEncoder + audio_tower = audio_tower.encoder + + audio_tower = apply_lora(audio_tower, config.audio_model_lora_config) + return audio_tower + + @classmethod + def _create_language_model( + cls, config: UltravoxConfig + ) -> transformers.LlamaForCausalLM: + if config.text_model_id is not None: + language_model = transformers.AutoModelForCausalLM.from_pretrained( + config.text_model_id, + attn_implementation=config._attn_implementation, + torch_dtype=config.torch_dtype, + ) + else: + with transformers.modeling_utils.no_init_weights(): + # we only ever use from_config if the weights are retrained, hence initializing is not + # required. This makes the model quite creation faster since init on CPU is quite slow. + language_model = transformers.AutoModelForCausalLM.from_config( + config.text_config, + attn_implementation=config._attn_implementation, + torch_dtype=config.torch_dtype, + ) + + language_model = apply_lora(language_model, config.text_model_lora_config) + return language_model + + def merge_and_unload(self): + if isinstance(self.language_model, peft.PeftModel): + self.language_model = self.language_model.merge_and_unload() + # no need to download base language model weights anymore, so we can remove the id + self.config.text_model_id = None + self.keep_params.update( + set( + [ + f"language_model.{name}" + for name, _ in self.language_model.named_parameters() + ] + ) + ) + + if isinstance(self.audio_tower, peft.PeftModel): + self.audio_tower = self.audio_tower.merge_and_unload() + # no need to download base audio model weights anymore, so we can remove the id + self.config.audio_model_id = None + self.keep_params.update( + set( + [ + f"audio_tower.{name}" + for name, _ in self.audio_tower.named_parameters() + ] + ) + ) + + for param in ["text_model_lora_config", "audio_model_lora_config"]: + if hasattr(self.config, param): + delattr(self.config, param) + + def push_to_hub(self, *args, **kwargs): + self.merge_and_unload() + self.to(self.language_model.dtype) + return super().push_to_hub(*args, **kwargs) + + def save_pretrained( + self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs + ): + if state_dict is None: + state_dict = super().state_dict() + + named_params = dict(self.named_parameters()) + + state_dict = { + k: v + for k, v in state_dict.items() + if k in self.keep_params + or (k in named_params and named_params[k].requires_grad) + } + + super().save_pretrained(*args, state_dict=state_dict, **kwargs) + + def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs): + self.keep_params.update(set(state_dict.keys())) + + def print_trainable_parameters(self): + """ + Prints the number of trainable parameters in the model (reuses Peft model's method) + """ + count_params = peft.peft_model.PeftModel.get_nb_trainable_parameters + + trainable_params, all_param = count_params(self) + + logging.info( + f"trainable params: {trainable_params:,d} || all params: {all_param:,d}" + f" || trainable%: {100 * trainable_params / all_param:.1f}%" + ) + + lm_trainable_params, lm_all_params = count_params(self.language_model) + audio_trainable_params, audio_all_params = count_params(self.audio_tower) + + projector_trainable_params = ( + trainable_params - lm_trainable_params - audio_trainable_params + ) + projector_all_params = all_param - lm_all_params - audio_all_params + + logging.info( + f"Trainable%: " + f" LLM: {100 * lm_trainable_params / lm_all_params:.1f}%" + f" || Audio Encoder: {100 * audio_trainable_params / audio_all_params:.1f}%" + f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%" + ) + + +def is_cache_empty( + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] +) -> bool: + """ + Check if the cache is empty. + """ + if past_key_values is None: + return True + if isinstance(past_key_values, tuple): + return all(len(c) == 0 for c in past_key_values) + return past_key_values.get_seq_length() == 0 + + +def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module: + """ + Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead. + """ + lora_config = peft.LoraConfig(**lora_config or {}) + + if lora_config.r == 0: + # freeze the model entirely + for param in model.parameters(): + param.requires_grad = False + else: + model = peft.get_peft_model(model, lora_config) + + return model + + +class StackAudioFrames(nn.Module): + """ + Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`. + + The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames. + NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor, + we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings. + In most cases this extra padding will get removed in the model's forward function so it has no effect. + """ + + def __init__(self, stack_factor: int = 8): + super().__init__() + self.stack_factor = stack_factor + + def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor: + B, T, C = audio_embeds.shape + T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor + audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T + self.stack_factor)) + B, T, C = audio_embeds.shape + audio_embeds = audio_embeds.view( + B, T // self.stack_factor, C * self.stack_factor + ) + return audio_embeds + + +class RMSNorm(transformers.models.llama.modeling_llama.LlamaRMSNorm): + def __init__(self, hidden_size: int, init: float = 1, eps: float = 1e-6): + super().__init__(hidden_size=hidden_size, eps=eps) + self.weight.data.fill_(init) + + +class SwiGLU(nn.Module): + def forward(self, x): + x, gate = x.chunk(2, dim=-1) + return F.silu(gate) * x + + +class UltravoxProjector(nn.Sequential): + def __init__(self, config: UltravoxConfig): + super().__init__() + self.hidden_dim = config.hidden_size + self._pad_and_stack = StackAudioFrames(config.stack_factor) + dim = config.audio_config.hidden_size * config.stack_factor + self.ln_pre = RMSNorm(dim, init=config.norm_init) + self.linear_1 = nn.Linear(dim, self.hidden_dim, bias=False) + dim = self.hidden_dim + self.act = transformers.activations.get_activation(config.projector_act) + dim = dim // 2 if config.projector_act == "swiglu" else dim + self.linear_2 = nn.Linear(dim, config.text_config.hidden_size, bias=False) + self.ln_post = RMSNorm(config.text_config.hidden_size, init=config.norm_init) + + def forward(self, audio_features: torch.Tensor) -> torch.Tensor: + audio_features = self._pad_and_stack(audio_features) + audio_features = self.ln_pre(audio_features) + hidden_states = self.linear_1(audio_features) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + hidden_states = self.ln_post(hidden_states) + return hidden_states + + +class ModifiedWhisperEncoder( + whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin +): + """ + Encoder portion of OpenAI's Whisper model. + + This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes: + 1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder + 2. allow less than 30 second of audio padding to be passed in: + - relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal + - embed_pos is now sliced to match the length of `inputs_embeds` + + Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py + """ + + base_model_prefix = "model.encoder" + _no_split_modules = ["WhisperEncoderLayer"] + + def init_latency_mask(self, audio_latency_block_size: int, dtype: torch.dtype): + if audio_latency_block_size is None: + self.audio_streaming_mask = None + return + + # maximum sequence length + max_seqlen = ( + self.config.max_source_positions + * self.conv1.stride[0] + * self.conv2.stride[0] + ) + assert ( + max_seqlen > 0 + ), f"maximum sequence length must be positive, got {max_seqlen}" + assert ( + max_seqlen % audio_latency_block_size == 0 + ), f"audio_latency_block_size {audio_latency_block_size} must divide {max_seqlen} evenly." + # Given the block size, we calculate number of blocks. + audio_latency_nblocks = max_seqlen // audio_latency_block_size + audio_streaming_mask = ( + torch.tril( + torch.ones(audio_latency_nblocks, audio_latency_nblocks), + diagonal=0, + ) + .repeat_interleave(audio_latency_block_size, dim=0) + .repeat_interleave(audio_latency_block_size, dim=1) + ) + audio_streaming_mask = (1.0 - audio_streaming_mask) * torch.finfo(dtype).min + audio_streaming_mask = audio_streaming_mask[None, None, :, :] + self.register_buffer( + "audio_streaming_mask", audio_streaming_mask, persistent=False + ) + + def forward( + self, + input_features, + audio_len=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + expected_seq_length = ( + self.config.max_source_positions + * self.conv1.stride[0] + * self.conv2.stride[0] + ) + if input_features.shape[-1] > expected_seq_length: + raise ValueError( + f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}." + ) + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + inputs_embeds = nn.functional.gelu(self.conv1(input_features)) + inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) + + inputs_embeds = inputs_embeds.permute(0, 2, 1) + embed_pos = self.embed_positions.weight[: inputs_embeds.size(-2)] + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout( + hidden_states, p=self.dropout, training=self.training + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # Create attention mask based on audio lengths to mask out padding tokens + # For each sample in batch: + # - Convert raw audio length to feature length after convolutions + # - Create boolean mask that is True for valid positions and False for padding + # - Convert to extended attention mask format expected by transformer layers + # (1.0 for positions to attend to, large negative for positions to ignore) + # This masking ensures consistent behavior between training and inference + # by preventing the model from attending to padding tokens in both cases + attention_mask = None + if audio_len != None: + audio_feature_len = self._get_feat_extract_output_lengths(audio_len) + max_seq_len = hidden_states.shape[1] + attention_mask = torch.arange(max_seq_len, device=hidden_states.device)[ + None, : + ].lt(audio_feature_len.view(-1, 1)) + attention_mask = self.get_extended_attention_mask( + attention_mask, + None, + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + + if self.audio_streaming_mask is not None: + seqlen = hidden_states.size(-2) + if attention_mask is not None: + attention_mask = torch.minimum( + self.audio_streaming_mask[:, :, :seqlen, :seqlen], attention_mask + ) # merge + else: + attention_mask = self.audio_streaming_mask[:, :, :seqlen, :seqlen] + attention_mask = attention_mask.to(hidden_states.dtype) + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + assert head_mask.size()[0] == ( + len(self.layers) + ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=( + head_mask[idx] if head_mask is not None else None + ), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, encoder_states, all_attentions] + if v is not None + ) + return transformers.modeling_outputs.BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=encoder_states, + attentions=all_attentions, + ) + + +UltravoxConfig.register_for_auto_class() +UltravoxModel.register_for_auto_class() + +transformers.AutoConfig.register("ultravox", UltravoxConfig) +transformers.AutoModel.register(UltravoxConfig, UltravoxModel) + +transformers.activations.ACT2FN["swiglu"] = SwiGLU diff --git a/checkpoint-7200/config.json b/checkpoint-7200/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ce016ecf9d037d4afa627d419c26b5ddd28a2f8 --- /dev/null +++ b/checkpoint-7200/config.json @@ -0,0 +1,43 @@ +{ + "architectures": [ + "UltravoxModel" + ], + "audio_latency_block_size": null, + "audio_model_id": "openai/whisper-large-v3-turbo", + "audio_model_lora_config": { + "lora_alpha": 8, + "r": 0, + "target_modules": [ + "k_proj", + "q_proj", + "linear_k", + "linear_q" + ] + }, + "auto_map": { + "AutoConfig": "ultravox_config.UltravoxConfig", + "AutoModel": "ultravox_model.UltravoxModel" + }, + "hidden_size": 4096, + "ignore_index": -100, + "initializer_range": 0.02, + "model_type": "ultravox", + "norm_init": 0.4, + "pad_token_id": 128009, + "projector_act": "swiglu", + "stack_factor": 8, + "text_model_id": "meta-llama/Llama-3.2-1B-Instruct", + "text_model_lora_config": { + "lora_alpha": 8, + "r": 0, + "target_modules": [ + "k_proj", + "q_proj", + "linear_k", + "linear_q" + ] + }, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.0", + "vocab_size": 128256 +} diff --git a/checkpoint-7200/generation_config.json b/checkpoint-7200/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4dac817850f65a6be4d01d824462c9fe54468763 --- /dev/null +++ b/checkpoint-7200/generation_config.json @@ -0,0 +1,11 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "transformers_version": "4.47.0" +} diff --git a/checkpoint-7200/model.safetensors b/checkpoint-7200/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..315087c24dab8036ebeddb2ebcc5ef1d852562b3 --- /dev/null +++ b/checkpoint-7200/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a03a8462cd861fa207927453b82db5d25ae96993e8bc8bcb061061b5f233792 +size 92299736 diff --git a/checkpoint-7200/optimizer.pt b/checkpoint-7200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..58f3a17b2c4e661c87d1506b62ab174274f24848 --- /dev/null +++ b/checkpoint-7200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6ecb44d60282528d473d70738c9b0e1a9e11819f2aefac6bedeeb5b73291e26 +size 184602962 diff --git a/checkpoint-7200/rng_state.pth b/checkpoint-7200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d2eb34eaaee375c7ad4080cd61efd96f62f84cad --- /dev/null +++ b/checkpoint-7200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28133004bd3f54b7faa1910be0d01d2fac0bd8891365545529d0dcbedee0f3e8 +size 14244 diff --git a/checkpoint-7200/scheduler.pt b/checkpoint-7200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..61e1310f0accf1b7dd3a45d94bda48ee493d042b --- /dev/null +++ b/checkpoint-7200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14976c1b3dd2b157698f792725a91270a736615d07a5dd5eae32d2ba4a41d313 +size 1064 diff --git a/checkpoint-7200/special_tokens_map.json b/checkpoint-7200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/checkpoint-7200/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/checkpoint-7200/tokenizer.json b/checkpoint-7200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-7200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-7200/tokenizer_config.json b/checkpoint-7200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/checkpoint-7200/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/checkpoint-7200/trainer_state.json b/checkpoint-7200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6b522a8ad3fb127de31109721a37556066a1f2d6 --- /dev/null +++ b/checkpoint-7200/trainer_state.json @@ -0,0 +1,733 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.12499782989878648, + "eval_steps": 1000, + "global_step": 7200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.736080970816479e-05, + "grad_norm": 10.5625, + "learning_rate": 2e-06, + "loss": 1.0, + "step": 1 + }, + { + "epoch": 0.001736080970816479, + "grad_norm": 0.1513671875, + "learning_rate": 0.0002, + "loss": 0.3311, + "step": 100 + }, + { + "epoch": 0.003472161941632958, + "grad_norm": 0.16796875, + "learning_rate": 0.0004, + "loss": 0.2169, + "step": 200 + }, + { + "epoch": 0.005208242912449436, + "grad_norm": 0.10400390625, + "learning_rate": 0.0006, + "loss": 0.2032, + "step": 300 + }, + { + "epoch": 0.006944323883265916, + "grad_norm": 0.11279296875, + "learning_rate": 0.0008, + "loss": 0.188, + "step": 400 + }, + { + "epoch": 0.008680404854082394, + "grad_norm": 0.10107421875, + "learning_rate": 0.001, + "loss": 0.1758, + "step": 500 + }, + { + "epoch": 0.010416485824898873, + "grad_norm": 0.09521484375, + "learning_rate": 0.0012, + "loss": 0.1637, + "step": 600 + }, + { + "epoch": 0.012152566795715351, + "grad_norm": 0.08154296875, + "learning_rate": 0.0014, + "loss": 0.1518, + "step": 700 + }, + { + "epoch": 0.013888647766531832, + "grad_norm": 0.08642578125, + "learning_rate": 0.0016, + "loss": 0.1485, + "step": 800 + }, + { + "epoch": 0.01562472873734831, + "grad_norm": 0.1044921875, + "learning_rate": 0.0018000000000000002, + "loss": 0.1433, + "step": 900 + }, + { + "epoch": 0.01736080970816479, + "grad_norm": 0.05419921875, + "learning_rate": 0.002, + "loss": 0.139, + "step": 1000 + }, + { + "epoch": 0.01736080970816479, + "eval_covost2-en-de_loss": 1.896493673324585, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 9.8697, + "eval_covost2-en-de_samples_per_second": 6.485, + "eval_covost2-en-de_steps_per_second": 0.811, + "step": 1000 + }, + { + "epoch": 0.01736080970816479, + "eval_covost2-zh-en_loss": 3.1452860832214355, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.3732, + "eval_covost2-zh-en_samples_per_second": 7.643, + "eval_covost2-zh-en_steps_per_second": 0.955, + "step": 1000 + }, + { + "epoch": 0.01736080970816479, + "eval_peoplespeech-clean-transcription_loss": 3.2206106185913086, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.6941, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.602, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.825, + "step": 1000 + }, + { + "epoch": 0.01909689067898127, + "grad_norm": 0.059814453125, + "learning_rate": 0.001999725185109816, + "loss": 0.1334, + "step": 1100 + }, + { + "epoch": 0.020832971649797746, + "grad_norm": 0.07373046875, + "learning_rate": 0.0019989008914857113, + "loss": 0.1288, + "step": 1200 + }, + { + "epoch": 0.022569052620614226, + "grad_norm": 0.049560546875, + "learning_rate": 0.00199752757218401, + "loss": 0.1262, + "step": 1300 + }, + { + "epoch": 0.024305133591430703, + "grad_norm": 0.0517578125, + "learning_rate": 0.001995605982021898, + "loss": 0.1222, + "step": 1400 + }, + { + "epoch": 0.026041214562247183, + "grad_norm": 0.058349609375, + "learning_rate": 0.0019931371771625545, + "loss": 0.1193, + "step": 1500 + }, + { + "epoch": 0.027777295533063663, + "grad_norm": 0.0498046875, + "learning_rate": 0.001990122514534651, + "loss": 0.1196, + "step": 1600 + }, + { + "epoch": 0.02951337650388014, + "grad_norm": 0.05517578125, + "learning_rate": 0.0019865636510865464, + "loss": 0.115, + "step": 1700 + }, + { + "epoch": 0.03124945747469662, + "grad_norm": 0.044677734375, + "learning_rate": 0.001982462542875576, + "loss": 0.115, + "step": 1800 + }, + { + "epoch": 0.0329855384455131, + "grad_norm": 0.05419921875, + "learning_rate": 0.001977821443992945, + "loss": 0.1125, + "step": 1900 + }, + { + "epoch": 0.03472161941632958, + "grad_norm": 0.047119140625, + "learning_rate": 0.001972642905324813, + "loss": 0.1094, + "step": 2000 + }, + { + "epoch": 0.03472161941632958, + "eval_covost2-en-de_loss": 1.6700351238250732, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.1279, + "eval_covost2-en-de_samples_per_second": 7.874, + "eval_covost2-en-de_steps_per_second": 0.984, + "step": 2000 + }, + { + "epoch": 0.03472161941632958, + "eval_covost2-zh-en_loss": 3.093877077102661, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.1488, + "eval_covost2-zh-en_samples_per_second": 7.854, + "eval_covost2-zh-en_steps_per_second": 0.982, + "step": 2000 + }, + { + "epoch": 0.03472161941632958, + "eval_peoplespeech-clean-transcription_loss": 2.478968620300293, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.5507, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.701, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.838, + "step": 2000 + }, + { + "epoch": 0.036457700387146054, + "grad_norm": 0.048583984375, + "learning_rate": 0.0019669297731502505, + "loss": 0.1077, + "step": 2100 + }, + { + "epoch": 0.03819378135796254, + "grad_norm": 0.054443359375, + "learning_rate": 0.00196068518757684, + "loss": 0.1069, + "step": 2200 + }, + { + "epoch": 0.039929862328779014, + "grad_norm": 0.047119140625, + "learning_rate": 0.001953912580814779, + "loss": 0.1043, + "step": 2300 + }, + { + "epoch": 0.04166594329959549, + "grad_norm": 0.044921875, + "learning_rate": 0.0019466156752904343, + "loss": 0.1035, + "step": 2400 + }, + { + "epoch": 0.043402024270411975, + "grad_norm": 0.050537109375, + "learning_rate": 0.0019387984816003866, + "loss": 0.1033, + "step": 2500 + }, + { + "epoch": 0.04513810524122845, + "grad_norm": 0.056396484375, + "learning_rate": 0.0019304652963070869, + "loss": 0.102, + "step": 2600 + }, + { + "epoch": 0.04687418621204493, + "grad_norm": 0.046875, + "learning_rate": 0.0019216206995773372, + "loss": 0.0998, + "step": 2700 + }, + { + "epoch": 0.048610267182861405, + "grad_norm": 0.042236328125, + "learning_rate": 0.0019122695526648968, + "loss": 0.1002, + "step": 2800 + }, + { + "epoch": 0.05034634815367789, + "grad_norm": 0.04638671875, + "learning_rate": 0.0019024169952385887, + "loss": 0.0978, + "step": 2900 + }, + { + "epoch": 0.052082429124494366, + "grad_norm": 0.05126953125, + "learning_rate": 0.0018920684425573864, + "loss": 0.097, + "step": 3000 + }, + { + "epoch": 0.052082429124494366, + "eval_covost2-en-de_loss": 1.749150276184082, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.1948, + "eval_covost2-en-de_samples_per_second": 7.81, + "eval_covost2-en-de_steps_per_second": 0.976, + "step": 3000 + }, + { + "epoch": 0.052082429124494366, + "eval_covost2-zh-en_loss": 3.198117971420288, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.1979, + "eval_covost2-zh-en_samples_per_second": 7.807, + "eval_covost2-zh-en_steps_per_second": 0.976, + "step": 3000 + }, + { + "epoch": 0.052082429124494366, + "eval_peoplespeech-clean-transcription_loss": 2.345036506652832, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 11.4402, + "eval_peoplespeech-clean-transcription_samples_per_second": 5.594, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.699, + "step": 3000 + }, + { + "epoch": 0.05381851009531084, + "grad_norm": 0.06494140625, + "learning_rate": 0.0018812295824940284, + "loss": 0.0955, + "step": 3100 + }, + { + "epoch": 0.055554591066127326, + "grad_norm": 0.044677734375, + "learning_rate": 0.0018699063724087904, + "loss": 0.0951, + "step": 3200 + }, + { + "epoch": 0.0572906720369438, + "grad_norm": 0.0390625, + "learning_rate": 0.0018581050358751443, + "loss": 0.0947, + "step": 3300 + }, + { + "epoch": 0.05902675300776028, + "grad_norm": 0.056396484375, + "learning_rate": 0.0018458320592590974, + "loss": 0.0939, + "step": 3400 + }, + { + "epoch": 0.060762833978576763, + "grad_norm": 0.047119140625, + "learning_rate": 0.0018330941881540914, + "loss": 0.0941, + "step": 3500 + }, + { + "epoch": 0.06249891494939324, + "grad_norm": 0.046630859375, + "learning_rate": 0.0018198984236734246, + "loss": 0.0927, + "step": 3600 + }, + { + "epoch": 0.06423499592020972, + "grad_norm": 0.055419921875, + "learning_rate": 0.0018062520186022297, + "loss": 0.0948, + "step": 3700 + }, + { + "epoch": 0.0659710768910262, + "grad_norm": 0.046142578125, + "learning_rate": 0.0017921624734111292, + "loss": 0.09, + "step": 3800 + }, + { + "epoch": 0.06770715786184267, + "grad_norm": 0.04736328125, + "learning_rate": 0.001777637532133752, + "loss": 0.0926, + "step": 3900 + }, + { + "epoch": 0.06944323883265915, + "grad_norm": 0.048828125, + "learning_rate": 0.0017626851781103819, + "loss": 0.0906, + "step": 4000 + }, + { + "epoch": 0.06944323883265915, + "eval_covost2-en-de_loss": 1.7936017513275146, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.0356, + "eval_covost2-en-de_samples_per_second": 7.965, + "eval_covost2-en-de_steps_per_second": 0.996, + "step": 4000 + }, + { + "epoch": 0.06944323883265915, + "eval_covost2-zh-en_loss": 3.2699265480041504, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 9.5779, + "eval_covost2-zh-en_samples_per_second": 6.682, + "eval_covost2-zh-en_steps_per_second": 0.835, + "step": 4000 + }, + { + "epoch": 0.06944323883265915, + "eval_peoplespeech-clean-transcription_loss": 2.3380110263824463, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.5943, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.671, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.834, + "step": 4000 + }, + { + "epoch": 0.07117931980347564, + "grad_norm": 0.041259765625, + "learning_rate": 0.001747313629600077, + "loss": 0.0926, + "step": 4100 + }, + { + "epoch": 0.07291540077429211, + "grad_norm": 0.05322265625, + "learning_rate": 0.001731531335263669, + "loss": 0.0907, + "step": 4200 + }, + { + "epoch": 0.07465148174510859, + "grad_norm": 0.05126953125, + "learning_rate": 0.0017153469695201276, + "loss": 0.0898, + "step": 4300 + }, + { + "epoch": 0.07638756271592508, + "grad_norm": 0.061767578125, + "learning_rate": 0.0016987694277788418, + "loss": 0.0876, + "step": 4400 + }, + { + "epoch": 0.07812364368674155, + "grad_norm": 0.042724609375, + "learning_rate": 0.001681807821550438, + "loss": 0.0874, + "step": 4500 + }, + { + "epoch": 0.07985972465755803, + "grad_norm": 0.05126953125, + "learning_rate": 0.0016644714734388218, + "loss": 0.0865, + "step": 4600 + }, + { + "epoch": 0.08159580562837451, + "grad_norm": 0.042724609375, + "learning_rate": 0.0016467699120171987, + "loss": 0.0866, + "step": 4700 + }, + { + "epoch": 0.08333188659919098, + "grad_norm": 0.0419921875, + "learning_rate": 0.001628712866590885, + "loss": 0.0864, + "step": 4800 + }, + { + "epoch": 0.08506796757000747, + "grad_norm": 0.051513671875, + "learning_rate": 0.0016103102618497923, + "loss": 0.0862, + "step": 4900 + }, + { + "epoch": 0.08680404854082395, + "grad_norm": 0.052734375, + "learning_rate": 0.0015915722124135226, + "loss": 0.0855, + "step": 5000 + }, + { + "epoch": 0.08680404854082395, + "eval_covost2-en-de_loss": 1.7862941026687622, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.2861, + "eval_covost2-en-de_samples_per_second": 7.724, + "eval_covost2-en-de_steps_per_second": 0.965, + "step": 5000 + }, + { + "epoch": 0.08680404854082395, + "eval_covost2-zh-en_loss": 3.33290433883667, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.4063, + "eval_covost2-zh-en_samples_per_second": 7.613, + "eval_covost2-zh-en_steps_per_second": 0.952, + "step": 5000 + }, + { + "epoch": 0.08680404854082395, + "eval_peoplespeech-clean-transcription_loss": 2.2601113319396973, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.4946, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.741, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.843, + "step": 5000 + }, + { + "epoch": 0.08854012951164042, + "grad_norm": 0.053466796875, + "learning_rate": 0.001572509017272072, + "loss": 0.0872, + "step": 5100 + }, + { + "epoch": 0.0902762104824569, + "grad_norm": 0.044189453125, + "learning_rate": 0.0015531311541251993, + "loss": 0.0859, + "step": 5200 + }, + { + "epoch": 0.09201229145327339, + "grad_norm": 0.052978515625, + "learning_rate": 0.0015334492736235703, + "loss": 0.085, + "step": 5300 + }, + { + "epoch": 0.09374837242408986, + "grad_norm": 0.04833984375, + "learning_rate": 0.0015134741935148419, + "loss": 0.0844, + "step": 5400 + }, + { + "epoch": 0.09548445339490634, + "grad_norm": 0.047119140625, + "learning_rate": 0.0014932168926979072, + "loss": 0.0844, + "step": 5500 + }, + { + "epoch": 0.09722053436572281, + "grad_norm": 0.05029296875, + "learning_rate": 0.0014726885051885652, + "loss": 0.0856, + "step": 5600 + }, + { + "epoch": 0.0989566153365393, + "grad_norm": 0.049560546875, + "learning_rate": 0.0014519003139999338, + "loss": 0.0841, + "step": 5700 + }, + { + "epoch": 0.10069269630735578, + "grad_norm": 0.056884765625, + "learning_rate": 0.0014308637449409706, + "loss": 0.0841, + "step": 5800 + }, + { + "epoch": 0.10242877727817225, + "grad_norm": 0.041015625, + "learning_rate": 0.0014095903603365066, + "loss": 0.0825, + "step": 5900 + }, + { + "epoch": 0.10416485824898873, + "grad_norm": 0.048583984375, + "learning_rate": 0.0013880918526722496, + "loss": 0.0828, + "step": 6000 + }, + { + "epoch": 0.10416485824898873, + "eval_covost2-en-de_loss": 1.8097732067108154, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.2052, + "eval_covost2-en-de_samples_per_second": 7.8, + "eval_covost2-en-de_steps_per_second": 0.975, + "step": 6000 + }, + { + "epoch": 0.10416485824898873, + "eval_covost2-zh-en_loss": 3.331326961517334, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.2653, + "eval_covost2-zh-en_samples_per_second": 7.743, + "eval_covost2-zh-en_steps_per_second": 0.968, + "step": 6000 + }, + { + "epoch": 0.10416485824898873, + "eval_peoplespeech-clean-transcription_loss": 2.250232219696045, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.4708, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.758, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.845, + "step": 6000 + }, + { + "epoch": 0.10590093921980522, + "grad_norm": 0.04443359375, + "learning_rate": 0.0013663800381682463, + "loss": 0.0819, + "step": 6100 + }, + { + "epoch": 0.10763702019062169, + "grad_norm": 0.05419921875, + "learning_rate": 0.0013444668502843329, + "loss": 0.08, + "step": 6200 + }, + { + "epoch": 0.10937310116143817, + "grad_norm": 0.0478515625, + "learning_rate": 0.0013223643331611537, + "loss": 0.0805, + "step": 6300 + }, + { + "epoch": 0.11110918213225465, + "grad_norm": 0.051513671875, + "learning_rate": 0.001300084635000341, + "loss": 0.0799, + "step": 6400 + }, + { + "epoch": 0.11284526310307112, + "grad_norm": 0.0498046875, + "learning_rate": 0.0012776400013875004, + "loss": 0.0807, + "step": 6500 + }, + { + "epoch": 0.1145813440738876, + "grad_norm": 0.050537109375, + "learning_rate": 0.0012550427685616766, + "loss": 0.0799, + "step": 6600 + }, + { + "epoch": 0.11631742504470409, + "grad_norm": 0.05029296875, + "learning_rate": 0.0012323053566349834, + "loss": 0.0802, + "step": 6700 + }, + { + "epoch": 0.11805350601552056, + "grad_norm": 0.047119140625, + "learning_rate": 0.0012094402627661448, + "loss": 0.0796, + "step": 6800 + }, + { + "epoch": 0.11978958698633704, + "grad_norm": 0.044677734375, + "learning_rate": 0.0011864600542916813, + "loss": 0.0784, + "step": 6900 + }, + { + "epoch": 0.12152566795715353, + "grad_norm": 0.0478515625, + "learning_rate": 0.0011633773618185302, + "loss": 0.0808, + "step": 7000 + }, + { + "epoch": 0.12152566795715353, + "eval_covost2-en-de_loss": 1.7786378860473633, + "eval_covost2-en-de_model_preparation_time": 0.0057, + "eval_covost2-en-de_runtime": 8.0291, + "eval_covost2-en-de_samples_per_second": 7.971, + "eval_covost2-en-de_steps_per_second": 0.996, + "step": 7000 + }, + { + "epoch": 0.12152566795715353, + "eval_covost2-zh-en_loss": 3.273571252822876, + "eval_covost2-zh-en_model_preparation_time": 0.0057, + "eval_covost2-zh-en_runtime": 8.3234, + "eval_covost2-zh-en_samples_per_second": 7.689, + "eval_covost2-zh-en_steps_per_second": 0.961, + "step": 7000 + }, + { + "epoch": 0.12152566795715353, + "eval_peoplespeech-clean-transcription_loss": 2.2290830612182617, + "eval_peoplespeech-clean-transcription_model_preparation_time": 0.0057, + "eval_peoplespeech-clean-transcription_runtime": 9.7693, + "eval_peoplespeech-clean-transcription_samples_per_second": 6.551, + "eval_peoplespeech-clean-transcription_steps_per_second": 0.819, + "step": 7000 + }, + { + "epoch": 0.12326174892797, + "grad_norm": 0.0478515625, + "learning_rate": 0.0011402048722818862, + "loss": 0.0786, + "step": 7100 + }, + { + "epoch": 0.12499782989878648, + "grad_norm": 0.049560546875, + "learning_rate": 0.0011169553219720827, + "loss": 0.0795, + "step": 7200 + } + ], + "logging_steps": 100, + "max_steps": 14400, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.3101190265194086e+17, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-7200/training_args.bin b/checkpoint-7200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..83b5b195c1720ea55bc992756c0ab6f1e2ef4671 --- /dev/null +++ b/checkpoint-7200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1fd9b1621955a41f9e58d5a3e2b2d6a70bd41f5404ebfa5cb0ca999c290090 +size 5688 diff --git a/checkpoint-7200/ultravox_config.py b/checkpoint-7200/ultravox_config.py new file mode 100644 index 0000000000000000000000000000000000000000..3671250d6b6a69604473ed1a526484c8c9a77f68 --- /dev/null +++ b/checkpoint-7200/ultravox_config.py @@ -0,0 +1,170 @@ +import dataclasses +from enum import Enum +from typing import Any, Dict, List, Optional + +import transformers + + +@dataclasses.dataclass +class LoraConfigSimplified: + """ + Low Rank Approximation (LoRA) configuration. + + Used for language and audio models separately. + """ + + # The rank of the approximation + r: int = 0 + lora_alpha: float = 8 + target_modules: Optional[List[str]] = dataclasses.field( + default_factory=lambda: ["k_proj", "q_proj", "linear_k", "linear_q"] + ) + + +class LossFunction(str, Enum): + CrossEntropy = "ce" + KL_Divergence = "kl" + + +@dataclasses.dataclass +class LossConfig: + loss_function: LossFunction = LossFunction.KL_Divergence + kl_temperature: float = 2.0 + + @property + def requires_alt_fields(self): + return self.loss_function == LossFunction.KL_Divergence + + +class UltravoxConfig(transformers.PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`UltravoxForConditionalGeneration`]. It is used to instantiate an + Ultravox model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + audio_config (`Wav2Vec2Config`, *optional*): + Custom audio config or dict + text_config (`Union[AutoConfig, dict]`, *optional*): + The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`. + ignore_index (`int`, *optional*, defaults to -100): + The ignore index for the loss function. + audio_token_index (`int`, *optional*, defaults to 32000): + The audio token index to encode the audio prompt. + stack_factor (`int`, *optional*, defaults to 8): + Audio downsampling factor for the multimodal projector. + norm_init (`float`, *optional*, defaults to 0.4): + The initialization value for the layer normalization. + projector_act (`str`, *optional*, defaults to `"swiglu"`): + The activation function used by the multimodal projector. + text_model_lora_config (`LoraConfigSimplified`, *optional*): + The LoRA configuration for finetuning the text model. + audio_model_lora_config (`LoraConfigSimplified`, *optional*): + The LoRA configuration for finetuning the audio model. + audio_latency_block_size (`int`, *optional*, defaults to `None`): + The latency block size for simulating audio streaming. + + + Example: + + ```python + >>> from transformers import UltravoxForConditionalGeneration, Wav2Vec2Config, UltravoxConfig, LlamaConfig + + >>> # Initializing an audio encoder config + >>> audio_config = Wav2Vec2Config() + + >>> # Initializing a Llama config + >>> text_config = LlamaConfig() + + >>> # Initializing a default configuration + >>> configuration = UltravoxConfig(audio_config, text_config) + + >>> # Initializing a completely untrained model from the configuration + >>> model = UltravoxForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # Initialize a model from pretrained checkpoints and random projector weights + >>> config = UltravoxConfig(audio_model_id="facebook/wav2vec2-base-960h", text_model_id="meta-llama/Llama-2-7b-chat-hf") + ```""" + + model_type = "ultravox" + is_composition = False + + def __init__( + self, + audio_config: Optional[Dict[str, Any]] = None, + text_config: Optional[Dict[str, Any]] = None, + audio_model_id: Optional[str] = None, + text_model_id: Optional[str] = None, + ignore_index: int = -100, + hidden_size: int = 4096, + stack_factor: int = 8, + norm_init: float = 0.4, + projector_act: str = "swiglu", + text_model_lora_config: Optional[LoraConfigSimplified] = None, + audio_model_lora_config: Optional[LoraConfigSimplified] = None, + audio_latency_block_size: Optional[int] = None, + **kwargs, + ): + self.ignore_index = ignore_index + + self.audio_model_id = audio_model_id + self.text_model_id = text_model_id + + self.hidden_size = hidden_size + self.stack_factor = stack_factor + self.norm_init = norm_init + self.projector_act = projector_act + + if text_model_id is not None: + self.text_config: transformers.LlamaConfig = ( + transformers.AutoConfig.from_pretrained(text_model_id) + ) + else: + text_config = text_config or {} + self.text_config = transformers.CONFIG_MAPPING[ + text_config.get("model_type", "llama") + ](**text_config) + + if audio_model_id is not None: + self.audio_config: transformers.PretrainedConfig = ( + transformers.AutoConfig.from_pretrained(audio_model_id) + ) + else: + audio_config = audio_config or {} + self.audio_config = transformers.CONFIG_MAPPING[ + audio_config.get("model_type", "wav2vec2") + ](**audio_config) + + self.text_model_lora_config = ( + text_model_lora_config + if isinstance(text_model_lora_config, dict) + else dataclasses.asdict(text_model_lora_config or LoraConfigSimplified()) + ) + self.audio_model_lora_config = ( + audio_model_lora_config + if isinstance(audio_model_lora_config, dict) + else dataclasses.asdict(audio_model_lora_config or LoraConfigSimplified()) + ) + self.audio_latency_block_size = audio_latency_block_size + + self.vocab_size = self.text_config.vocab_size + + self.initializer_range = self.text_config.initializer_range + + super().__init__(**kwargs) + + def to_diff_dict(self) -> Dict[str, Any]: + diff_dict = super().to_diff_dict() + + # remove text_config and audio_config if text_model_id and audio_model_id are present + if self.text_model_id is not None: + diff_dict.pop("text_config", None) + if self.audio_model_id is not None: + diff_dict.pop("audio_config", None) + + return diff_dict diff --git a/checkpoint-7200/ultravox_model.py b/checkpoint-7200/ultravox_model.py new file mode 100644 index 0000000000000000000000000000000000000000..ccea3e0ca174ec3119184d1f5a0f384c1cdbcae9 --- /dev/null +++ b/checkpoint-7200/ultravox_model.py @@ -0,0 +1,723 @@ +import logging +from typing import Any, Dict, Optional, Set, Tuple, Union + +import peft +import torch +import torch.nn as nn +import torch.nn.functional as F +import transformers +import transformers.activations +import transformers.modeling_outputs +import transformers.models +from transformers.models.whisper import modeling_whisper as whisper + +# We must use relative import in this directory to allow uploading to HF Hub +# Even "from . import X" pattern doesn't work (undocumented and unclear why) +from .ultravox_config import LossConfig +from .ultravox_config import LossFunction +from .ultravox_config import UltravoxConfig + + +class UltravoxModel(transformers.LlamaPreTrainedModel): + """ + The Ultravox model which consists of an audio encoder and a language model. + + Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and + projected to the language model's embedding space using a few linear layers. + The text is embedded by the language model as usual and then the audio and text embeddings are merged together. + + A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings. + + Parameters: + config: Model configuration class with all the parameters of the model. + """ + + config_class = UltravoxConfig + config: UltravoxConfig # for type hinting + # Usually we load encoder and LLM weights from a pretrained model separately, so they are allowed to be missing + _keys_to_ignore_on_load_missing = ["audio_tower.*", "language_model.*"] + + def __init__(self, config: UltravoxConfig): + super().__init__(config) + self._register_load_state_dict_pre_hook(self._pre_load_state_dict_hook) + + self.keep_params: Set[str] = set() + self.vocab_size = config.vocab_size + + self.audio_tower = self._create_audio_tower(config) + self.multi_modal_projector = self._create_multi_modal_projector(config) + self.language_model = self._create_language_model(config) + + # Determine no_split_modules dynamically to use with FSDP auto_wrap policy. + # FSDP throws an error if some of the layer types are not found in the model. + # This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"] + self._no_split_modules = (self.language_model._no_split_modules or []) + ( + self.audio_tower._no_split_modules or [] + ) + + self.loss_config = LossConfig() + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def tie_weights(self): + return self.language_model.tie_weights() + + def set_loss_config(self, loss_config: LossConfig): + self.loss_config = loss_config + + def _setup_cache( + self, cache_cls, max_batch_size: int, max_cache_len: Optional[int] = None + ): + self.language_model._setup_cache(cache_cls, max_batch_size, max_cache_len) + + def _reorder_cache(self, past_key_values, beam_idx): + return self.language_model._reorder_cache(past_key_values, beam_idx) + + def resize_token_embeddings( + self, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + ) -> nn.Embedding: + model_embeds = self.language_model.resize_token_embeddings( + new_num_tokens, pad_to_multiple_of + ) + # update vocab size + self.config.text_config.vocab_size = model_embeds.num_embeddings + self.config.vocab_size = model_embeds.num_embeddings + self.vocab_size = model_embeds.num_embeddings + return model_embeds + + def _compute_kl_loss( + self, + lm_output: transformers.modeling_outputs.CausalLMOutputWithPast, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + alt_input_ids: Optional[torch.Tensor] = None, + alt_attention_mask: Optional[torch.Tensor] = None, + alt_labels: Optional[torch.Tensor] = None, + **kwargs, + ): + # disable gradient computation for the teacher model + with torch.no_grad(): + # compute the teacher (text-only) model's distribution + alt_inputs_embeds = self.get_input_embeddings().forward(alt_input_ids) + alt_lm_output = self.language_model.forward( + inputs_embeds=alt_inputs_embeds, + labels=alt_labels, + attention_mask=alt_attention_mask, + past_key_values=past_key_values, + **kwargs, + ) + # compute the KL divergence loss between the two models + kl_loss = F.kl_div( + F.log_softmax( + lm_output.logits[labels != -100] / self.loss_config.kl_temperature, + dim=-1, + ), + F.softmax( + alt_lm_output.logits[alt_labels != -100] + / self.loss_config.kl_temperature, + dim=-1, + ), + reduction="batchmean", + ) + return {"loss": kl_loss} + + def forward( + self, + input_ids: torch.Tensor, + audio_values: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + audio_token_start_idx: Optional[torch.Tensor] = None, + audio_len: Optional[torch.Tensor] = None, + audio_token_len: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + # the alt_* fields are needed for KL divergence loss + alt_input_ids: Optional[torch.Tensor] = None, + alt_attention_mask: Optional[torch.Tensor] = None, + alt_labels: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]: + """ + Forward pass for the Ultravox model. + + `input_ids` are the tokenized text input. They are embedded by the language model as usual. + `audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and + projected to the language model's embedding space using a few linear layers. + The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start + of the audio embeddings in the merged embeddings. + + Args: + input_ids: The tokenized text input. + audio_values: The processed audio values. + inputs_embeds: The embeddings for the input tokens. + labels: The tokenized text labels. + attention_mask: The attention mask for the input. + position_ids: The position ids for the input. + past_key_values: The past key value cache for the language model attention layers. + **kwargs: Additional keyword arguments. Passed directly to the language model. + """ + if inputs_embeds is None: + # B x T -> B x T x D + inputs_embeds = self.get_input_embeddings().forward(input_ids) + + if audio_values is not None: + assert ( + audio_token_start_idx is not None and audio_token_len is not None + ), "audio_token_start_idx and audio_token_len must be provided if audio_values are provided." + assert ( + len(audio_token_start_idx) == len(audio_token_len) == len(audio_values) + ), "audio_token_start_idx, audio_token_len, and audio_values must have the same batch size." + + # B x A/3200 x D + audio_tower_output = self.audio_tower.forward( + audio_values.to(self.audio_tower.dtype), + audio_len=audio_len, + ).last_hidden_state + audio_tower_output = audio_tower_output.to(inputs_embeds.dtype) + + audio_embeds = self.multi_modal_projector.forward(audio_tower_output) + + # combine audio and text embeddings + for i, (audio, start, length) in enumerate( + zip(audio_embeds, audio_token_start_idx, audio_token_len) + ): + length = min(length, audio.shape[0]) + inputs_embeds[i, start : start + length] = audio[:length] + + lm_output = self.language_model.forward( + inputs_embeds=inputs_embeds, + labels=labels, + attention_mask=attention_mask, + past_key_values=past_key_values, + **kwargs, + ) + if self.training: + if self.loss_config.loss_function == LossFunction.CrossEntropy: + return lm_output + elif self.loss_config.loss_function == LossFunction.KL_Divergence: + return self._compute_kl_loss( + lm_output=lm_output, + labels=labels, + past_key_values=past_key_values, + alt_input_ids=alt_input_ids, + alt_attention_mask=alt_attention_mask, + alt_labels=alt_labels, + **kwargs, + ) + else: + raise ValueError( + f"Unsupported loss function: {self.loss_config.loss_function}" + ) + else: + return lm_output + + def prepare_inputs_for_generation( + self, + input_ids: torch.Tensor, + audio_values: Optional[torch.FloatTensor] = None, + audio_token_start_idx: Optional[torch.Tensor] = None, + audio_token_len: Optional[torch.Tensor] = None, + audio_len: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, + ) -> Dict[str, Any]: + model_input = self.language_model.prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + # include audio information in model_input only when it is needed during prefilling + # audio_token_start_idx should always be relative to the current cache position + prefill_start_idx = 0 if cache_position is None else cache_position[0] + if ( + audio_values is not None + and audio_token_start_idx is not None + and prefill_start_idx <= torch.max(audio_token_start_idx) + ): + model_input["audio_values"] = audio_values + model_input["audio_token_start_idx"] = ( + audio_token_start_idx - prefill_start_idx + ) + model_input["audio_token_len"] = audio_token_len + model_input["audio_len"] = audio_len + + return model_input + + @classmethod + def _create_multi_modal_projector( + cls, config: UltravoxConfig + ) -> "UltravoxProjector": + projector = UltravoxProjector(config) + projector.to(config.torch_dtype) + return projector + + @classmethod + def _create_audio_tower( + cls, config: UltravoxConfig + ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]: + if config.audio_model_id is not None: + if "whisper" in config.audio_model_id is not None: + audio_tower = ModifiedWhisperEncoder.from_pretrained( + config.audio_model_id, torch_dtype=config.torch_dtype + ) + audio_tower.init_latency_mask( + config.audio_latency_block_size, dtype=config.torch_dtype + ) + else: + assert config.audio_latency_block_size in ( + None, + 0, + ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'" + audio_tower = transformers.AutoModel.from_pretrained( + config.audio_model_id, torch_dtype=config.torch_dtype + ) + else: + if "whisper" in config.audio_config._name_or_path: + audio_tower = ModifiedWhisperEncoder(config.audio_config) + audio_tower.init_latency_mask( + config.audio_latency_block_size, dtype=config.torch_dtype + ) + else: + assert config.audio_latency_block_size in ( + None, + 0, + ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'" + with transformers.modeling_utils.no_init_weights(): + # we only ever use from_config if the weights are retrained, hence initializing is not + # required. This makes the model quite creation faster since init on CPU is quite slow. + audio_tower = transformers.AutoModel.from_config( + config.audio_config + ) + + if isinstance( + audio_tower, + (transformers.Wav2Vec2BertModel, transformers.WhisperModel), + ): + # For these models we only need the encoder part + # Wav2Vec2BertModel -> Wav2Vec2BertEncoder + # WhisperModel -> WhisperEncoder + audio_tower = audio_tower.encoder + + audio_tower = apply_lora(audio_tower, config.audio_model_lora_config) + return audio_tower + + @classmethod + def _create_language_model( + cls, config: UltravoxConfig + ) -> transformers.LlamaForCausalLM: + if config.text_model_id is not None: + language_model = transformers.AutoModelForCausalLM.from_pretrained( + config.text_model_id, + attn_implementation=config._attn_implementation, + torch_dtype=config.torch_dtype, + ) + else: + with transformers.modeling_utils.no_init_weights(): + # we only ever use from_config if the weights are retrained, hence initializing is not + # required. This makes the model quite creation faster since init on CPU is quite slow. + language_model = transformers.AutoModelForCausalLM.from_config( + config.text_config, + attn_implementation=config._attn_implementation, + torch_dtype=config.torch_dtype, + ) + + language_model = apply_lora(language_model, config.text_model_lora_config) + return language_model + + def merge_and_unload(self): + if isinstance(self.language_model, peft.PeftModel): + self.language_model = self.language_model.merge_and_unload() + # no need to download base language model weights anymore, so we can remove the id + self.config.text_model_id = None + self.keep_params.update( + set( + [ + f"language_model.{name}" + for name, _ in self.language_model.named_parameters() + ] + ) + ) + + if isinstance(self.audio_tower, peft.PeftModel): + self.audio_tower = self.audio_tower.merge_and_unload() + # no need to download base audio model weights anymore, so we can remove the id + self.config.audio_model_id = None + self.keep_params.update( + set( + [ + f"audio_tower.{name}" + for name, _ in self.audio_tower.named_parameters() + ] + ) + ) + + for param in ["text_model_lora_config", "audio_model_lora_config"]: + if hasattr(self.config, param): + delattr(self.config, param) + + def push_to_hub(self, *args, **kwargs): + self.merge_and_unload() + self.to(self.language_model.dtype) + return super().push_to_hub(*args, **kwargs) + + def save_pretrained( + self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs + ): + if state_dict is None: + state_dict = super().state_dict() + + named_params = dict(self.named_parameters()) + + state_dict = { + k: v + for k, v in state_dict.items() + if k in self.keep_params + or (k in named_params and named_params[k].requires_grad) + } + + super().save_pretrained(*args, state_dict=state_dict, **kwargs) + + def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs): + self.keep_params.update(set(state_dict.keys())) + + def print_trainable_parameters(self): + """ + Prints the number of trainable parameters in the model (reuses Peft model's method) + """ + count_params = peft.peft_model.PeftModel.get_nb_trainable_parameters + + trainable_params, all_param = count_params(self) + + logging.info( + f"trainable params: {trainable_params:,d} || all params: {all_param:,d}" + f" || trainable%: {100 * trainable_params / all_param:.1f}%" + ) + + lm_trainable_params, lm_all_params = count_params(self.language_model) + audio_trainable_params, audio_all_params = count_params(self.audio_tower) + + projector_trainable_params = ( + trainable_params - lm_trainable_params - audio_trainable_params + ) + projector_all_params = all_param - lm_all_params - audio_all_params + + logging.info( + f"Trainable%: " + f" LLM: {100 * lm_trainable_params / lm_all_params:.1f}%" + f" || Audio Encoder: {100 * audio_trainable_params / audio_all_params:.1f}%" + f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%" + ) + + +def is_cache_empty( + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] +) -> bool: + """ + Check if the cache is empty. + """ + if past_key_values is None: + return True + if isinstance(past_key_values, tuple): + return all(len(c) == 0 for c in past_key_values) + return past_key_values.get_seq_length() == 0 + + +def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module: + """ + Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead. + """ + lora_config = peft.LoraConfig(**lora_config or {}) + + if lora_config.r == 0: + # freeze the model entirely + for param in model.parameters(): + param.requires_grad = False + else: + model = peft.get_peft_model(model, lora_config) + + return model + + +class StackAudioFrames(nn.Module): + """ + Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`. + + The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames. + NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor, + we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings. + In most cases this extra padding will get removed in the model's forward function so it has no effect. + """ + + def __init__(self, stack_factor: int = 8): + super().__init__() + self.stack_factor = stack_factor + + def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor: + B, T, C = audio_embeds.shape + T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor + audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T + self.stack_factor)) + B, T, C = audio_embeds.shape + audio_embeds = audio_embeds.view( + B, T // self.stack_factor, C * self.stack_factor + ) + return audio_embeds + + +class RMSNorm(transformers.models.llama.modeling_llama.LlamaRMSNorm): + def __init__(self, hidden_size: int, init: float = 1, eps: float = 1e-6): + super().__init__(hidden_size=hidden_size, eps=eps) + self.weight.data.fill_(init) + + +class SwiGLU(nn.Module): + def forward(self, x): + x, gate = x.chunk(2, dim=-1) + return F.silu(gate) * x + + +class UltravoxProjector(nn.Sequential): + def __init__(self, config: UltravoxConfig): + super().__init__() + self.hidden_dim = config.hidden_size + self._pad_and_stack = StackAudioFrames(config.stack_factor) + dim = config.audio_config.hidden_size * config.stack_factor + self.ln_pre = RMSNorm(dim, init=config.norm_init) + self.linear_1 = nn.Linear(dim, self.hidden_dim, bias=False) + dim = self.hidden_dim + self.act = transformers.activations.get_activation(config.projector_act) + dim = dim // 2 if config.projector_act == "swiglu" else dim + self.linear_2 = nn.Linear(dim, config.text_config.hidden_size, bias=False) + self.ln_post = RMSNorm(config.text_config.hidden_size, init=config.norm_init) + + def forward(self, audio_features: torch.Tensor) -> torch.Tensor: + audio_features = self._pad_and_stack(audio_features) + audio_features = self.ln_pre(audio_features) + hidden_states = self.linear_1(audio_features) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + hidden_states = self.ln_post(hidden_states) + return hidden_states + + +class ModifiedWhisperEncoder( + whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin +): + """ + Encoder portion of OpenAI's Whisper model. + + This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes: + 1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder + 2. allow less than 30 second of audio padding to be passed in: + - relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal + - embed_pos is now sliced to match the length of `inputs_embeds` + + Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py + """ + + base_model_prefix = "model.encoder" + _no_split_modules = ["WhisperEncoderLayer"] + + def init_latency_mask(self, audio_latency_block_size: int, dtype: torch.dtype): + if audio_latency_block_size is None: + self.audio_streaming_mask = None + return + + # maximum sequence length + max_seqlen = ( + self.config.max_source_positions + * self.conv1.stride[0] + * self.conv2.stride[0] + ) + assert ( + max_seqlen > 0 + ), f"maximum sequence length must be positive, got {max_seqlen}" + assert ( + max_seqlen % audio_latency_block_size == 0 + ), f"audio_latency_block_size {audio_latency_block_size} must divide {max_seqlen} evenly." + # Given the block size, we calculate number of blocks. + audio_latency_nblocks = max_seqlen // audio_latency_block_size + audio_streaming_mask = ( + torch.tril( + torch.ones(audio_latency_nblocks, audio_latency_nblocks), + diagonal=0, + ) + .repeat_interleave(audio_latency_block_size, dim=0) + .repeat_interleave(audio_latency_block_size, dim=1) + ) + audio_streaming_mask = (1.0 - audio_streaming_mask) * torch.finfo(dtype).min + audio_streaming_mask = audio_streaming_mask[None, None, :, :] + self.register_buffer( + "audio_streaming_mask", audio_streaming_mask, persistent=False + ) + + def forward( + self, + input_features, + audio_len=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + expected_seq_length = ( + self.config.max_source_positions + * self.conv1.stride[0] + * self.conv2.stride[0] + ) + if input_features.shape[-1] > expected_seq_length: + raise ValueError( + f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}." + ) + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + inputs_embeds = nn.functional.gelu(self.conv1(input_features)) + inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) + + inputs_embeds = inputs_embeds.permute(0, 2, 1) + embed_pos = self.embed_positions.weight[: inputs_embeds.size(-2)] + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout( + hidden_states, p=self.dropout, training=self.training + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # Create attention mask based on audio lengths to mask out padding tokens + # For each sample in batch: + # - Convert raw audio length to feature length after convolutions + # - Create boolean mask that is True for valid positions and False for padding + # - Convert to extended attention mask format expected by transformer layers + # (1.0 for positions to attend to, large negative for positions to ignore) + # This masking ensures consistent behavior between training and inference + # by preventing the model from attending to padding tokens in both cases + attention_mask = None + if audio_len != None: + audio_feature_len = self._get_feat_extract_output_lengths(audio_len) + max_seq_len = hidden_states.shape[1] + attention_mask = torch.arange(max_seq_len, device=hidden_states.device)[ + None, : + ].lt(audio_feature_len.view(-1, 1)) + attention_mask = self.get_extended_attention_mask( + attention_mask, + None, + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + + if self.audio_streaming_mask is not None: + seqlen = hidden_states.size(-2) + if attention_mask is not None: + attention_mask = torch.minimum( + self.audio_streaming_mask[:, :, :seqlen, :seqlen], attention_mask + ) # merge + else: + attention_mask = self.audio_streaming_mask[:, :, :seqlen, :seqlen] + attention_mask = attention_mask.to(hidden_states.dtype) + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + assert head_mask.size()[0] == ( + len(self.layers) + ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=( + head_mask[idx] if head_mask is not None else None + ), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, encoder_states, all_attentions] + if v is not None + ) + return transformers.modeling_outputs.BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=encoder_states, + attentions=all_attentions, + ) + + +UltravoxConfig.register_for_auto_class() +UltravoxModel.register_for_auto_class() + +transformers.AutoConfig.register("ultravox", UltravoxConfig) +transformers.AutoModel.register(UltravoxConfig, UltravoxModel) + +transformers.activations.ACT2FN["swiglu"] = SwiGLU diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..d73356d28c6aaa69d1d906ea6fe6983073c613eb --- /dev/null +++ b/config.json @@ -0,0 +1,53 @@ +{ + "architectures": [ + "UltravoxModel" + ], + "audio_latency_block_size": null, + "audio_model_id": "openai/whisper-large-v3-turbo", + "audio_model_lora_config": { + "lora_alpha": 8, + "r": 0, + "target_modules": [ + "k_proj", + "q_proj", + "linear_k", + "linear_q" + ] + }, + "auto_map": { + "AutoConfig": "ultravox_config.UltravoxConfig", + "AutoModel": "ultravox_model.UltravoxModel" + }, + "custom_pipelines": { + "ultravox-pipeline": { + "impl": "ultravox_pipeline.UltravoxPipeline", + "pt": [ + "AutoModel" + ], + "tf": [], + "type": "multimodal" + } + }, + "hidden_size": 4096, + "ignore_index": -100, + "initializer_range": 0.02, + "model_type": "ultravox", + "norm_init": 0.4, + "pad_token_id": 128009, + "projector_act": "swiglu", + "stack_factor": 8, + "text_model_id": "meta-llama/Llama-3.2-1B-Instruct", + "text_model_lora_config": { + "lora_alpha": 8, + "r": 0, + "target_modules": [ + "k_proj", + "q_proj", + "linear_k", + "linear_q" + ] + }, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.0", + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4dac817850f65a6be4d01d824462c9fe54468763 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,11 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "pad_token_id": 128009, + "transformers_version": "4.47.0" +} diff --git a/logs/events.out.tfevents.1738860874.r-reach-vb-benchmarks-3beyuy7k-f14cd-vqxc5.14234.0 b/logs/events.out.tfevents.1738860874.r-reach-vb-benchmarks-3beyuy7k-f14cd-vqxc5.14234.0 new file mode 100644 index 0000000000000000000000000000000000000000..a42cef7438c79672ea8cd278f90adc734ad77afe --- /dev/null +++ b/logs/events.out.tfevents.1738860874.r-reach-vb-benchmarks-3beyuy7k-f14cd-vqxc5.14234.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d135a7174236a0c68309bfed98aa1aa2aaf205187db070abb1937f665de35c7 +size 56000 diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8a76fff3f8e4d9abf6f493159379ea05b1446d90 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46fb828f3d8af8e0688d12814507e753d1b1b539be4d4fc1c4354c929de48237 +size 92299736 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..83b5b195c1720ea55bc992756c0ab6f1e2ef4671 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1fd9b1621955a41f9e58d5a3e2b2d6a70bd41f5404ebfa5cb0ca999c290090 +size 5688 diff --git a/ultravox_config.py b/ultravox_config.py new file mode 100644 index 0000000000000000000000000000000000000000..3671250d6b6a69604473ed1a526484c8c9a77f68 --- /dev/null +++ b/ultravox_config.py @@ -0,0 +1,170 @@ +import dataclasses +from enum import Enum +from typing import Any, Dict, List, Optional + +import transformers + + +@dataclasses.dataclass +class LoraConfigSimplified: + """ + Low Rank Approximation (LoRA) configuration. + + Used for language and audio models separately. + """ + + # The rank of the approximation + r: int = 0 + lora_alpha: float = 8 + target_modules: Optional[List[str]] = dataclasses.field( + default_factory=lambda: ["k_proj", "q_proj", "linear_k", "linear_q"] + ) + + +class LossFunction(str, Enum): + CrossEntropy = "ce" + KL_Divergence = "kl" + + +@dataclasses.dataclass +class LossConfig: + loss_function: LossFunction = LossFunction.KL_Divergence + kl_temperature: float = 2.0 + + @property + def requires_alt_fields(self): + return self.loss_function == LossFunction.KL_Divergence + + +class UltravoxConfig(transformers.PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`UltravoxForConditionalGeneration`]. It is used to instantiate an + Ultravox model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + audio_config (`Wav2Vec2Config`, *optional*): + Custom audio config or dict + text_config (`Union[AutoConfig, dict]`, *optional*): + The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`. + ignore_index (`int`, *optional*, defaults to -100): + The ignore index for the loss function. + audio_token_index (`int`, *optional*, defaults to 32000): + The audio token index to encode the audio prompt. + stack_factor (`int`, *optional*, defaults to 8): + Audio downsampling factor for the multimodal projector. + norm_init (`float`, *optional*, defaults to 0.4): + The initialization value for the layer normalization. + projector_act (`str`, *optional*, defaults to `"swiglu"`): + The activation function used by the multimodal projector. + text_model_lora_config (`LoraConfigSimplified`, *optional*): + The LoRA configuration for finetuning the text model. + audio_model_lora_config (`LoraConfigSimplified`, *optional*): + The LoRA configuration for finetuning the audio model. + audio_latency_block_size (`int`, *optional*, defaults to `None`): + The latency block size for simulating audio streaming. + + + Example: + + ```python + >>> from transformers import UltravoxForConditionalGeneration, Wav2Vec2Config, UltravoxConfig, LlamaConfig + + >>> # Initializing an audio encoder config + >>> audio_config = Wav2Vec2Config() + + >>> # Initializing a Llama config + >>> text_config = LlamaConfig() + + >>> # Initializing a default configuration + >>> configuration = UltravoxConfig(audio_config, text_config) + + >>> # Initializing a completely untrained model from the configuration + >>> model = UltravoxForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # Initialize a model from pretrained checkpoints and random projector weights + >>> config = UltravoxConfig(audio_model_id="facebook/wav2vec2-base-960h", text_model_id="meta-llama/Llama-2-7b-chat-hf") + ```""" + + model_type = "ultravox" + is_composition = False + + def __init__( + self, + audio_config: Optional[Dict[str, Any]] = None, + text_config: Optional[Dict[str, Any]] = None, + audio_model_id: Optional[str] = None, + text_model_id: Optional[str] = None, + ignore_index: int = -100, + hidden_size: int = 4096, + stack_factor: int = 8, + norm_init: float = 0.4, + projector_act: str = "swiglu", + text_model_lora_config: Optional[LoraConfigSimplified] = None, + audio_model_lora_config: Optional[LoraConfigSimplified] = None, + audio_latency_block_size: Optional[int] = None, + **kwargs, + ): + self.ignore_index = ignore_index + + self.audio_model_id = audio_model_id + self.text_model_id = text_model_id + + self.hidden_size = hidden_size + self.stack_factor = stack_factor + self.norm_init = norm_init + self.projector_act = projector_act + + if text_model_id is not None: + self.text_config: transformers.LlamaConfig = ( + transformers.AutoConfig.from_pretrained(text_model_id) + ) + else: + text_config = text_config or {} + self.text_config = transformers.CONFIG_MAPPING[ + text_config.get("model_type", "llama") + ](**text_config) + + if audio_model_id is not None: + self.audio_config: transformers.PretrainedConfig = ( + transformers.AutoConfig.from_pretrained(audio_model_id) + ) + else: + audio_config = audio_config or {} + self.audio_config = transformers.CONFIG_MAPPING[ + audio_config.get("model_type", "wav2vec2") + ](**audio_config) + + self.text_model_lora_config = ( + text_model_lora_config + if isinstance(text_model_lora_config, dict) + else dataclasses.asdict(text_model_lora_config or LoraConfigSimplified()) + ) + self.audio_model_lora_config = ( + audio_model_lora_config + if isinstance(audio_model_lora_config, dict) + else dataclasses.asdict(audio_model_lora_config or LoraConfigSimplified()) + ) + self.audio_latency_block_size = audio_latency_block_size + + self.vocab_size = self.text_config.vocab_size + + self.initializer_range = self.text_config.initializer_range + + super().__init__(**kwargs) + + def to_diff_dict(self) -> Dict[str, Any]: + diff_dict = super().to_diff_dict() + + # remove text_config and audio_config if text_model_id and audio_model_id are present + if self.text_model_id is not None: + diff_dict.pop("text_config", None) + if self.audio_model_id is not None: + diff_dict.pop("audio_config", None) + + return diff_dict diff --git a/ultravox_model.py b/ultravox_model.py new file mode 100644 index 0000000000000000000000000000000000000000..ccea3e0ca174ec3119184d1f5a0f384c1cdbcae9 --- /dev/null +++ b/ultravox_model.py @@ -0,0 +1,723 @@ +import logging +from typing import Any, Dict, Optional, Set, Tuple, Union + +import peft +import torch +import torch.nn as nn +import torch.nn.functional as F +import transformers +import transformers.activations +import transformers.modeling_outputs +import transformers.models +from transformers.models.whisper import modeling_whisper as whisper + +# We must use relative import in this directory to allow uploading to HF Hub +# Even "from . import X" pattern doesn't work (undocumented and unclear why) +from .ultravox_config import LossConfig +from .ultravox_config import LossFunction +from .ultravox_config import UltravoxConfig + + +class UltravoxModel(transformers.LlamaPreTrainedModel): + """ + The Ultravox model which consists of an audio encoder and a language model. + + Audio input is processed by the audio encoder, then every `stack_factor` frames are stacked together and + projected to the language model's embedding space using a few linear layers. + The text is embedded by the language model as usual and then the audio and text embeddings are merged together. + + A special token `<|audio|>` is used to indicate the start of the audio embeddings in the merged embeddings. + + Parameters: + config: Model configuration class with all the parameters of the model. + """ + + config_class = UltravoxConfig + config: UltravoxConfig # for type hinting + # Usually we load encoder and LLM weights from a pretrained model separately, so they are allowed to be missing + _keys_to_ignore_on_load_missing = ["audio_tower.*", "language_model.*"] + + def __init__(self, config: UltravoxConfig): + super().__init__(config) + self._register_load_state_dict_pre_hook(self._pre_load_state_dict_hook) + + self.keep_params: Set[str] = set() + self.vocab_size = config.vocab_size + + self.audio_tower = self._create_audio_tower(config) + self.multi_modal_projector = self._create_multi_modal_projector(config) + self.language_model = self._create_language_model(config) + + # Determine no_split_modules dynamically to use with FSDP auto_wrap policy. + # FSDP throws an error if some of the layer types are not found in the model. + # This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"] + self._no_split_modules = (self.language_model._no_split_modules or []) + ( + self.audio_tower._no_split_modules or [] + ) + + self.loss_config = LossConfig() + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def tie_weights(self): + return self.language_model.tie_weights() + + def set_loss_config(self, loss_config: LossConfig): + self.loss_config = loss_config + + def _setup_cache( + self, cache_cls, max_batch_size: int, max_cache_len: Optional[int] = None + ): + self.language_model._setup_cache(cache_cls, max_batch_size, max_cache_len) + + def _reorder_cache(self, past_key_values, beam_idx): + return self.language_model._reorder_cache(past_key_values, beam_idx) + + def resize_token_embeddings( + self, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + ) -> nn.Embedding: + model_embeds = self.language_model.resize_token_embeddings( + new_num_tokens, pad_to_multiple_of + ) + # update vocab size + self.config.text_config.vocab_size = model_embeds.num_embeddings + self.config.vocab_size = model_embeds.num_embeddings + self.vocab_size = model_embeds.num_embeddings + return model_embeds + + def _compute_kl_loss( + self, + lm_output: transformers.modeling_outputs.CausalLMOutputWithPast, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + alt_input_ids: Optional[torch.Tensor] = None, + alt_attention_mask: Optional[torch.Tensor] = None, + alt_labels: Optional[torch.Tensor] = None, + **kwargs, + ): + # disable gradient computation for the teacher model + with torch.no_grad(): + # compute the teacher (text-only) model's distribution + alt_inputs_embeds = self.get_input_embeddings().forward(alt_input_ids) + alt_lm_output = self.language_model.forward( + inputs_embeds=alt_inputs_embeds, + labels=alt_labels, + attention_mask=alt_attention_mask, + past_key_values=past_key_values, + **kwargs, + ) + # compute the KL divergence loss between the two models + kl_loss = F.kl_div( + F.log_softmax( + lm_output.logits[labels != -100] / self.loss_config.kl_temperature, + dim=-1, + ), + F.softmax( + alt_lm_output.logits[alt_labels != -100] + / self.loss_config.kl_temperature, + dim=-1, + ), + reduction="batchmean", + ) + return {"loss": kl_loss} + + def forward( + self, + input_ids: torch.Tensor, + audio_values: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + audio_token_start_idx: Optional[torch.Tensor] = None, + audio_len: Optional[torch.Tensor] = None, + audio_token_len: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + # the alt_* fields are needed for KL divergence loss + alt_input_ids: Optional[torch.Tensor] = None, + alt_attention_mask: Optional[torch.Tensor] = None, + alt_labels: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]: + """ + Forward pass for the Ultravox model. + + `input_ids` are the tokenized text input. They are embedded by the language model as usual. + `audio_values` are processed by the audio encoder and then every `stack_factor` frames are stacked together and + projected to the language model's embedding space using a few linear layers. + The audio and text embeddings are merged together. A special token `<|audio|>` is used to indicate the start + of the audio embeddings in the merged embeddings. + + Args: + input_ids: The tokenized text input. + audio_values: The processed audio values. + inputs_embeds: The embeddings for the input tokens. + labels: The tokenized text labels. + attention_mask: The attention mask for the input. + position_ids: The position ids for the input. + past_key_values: The past key value cache for the language model attention layers. + **kwargs: Additional keyword arguments. Passed directly to the language model. + """ + if inputs_embeds is None: + # B x T -> B x T x D + inputs_embeds = self.get_input_embeddings().forward(input_ids) + + if audio_values is not None: + assert ( + audio_token_start_idx is not None and audio_token_len is not None + ), "audio_token_start_idx and audio_token_len must be provided if audio_values are provided." + assert ( + len(audio_token_start_idx) == len(audio_token_len) == len(audio_values) + ), "audio_token_start_idx, audio_token_len, and audio_values must have the same batch size." + + # B x A/3200 x D + audio_tower_output = self.audio_tower.forward( + audio_values.to(self.audio_tower.dtype), + audio_len=audio_len, + ).last_hidden_state + audio_tower_output = audio_tower_output.to(inputs_embeds.dtype) + + audio_embeds = self.multi_modal_projector.forward(audio_tower_output) + + # combine audio and text embeddings + for i, (audio, start, length) in enumerate( + zip(audio_embeds, audio_token_start_idx, audio_token_len) + ): + length = min(length, audio.shape[0]) + inputs_embeds[i, start : start + length] = audio[:length] + + lm_output = self.language_model.forward( + inputs_embeds=inputs_embeds, + labels=labels, + attention_mask=attention_mask, + past_key_values=past_key_values, + **kwargs, + ) + if self.training: + if self.loss_config.loss_function == LossFunction.CrossEntropy: + return lm_output + elif self.loss_config.loss_function == LossFunction.KL_Divergence: + return self._compute_kl_loss( + lm_output=lm_output, + labels=labels, + past_key_values=past_key_values, + alt_input_ids=alt_input_ids, + alt_attention_mask=alt_attention_mask, + alt_labels=alt_labels, + **kwargs, + ) + else: + raise ValueError( + f"Unsupported loss function: {self.loss_config.loss_function}" + ) + else: + return lm_output + + def prepare_inputs_for_generation( + self, + input_ids: torch.Tensor, + audio_values: Optional[torch.FloatTensor] = None, + audio_token_start_idx: Optional[torch.Tensor] = None, + audio_token_len: Optional[torch.Tensor] = None, + audio_len: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + cache_position: Optional[torch.Tensor] = None, + **kwargs, + ) -> Dict[str, Any]: + model_input = self.language_model.prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + cache_position=cache_position, + **kwargs, + ) + + # include audio information in model_input only when it is needed during prefilling + # audio_token_start_idx should always be relative to the current cache position + prefill_start_idx = 0 if cache_position is None else cache_position[0] + if ( + audio_values is not None + and audio_token_start_idx is not None + and prefill_start_idx <= torch.max(audio_token_start_idx) + ): + model_input["audio_values"] = audio_values + model_input["audio_token_start_idx"] = ( + audio_token_start_idx - prefill_start_idx + ) + model_input["audio_token_len"] = audio_token_len + model_input["audio_len"] = audio_len + + return model_input + + @classmethod + def _create_multi_modal_projector( + cls, config: UltravoxConfig + ) -> "UltravoxProjector": + projector = UltravoxProjector(config) + projector.to(config.torch_dtype) + return projector + + @classmethod + def _create_audio_tower( + cls, config: UltravoxConfig + ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]: + if config.audio_model_id is not None: + if "whisper" in config.audio_model_id is not None: + audio_tower = ModifiedWhisperEncoder.from_pretrained( + config.audio_model_id, torch_dtype=config.torch_dtype + ) + audio_tower.init_latency_mask( + config.audio_latency_block_size, dtype=config.torch_dtype + ) + else: + assert config.audio_latency_block_size in ( + None, + 0, + ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'" + audio_tower = transformers.AutoModel.from_pretrained( + config.audio_model_id, torch_dtype=config.torch_dtype + ) + else: + if "whisper" in config.audio_config._name_or_path: + audio_tower = ModifiedWhisperEncoder(config.audio_config) + audio_tower.init_latency_mask( + config.audio_latency_block_size, dtype=config.torch_dtype + ) + else: + assert config.audio_latency_block_size in ( + None, + 0, + ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'" + with transformers.modeling_utils.no_init_weights(): + # we only ever use from_config if the weights are retrained, hence initializing is not + # required. This makes the model quite creation faster since init on CPU is quite slow. + audio_tower = transformers.AutoModel.from_config( + config.audio_config + ) + + if isinstance( + audio_tower, + (transformers.Wav2Vec2BertModel, transformers.WhisperModel), + ): + # For these models we only need the encoder part + # Wav2Vec2BertModel -> Wav2Vec2BertEncoder + # WhisperModel -> WhisperEncoder + audio_tower = audio_tower.encoder + + audio_tower = apply_lora(audio_tower, config.audio_model_lora_config) + return audio_tower + + @classmethod + def _create_language_model( + cls, config: UltravoxConfig + ) -> transformers.LlamaForCausalLM: + if config.text_model_id is not None: + language_model = transformers.AutoModelForCausalLM.from_pretrained( + config.text_model_id, + attn_implementation=config._attn_implementation, + torch_dtype=config.torch_dtype, + ) + else: + with transformers.modeling_utils.no_init_weights(): + # we only ever use from_config if the weights are retrained, hence initializing is not + # required. This makes the model quite creation faster since init on CPU is quite slow. + language_model = transformers.AutoModelForCausalLM.from_config( + config.text_config, + attn_implementation=config._attn_implementation, + torch_dtype=config.torch_dtype, + ) + + language_model = apply_lora(language_model, config.text_model_lora_config) + return language_model + + def merge_and_unload(self): + if isinstance(self.language_model, peft.PeftModel): + self.language_model = self.language_model.merge_and_unload() + # no need to download base language model weights anymore, so we can remove the id + self.config.text_model_id = None + self.keep_params.update( + set( + [ + f"language_model.{name}" + for name, _ in self.language_model.named_parameters() + ] + ) + ) + + if isinstance(self.audio_tower, peft.PeftModel): + self.audio_tower = self.audio_tower.merge_and_unload() + # no need to download base audio model weights anymore, so we can remove the id + self.config.audio_model_id = None + self.keep_params.update( + set( + [ + f"audio_tower.{name}" + for name, _ in self.audio_tower.named_parameters() + ] + ) + ) + + for param in ["text_model_lora_config", "audio_model_lora_config"]: + if hasattr(self.config, param): + delattr(self.config, param) + + def push_to_hub(self, *args, **kwargs): + self.merge_and_unload() + self.to(self.language_model.dtype) + return super().push_to_hub(*args, **kwargs) + + def save_pretrained( + self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs + ): + if state_dict is None: + state_dict = super().state_dict() + + named_params = dict(self.named_parameters()) + + state_dict = { + k: v + for k, v in state_dict.items() + if k in self.keep_params + or (k in named_params and named_params[k].requires_grad) + } + + super().save_pretrained(*args, state_dict=state_dict, **kwargs) + + def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs): + self.keep_params.update(set(state_dict.keys())) + + def print_trainable_parameters(self): + """ + Prints the number of trainable parameters in the model (reuses Peft model's method) + """ + count_params = peft.peft_model.PeftModel.get_nb_trainable_parameters + + trainable_params, all_param = count_params(self) + + logging.info( + f"trainable params: {trainable_params:,d} || all params: {all_param:,d}" + f" || trainable%: {100 * trainable_params / all_param:.1f}%" + ) + + lm_trainable_params, lm_all_params = count_params(self.language_model) + audio_trainable_params, audio_all_params = count_params(self.audio_tower) + + projector_trainable_params = ( + trainable_params - lm_trainable_params - audio_trainable_params + ) + projector_all_params = all_param - lm_all_params - audio_all_params + + logging.info( + f"Trainable%: " + f" LLM: {100 * lm_trainable_params / lm_all_params:.1f}%" + f" || Audio Encoder: {100 * audio_trainable_params / audio_all_params:.1f}%" + f" || Projector: {100 * projector_trainable_params / projector_all_params:.1f}%" + ) + + +def is_cache_empty( + past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] +) -> bool: + """ + Check if the cache is empty. + """ + if past_key_values is None: + return True + if isinstance(past_key_values, tuple): + return all(len(c) == 0 for c in past_key_values) + return past_key_values.get_seq_length() == 0 + + +def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module: + """ + Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead. + """ + lora_config = peft.LoraConfig(**lora_config or {}) + + if lora_config.r == 0: + # freeze the model entirely + for param in model.parameters(): + param.requires_grad = False + else: + model = peft.get_peft_model(model, lora_config) + + return model + + +class StackAudioFrames(nn.Module): + """ + Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`. + + The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames. + NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor, + we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings. + In most cases this extra padding will get removed in the model's forward function so it has no effect. + """ + + def __init__(self, stack_factor: int = 8): + super().__init__() + self.stack_factor = stack_factor + + def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor: + B, T, C = audio_embeds.shape + T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor + audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T + self.stack_factor)) + B, T, C = audio_embeds.shape + audio_embeds = audio_embeds.view( + B, T // self.stack_factor, C * self.stack_factor + ) + return audio_embeds + + +class RMSNorm(transformers.models.llama.modeling_llama.LlamaRMSNorm): + def __init__(self, hidden_size: int, init: float = 1, eps: float = 1e-6): + super().__init__(hidden_size=hidden_size, eps=eps) + self.weight.data.fill_(init) + + +class SwiGLU(nn.Module): + def forward(self, x): + x, gate = x.chunk(2, dim=-1) + return F.silu(gate) * x + + +class UltravoxProjector(nn.Sequential): + def __init__(self, config: UltravoxConfig): + super().__init__() + self.hidden_dim = config.hidden_size + self._pad_and_stack = StackAudioFrames(config.stack_factor) + dim = config.audio_config.hidden_size * config.stack_factor + self.ln_pre = RMSNorm(dim, init=config.norm_init) + self.linear_1 = nn.Linear(dim, self.hidden_dim, bias=False) + dim = self.hidden_dim + self.act = transformers.activations.get_activation(config.projector_act) + dim = dim // 2 if config.projector_act == "swiglu" else dim + self.linear_2 = nn.Linear(dim, config.text_config.hidden_size, bias=False) + self.ln_post = RMSNorm(config.text_config.hidden_size, init=config.norm_init) + + def forward(self, audio_features: torch.Tensor) -> torch.Tensor: + audio_features = self._pad_and_stack(audio_features) + audio_features = self.ln_pre(audio_features) + hidden_states = self.linear_1(audio_features) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + hidden_states = self.ln_post(hidden_states) + return hidden_states + + +class ModifiedWhisperEncoder( + whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin +): + """ + Encoder portion of OpenAI's Whisper model. + + This implementation is a slightly modified version of HF Transformers' Whisper Encoder, with only a few fixes: + 1. base_model_prefix updated to allow for doing `.from_pretrained` directly on the encoder + 2. allow less than 30 second of audio padding to be passed in: + - relaxed ValueError check for `input_features` length to be less than or equal to `expected_seq_length` instead of strictly equal + - embed_pos is now sliced to match the length of `inputs_embeds` + + Original: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py + """ + + base_model_prefix = "model.encoder" + _no_split_modules = ["WhisperEncoderLayer"] + + def init_latency_mask(self, audio_latency_block_size: int, dtype: torch.dtype): + if audio_latency_block_size is None: + self.audio_streaming_mask = None + return + + # maximum sequence length + max_seqlen = ( + self.config.max_source_positions + * self.conv1.stride[0] + * self.conv2.stride[0] + ) + assert ( + max_seqlen > 0 + ), f"maximum sequence length must be positive, got {max_seqlen}" + assert ( + max_seqlen % audio_latency_block_size == 0 + ), f"audio_latency_block_size {audio_latency_block_size} must divide {max_seqlen} evenly." + # Given the block size, we calculate number of blocks. + audio_latency_nblocks = max_seqlen // audio_latency_block_size + audio_streaming_mask = ( + torch.tril( + torch.ones(audio_latency_nblocks, audio_latency_nblocks), + diagonal=0, + ) + .repeat_interleave(audio_latency_block_size, dim=0) + .repeat_interleave(audio_latency_block_size, dim=1) + ) + audio_streaming_mask = (1.0 - audio_streaming_mask) * torch.finfo(dtype).min + audio_streaming_mask = audio_streaming_mask[None, None, :, :] + self.register_buffer( + "audio_streaming_mask", audio_streaming_mask, persistent=False + ) + + def forward( + self, + input_features, + audio_len=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + expected_seq_length = ( + self.config.max_source_positions + * self.conv1.stride[0] + * self.conv2.stride[0] + ) + if input_features.shape[-1] > expected_seq_length: + raise ValueError( + f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}." + ) + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + inputs_embeds = nn.functional.gelu(self.conv1(input_features)) + inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) + + inputs_embeds = inputs_embeds.permute(0, 2, 1) + embed_pos = self.embed_positions.weight[: inputs_embeds.size(-2)] + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout( + hidden_states, p=self.dropout, training=self.training + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # Create attention mask based on audio lengths to mask out padding tokens + # For each sample in batch: + # - Convert raw audio length to feature length after convolutions + # - Create boolean mask that is True for valid positions and False for padding + # - Convert to extended attention mask format expected by transformer layers + # (1.0 for positions to attend to, large negative for positions to ignore) + # This masking ensures consistent behavior between training and inference + # by preventing the model from attending to padding tokens in both cases + attention_mask = None + if audio_len != None: + audio_feature_len = self._get_feat_extract_output_lengths(audio_len) + max_seq_len = hidden_states.shape[1] + attention_mask = torch.arange(max_seq_len, device=hidden_states.device)[ + None, : + ].lt(audio_feature_len.view(-1, 1)) + attention_mask = self.get_extended_attention_mask( + attention_mask, + None, + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + + if self.audio_streaming_mask is not None: + seqlen = hidden_states.size(-2) + if attention_mask is not None: + attention_mask = torch.minimum( + self.audio_streaming_mask[:, :, :seqlen, :seqlen], attention_mask + ) # merge + else: + attention_mask = self.audio_streaming_mask[:, :, :seqlen, :seqlen] + attention_mask = attention_mask.to(hidden_states.dtype) + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + assert head_mask.size()[0] == ( + len(self.layers) + ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=( + head_mask[idx] if head_mask is not None else None + ), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, encoder_states, all_attentions] + if v is not None + ) + return transformers.modeling_outputs.BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=encoder_states, + attentions=all_attentions, + ) + + +UltravoxConfig.register_for_auto_class() +UltravoxModel.register_for_auto_class() + +transformers.AutoConfig.register("ultravox", UltravoxConfig) +transformers.AutoModel.register(UltravoxConfig, UltravoxModel) + +transformers.activations.ACT2FN["swiglu"] = SwiGLU diff --git a/ultravox_pipeline.py b/ultravox_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..4d84c8b677f454bce10210886a1518ea885a436b --- /dev/null +++ b/ultravox_pipeline.py @@ -0,0 +1,127 @@ +import logging +from typing import Any, Dict, List, Optional + +import numpy as np +import transformers + +# We must use relative import in this directory to allow uploading to HF Hub +# Even "from . import X" pattern doesn't work (undocumented and unclear why) +from .ultravox_model import UltravoxModel +from .ultravox_processing import UltravoxProcessor + + +class UltravoxPipeline(transformers.Pipeline): + def __init__( + self, + model: UltravoxModel, + tokenizer: Optional[transformers.PreTrainedTokenizerBase] = None, + audio_processor: Optional[transformers.ProcessorMixin] = None, + **kwargs + ): + if tokenizer is None: + try: + tokenizer = transformers.AutoTokenizer.from_pretrained( + model.config._name_or_path + ) + except: + tokenizer = transformers.AutoTokenizer.from_pretrained( + model.config.text_model_id or model.config.text_config._name_or_path + ) + + if audio_processor is None: + audio_processor = transformers.AutoProcessor.from_pretrained( + model.config.audio_model_id or model.config.audio_config._name_or_path + ) + + super().__init__(model=model, tokenizer=tokenizer, **kwargs) + + self.processor = UltravoxProcessor( + audio_processor=audio_processor, + tokenizer=tokenizer, + stack_factor=model.config.stack_factor, + ) + + def _sanitize_parameters(self, **kwargs): + generation_keys = ["temperature", "max_new_tokens", "repetition_penalty"] + generation_kwargs = {k: kwargs[k] for k in kwargs if k in generation_keys} + return {}, generation_kwargs, {} + + def preprocess(self, inputs: Dict[str, Any]): + turns: list = inputs.get("turns", []) + + audio = inputs.get("audio", None) + # Convert to float32 if needed. + if isinstance(audio, np.ndarray): + if audio.dtype == np.float64: + audio = audio.astype(np.float32) + elif audio.dtype == np.int16: + audio = audio.astype(np.float32) / np.float32(32768.0) + elif audio.dtype == np.int32: + audio = audio.astype(np.float32) / np.float32(2147483648.0) + + if audio is not None and (len(turns) == 0 or turns[-1]["role"] != "user"): + prompt = inputs.get("prompt", "<|audio|>") + if "<|audio|>" not in prompt: + logging.warning( + "Prompt does not contain '<|audio|>', appending '<|audio|>' to the end of the prompt." + ) + + prompt += " <|audio|>" + turns.append({"role": "user", "content": prompt}) + + text = self.processor.tokenizer.apply_chat_template( + turns, add_generation_prompt=True, tokenize=False + ) + + if "sampling_rate" not in inputs and audio is not None: + logging.warning( + "No sampling rate provided, using default of 16kHz. We highly recommend providing the correct sampling rate." + ) + + output = self.processor( + text=text, + audio=audio, + sampling_rate=inputs.get("sampling_rate", 16000), + ) + if "audio_values" in output: + output["audio_values"] = output["audio_values"].to(self.model.dtype) + + return output + + def _forward( + self, + model_inputs: Dict[str, Any], + temperature: Optional[float] = None, + max_new_tokens: Optional[int] = None, + repetition_penalty: float = 1.1, + ) -> List[int]: + temperature = temperature or None + do_sample = temperature is not None + + terminators = [self.tokenizer.eos_token_id] + if "<|eot_id|>" in self.tokenizer.added_tokens_encoder: + terminators.append(self.tokenizer.convert_tokens_to_ids("<|eot_id|>")) + + input_len = model_inputs["input_ids"].shape[1] + + outputs = self.model.generate( + **model_inputs, + do_sample=do_sample, + temperature=temperature, + max_new_tokens=max_new_tokens, + repetition_penalty=repetition_penalty, + eos_token_id=terminators + ) + return outputs[0][input_len:] + + def postprocess(self, model_outputs) -> str: + output_text = self.tokenizer.decode(model_outputs, skip_special_tokens=True) + return output_text + + +transformers.pipelines.PIPELINE_REGISTRY.register_pipeline( + "ultravox-pipeline", + pipeline_class=UltravoxPipeline, + pt_model=transformers.AutoModel, + type="multimodal", +) diff --git a/ultravox_processing.py b/ultravox_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..5cab9e8cb198172ae235c1e43ca7ca1f0c1aba49 --- /dev/null +++ b/ultravox_processing.py @@ -0,0 +1,221 @@ +from typing import Optional, Union + +import numpy as np +import torch +import transformers + +from .ultravox_config import UltravoxConfig + + +class UltravoxProcessor(transformers.ProcessorMixin): + """ + Constructs an Ultravox processor which wraps an audio processor and a tokenizer into a single processor. + + Args: + audio_processor: The audio processor for the audio encoder. + tokenizer: The tokenizer for the language model. + """ + + attributes = ["audio_processor", "tokenizer"] + audio_processor_class = ( + "Wav2Vec2Processor", + "SeamlessM4TFeatureExtractor", + "WhisperProcessor", + ) + tokenizer_class = ( + "PreTrainedTokenizer", + "PreTrainedTokenizerFast", + ) + + tokenizer: transformers.PreTrainedTokenizerBase + audio_processor: transformers.ProcessorMixin + + def __init__( + self, + audio_processor=None, + tokenizer=None, + audio_padding: str = "longest", + encoder_ds_factor: int = 320, + stack_factor: int = 8, + audio_placeholder: str = "<|audio|>", + ): + """ + Args: + audio_processor: The audio processor for the audio encoder. + tokenizer: The tokenizer for the language model. + audio_padding: The padding strategy for the audio encoder. + encoder_ds_factor: The downsample factor of the audio encoder. + stack_factor: The factor by which the audio encoder output is stacked in the multimodal projector. + audio_placeholder: The placeholder for the audio in the text. + """ + self.audio_padding = audio_padding + self.encoder_ds_factor = encoder_ds_factor + self.stack_factor = stack_factor + self.audio_placeholder = audio_placeholder + self.audio_token_replacement = tokenizer.eos_token + assert ( + self.audio_token_replacement is not None + ), "The tokenizer has no EOS token. Cannot recover." + if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + + super().__init__(audio_processor=audio_processor, tokenizer=tokenizer) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): + config: UltravoxConfig = transformers.AutoConfig.from_pretrained( + pretrained_model_name_or_path, **kwargs + ) + audio_processor = transformers.AutoProcessor.from_pretrained( + config.audio_model_id + or config.audio_config._name_or_path + or "facebook/wav2vec2-base-960h" + ) + + tokenizer = transformers.AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, **kwargs + ) + tokenizer.padding_side = "left" + tokenizer.pad_token = tokenizer.eos_token + + return cls( + audio_processor=audio_processor, + tokenizer=tokenizer, + stack_factor=config.stack_factor, + ) + + def __call__( + self, + text: Optional[str] = None, + audio: Optional[Union[np.ndarray, torch.Tensor]] = None, + sampling_rate: Optional[int] = None, + return_tensors: Optional[ + Union[str, transformers.TensorType] + ] = transformers.TensorType.PYTORCH, + **kwargs, + ) -> transformers.BatchFeature: + """ + Main method to prepare for the model one text sequence and audio. This method forwards the `text` + and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to + audio processor's [`~Wav2Vec2Processor.__call__`] if `audio` is not `None`. Please refer to the docstring + of the above two methods for more information. + + Args: + text (`str`, `List[str]`): + The sequence to be encoded. Sequence can be a string or (pretokenized string). + audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`): + The audio to be prepared. Audio can be NumPy array or PyTorch tensor. In case of a + NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels, and T the + sample length of the audio. + sampling_rate (`int`, *optional*, defaults to 16000): + Sampling rate of the input audio. We expect 16kHz audio. Don't change this value unless you know what + you are doing. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **audio_values** -- Processed audio values to be fed to a model. Returned when `audio` is not `None`. + - **audio_token_len** -- Predicted number of audio frames: this value is guaranteed to be a close upper bound. + Returned when `audio` is not `None`. + - **audio_token_start_idx** -- The index in the tokenized text where the audio starts. Returned when `audio` is not `None`. + """ + # TODO: Add support for multiple audio and text inputs. + data = {} + audio_embed_frames = 0 + if audio is not None and len(audio) > 0: + if self.audio_padding == "max_length": + # 30 seconds is the expected length for Whisper + assert sampling_rate is not None, "Sampling rate must be provided." + audio_len = 30 * sampling_rate + else: + audio_len = audio.shape[-1] + # It's guaranteed that the number of frames is less than or equal to this amount. + # For Whisper this is exact AFAICT, but for Wav2Vec2 it's an upper bound. + # Currently, StackAudioFrames makes sure an over-estimation won't cause issues by padding the audio embeddings. + nb_encoder_frames = int(round(audio_len / self.encoder_ds_factor + 1e-4)) + audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor)) + data["audio_token_len"] = [audio_embed_frames] + + # Main audio processing. The processor is model-specific. + x = self.audio_processor( + audio, + sampling_rate=sampling_rate, + padding="longest", + max_length=audio_len, + return_attention_mask=True, + **kwargs, + ) + if "input_features" in x: + data["audio_values"] = x.input_features + else: + data["audio_values"] = x.input_values + + # data["audio_len"] is the number of frames in the audio, used for creating attention masks in whisper encoder + if ( + self.audio_padding == "max_length" + ): # audio is padded to max length, so we rely on the attention mask to determine audio_len + data["audio_len"] = ( + x.attention_mask.sum(-1) - 1 + ) # Whisper attention mask includes an extra 1 at the end that needs to be subtracted + else: # audio is not padded, so we can directly use the audio length + data["audio_len"] = [torch.as_tensor(data["audio_values"]).shape[-1]] + + if text is not None: + assert isinstance( + text, str + ), "Text must be a string. Batch mode not supported yet." + if self.audio_placeholder in text: + if "audio_token_len" not in data: + raise ValueError( + f"audio must be provided when using audio placeholder ({self.audio_placeholder}) in text." + ) + + start_idx = len( + self.tokenizer.encode( + text[: text.index(self.audio_placeholder)], + add_special_tokens=False, + ) + ) + data["audio_token_start_idx"] = [start_idx] + + # Replace the audio placeholder with the audio token. + # e.g. "Transcribe\n<|audio|>" -> "Transcribe " + # where the number of is the number of audio frames. + text = text.replace( + self.audio_placeholder, + self.audio_token_replacement * audio_embed_frames, + ) + + # Special tokens like BOS should already have been added by the caller. + data.update(self.tokenizer([text], add_special_tokens=False, **kwargs)) + + return transformers.BatchFeature(data=data, tensor_type=return_tensors) + + def batch_decode(self, *args, **kwargs): + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + audio_processor_input_names = self.audio_processor.model_input_names + return list(set(tokenizer_input_names + audio_processor_input_names)) + + +UltravoxProcessor.register_for_auto_class() + +transformers.AutoProcessor.register(UltravoxConfig, UltravoxProcessor)