|
{ |
|
"train_config": "config/is_selfssm.swe_2048.json", |
|
"model_name": "Selfssm", |
|
"devices": 8, |
|
"train_name": "is_selfssm.swe_2048", |
|
"name": "is_selfssm.swe_2048_Selfssm", |
|
"model_type": "selfssm", |
|
"nodes": 1, |
|
"block_size": 4096, |
|
"max_tokens": 20000000000, |
|
"global_batch_size": 512, |
|
"micro_batch_size": 2, |
|
"batch_size": 64, |
|
"gradient_accumulation_steps": 32, |
|
"learning_rate": 0.0004, |
|
"total_evals": 400, |
|
"warmup_tokens": 200000000, |
|
"eval_iters": 200, |
|
"resume_ckpt": "token-3775135744.iter-014400-ckpt.pth", |
|
"log_step_interval": 10, |
|
"save_step_interval": 300, |
|
"eval_step_interval": 300, |
|
"num_extrapol": 4, |
|
"weight_decay": 0.1, |
|
"beta1": 0.9, |
|
"beta2": 0.95, |
|
"grad_clip": 1.0, |
|
"decay_lr": true, |
|
"min_lr": 4e-05, |
|
"attn_type": "flash_attention_2", |
|
"attn_window_size": 2048, |
|
"output_attentions": false, |
|
"output_hidden_states": false, |
|
"load_linear_data": "next", |
|
"load_linear_overlap": 0, |
|
"load_linear_also_copy_rate": 0, |
|
"data_do_shuffle": false, |
|
"load_input_data": "", |
|
"log_iter_interval": 320, |
|
"tok": "llama2_tok", |
|
"transformer_hidden_size": 1024, |
|
"transformer_num_hidden_layers": 12, |
|
"transformer_intermediate_size": 4096, |
|
"linear_hidden_states": 2048, |
|
"linear_cal_loss": false, |
|
"linear_num_hidden_layers": 12, |
|
"linear_hidden_name": "output_embedding", |
|
"linear_multiply_self_attn_B": false, |
|
"linear_input_w_selfattn": false, |
|
"integrate_self_attn_weight": "", |
|
"integrate_self_attn_weight_rate": 1, |
|
"tie_word_embedding_layer": true, |
|
"linear_up_project": false, |
|
"linear_up_project_size": 0, |
|
"interact_mode": "cross_attn" |
|
} |