{ "train_config": "config/is_selfssm.swe_2048.json", "model_name": "Selfssm", "devices": 8, "train_name": "is_selfssm.swe_2048", "name": "is_selfssm.swe_2048_Selfssm", "model_type": "selfssm", "nodes": 1, "block_size": 4096, "max_tokens": 20000000000, "global_batch_size": 512, "micro_batch_size": 2, "batch_size": 64, "gradient_accumulation_steps": 32, "learning_rate": 0.0004, "total_evals": 400, "warmup_tokens": 200000000, "eval_iters": 200, "resume_ckpt": "token-3775135744.iter-014400-ckpt.pth", "log_step_interval": 10, "save_step_interval": 300, "eval_step_interval": 300, "num_extrapol": 4, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "decay_lr": true, "min_lr": 4e-05, "attn_type": "flash_attention_2", "attn_window_size": 2048, "output_attentions": false, "output_hidden_states": false, "load_linear_data": "next", "load_linear_overlap": 0, "load_linear_also_copy_rate": 0, "data_do_shuffle": false, "load_input_data": "", "log_iter_interval": 320, "tok": "llama2_tok", "transformer_hidden_size": 1024, "transformer_num_hidden_layers": 12, "transformer_intermediate_size": 4096, "linear_hidden_states": 2048, "linear_cal_loss": false, "linear_num_hidden_layers": 12, "linear_hidden_name": "output_embedding", "linear_multiply_self_attn_B": false, "linear_input_w_selfattn": false, "integrate_self_attn_weight": "", "integrate_self_attn_weight_rate": 1, "tie_word_embedding_layer": true, "linear_up_project": false, "linear_up_project_size": 0, "interact_mode": "cross_attn" }