File size: 1,726 Bytes
9b06537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
{
    "train_config": "config/is_selfssm.swe_2048.json",
    "model_name": "Selfssm",
    "devices": 8,
    "train_name": "is_selfssm.swe_2048",
    "name": "is_selfssm.swe_2048_Selfssm",
    "model_type": "selfssm",
    "nodes": 1,
    "block_size": 4096,
    "max_tokens": 20000000000,
    "global_batch_size": 512,
    "micro_batch_size": 2,
    "batch_size": 64,
    "gradient_accumulation_steps": 32,
    "learning_rate": 0.0004,
    "total_evals": 400,
    "warmup_tokens": 200000000,
    "eval_iters": 200,
    "resume_ckpt": "token-3775135744.iter-014400-ckpt.pth",
    "log_step_interval": 10,
    "save_step_interval": 300,
    "eval_step_interval": 300,
    "num_extrapol": 4,
    "weight_decay": 0.1,
    "beta1": 0.9,
    "beta2": 0.95,
    "grad_clip": 1.0,
    "decay_lr": true,
    "min_lr": 4e-05,
    "attn_type": "flash_attention_2",
    "attn_window_size": 2048,
    "output_attentions": false,
    "output_hidden_states": false,
    "load_linear_data": "next",
    "load_linear_overlap": 0,
    "load_linear_also_copy_rate": 0,
    "data_do_shuffle": false,
    "load_input_data": "",
    "log_iter_interval": 320,
    "tok": "llama2_tok",
    "transformer_hidden_size": 1024,
    "transformer_num_hidden_layers": 12,
    "transformer_intermediate_size": 4096,
    "linear_hidden_states": 2048,
    "linear_cal_loss": false,
    "linear_num_hidden_layers": 12,
    "linear_hidden_name": "output_embedding",
    "linear_multiply_self_attn_B": false,
    "linear_input_w_selfattn": false,
    "integrate_self_attn_weight": "",
    "integrate_self_attn_weight_rate": 1,
    "tie_word_embedding_layer": true,
    "linear_up_project": false,
    "linear_up_project_size": 0,
    "interact_mode": "cross_attn"
}