{ | |
"d_model": 128, | |
"num_layers": 2, | |
"T_local": 3, | |
"cluster_size": 8, | |
"seq_len": 256, | |
"batch_size": 96, | |
"learning_rate": 4.76e-4, | |
"weight_decay": 0.0541, | |
"dropout": 0.30 | |
} |
{ | |
"d_model": 128, | |
"num_layers": 2, | |
"T_local": 3, | |
"cluster_size": 8, | |
"seq_len": 256, | |
"batch_size": 96, | |
"learning_rate": 4.76e-4, | |
"weight_decay": 0.0541, | |
"dropout": 0.30 | |
} |