warmup_steps = 5,

num_train_epochs = 3,

learning_rate = 5e-5,

optim = 'adamw_torch', #"adamw_8bit",

gradient_accumulation_steps = 4,

weight_decay = 0.03, #L2 reg

lr_scheduler_type = "linear",