warmup_steps = 5, num_train_epochs = 3, learning_rate = 5e-5, optim="galore_adafactor", optim_target_modules=[r".*.attn.*", r".*.mlp.*"], weight_decay = 0.03, #L2 reg lr_scheduler_type = "linear", #reduce_lr_on_plateau gradient_accumulation_steps = 4, use_liger = True,