|
{ |
|
"output_dir": "output", |
|
"model_name_or_path": "models/Qwen2.5-1.5B-Instruct", |
|
"deepspeed": "./train_args/ds_z3_config.json", |
|
"train_file": "train_GPT/conversations_20000_MASK.jsonl", |
|
"template_name": "qwen", |
|
"train_mode": "full", |
|
"num_train_epochs": 3, |
|
"per_device_train_batch_size": 4, |
|
"gradient_accumulation_steps": 16, |
|
"learning_rate": 1e-05, |
|
"max_seq_length": 4096, |
|
"logging_steps": 200, |
|
"save_steps": 200, |
|
"save_total_limit": 1, |
|
"lr_scheduler_type": "cosine", |
|
"warmup_steps": 50, |
|
"gradient_checkpointing": false, |
|
"disable_tqdm": false, |
|
"optim": "adamw_hf", |
|
"seed": 42, |
|
"fp16": true, |
|
"report_to": "tensorboard", |
|
"dataloader_num_workers": 0, |
|
"save_strategy": "steps", |
|
"weight_decay": 0, |
|
"max_grad_norm": 1.0, |
|
"remove_unused_columns": false |
|
} |