keeeeenw commited on
Commit
63ae844
·
verified ·
1 Parent(s): cbc344c

Upload microllama_v2.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. microllama_v2.yaml +121 -0
microllama_v2.yaml ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
3
+ # ``model_config``. (type: Optional[str], default: null)
4
+ model_name: micro-llama-300M-v2
5
+
6
+ # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
7
+ # ``model_config``. (type: Optional[Config], default: null)
8
+ model_config:
9
+
10
+ # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
11
+ # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
12
+ out_dir: out/pretrain/micro-llama-v2
13
+
14
+ # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
15
+ precision: bf16-mixed
16
+
17
+ # Optional path to a checkpoint directory to initialize the model from.
18
+ # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
19
+ # initial_checkpoint_dir: /root/litgpt/out_lightning_ai/pretrain/micro-llama-v2/step-00128000/
20
+ initial_checkpoint_dir: /root/litgpt/out_lightning_ai/step-00128000-converted
21
+
22
+ # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
23
+ # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
24
+ # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
25
+ # (type: Union[bool, Literal["auto"], Path], default: False)
26
+ resume: False
27
+
28
+ # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
29
+ data: MicroLlama
30
+
31
+ # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
32
+ train:
33
+
34
+ # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
35
+ save_interval: 1000
36
+
37
+ # Number of iterations between logging calls (type: int, default: 1)
38
+ log_interval: 10
39
+
40
+ # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 48)
41
+ # Scale this number according to the number of GPU and memory size per GPU
42
+ # For example, we used 16 for 4 x 48G L40s
43
+ global_batch_size: 32
44
+
45
+ # Number of samples per data-parallel rank (type: int, default: 12)
46
+ # Scale this number according to the memory size per GPU
47
+ # For example, we used 12 for 24G 4090
48
+ micro_batch_size: 4
49
+
50
+ # Number of iterations with learning rate warmup active (type: int, default: 2000)
51
+ lr_warmup_steps: 2000
52
+
53
+ # Number of epochs to train on (type: Optional[int], default: null)
54
+ epochs:
55
+
56
+ # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
57
+ max_tokens: 3000000000000
58
+
59
+ # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
60
+ max_steps:
61
+
62
+ # Limits the length of samples. Off by default (type: Optional[int], default: null)
63
+ max_seq_length: 2048
64
+
65
+ # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
66
+ tie_embeddings:
67
+
68
+ # (type: Optional[float], default: 1.0)
69
+ max_norm: 1.0
70
+
71
+ # (type: float, default: 4e-05)
72
+ min_lr: 4.0e-05
73
+
74
+ # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
75
+ eval:
76
+
77
+ # Number of optimizer steps between evaluation calls (type: int, default: 1000)
78
+ interval: 1000
79
+
80
+ # Number of tokens to generate (type: Optional[int], default: null)
81
+ max_new_tokens:
82
+
83
+ # Number of iterations (type: int, default: 100)
84
+ max_iters: 100
85
+
86
+ # Whether to evaluate on the validation set at the beginning of the training
87
+ initial_validation: false
88
+
89
+ # Optimizer-related arguments
90
+ optimizer:
91
+
92
+ class_path: torch.optim.AdamW
93
+
94
+ init_args:
95
+
96
+ # (type: float, default: 0.001)
97
+ lr: 4e-4
98
+
99
+ # (type: float, default: 0.01)
100
+ weight_decay: 0.1
101
+
102
+ # (type: tuple, default: (0.9,0.999))
103
+ betas:
104
+ - 0.9
105
+ - 0.95
106
+
107
+ # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
108
+ devices: auto
109
+
110
+ # How many nodes to use. (type: int, default: 1)
111
+ num_nodes: 1
112
+
113
+ # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
114
+ # module require this. (type: Optional[Path], default: null)
115
+ tokenizer_dir: checkpoints/meta-llama/Llama-3.2-1B
116
+
117
+ # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
118
+ logger_name: wandb
119
+
120
+ # The random seed to use for reproducibility. (type: int, default: 42)
121
+ seed: 42