billxbf commited on
Commit
5035033
·
verified ·
1 Parent(s): 774c954

Upload 9 files

Browse files
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/data/pretrain_model/deepseek-math-7b-base",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "bos_token_id": 100000,
7
+ "eos_token_id": 100001,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 11008,
12
+ "max_position_embeddings": 4096,
13
+ "model_type": "llama",
14
+ "num_attention_heads": 32,
15
+ "num_hidden_layers": 30,
16
+ "num_key_value_heads": 32,
17
+ "pretraining_tp": 1,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_scaling": null,
20
+ "rope_theta": 10000.0,
21
+ "tie_word_embeddings": false,
22
+ "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.33.0",
24
+ "use_cache": true,
25
+ "vocab_size": 102400
26
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 100000,
4
+ "eos_token_id": 100001,
5
+ "transformers_version": "4.33.0"
6
+ }
logs.txt ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2024-02-27 15:55:39,803] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
2
+ [2024-02-27 15:55:50,622] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
3
+ [2024-02-27 15:55:50,640] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
4
+ [2024-02-27 15:55:50,658] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
5
+ [2024-02-27 15:55:50,683] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
6
+ [2024-02-27 15:55:50,774] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
7
+ [2024-02-27 15:55:50,793] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
8
+ [2024-02-27 15:55:50,859] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
9
+ [2024-02-27 15:55:50,879] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
10
+ [2024-02-27 15:55:54,396] [INFO] [comm.py:637:init_distributed] cdb=None
11
+ [2024-02-27 15:55:54,397] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
12
+ [2024-02-27 15:55:54,397] [INFO] [comm.py:637:init_distributed] cdb=None
13
+ [2024-02-27 15:55:54,399] [INFO] [comm.py:637:init_distributed] cdb=None
14
+ [2024-02-27 15:55:54,438] [INFO] [comm.py:637:init_distributed] cdb=None
15
+ [2024-02-27 15:55:54,466] [INFO] [comm.py:637:init_distributed] cdb=None
16
+ [2024-02-27 15:55:54,496] [INFO] [comm.py:637:init_distributed] cdb=None
17
+ [2024-02-27 15:55:54,529] [INFO] [comm.py:637:init_distributed] cdb=None
18
+ [2024-02-27 15:55:54,621] [INFO] [comm.py:637:init_distributed] cdb=None
19
+ **************************************************************************************************** DistributedType.DEEPSPEED
20
+ **************************************************************************************************** DistributedType.DEEPSPEED
21
+ **************************************************************************************************** DistributedType.DEEPSPEED
22
+ **************************************************************************************************** DistributedType.DEEPSPEED
23
+ **************************************************************************************************** DistributedType.DEEPSPEED
24
+ **************************************************************************************************** DistributedType.DEEPSPEED
25
+ **************************************************************************************************** DistributedType.DEEPSPEED
26
+ **************************************************************************************************** DistributedType.DEEPSPEED
27
+ [2024-02-27 15:56:09,579] [INFO] [partition_parameters.py:347:__exit__] finished initializing model - num_params = 273, num_elems = 6.91B
28
+ Is prompt masked: True
29
+ Is prompt masked: True
30
+ Is prompt masked: True
31
+ Is prompt masked: True
32
+ Is prompt masked: True
33
+ Is prompt masked: True
34
+ Is prompt masked: True
35
+ Is prompt masked: True
36
+ ************************************************** train_dataset
37
+ [2024-02-27 15:56:34,622] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.11.0, git-hash=unknown, git-branch=unknown
38
+ ************************************************** train_dataset
39
+ ************************************************** train_dataset
40
+ ************************************************** train_dataset
41
+ ************************************************** train_dataset
42
+ ************************************************** train_dataset
43
+ ************************************************** train_dataset
44
+ ************************************************** train_dataset
45
+ [2024-02-27 15:56:34,797] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
46
+ [2024-02-27 15:56:34,799] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
47
+ [2024-02-27 15:56:34,799] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
48
+ [2024-02-27 15:56:34,811] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW
49
+ [2024-02-27 15:56:34,811] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type=<class 'torch.optim.adamw.AdamW'>
50
+ [2024-02-27 15:56:34,811] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False
51
+ [2024-02-27 15:56:34,811] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer
52
+ [2024-02-27 15:56:34,975] [INFO] [utils.py:802:see_memory_usage] Stage 3 initialize beginning
53
+ [2024-02-27 15:56:34,976] [INFO] [utils.py:803:see_memory_usage] MA 1.67 GB Max_MA 3.23 GB CA 4.47 GB Max_CA 4 GB
54
+ [2024-02-27 15:56:34,977] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 44.68 GB, percent = 7.1%
55
+ [2024-02-27 15:56:34,978] [INFO] [stage3.py:126:__init__] Reduce bucket size 16777216
56
+ [2024-02-27 15:56:34,978] [INFO] [stage3.py:127:__init__] Prefetch bucket size 15099494
57
+ [2024-02-27 15:56:35,119] [INFO] [utils.py:802:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]
58
+ [2024-02-27 15:56:35,120] [INFO] [utils.py:803:see_memory_usage] MA 1.67 GB Max_MA 1.67 GB CA 4.47 GB Max_CA 4 GB
59
+ [2024-02-27 15:56:35,120] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 44.68 GB, percent = 7.1%
60
+ Parameter Offload: Total persistent parameters: 249856 in 61 params
61
+ [2024-02-27 15:56:35,295] [INFO] [utils.py:802:see_memory_usage] DeepSpeedZeRoOffload initialize [end]
62
+ [2024-02-27 15:56:35,296] [INFO] [utils.py:803:see_memory_usage] MA 1.67 GB Max_MA 1.67 GB CA 4.47 GB Max_CA 4 GB
63
+ [2024-02-27 15:56:35,297] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 44.68 GB, percent = 7.1%
64
+ [2024-02-27 15:56:35,427] [INFO] [utils.py:802:see_memory_usage] Before creating fp16 partitions
65
+ [2024-02-27 15:56:35,427] [INFO] [utils.py:803:see_memory_usage] MA 1.67 GB Max_MA 1.67 GB CA 4.47 GB Max_CA 4 GB
66
+ [2024-02-27 15:56:35,428] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 44.68 GB, percent = 7.1%
67
+ [2024-02-27 15:56:37,252] [INFO] [utils.py:802:see_memory_usage] After creating fp16 partitions: 1
68
+ [2024-02-27 15:56:37,253] [INFO] [utils.py:803:see_memory_usage] MA 1.67 GB Max_MA 1.67 GB CA 1.67 GB Max_CA 4 GB
69
+ [2024-02-27 15:56:37,253] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 55.98 GB, percent = 8.9%
70
+ [2024-02-27 15:56:37,404] [INFO] [utils.py:802:see_memory_usage] Before creating fp32 partitions
71
+ [2024-02-27 15:56:37,404] [INFO] [utils.py:803:see_memory_usage] MA 1.67 GB Max_MA 1.67 GB CA 1.67 GB Max_CA 2 GB
72
+ [2024-02-27 15:56:37,405] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 55.98 GB, percent = 8.9%
73
+ [2024-02-27 15:56:38,537] [INFO] [utils.py:802:see_memory_usage] After creating fp32 partitions
74
+ [2024-02-27 15:56:38,538] [INFO] [utils.py:803:see_memory_usage] MA 4.89 GB Max_MA 6.5 GB CA 6.5 GB Max_CA 6 GB
75
+ [2024-02-27 15:56:38,538] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 52.73 GB, percent = 8.4%
76
+ [2024-02-27 15:56:39,212] [INFO] [utils.py:802:see_memory_usage] Before initializing optimizer states
77
+ [2024-02-27 15:56:39,213] [INFO] [utils.py:803:see_memory_usage] MA 4.89 GB Max_MA 4.89 GB CA 6.5 GB Max_CA 6 GB
78
+ [2024-02-27 15:56:39,213] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 44.68 GB, percent = 7.1%
79
+ [2024-02-27 15:56:39,404] [INFO] [utils.py:802:see_memory_usage] After initializing optimizer states
80
+ [2024-02-27 15:56:39,405] [INFO] [utils.py:803:see_memory_usage] MA 11.32 GB Max_MA 20.98 GB CA 22.59 GB Max_CA 23 GB
81
+ [2024-02-27 15:56:39,405] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 44.68 GB, percent = 7.1%
82
+ [2024-02-27 15:56:39,406] [INFO] [stage3.py:459:_setup_for_real_optimizer] optimizer state initialized
83
+ [2024-02-27 15:56:43,265] [INFO] [utils.py:802:see_memory_usage] After initializing ZeRO optimizer
84
+ [2024-02-27 15:56:43,265] [INFO] [utils.py:803:see_memory_usage] MA 12.96 GB Max_MA 14.53 GB CA 24.98 GB Max_CA 25 GB
85
+ [2024-02-27 15:56:43,266] [INFO] [utils.py:810:see_memory_usage] CPU Virtual Memory: used = 44.7 GB, percent = 7.1%
86
+ [2024-02-27 15:56:43,266] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = AdamW
87
+ [2024-02-27 15:56:43,266] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
88
+ [2024-02-27 15:56:43,266] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
89
+ [2024-02-27 15:56:43,266] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[(0.9, 0.999)]
90
+ [2024-02-27 15:56:43,267] [INFO] [config.py:968:print] DeepSpeedEngine configuration:
91
+ [2024-02-27 15:56:43,268] [INFO] [config.py:972:print] activation_checkpointing_config {
92
+ "partition_activations": false,
93
+ "contiguous_memory_optimization": false,
94
+ "cpu_checkpointing": false,
95
+ "number_checkpoints": null,
96
+ "synchronize_checkpoint_boundary": false,
97
+ "profile": false
98
+ }
99
+ [2024-02-27 15:56:43,268] [INFO] [config.py:972:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
100
+ [2024-02-27 15:56:43,268] [INFO] [config.py:972:print] amp_enabled .................. False
101
+ [2024-02-27 15:56:43,268] [INFO] [config.py:972:print] amp_params ................... False
102
+ [2024-02-27 15:56:43,268] [INFO] [config.py:972:print] autotuning_config ............ {
103
+ "enabled": false,
104
+ "start_step": null,
105
+ "end_step": null,
106
+ "metric_path": null,
107
+ "arg_mappings": null,
108
+ "metric": "throughput",
109
+ "model_info": null,
110
+ "results_dir": "autotuning_results",
111
+ "exps_dir": "autotuning_exps",
112
+ "overwrite": true,
113
+ "fast": true,
114
+ "start_profile_step": 3,
115
+ "end_profile_step": 5,
116
+ "tuner_type": "gridsearch",
117
+ "tuner_early_stopping": 5,
118
+ "tuner_num_trials": 50,
119
+ "model_info_path": null,
120
+ "mp_size": 1,
121
+ "max_train_batch_size": null,
122
+ "min_train_batch_size": 1,
123
+ "max_train_micro_batch_size_per_gpu": 1.024000e+03,
124
+ "min_train_micro_batch_size_per_gpu": 1,
125
+ "num_tuning_micro_batch_sizes": 3
126
+ }
127
+ [2024-02-27 15:56:43,268] [INFO] [config.py:972:print] bfloat16_enabled ............. True
128
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] checkpoint_parallel_write_pipeline False
129
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] checkpoint_tag_validation_enabled True
130
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] checkpoint_tag_validation_fail False
131
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f15566e2920>
132
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] communication_data_type ...... None
133
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
134
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] curriculum_enabled_legacy .... False
135
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] curriculum_params_legacy ..... False
136
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
137
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] data_efficiency_enabled ...... False
138
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] dataloader_drop_last ......... False
139
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] disable_allgather ............ False
140
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] dump_state ................... False
141
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] dynamic_loss_scale_args ...... None
142
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] eigenvalue_enabled ........... False
143
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] eigenvalue_gas_boundary_resolution 1
144
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] eigenvalue_layer_name ........ bert.encoder.layer
145
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] eigenvalue_layer_num ......... 0
146
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] eigenvalue_max_iter .......... 100
147
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] eigenvalue_stability ......... 1e-06
148
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] eigenvalue_tol ............... 0.01
149
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] eigenvalue_verbose ........... False
150
+ [2024-02-27 15:56:43,269] [INFO] [config.py:972:print] elasticity_enabled ........... False
151
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] flops_profiler_config ........ {
152
+ "enabled": false,
153
+ "recompute_fwd_factor": 0.0,
154
+ "profile_step": 1,
155
+ "module_depth": -1,
156
+ "top_modules": 1,
157
+ "detailed": true,
158
+ "output_file": null
159
+ }
160
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] fp16_auto_cast ............... None
161
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] fp16_enabled ................. False
162
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] fp16_master_weights_and_gradients False
163
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] global_rank .................. 0
164
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] grad_accum_dtype ............. None
165
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] gradient_accumulation_steps .. 4
166
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] gradient_clipping ............ 1.0
167
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] gradient_predivide_factor .... 1.0
168
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
169
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] initial_dynamic_scale ........ 1
170
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] load_universal_checkpoint .... False
171
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] loss_scale ................... 1.0
172
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] memory_breakdown ............. False
173
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] mics_hierarchial_params_gather False
174
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] mics_shard_size .............. -1
175
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
176
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] nebula_config ................ {
177
+ "enabled": false,
178
+ "persistent_storage_path": null,
179
+ "persistent_time_interval": 100,
180
+ "num_of_version_in_retention": 2,
181
+ "enable_nebula_load": true,
182
+ "load_path": null
183
+ }
184
+ [2024-02-27 15:56:43,270] [INFO] [config.py:972:print] optimizer_legacy_fusion ...... False
185
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] optimizer_name ............... None
186
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] optimizer_params ............. None
187
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
188
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] pld_enabled .................. False
189
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] pld_params ................... False
190
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] prescale_gradients ........... False
191
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] scheduler_name ............... None
192
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] scheduler_params ............. None
193
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] sparse_attention ............. None
194
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] sparse_gradients_enabled ..... False
195
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] steps_per_print .............. inf
196
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] train_batch_size ............. 128
197
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] train_micro_batch_size_per_gpu 4
198
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] use_node_local_storage ....... False
199
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] wall_clock_breakdown ......... False
200
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] weight_quantization_config ... None
201
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] world_size ................... 8
202
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] zero_allow_untested_optimizer True
203
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=16777216 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=15099494 param_persistence_threshold=40960 model_persistence_threshold=sys.maxsize max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
204
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] zero_enabled ................. True
205
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] zero_force_ds_cpu_optimizer .. True
206
+ [2024-02-27 15:56:43,271] [INFO] [config.py:972:print] zero_optimization_stage ...... 3
207
+ [2024-02-27 15:56:43,272] [INFO] [config.py:958:print_user_config] json = {
208
+ "bf16": {
209
+ "enabled": true
210
+ },
211
+ "zero_optimization": {
212
+ "stage": 3,
213
+ "overlap_comm": true,
214
+ "contiguous_gradients": true,
215
+ "sub_group_size": 1.000000e+09,
216
+ "reduce_bucket_size": 1.677722e+07,
217
+ "stage3_prefetch_bucket_size": 1.509949e+07,
218
+ "stage3_param_persistence_threshold": 4.096000e+04,
219
+ "stage3_max_live_parameters": 1.000000e+09,
220
+ "stage3_max_reuse_distance": 1.000000e+09,
221
+ "stage3_gather_16bit_weights_on_model_save": true
222
+ },
223
+ "gradient_accumulation_steps": 4,
224
+ "gradient_clipping": 1.0,
225
+ "steps_per_print": inf,
226
+ "train_batch_size": 128,
227
+ "train_micro_batch_size_per_gpu": 4,
228
+ "wall_clock_breakdown": false,
229
+ "fp16": {
230
+ "enabled": false
231
+ },
232
+ "zero_allow_untested_optimizer": true
233
+ }
234
+ [2024-02-27 16:14:40,687] [WARNING] [stage3.py:1947:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
235
+ [2024-02-27 16:52:42,926] [WARNING] [stage3.py:1947:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
236
+ [2024-02-27 16:53:38,504] [WARNING] [stage3.py:1947:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
237
+ [2024-02-27 23:41:27,648] [WARNING] [stage3.py:1947:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
238
+ [2024-02-27 23:52:36,654] [WARNING] [stage3.py:1947:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
239
+ [2024-02-27 23:53:33,055] [WARNING] [stage3.py:1947:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
240
+ [2024-02-27 23:54:28,758] [WARNING] [stage3.py:1947:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
241
+ [2024-02-28 00:20:20,365] [WARNING] [stage3.py:1947:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
242
+ [2024-02-28 00:24:03,965] [WARNING] [stage3.py:1947:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7d506e2a1993e23aeedbab73cda3ecce878cdeeb2c630188102b363dc45b97e
3
+ size 9968194327
pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b9b869a286e5b3405eb97fd410a5c3f2574d7f180dbfd591cf1b18ee4e03a87
3
+ size 3852630633
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 13820731392
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00002-of-00002.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
10
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
11
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
12
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
13
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
14
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
15
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
16
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
17
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
18
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
19
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
20
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
21
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
22
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
23
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
24
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
25
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
26
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
27
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
28
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
29
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
30
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
31
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
32
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
33
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
34
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
35
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
36
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
37
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
38
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
39
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
40
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
41
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
42
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
43
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
44
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
45
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
46
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
47
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
48
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
49
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
50
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
51
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
52
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
53
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
54
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
55
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
56
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
57
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
58
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
59
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
60
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
61
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
62
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
63
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
64
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
65
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
66
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
67
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
68
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
69
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
70
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
71
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
72
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
73
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
74
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
75
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
76
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
77
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
78
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
79
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
80
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
81
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
82
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
83
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
84
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
85
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
86
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
87
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
88
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
89
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
90
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
91
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
92
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
93
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
94
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
95
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
96
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
97
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
99
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
100
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
101
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
102
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
103
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
104
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
105
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
106
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
107
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
108
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
109
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
110
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
111
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
112
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
113
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
114
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
115
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
116
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
117
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
118
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
119
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
120
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
121
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
122
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
123
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
124
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
125
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
126
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
127
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
128
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
129
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
130
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
131
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
132
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
133
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
134
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
135
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
136
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
137
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
138
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
139
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
140
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
141
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
142
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
143
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
144
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
145
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
146
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
147
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
148
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
149
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
150
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
151
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
152
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
153
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
154
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
155
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
156
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
157
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
158
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
159
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
160
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
161
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
162
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
163
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
164
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
165
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
166
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
167
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
168
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
169
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
170
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
171
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
172
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
173
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
174
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
175
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
176
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
177
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
178
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
179
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
180
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
181
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
182
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
183
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
184
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
185
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
186
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
187
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
188
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
189
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
190
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
191
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
192
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
193
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
194
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
195
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
196
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
197
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
198
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
199
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
200
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
201
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
202
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
203
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
204
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
205
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
206
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
207
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
208
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
209
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
210
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
211
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
212
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
213
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
214
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
215
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
216
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
217
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
218
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
219
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
220
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
221
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
222
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
223
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
224
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
225
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
226
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
227
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
228
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
229
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
230
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
231
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
232
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
233
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
234
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
235
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
236
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
237
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
238
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
239
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
240
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
241
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
242
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
243
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
244
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
245
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
246
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
247
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
248
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
249
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
250
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
251
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
252
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
253
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
254
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
255
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
256
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
257
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
258
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
259
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
260
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
261
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
262
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
263
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
264
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
265
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
266
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
267
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
268
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
269
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
270
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
271
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
272
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
273
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
274
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
275
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
276
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
277
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
278
+ "model.norm.weight": "pytorch_model-00002-of-00002.bin"
279
+ }
280
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|end▁of▁sentence|>"
17
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "__type": "AddedToken",
4
+ "content": "<|begin▁of▁sentence|>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "clean_up_tokenization_spaces": false,
11
+ "eos_token": {
12
+ "__type": "AddedToken",
13
+ "content": "<|end▁of▁sentence|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "legacy": true,
20
+ "model_max_length": 4096,
21
+ "sp_model_kwargs": {},
22
+ "tokenizer_class": "LlamaTokenizer",
23
+ "unk_token": null,
24
+ "use_default_system_prompt": true
25
+ }