Spaces:
Sleeping
Sleeping
File size: 6,632 Bytes
416db21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import os
import yaml
from dataclasses import dataclass, field
from typing import List, Optional
@dataclass
class ModelConfig:
type: str = "custom"
name: str = "smollm2_transformer"
tokenizer_name: str = "HuggingFaceTB/SmolLM2-135M"
vocab_size: int = 49152
hidden_size: int = 576
num_attention_heads: int = 9
num_key_value_heads: int = 3
num_hidden_layers: int = 30
intermediate_size: int = 1536
hidden_act: str = "gelu"
max_position_embeddings: int = 512
initializer_range: float = 0.02
rms_norm_eps: float = 1e-5
use_cache: bool = True
pad_token_id: Optional[int] = None
max_length: int = 512
def __post_init__(self):
# Ensure numeric values are proper types
self.vocab_size = int(self.vocab_size)
self.hidden_size = int(self.hidden_size)
self.num_attention_heads = int(self.num_attention_heads)
self.num_key_value_heads = int(self.num_key_value_heads)
self.num_hidden_layers = int(self.num_hidden_layers)
self.intermediate_size = int(self.intermediate_size)
self.max_position_embeddings = int(self.max_position_embeddings)
self.initializer_range = float(self.initializer_range)
self.rms_norm_eps = float(self.rms_norm_eps)
self.max_length = int(self.max_length)
@dataclass
class OptimizerConfig:
type: str = "adamW"
weight_decay: float = 0.01
adam_beta1: float = 0.9
adam_beta2: float = 0.95
adam_eps: float = 1e-8
torch_adam_is_fused: bool = True
clip_grad: float = 1.0
accumulate_grad_in_fp32: bool = True
def __post_init__(self):
# Ensure numeric values are proper floats
self.weight_decay = float(self.weight_decay)
self.adam_beta1 = float(self.adam_beta1)
self.adam_beta2 = float(self.adam_beta2)
self.adam_eps = float(self.adam_eps)
self.clip_grad = float(self.clip_grad)
@dataclass
class SchedulerConfig:
type: str = "one_cycle"
learning_rate: float = 0.003
warmup_steps: int = 100
max_lr: float = 0.003
pct_start: float = 0.02
anneal_strategy: str = "cos"
cycle_momentum: bool = False
div_factor: float = 25.0
final_div_factor: float = 1000.0
@dataclass
class TrainingConfig:
output_dir: str = "./results"
batch_size: int = 2
micro_batch_size: int = 1
gradient_accumulation_steps: int = 4
sequence_length: int = 512
learning_rate: float = 0.003
max_steps: int = 5050
first_phase_steps: int = 5000
second_phase_steps: int = 50
sample_frequency: int = 500
second_phase_sample_frequency: int = 10
logging_dir: str = "./logs"
logging_steps: int = 1
save_steps: int = 500
checkpoint_dir: str = "checkpoints"
sample_prompt: str = "Explain what machine learning is:"
max_generate_length: int = 100
@dataclass
class HardwareConfig:
precision: str = "16-mixed"
accelerator: str = "gpu"
devices: int = 1
strategy: str = "auto"
gradient_clip: float = 1.0
@dataclass
class DatasetConfig:
name: str
path: str
subset: str
weight: float
split_ratio: float = 1.0 # Default to using full dataset
@dataclass
class DataLoadingConfig:
num_workers: int = 2
batch_size: int = 32
pin_memory: bool = True
prefetch_factor: int = 2
persistent_workers: bool = True
@dataclass
class DataConfig:
datasets: List[DatasetConfig] = field(default_factory=list)
loading: DataLoadingConfig = field(default_factory=DataLoadingConfig)
class SmolLM2Config:
def __init__(self, config_path: str = None):
self.model = ModelConfig()
self.optimizer = OptimizerConfig()
self.scheduler = SchedulerConfig()
self.training = TrainingConfig()
self.hardware = HardwareConfig()
self.data = DataConfig()
# Default dataset configuration
self.data.datasets = [
DatasetConfig(
name="wikitext",
path="wikitext",
subset="wikitext-2-raw-v1",
weight=1.0
)
]
if config_path and os.path.exists(config_path):
self.load_from_yaml(config_path)
def load_from_yaml(self, config_path: str):
with open(config_path, 'r') as f:
config_dict = yaml.safe_load(f)
# Update configurations from yaml
if 'model' in config_dict:
for k, v in config_dict['model'].items():
setattr(self.model, k, v)
if 'optimizer' in config_dict:
for k, v in config_dict['optimizer'].items():
setattr(self.optimizer, k, v)
if 'scheduler' in config_dict:
for k, v in config_dict['scheduler'].items():
setattr(self.scheduler, k, v)
if 'training' in config_dict:
for k, v in config_dict['training'].items():
setattr(self.training, k, v)
if 'hardware' in config_dict:
for k, v in config_dict['hardware'].items():
setattr(self.hardware, k, v)
if 'data' in config_dict:
for k, v in config_dict['data'].items():
if k == 'datasets':
for dataset in v:
self.data.datasets.append(DatasetConfig(**dataset))
elif k == 'loading':
for k, v in config_dict['data']['loading'].items():
setattr(self.data.loading, k, v)
def save_to_yaml(self, config_path: str):
config_dict = {
'model': self.model.__dict__,
'optimizer': self.optimizer.__dict__,
'scheduler': self.scheduler.__dict__,
'training': self.training.__dict__,
'hardware': self.hardware.__dict__,
'data': self.data.__dict__
}
with open(config_path, 'w') as f:
yaml.dump(config_dict, f, default_flow_style=False)
def __repr__(self):
return f"SmolLM2Config(\n" \
f" model={self.model}\n" \
f" optimizer={self.optimizer}\n" \
f" scheduler={self.scheduler}\n" \
f" training={self.training}\n" \
f" hardware={self.hardware}\n" \
f" data={self.data}\n" \
f")" |