File size: 6,632 Bytes
416db21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import os
import yaml
from dataclasses import dataclass, field
from typing import List, Optional

@dataclass
class ModelConfig:
    type: str = "custom"
    name: str = "smollm2_transformer"
    tokenizer_name: str = "HuggingFaceTB/SmolLM2-135M"
    vocab_size: int = 49152
    hidden_size: int = 576
    num_attention_heads: int = 9
    num_key_value_heads: int = 3
    num_hidden_layers: int = 30
    intermediate_size: int = 1536
    hidden_act: str = "gelu"
    max_position_embeddings: int = 512
    initializer_range: float = 0.02
    rms_norm_eps: float = 1e-5
    use_cache: bool = True
    pad_token_id: Optional[int] = None
    max_length: int = 512

    def __post_init__(self):
        # Ensure numeric values are proper types
        self.vocab_size = int(self.vocab_size)
        self.hidden_size = int(self.hidden_size)
        self.num_attention_heads = int(self.num_attention_heads)
        self.num_key_value_heads = int(self.num_key_value_heads)
        self.num_hidden_layers = int(self.num_hidden_layers)
        self.intermediate_size = int(self.intermediate_size)
        self.max_position_embeddings = int(self.max_position_embeddings)
        self.initializer_range = float(self.initializer_range)
        self.rms_norm_eps = float(self.rms_norm_eps)
        self.max_length = int(self.max_length)

@dataclass
class OptimizerConfig:
    type: str = "adamW"
    weight_decay: float = 0.01
    adam_beta1: float = 0.9
    adam_beta2: float = 0.95
    adam_eps: float = 1e-8
    torch_adam_is_fused: bool = True
    clip_grad: float = 1.0
    accumulate_grad_in_fp32: bool = True

    def __post_init__(self):
        # Ensure numeric values are proper floats
        self.weight_decay = float(self.weight_decay)
        self.adam_beta1 = float(self.adam_beta1)
        self.adam_beta2 = float(self.adam_beta2)
        self.adam_eps = float(self.adam_eps)
        self.clip_grad = float(self.clip_grad)

@dataclass
class SchedulerConfig:
    type: str = "one_cycle"
    learning_rate: float = 0.003
    warmup_steps: int = 100
    max_lr: float = 0.003
    pct_start: float = 0.02
    anneal_strategy: str = "cos"
    cycle_momentum: bool = False
    div_factor: float = 25.0
    final_div_factor: float = 1000.0

@dataclass
class TrainingConfig:
    output_dir: str = "./results"
    batch_size: int = 2
    micro_batch_size: int = 1
    gradient_accumulation_steps: int = 4
    sequence_length: int = 512
    learning_rate: float = 0.003
    max_steps: int = 5050
    first_phase_steps: int = 5000
    second_phase_steps: int = 50
    sample_frequency: int = 500
    second_phase_sample_frequency: int = 10
    logging_dir: str = "./logs"
    logging_steps: int = 1
    save_steps: int = 500
    checkpoint_dir: str = "checkpoints"
    sample_prompt: str = "Explain what machine learning is:"
    max_generate_length: int = 100

@dataclass
class HardwareConfig:
    precision: str = "16-mixed"
    accelerator: str = "gpu"
    devices: int = 1
    strategy: str = "auto"
    gradient_clip: float = 1.0

@dataclass
class DatasetConfig:
    name: str
    path: str
    subset: str
    weight: float
    split_ratio: float = 1.0  # Default to using full dataset

@dataclass
class DataLoadingConfig:
    num_workers: int = 2
    batch_size: int = 32
    pin_memory: bool = True
    prefetch_factor: int = 2
    persistent_workers: bool = True

@dataclass
class DataConfig:
    datasets: List[DatasetConfig] = field(default_factory=list)
    loading: DataLoadingConfig = field(default_factory=DataLoadingConfig)

class SmolLM2Config:
    def __init__(self, config_path: str = None):
        self.model = ModelConfig()
        self.optimizer = OptimizerConfig()
        self.scheduler = SchedulerConfig()
        self.training = TrainingConfig()
        self.hardware = HardwareConfig()
        self.data = DataConfig()
        
        # Default dataset configuration
        self.data.datasets = [
            DatasetConfig(
                name="wikitext",
                path="wikitext",
                subset="wikitext-2-raw-v1",
                weight=1.0
            )
        ]
        
        if config_path and os.path.exists(config_path):
            self.load_from_yaml(config_path)
    
    def load_from_yaml(self, config_path: str):
        with open(config_path, 'r') as f:
            config_dict = yaml.safe_load(f)
            
        # Update configurations from yaml
        if 'model' in config_dict:
            for k, v in config_dict['model'].items():
                setattr(self.model, k, v)
                
        if 'optimizer' in config_dict:
            for k, v in config_dict['optimizer'].items():
                setattr(self.optimizer, k, v)
                
        if 'scheduler' in config_dict:
            for k, v in config_dict['scheduler'].items():
                setattr(self.scheduler, k, v)
                
        if 'training' in config_dict:
            for k, v in config_dict['training'].items():
                setattr(self.training, k, v)
                
        if 'hardware' in config_dict:
            for k, v in config_dict['hardware'].items():
                setattr(self.hardware, k, v)
                
        if 'data' in config_dict:
            for k, v in config_dict['data'].items():
                if k == 'datasets':
                    for dataset in v:
                        self.data.datasets.append(DatasetConfig(**dataset))
                elif k == 'loading':
                    for k, v in config_dict['data']['loading'].items():
                        setattr(self.data.loading, k, v)
    
    def save_to_yaml(self, config_path: str):
        config_dict = {
            'model': self.model.__dict__,
            'optimizer': self.optimizer.__dict__,
            'scheduler': self.scheduler.__dict__,
            'training': self.training.__dict__,
            'hardware': self.hardware.__dict__,
            'data': self.data.__dict__
        }
        
        with open(config_path, 'w') as f:
            yaml.dump(config_dict, f, default_flow_style=False)

    def __repr__(self):
        return f"SmolLM2Config(\n" \
               f"  model={self.model}\n" \
               f"  optimizer={self.optimizer}\n" \
               f"  scheduler={self.scheduler}\n" \
               f"  training={self.training}\n" \
               f"  hardware={self.hardware}\n" \
               f"  data={self.data}\n" \
               f")"