Spaces:
Sleeping
Sleeping
import os | |
import yaml | |
from dataclasses import dataclass, field | |
from typing import List, Optional | |
class ModelConfig: | |
type: str = "custom" | |
name: str = "smollm2_transformer" | |
tokenizer_name: str = "HuggingFaceTB/SmolLM2-135M" | |
vocab_size: int = 49152 | |
hidden_size: int = 576 | |
num_attention_heads: int = 9 | |
num_key_value_heads: int = 3 | |
num_hidden_layers: int = 30 | |
intermediate_size: int = 1536 | |
hidden_act: str = "gelu" | |
max_position_embeddings: int = 512 | |
initializer_range: float = 0.02 | |
rms_norm_eps: float = 1e-5 | |
use_cache: bool = True | |
pad_token_id: Optional[int] = None | |
max_length: int = 512 | |
def __post_init__(self): | |
# Ensure numeric values are proper types | |
self.vocab_size = int(self.vocab_size) | |
self.hidden_size = int(self.hidden_size) | |
self.num_attention_heads = int(self.num_attention_heads) | |
self.num_key_value_heads = int(self.num_key_value_heads) | |
self.num_hidden_layers = int(self.num_hidden_layers) | |
self.intermediate_size = int(self.intermediate_size) | |
self.max_position_embeddings = int(self.max_position_embeddings) | |
self.initializer_range = float(self.initializer_range) | |
self.rms_norm_eps = float(self.rms_norm_eps) | |
self.max_length = int(self.max_length) | |
class OptimizerConfig: | |
type: str = "adamW" | |
weight_decay: float = 0.01 | |
adam_beta1: float = 0.9 | |
adam_beta2: float = 0.95 | |
adam_eps: float = 1e-8 | |
torch_adam_is_fused: bool = True | |
clip_grad: float = 1.0 | |
accumulate_grad_in_fp32: bool = True | |
def __post_init__(self): | |
# Ensure numeric values are proper floats | |
self.weight_decay = float(self.weight_decay) | |
self.adam_beta1 = float(self.adam_beta1) | |
self.adam_beta2 = float(self.adam_beta2) | |
self.adam_eps = float(self.adam_eps) | |
self.clip_grad = float(self.clip_grad) | |
class SchedulerConfig: | |
type: str = "one_cycle" | |
learning_rate: float = 0.003 | |
warmup_steps: int = 100 | |
max_lr: float = 0.003 | |
pct_start: float = 0.02 | |
anneal_strategy: str = "cos" | |
cycle_momentum: bool = False | |
div_factor: float = 25.0 | |
final_div_factor: float = 1000.0 | |
class TrainingConfig: | |
output_dir: str = "./results" | |
batch_size: int = 2 | |
micro_batch_size: int = 1 | |
gradient_accumulation_steps: int = 4 | |
sequence_length: int = 512 | |
learning_rate: float = 0.003 | |
max_steps: int = 5050 | |
first_phase_steps: int = 5000 | |
second_phase_steps: int = 50 | |
sample_frequency: int = 500 | |
second_phase_sample_frequency: int = 10 | |
logging_dir: str = "./logs" | |
logging_steps: int = 1 | |
save_steps: int = 500 | |
checkpoint_dir: str = "checkpoints" | |
sample_prompt: str = "Explain what machine learning is:" | |
max_generate_length: int = 100 | |
class HardwareConfig: | |
precision: str = "16-mixed" | |
accelerator: str = "gpu" | |
devices: int = 1 | |
strategy: str = "auto" | |
gradient_clip: float = 1.0 | |
class DatasetConfig: | |
name: str | |
path: str | |
subset: str | |
weight: float | |
split_ratio: float = 1.0 # Default to using full dataset | |
class DataLoadingConfig: | |
num_workers: int = 2 | |
batch_size: int = 32 | |
pin_memory: bool = True | |
prefetch_factor: int = 2 | |
persistent_workers: bool = True | |
class DataConfig: | |
datasets: List[DatasetConfig] = field(default_factory=list) | |
loading: DataLoadingConfig = field(default_factory=DataLoadingConfig) | |
class SmolLM2Config: | |
def __init__(self, config_path: str = None): | |
self.model = ModelConfig() | |
self.optimizer = OptimizerConfig() | |
self.scheduler = SchedulerConfig() | |
self.training = TrainingConfig() | |
self.hardware = HardwareConfig() | |
self.data = DataConfig() | |
# Default dataset configuration | |
self.data.datasets = [ | |
DatasetConfig( | |
name="wikitext", | |
path="wikitext", | |
subset="wikitext-2-raw-v1", | |
weight=1.0 | |
) | |
] | |
if config_path and os.path.exists(config_path): | |
self.load_from_yaml(config_path) | |
def load_from_yaml(self, config_path: str): | |
with open(config_path, 'r') as f: | |
config_dict = yaml.safe_load(f) | |
# Update configurations from yaml | |
if 'model' in config_dict: | |
for k, v in config_dict['model'].items(): | |
setattr(self.model, k, v) | |
if 'optimizer' in config_dict: | |
for k, v in config_dict['optimizer'].items(): | |
setattr(self.optimizer, k, v) | |
if 'scheduler' in config_dict: | |
for k, v in config_dict['scheduler'].items(): | |
setattr(self.scheduler, k, v) | |
if 'training' in config_dict: | |
for k, v in config_dict['training'].items(): | |
setattr(self.training, k, v) | |
if 'hardware' in config_dict: | |
for k, v in config_dict['hardware'].items(): | |
setattr(self.hardware, k, v) | |
if 'data' in config_dict: | |
for k, v in config_dict['data'].items(): | |
if k == 'datasets': | |
for dataset in v: | |
self.data.datasets.append(DatasetConfig(**dataset)) | |
elif k == 'loading': | |
for k, v in config_dict['data']['loading'].items(): | |
setattr(self.data.loading, k, v) | |
def save_to_yaml(self, config_path: str): | |
config_dict = { | |
'model': self.model.__dict__, | |
'optimizer': self.optimizer.__dict__, | |
'scheduler': self.scheduler.__dict__, | |
'training': self.training.__dict__, | |
'hardware': self.hardware.__dict__, | |
'data': self.data.__dict__ | |
} | |
with open(config_path, 'w') as f: | |
yaml.dump(config_dict, f, default_flow_style=False) | |
def __repr__(self): | |
return f"SmolLM2Config(\n" \ | |
f" model={self.model}\n" \ | |
f" optimizer={self.optimizer}\n" \ | |
f" scheduler={self.scheduler}\n" \ | |
f" training={self.training}\n" \ | |
f" hardware={self.hardware}\n" \ | |
f" data={self.data}\n" \ | |
f")" |