import torch MODEL_CONFIG = { 'VOCAB_SIZE': 50000, 'D_MODEL': 1024, 'N_HEADS': 32, 'D_FF': 4096, 'N_LAYERS': 32, 'MAX_SEQ_LEN': 512, 'BATCH_SIZE': 32, 'LEARNING_RATE': 1e-4, 'NUM_EPOCHS': 20, 'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu', 'WARMUP_STEPS': 4000, 'ADAM_EPSILON': 1e-8, 'WEIGHT_DECAY': 0.01, 'GRADIENT_ACCUMULATION_STEPS': 2, 'MAX_GRAD_NORM': 1.0, 'DROPOUT': 0.1, } TRAINING_CONFIG = { 'CHECKPOINT_SAVE_STEPS': 5000, 'LOGGING_STEPS': 100, 'EVAL_STEPS': 1000, 'SAVE_TOTAL_LIMIT': 5 }