---
license: mit
datasets:
- cjiao/minipile-train
- cjiao/wikitext-tokenized
base_model:
- jmvcoelho/GPTNeoX-160m
metrics:
- perplexity
---


#### Training Hyperparameters

```json
{
    "model_to_train": "models/GPTNeoX-160M-WikiText-512-flash_attention_2-2048-gradient_checkpointing-FT",
    "output_dir": "...",
    "run_name": "...",
    "report_to": "wandb",
    "eval_strategy": "epoch",
    "logging_steps": 1,
    "num_train_epochs": 3,
    "save_strategy": "no",
    "bf16": true,
    "seq_len": 2048,
    "attention_type": "flash_attention_2",
    "gradient_checkpointing": true,
    "dataset": "minipile",
    "deepspeed": null,
    "per_device_train_batch_size": 24,
    "learning_rate": 0.0016,
    "weight_decay": 0.0,
    "warmup_ratio": 0.1,
    "lr_scheduler_type": "cosine_with_min_lr",
    "lr_scheduler_kwargs": {
        "min_lr": 1e-6
    }
}
```