{
  "chunk_size": 20,
  "dim_feedforward": 3200,
  "dim_model": 64,
  "dropout": 0.1,
  "feedforward_activation": "relu",
  "input_features": {
    "observation.images.top": {
      "shape": [
        3,
        480,
        640
      ],
      "type": "VISUAL"
    },
    "observation.state": {
      "shape": [
        14
      ],
      "type": "STATE"
    }
  },
  "kl_weight": 10.0,
  "latent_dim": 32,
  "n_action_steps": 20,
  "n_decoder_layers": 1,
  "n_encoder_layers": 4,
  "n_heads": 8,
  "n_obs_steps": 1,
  "n_vae_encoder_layers": 4,
  "normalization_mapping": {
    "ACTION": "MEAN_STD",
    "STATE": "MEAN_STD",
    "VISUAL": "MEAN_STD"
  },
  "optimizer_lr": 1e-05,
  "optimizer_lr_backbone": 1e-05,
  "optimizer_weight_decay": 0.0001,
  "output_features": {
    "action": {
      "shape": [
        14
      ],
      "type": "ACTION"
    }
  },
  "pre_norm": false,
  "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
  "replace_final_stride_with_dilation": false,
  "temporal_ensemble_coeff": null,
  "type": "act",
  "use_vae": true,
  "vision_backbone": "resnet18"
}