File size: 2,629 Bytes

2a3ed20

# This file defines the SGD-related parameters for Marian trainings.
# This is the teacher configuration.

seed: 141414

# cost
cost-type: ce-sum
label-smoothing: 0.1

# optimizer config
optimizer: adam
learn-rate: 0.0005
lr-warmup: 4000
lr-decay-inv-sqrt: 4000
mini-batch-warmup: 4000
mini-batch-round-up: true
optimizer-params:
  - 0.9
  - 0.999
  - 1e-08
  - 0.01
clip-norm: 0
dynamic-gradient-scaling:
  - 2
  - log
exponential-smoothing: 1e-3

# alignment
guided-alignment-weight: 0

# batch-size related parameters
mini-batch-fit: true
mini-batch-fit-step: 5
maxi-batch: 1000
mini-batch: 1000
mini-batch-words: 500000
max-length: 256

# validation-related parameters
# Note: Valid-metrics is specified in code (cf. k_validMetricNames), since it has a relation with the model pathname.
# Note: Decoding parameters below are only for the validation decoding, decoding parameters in deployment are separate.
early-stopping: 40
valid-mini-batch: 32
beam-size: 4
normalize: 1.0
word-penalty: 0.0
valid-max-length: 1000
n-best: false

# general parameters
logical-epoch: 1Gt
after: 40e
valid-freq: 1Gt
save-freq: 1Gt
disp-freq: 100Mt
disp-label-counts: true
lr-report: true
sync-sgd: true
shuffle: batches
shuffle-in-ram: true
disp-first: 10

# multi-node sharding mode, irrelevant for single-node
sharding: local
sync-freq: 200u

fp16: false
# https://machinetranslation.visualstudio.com/Marian/_git/autogen?path=/configs/trainingConfigTeacherPoloniumV2Top15.yml&version=GBmain&_a=contents
# for fp16 stability
cost-scaling:
  - 256.f
  - 10000
  - 1.f
  - 256.f

# model structure
type: transformer

# Flo generates separate vocabs, so don't tie between source and target
tied-embeddings: true
tied-embeddings-all: true
tied-embeddings-src: false

# dimensions
dim-emb: 1024
enc-depth: 6
dec-depth: 6
transformer-dim-ffn: 8192
transformer-decoder-dim-ffn: 8192
transformer-depth-scaling: true
lemma-dim-emb: 0
max-length: 256

# architecture details
transformer-decoder-autoreg: self-attention
transformer-tied-layers: []

# further transformer details
transformer-ffn-activation: relu

transformer-heads: 8
transformer-postprocess-emb: d
transformer-postprocess: dan

transformer-dropout: 0.1
transformer-dropout-attention: 0
transformer-dropout-ffn: 0.1

# data munging
all-caps-every: 0
english-title-case-every: 0

log-time-zone: PST8PDT

quiet-translation: true
keep-best: true
overwrite: false
interpolate-env-vars: true
log: train.log
valid-log: valid.log
valid-translation-output: valid.trg.output