File size: 2,629 Bytes
2a3ed20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# This file defines the SGD-related parameters for Marian trainings.
# This is the teacher configuration.
seed: 141414
# cost
cost-type: ce-sum
label-smoothing: 0.1
# optimizer config
optimizer: adam
learn-rate: 0.0005
lr-warmup: 4000
lr-decay-inv-sqrt: 4000
mini-batch-warmup: 4000
mini-batch-round-up: true
optimizer-params:
- 0.9
- 0.999
- 1e-08
- 0.01
clip-norm: 0
dynamic-gradient-scaling:
- 2
- log
exponential-smoothing: 1e-3
# alignment
guided-alignment-weight: 0
# batch-size related parameters
mini-batch-fit: true
mini-batch-fit-step: 5
maxi-batch: 1000
mini-batch: 1000
mini-batch-words: 500000
max-length: 256
# validation-related parameters
# Note: Valid-metrics is specified in code (cf. k_validMetricNames), since it has a relation with the model pathname.
# Note: Decoding parameters below are only for the validation decoding, decoding parameters in deployment are separate.
early-stopping: 40
valid-mini-batch: 32
beam-size: 4
normalize: 1.0
word-penalty: 0.0
valid-max-length: 1000
n-best: false
# general parameters
logical-epoch: 1Gt
after: 40e
valid-freq: 1Gt
save-freq: 1Gt
disp-freq: 100Mt
disp-label-counts: true
lr-report: true
sync-sgd: true
shuffle: batches
shuffle-in-ram: true
disp-first: 10
# multi-node sharding mode, irrelevant for single-node
sharding: local
sync-freq: 200u
fp16: false
# https://machinetranslation.visualstudio.com/Marian/_git/autogen?path=/configs/trainingConfigTeacherPoloniumV2Top15.yml&version=GBmain&_a=contents
# for fp16 stability
cost-scaling:
- 256.f
- 10000
- 1.f
- 256.f
# model structure
type: transformer
# Flo generates separate vocabs, so don't tie between source and target
tied-embeddings: true
tied-embeddings-all: true
tied-embeddings-src: false
# dimensions
dim-emb: 1024
enc-depth: 6
dec-depth: 6
transformer-dim-ffn: 8192
transformer-decoder-dim-ffn: 8192
transformer-depth-scaling: true
lemma-dim-emb: 0
max-length: 256
# architecture details
transformer-decoder-autoreg: self-attention
transformer-tied-layers: []
# further transformer details
transformer-ffn-activation: relu
transformer-heads: 8
transformer-postprocess-emb: d
transformer-postprocess: dan
transformer-dropout: 0.1
transformer-dropout-attention: 0
transformer-dropout-ffn: 0.1
# data munging
all-caps-every: 0
english-title-case-every: 0
log-time-zone: PST8PDT
quiet-translation: true
keep-best: true
overwrite: false
interpolate-env-vars: true
log: train.log
valid-log: valid.log
valid-translation-output: valid.trg.output
|