File size: 2,629 Bytes
2a3ed20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# This file defines the SGD-related parameters for Marian trainings.
# This is the teacher configuration.

seed: 141414

# cost
cost-type: ce-sum
label-smoothing: 0.1

# optimizer config
optimizer: adam
learn-rate: 0.0005
lr-warmup: 4000
lr-decay-inv-sqrt: 4000
mini-batch-warmup: 4000
mini-batch-round-up: true
optimizer-params:
  - 0.9
  - 0.999
  - 1e-08
  - 0.01
clip-norm: 0
dynamic-gradient-scaling:
  - 2
  - log
exponential-smoothing: 1e-3

# alignment
guided-alignment-weight: 0

# batch-size related parameters
mini-batch-fit: true
mini-batch-fit-step: 5
maxi-batch: 1000
mini-batch: 1000
mini-batch-words: 500000
max-length: 256

# validation-related parameters
# Note: Valid-metrics is specified in code (cf. k_validMetricNames), since it has a relation with the model pathname.
# Note: Decoding parameters below are only for the validation decoding, decoding parameters in deployment are separate.
early-stopping: 40
valid-mini-batch: 32
beam-size: 4
normalize: 1.0
word-penalty: 0.0
valid-max-length: 1000
n-best: false

# general parameters
logical-epoch: 1Gt
after: 40e
valid-freq: 1Gt
save-freq: 1Gt
disp-freq: 100Mt
disp-label-counts: true
lr-report: true
sync-sgd: true
shuffle: batches
shuffle-in-ram: true
disp-first: 10

# multi-node sharding mode, irrelevant for single-node
sharding: local
sync-freq: 200u

fp16: false
# https://machinetranslation.visualstudio.com/Marian/_git/autogen?path=/configs/trainingConfigTeacherPoloniumV2Top15.yml&version=GBmain&_a=contents
# for fp16 stability
cost-scaling:
  - 256.f
  - 10000
  - 1.f
  - 256.f

# model structure
type: transformer

# Flo generates separate vocabs, so don't tie between source and target
tied-embeddings: true
tied-embeddings-all: true
tied-embeddings-src: false

# dimensions
dim-emb: 1024
enc-depth: 6
dec-depth: 6
transformer-dim-ffn: 8192
transformer-decoder-dim-ffn: 8192
transformer-depth-scaling: true
lemma-dim-emb: 0
max-length: 256

# architecture details
transformer-decoder-autoreg: self-attention
transformer-tied-layers: []

# further transformer details
transformer-ffn-activation: relu

transformer-heads: 8
transformer-postprocess-emb: d
transformer-postprocess: dan

transformer-dropout: 0.1
transformer-dropout-attention: 0
transformer-dropout-ffn: 0.1

# data munging
all-caps-every: 0
english-title-case-every: 0

log-time-zone: PST8PDT

quiet-translation: true
keep-best: true
overwrite: false
interpolate-env-vars: true
log: train.log
valid-log: valid.log
valid-translation-output: valid.trg.output