rewicks
/

baseline_en-de_64k_ep2

text2text-generation

Model card Files Files and versions

rewicks commited on Feb 19

Commit

3b71bea

·

verified ·

1 Parent(s): e5d1aa0

Upload marian-config.yaml

Files changed (1) hide show

marian-config.yaml +121 -0

marian-config.yaml ADDED Viewed

	@@ -0,0 +1,121 @@

+# This file defines the SGD-related parameters for Marian trainings.
+# This is the teacher configuration.
+seed: 141414
+# cost
+cost-type: ce-sum
+label-smoothing: 0.1
+# optimizer config
+optimizer: adam
+learn-rate: 0.0005
+lr-warmup: 4000
+lr-decay-inv-sqrt: 4000
+mini-batch-warmup: 4000
+mini-batch-round-up: true
+optimizer-params:
+  - 0.9
+  - 0.999
+  - 1e-08
+  - 0.01
+clip-norm: 0
+dynamic-gradient-scaling:
+  - 2
+  - log
+exponential-smoothing: 1e-3
+# alignment
+guided-alignment-weight: 0
+# batch-size related parameters
+mini-batch-fit: true
+mini-batch-fit-step: 5
+maxi-batch: 1000
+mini-batch: 1000
+mini-batch-words: 500000
+max-length: 256
+# validation-related parameters
+# Note: Valid-metrics is specified in code (cf. k_validMetricNames), since it has a relation with the model pathname.
+# Note: Decoding parameters below are only for the validation decoding, decoding parameters in deployment are separate.
+early-stopping: 40
+valid-mini-batch: 32
+beam-size: 4
+normalize: 1.0
+word-penalty: 0.0
+valid-max-length: 1000
+n-best: false
+# general parameters
+logical-epoch: 1Gt
+after: 40e
+valid-freq: 1Gt
+save-freq: 1Gt
+disp-freq: 100Mt
+disp-label-counts: true
+lr-report: true
+sync-sgd: true
+shuffle: batches
+shuffle-in-ram: true
+disp-first: 10
+# multi-node sharding mode, irrelevant for single-node
+sharding: local
+sync-freq: 200u
+fp16: false
+# https://machinetranslation.visualstudio.com/Marian/_git/autogen?path=/configs/trainingConfigTeacherPoloniumV2Top15.yml&version=GBmain&_a=contents
+# for fp16 stability
+cost-scaling:
+  - 256.f
+  - 10000
+  - 1.f
+  - 256.f
+# model structure
+type: transformer
+# Flo generates separate vocabs, so don't tie between source and target
+tied-embeddings: true
+tied-embeddings-all: true
+tied-embeddings-src: false
+# dimensions
+dim-emb: 1024
+enc-depth: 6
+dec-depth: 6
+transformer-dim-ffn: 8192
+transformer-decoder-dim-ffn: 8192
+transformer-depth-scaling: true
+lemma-dim-emb: 0
+max-length: 256
+# architecture details
+transformer-decoder-autoreg: self-attention
+transformer-tied-layers: []
+# further transformer details
+transformer-ffn-activation: relu
+transformer-heads: 8
+transformer-postprocess-emb: d
+transformer-postprocess: dan
+transformer-dropout: 0.1
+transformer-dropout-attention: 0
+transformer-dropout-ffn: 0.1
+# data munging
+all-caps-every: 0
+english-title-case-every: 0
+log-time-zone: PST8PDT
+quiet-translation: true
+keep-best: true
+overwrite: false
+interpolate-env-vars: true
+log: train.log
+valid-log: valid.log
+valid-translation-output: valid.trg.output