baseline_en-de_64k_ep40 / model.npz.yml
rewicks's picture
Upload model.npz.yml with huggingface_hub
a300893 verified
authors: false
cite: false
build-info: ""
workspace: -8000
log: train.log
log-level: info
log-time-zone: PST8PDT
quiet: false
quiet-translation: true
seed: 141414
check-nan: false
interpolate-env-vars: true
relative-paths: false
dump-config: ""
sigterm: save-and-exit
model: model_files/model.npz
pretrained-model: ""
ignore-model-config: false
type: transformer
dim-vocabs:
- 64000
- 64000
dim-emb: 1024
factors-dim-emb: 0
factors-combine: sum
lemma-dependency: ""
lemma-dim-emb: 0
dim-rnn: 1024
enc-type: bidirectional
enc-cell: gru
enc-cell-depth: 1
enc-depth: 6
dec-cell: gru
dec-cell-base-depth: 2
dec-cell-high-depth: 1
dec-depth: 6
skip: false
layer-normalization: false
right-left: false
input-types:
[]
tied-embeddings: true
tied-embeddings-src: false
tied-embeddings-all: true
output-omit-bias: false
transformer-heads: 8
transformer-no-projection: false
transformer-rnn-projection: false
transformer-pool: false
transformer-dim-ffn: 8192
transformer-decoder-dim-ffn: 8192
transformer-ffn-depth: 2
transformer-decoder-ffn-depth: 0
transformer-ffn-activation: relu
transformer-dim-aan: 2048
transformer-aan-depth: 2
transformer-aan-activation: swish
transformer-aan-nogate: false
transformer-decoder-autoreg: self-attention
transformer-tied-layers: []
transformer-guided-alignment-layer: last
transformer-preprocess: ""
transformer-postprocess-emb: d
transformer-postprocess: dan
transformer-postprocess-top: ""
transformer-train-position-embeddings: false
transformer-depth-scaling: true
transformer-no-bias: false
transformer-no-affine: false
bert-mask-symbol: "[MASK]"
bert-sep-symbol: "[SEP]"
bert-class-symbol: "[CLS]"
bert-masking-fraction: 0.15
bert-train-type-embeddings: true
bert-type-vocab-size: 2
comet-final-sigmoid: false
comet-mix: false
comet-mix-norm: false
comet-dropout: 0.1
comet-mixup: 0
comet-mixup-reg: false
comet-pooler-ffn:
- 2048
- 1024
comet-prepend-zero: false
dropout-rnn: 0
dropout-src: 0
dropout-trg: 0
transformer-dropout: 0.1
transformer-dropout-attention: 0
transformer-dropout-ffn: 0.1
cost-type: ce-sum
multi-loss-type: sum
unlikelihood-loss: false
overwrite: false
overwrite-checkpoint: true
no-reload: false
train-sets:
- stdin
vocabs:
- vocab
- vocab
sentencepiece-alphas:
[]
sentencepiece-options: ""
sentencepiece-max-lines: 2000000
no-spm-encode: false
after-epochs: 0
after-batches: 0
after: 40e
disp-freq: 100Mt
disp-first: 10
disp-label-counts: true
save-freq: 1Gt
logical-epoch:
- 1Gt
max-length: 256
max-length-crop: false
tsv: true
tsv-fields: 2
shuffle: batches
no-restore-corpus: true
tempdir: /tmp
sqlite: ""
sqlite-drop: false
devices:
- 0
- 1
no-nccl: false
sharding: local
sync-freq: 200u
cpu-threads: 0
mini-batch: 1000
mini-batch-words: 500000
mini-batch-fit: true
mini-batch-fit-step: 5
gradient-checkpointing: false
maxi-batch: 1000
maxi-batch-sort: trg
shuffle-in-ram: true
data-threads: 8
all-caps-every: 0
english-title-case-every: 0
mini-batch-words-ref: 0
mini-batch-warmup: 4000
mini-batch-track-lr: false
mini-batch-round-up: true
optimizer: adam
optimizer-params:
- 0.9
- 0.999
- 1e-08
- 0.01
optimizer-delay: 1
sync-sgd: true
learn-rate: 0.0005
lr-report: true
lr-decay: 0
lr-decay-strategy: epoch+stalled
lr-decay-start:
- 10
- 1
lr-decay-freq: 50000
lr-decay-reset-optimizer: false
lr-decay-repeat-warmup: false
lr-decay-inv-sqrt:
- 4000
lr-warmup: 4000
lr-warmup-start-rate: 0
lr-warmup-cycle: false
lr-warmup-at-reload: false
label-smoothing: 0.1
factor-weight: 1
clip-norm: 0
exponential-smoothing: 1e-3
exponential-smoothing-replace-freq: 0
guided-alignment: none
guided-alignment-cost: ce
guided-alignment-weight: 0
data-weighting: ""
data-weighting-type: sentence
embedding-vectors:
[]
embedding-normalization: false
embedding-fix-src: false
embedding-fix-trg: false
precision:
- float32
- float32
cost-scaling:
- 256.f
- 10000
- 1.f
- 256.f
throw-on-divergence:
[]
custom-fallbacks:
[]
gradient-norm-average-window: 100
dynamic-gradient-scaling:
- 2
- log
check-gradient-nan: false
normalize-gradient: false
train-embedder-rank:
[]
quantize-bits: 0
quantize-optimization-steps: 0
quantize-log-based: false
quantize-biases: false
ulr: false
ulr-query-vectors: ""
ulr-keys-vectors: ""
ulr-trainable-transformation: false
ulr-dim-emb: 0
ulr-dropout: 0
ulr-softmax-temperature: 1
valid-sets:
- dev.en-de
valid-freq: 1Gt
valid-metrics:
- perplexity
- ce-mean-words
- bleu
- chrf
valid-reset-stalled: false
valid-reset-all: false
early-stopping: 40
early-stopping-epsilon:
- 0
early-stopping-on: first
beam-size: 4
normalize: 1.0
max-length-factor: 3
word-penalty: 0.0
allow-unk: false
n-best: false
word-scores: false
valid-mini-batch: 32
valid-max-length: 1000
valid-script-path: ""
valid-script-args:
[]
valid-translation-output: valid.trg.output
keep-best: true
valid-log: valid.log