Spaces:

rynmurdock
/

Babel

Runtime error

App Files Files Community

Babel / Optimus /code /pytorch_transformers /configuration_transfo_xl.py

rynmurdock

init

c5ca37a 11 months ago

raw

history blame

6.8 kB

	# coding=utf-8
	# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
	# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" Transformer XL configuration """

	from __future__ import absolute_import, division, print_function, unicode_literals

	import json
	import logging
	import sys
	from io import open

	from .configuration_utils import PretrainedConfig

	logger = logging.getLogger(__name__)

	TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
	'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
	}

	class TransfoXLConfig(PretrainedConfig):
	"""Configuration class to store the configuration of a `TransfoXLModel`.

	Args:
	vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
	cutoffs: cutoffs for the adaptive softmax
	d_model: Dimensionality of the model's hidden states.
	d_embed: Dimensionality of the embeddings
	d_head: Dimensionality of the model's heads.
	div_val: divident value for adapative input and softmax
	pre_lnorm: apply LayerNorm to the input instead of the output
	d_inner: Inner dimension in FF
	n_layer: Number of hidden layers in the Transformer encoder.
	n_head: Number of attention heads for each attention layer in
	the Transformer encoder.
	tgt_len: number of tokens to predict
	ext_len: length of the extended context
	mem_len: length of the retained previous heads
	same_length: use the same attn length for all tokens
	proj_share_all_but_first: True to share all but first projs, False not to share.
	attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
	clamp_len: use the same pos embeddings after clamp_len
	sample_softmax: number of samples in sampled softmax
	adaptive: use adaptive softmax
	tie_weight: tie the word embedding and softmax weights
	dropout: The dropout probabilitiy for all fully connected
	layers in the embeddings, encoder, and pooler.
	dropatt: The dropout ratio for the attention probabilities.
	untie_r: untie relative position biases
	embd_pdrop: The dropout ratio for the embeddings.
	init: parameter initializer to use
	init_range: parameters initialized by U(-init_range, init_range).
	proj_init_std: parameters initialized by N(0, init_std)
	init_std: parameters initialized by N(0, init_std)
	"""
	pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP

	def __init__(self,
	vocab_size_or_config_json_file=267735,
	cutoffs=[20000, 40000, 200000],
	d_model=1024,
	d_embed=1024,
	n_head=16,
	d_head=64,
	d_inner=4096,
	div_val=4,
	pre_lnorm=False,
	n_layer=18,
	tgt_len=128,
	ext_len=0,
	mem_len=1600,
	clamp_len=1000,
	same_length=True,
	proj_share_all_but_first=True,
	attn_type=0,
	sample_softmax=-1,
	adaptive=True,
	tie_weight=True,
	dropout=0.1,
	dropatt=0.0,
	untie_r=True,
	init="normal",
	init_range=0.01,
	proj_init_std=0.01,
	init_std=0.02,
	**kwargs):
	"""Constructs TransfoXLConfig.
	"""
	super(TransfoXLConfig, self).__init__(**kwargs)

	if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
	and isinstance(vocab_size_or_config_json_file, unicode)):
	with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
	json_config = json.loads(reader.read())
	for key, value in json_config.items():
	self.__dict__[key] = value
	elif isinstance(vocab_size_or_config_json_file, int):
	self.n_token = vocab_size_or_config_json_file
	self.cutoffs = []
	self.cutoffs.extend(cutoffs)
	self.tie_weight = tie_weight
	if proj_share_all_but_first:
	self.tie_projs = [False] + [True] * len(self.cutoffs)
	else:
	self.tie_projs = [False] + [False] * len(self.cutoffs)
	self.d_model = d_model
	self.d_embed = d_embed
	self.d_head = d_head
	self.d_inner = d_inner
	self.div_val = div_val
	self.pre_lnorm = pre_lnorm
	self.n_layer = n_layer
	self.n_head = n_head
	self.tgt_len = tgt_len
	self.ext_len = ext_len
	self.mem_len = mem_len
	self.same_length = same_length
	self.attn_type = attn_type
	self.clamp_len = clamp_len
	self.sample_softmax = sample_softmax
	self.adaptive = adaptive
	self.dropout = dropout
	self.dropatt = dropatt
	self.untie_r = untie_r
	self.init = init
	self.init_range = init_range
	self.proj_init_std = proj_init_std
	self.init_std = init_std
	else:
	raise ValueError("First argument must be either a vocabulary size (int)"
	" or the path to a pretrained model config file (str)")

	@property
	def max_position_embeddings(self):
	return self.tgt_len + self.ext_len + self.mem_len

	@property
	def vocab_size(self):
	return self.n_token

	@vocab_size.setter
	def vocab_size(self, value):
	self.n_token = value

	@property
	def hidden_size(self):
	return self.d_model

	@property
	def num_attention_heads(self):
	return self.n_head

	@property
	def num_hidden_layers(self):
	return self.n_layer