Spaces:
Runtime error
Runtime error
# coding=utf-8 | |
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. | |
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" Transformer XL configuration """ | |
import logging | |
from .configuration_utils import PretrainedConfig | |
logger = logging.getLogger(__name__) | |
TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { | |
"transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", | |
} | |
class TransfoXLConfig(PretrainedConfig): | |
""" | |
This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`. | |
It is used to instantiate a Transformer XL model according to the specified arguments, defining the model | |
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of | |
the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture. | |
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used | |
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` | |
for more information. | |
Args: | |
vocab_size (:obj:`int`, optional, defaults to 267735): | |
Vocabulary size of the Transformer XL model. Defines the different tokens that | |
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`. | |
cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`): | |
Cutoffs for the adaptive softmax | |
d_model (:obj:`int`, optional, defaults to 1024): | |
Dimensionality of the model's hidden states. | |
d_embed (:obj:`int`, optional, defaults to 1024): | |
Dimensionality of the embeddings | |
n_head (:obj:`int`, optional, defaults to 16): | |
Number of attention heads for each attention layer in the Transformer encoder. | |
d_head (:obj:`int`, optional, defaults to 64): | |
Dimensionality of the model's heads. | |
d_inner (:obj:`int`, optional, defaults to 4096): | |
Inner dimension in FF | |
div_val (:obj:`int`, optional, defaults to 4): | |
Divident value for adapative input and softmax | |
pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`): | |
Apply LayerNorm to the input instead of the output | |
n_layer (:obj:`int`, optional, defaults to 18): | |
Number of hidden layers in the Transformer encoder. | |
tgt_len (:obj:`int`, optional, defaults to 128): | |
Number of tokens to predict | |
ext_len (:obj:`int`, optional, defaults to 0): | |
Length of the extended context | |
mem_len (:obj:`int`, optional, defaults to 1600): | |
Length of the retained previous heads | |
clamp_len (:obj:`int`, optional, defaults to 1000): | |
use the same pos embeddings after clamp_len | |
same_length (:obj:`boolean`, optional, defaults to :obj:`True`): | |
Use the same attn length for all tokens | |
proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`): | |
True to share all but first projs, False not to share. | |
attn_type (:obj:`int`, optional, defaults to 0): | |
Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. | |
sample_softmax (:obj:`int`, optional, defaults to -1): | |
number of samples in sampled softmax | |
adaptive (:obj:`boolean`, optional, defaults to :obj:`True`): | |
use adaptive softmax | |
tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`): | |
tie the word embedding and softmax weights | |
dropout (:obj:`float`, optional, defaults to 0.1): | |
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. | |
dropatt (:obj:`float`, optional, defaults to 0): | |
The dropout ratio for the attention probabilities. | |
untie_r (:obj:`boolean`, optional, defaults to :obj:`True`): | |
Untie relative position biases | |
init (:obj:`string`, optional, defaults to `normal`): | |
Parameter initializer to use | |
init_range (:obj:`float`, optional, defaults to 0.01): | |
Parameters initialized by U(-init_range, init_range). | |
proj_init_std (:obj:`float`, optional, defaults to 0.01): | |
Parameters initialized by N(0, init_std) | |
init_std (:obj:`float`, optional, defaults to 0.02): | |
Parameters initialized by N(0, init_std) | |
layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5): | |
The epsilon to use in the layer normalization layers | |
Example:: | |
from transformers import TransfoXLConfig, TransfoXLModel | |
# Initializing a Transformer XL configuration | |
configuration = TransfoXLConfig() | |
# Initializing a model from the configuration | |
model = TransfoXLModel(configuration) | |
# Accessing the model configuration | |
configuration = model.config | |
Attributes: | |
pretrained_config_archive_map (Dict[str, str]): | |
A dictionary containing all the available pre-trained checkpoints. | |
""" | |
pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP | |
model_type = "transfo-xl" | |
def __init__( | |
self, | |
vocab_size=267735, | |
cutoffs=[20000, 40000, 200000], | |
d_model=1024, | |
d_embed=1024, | |
n_head=16, | |
d_head=64, | |
d_inner=4096, | |
div_val=4, | |
pre_lnorm=False, | |
n_layer=18, | |
tgt_len=128, | |
ext_len=0, | |
mem_len=1600, | |
clamp_len=1000, | |
same_length=True, | |
proj_share_all_but_first=True, | |
attn_type=0, | |
sample_softmax=-1, | |
adaptive=True, | |
tie_weight=True, | |
dropout=0.1, | |
dropatt=0.0, | |
untie_r=True, | |
init="normal", | |
init_range=0.01, | |
proj_init_std=0.01, | |
init_std=0.02, | |
layer_norm_epsilon=1e-5, | |
**kwargs | |
): | |
super().__init__(**kwargs) | |
self.vocab_size = vocab_size | |
self.cutoffs = [] | |
self.cutoffs.extend(cutoffs) | |
self.tie_weight = tie_weight | |
if proj_share_all_but_first: | |
self.tie_projs = [False] + [True] * len(self.cutoffs) | |
else: | |
self.tie_projs = [False] + [False] * len(self.cutoffs) | |
self.d_model = d_model | |
self.d_embed = d_embed | |
self.d_head = d_head | |
self.d_inner = d_inner | |
self.div_val = div_val | |
self.pre_lnorm = pre_lnorm | |
self.n_layer = n_layer | |
self.n_head = n_head | |
self.tgt_len = tgt_len | |
self.ext_len = ext_len | |
self.mem_len = mem_len | |
self.same_length = same_length | |
self.attn_type = attn_type | |
self.clamp_len = clamp_len | |
self.sample_softmax = sample_softmax | |
self.adaptive = adaptive | |
self.dropout = dropout | |
self.dropatt = dropatt | |
self.untie_r = untie_r | |
self.init = init | |
self.init_range = init_range | |
self.proj_init_std = proj_init_std | |
self.init_std = init_std | |
self.layer_norm_epsilon = layer_norm_epsilon | |
def max_position_embeddings(self): | |
return self.tgt_len + self.ext_len + self.mem_len | |
def n_token(self): # Backward compatibility | |
return self.vocab_size | |
def n_token(self, value): # Backward compatibility | |
self.vocab_size = value | |
def hidden_size(self): | |
return self.d_model | |
def num_attention_heads(self): | |
return self.n_head | |
def num_hidden_layers(self): | |
return self.n_layer | |