diff --git a/.gitattributes b/.gitattributes index fb55ab1be9cfc79e062d44758e8e54c1bf486a5d..c9695f200404aeb644165b61cf026806a2ef64b2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -38,3 +38,4 @@ fairseq/examples/MMPT/videoclip.png filter=lfs diff=lfs merge=lfs -text fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text fairseq/docs/fairseq.gif filter=lfs diff=lfs merge=lfs -text +fairseq/examples/hubert/tests/6313-76958-0021.flac filter=lfs diff=lfs merge=lfs -text diff --git a/fairseq/examples/hubert/tests/6313-76958-0021.flac b/fairseq/examples/hubert/tests/6313-76958-0021.flac new file mode 100644 index 0000000000000000000000000000000000000000..a3de1e1e9d7ea573e77cbbc33895178908622bb5 --- /dev/null +++ b/fairseq/examples/hubert/tests/6313-76958-0021.flac @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dd89e2bf8c60e05264bf2539f0ec35bc65efb55cd952b3510fa243de9cc16ff +size 223912 diff --git a/fairseq/examples/pointer_generator/README.xsum.md b/fairseq/examples/pointer_generator/README.xsum.md new file mode 100644 index 0000000000000000000000000000000000000000..ac3a8c3ddc96cd9810b45d49f6b361e43de1e9fb --- /dev/null +++ b/fairseq/examples/pointer_generator/README.xsum.md @@ -0,0 +1,180 @@ +## Training a pointer-generator model on the Extreme Summarization dataset + +##### 1. Download the Extreme Summarization data and preprocess it + +Follow the instructions [here](https://github.com/EdinburghNLP/XSum) to obtain +the original Extreme Summarization dataset. You should have six files, +{train,validation,test}.{document,summary}. + +##### 2. Create a vocabulary and extend it with source position markers + +```bash +vocab_size=10000 +position_markers=1000 +export LC_ALL=C +cat train.document train.summary | + tr -s '[:space:]' '\n' | + sort | + uniq -c | + sort -k1,1bnr -k2 | + head -n "$((vocab_size - 4))" | + awk '{ print $2 " " $1 }' >dict.pg.txt +python3 -c "[print(' 0'.format(n)) for n in range($position_markers)]" >>dict.pg.txt +``` + +This creates the file dict.pg.txt that contains the 10k most frequent words, +followed by 1k source position markers: + +``` +the 4954867 +. 4157552 +, 3439668 +to 2212159 +a 1916857 +of 1916820 +and 1823350 +... + 0 + 0 + 0 + 0 + 0 +... +``` + +##### 2. Preprocess the text data + +```bash +./preprocess.py --source train.document --target train.summary --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out train.pg.src --target-out train.pg.tgt +./preprocess.py --source validation.document --target validation.summary --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out valid.pg.src --target-out valid.pg.tgt +./preprocess.py --source test.document --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out test.pg.src +``` + +The data should now contain `` tokens in place of out-of-vocabulary words. + +##### 3. Binarize the dataset: + +```bash +fairseq-preprocess \ + --source-lang src \ + --target-lang tgt \ + --trainpref train.pg \ + --validpref valid.pg \ + --destdir bin \ + --workers 60 \ + --srcdict dict.pg.txt \ + --joined-dictionary +``` + +##### 3. Train a model + +```bash +total_updates=20000 +warmup_updates=500 +lr=0.001 +max_tokens=4096 +update_freq=4 +pointer_layer=-2 + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 fairseq-train bin \ + --user-dir examples/pointer_generator/pointer_generator_src \ + --max-tokens "$max_tokens" \ + --task translation \ + --source-lang src --target-lang tgt \ + --truncate-source \ + --layernorm-embedding \ + --share-all-embeddings \ + --encoder-normalize-before \ + --decoder-normalize-before \ + --required-batch-size-multiple 1 \ + --arch transformer_pointer_generator \ + --alignment-layer "$pointer_layer" \ + --alignment-heads 1 \ + --source-position-markers 1000 \ + --criterion label_smoothed_cross_entropy \ + --label-smoothing 0.1 \ + --dropout 0.1 --attention-dropout 0.1 \ + --weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \ + --clip-norm 0.1 \ + --lr-scheduler inverse_sqrt --lr "$lr" --max-update "$total_updates" --warmup-updates "$warmup_updates" \ + --update-freq "$update_freq" \ + --skip-invalid-size-inputs-valid-test +``` + +Above we specify that our dictionary contains 1000 source position markers, and +that we want to use one attention head from the penultimate decoder layer for +pointing. It should run in 5.5 hours on one node with eight 32GB V100 GPUs. The +logged messages confirm that dictionary indices above 10000 will be mapped to +the `` embedding: + +``` +2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | [src] dictionary: 11000 types +2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | [tgt] dictionary: 11000 types +2020-09-24 20:43:53 | INFO | fairseq.data.data_utils | loaded 11332 examples from: bin/valid.src-tgt.src +2020-09-24 20:43:53 | INFO | fairseq.data.data_utils | loaded 11332 examples from: bin/valid.src-tgt.tgt +2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | bin valid src-tgt 11332 examples +2020-09-24 20:43:53 | INFO | fairseq.models.transformer_pg | dictionary indices from 10000 to 10999 will be mapped to 3 +``` + +##### 4. Summarize the test sequences + +```bash +batch_size=32 +beam_size=6 +max_length=60 +length_penalty=1.0 + +fairseq-interactive bin \ + --user-dir examples/pointer_generator/pointer_generator_src \ + --batch-size "$batch_size" \ + --task translation \ + --source-lang src --target-lang tgt \ + --path checkpoints/checkpoint_last.pt \ + --input test.pg.src \ + --buffer-size 200 \ + --max-len-a 0 \ + --max-len-b "$max_length" \ + --lenpen "$length_penalty" \ + --beam "$beam_size" \ + --skip-invalid-size-inputs-valid-test | + tee generate.out +grep ^H generate.out | cut -f 3- >generate.hyp +``` + +Now you should have the generated sequences in `generate.hyp`. They contain +`` tokens that the model has copied from the source sequence. In order to +retrieve the original words, we need the unprocessed source sequences from +`test.document`. + +##### 5. Process the generated output + +Since we skipped too long inputs when producing `generate.hyp`, we also have to +skip too long sequences now that we read `test.document`. + +```bash +./postprocess.py \ + --source <(awk 'NF<1024' test.document) \ + --target generate.hyp \ + --target-out generate.hyp.processed +``` + +Now you'll find the final sequences from `generate.hyp.processed`, with +`` replaced with the original word from the source sequence. + +##### An example of a summarized sequence + +The original source document in `test.document`: + +> de roon moved to teesside in june 2016 for an initial # 8.8 m fee and played 33 premier league games last term . the netherlands international , 26 , scored five goals in 36 league and cup games during his spell at boro . meanwhile , manager garry monk confirmed the championship club 's interest in signing chelsea midfielder lewis baker . `` he 's a target and one of many that we 've had throughout the summer months , '' said monk . find all the latest football transfers on our dedicated page . + +The preprocessed source document in `test.src.pg`: + +> de \ moved to \ in june 2016 for an initial # \ m fee and played 33 premier league games last term . the netherlands international , 26 , scored five goals in 36 league and cup games during his spell at boro . meanwhile , manager garry monk confirmed the championship club 's interest in signing chelsea midfielder lewis baker . `` he 's a target and one of many that we 've had throughout the summer months , '' said monk . find all the latest football transfers on our dedicated page . + +The generated summary in `generate.hyp`: + +> middlesbrough striker \ de \ has joined spanish side \ on a season-long loan . + +The generated summary after postprocessing in `generate.hyp.processed`: + +> middlesbrough striker \ de roon has joined spanish side \ on a season-long loan . diff --git a/fairseq/examples/pointer_generator/pointer_generator_src/__init__.py b/fairseq/examples/pointer_generator/pointer_generator_src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c361ff6bd616512fe2521387665de1ad1aff66d0 --- /dev/null +++ b/fairseq/examples/pointer_generator/pointer_generator_src/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from . import transformer_pg # noqa diff --git a/fairseq/examples/pointer_generator/pointer_generator_src/transformer_pg.py b/fairseq/examples/pointer_generator/pointer_generator_src/transformer_pg.py new file mode 100644 index 0000000000000000000000000000000000000000..4ccf30f4eb154f8fab1e285934fb973a2d1166cb --- /dev/null +++ b/fairseq/examples/pointer_generator/pointer_generator_src/transformer_pg.py @@ -0,0 +1,518 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import Any, Dict, Optional, List, Tuple + +import torch +import torch.nn as nn +from fairseq import utils +from fairseq.models import register_model, register_model_architecture +from fairseq.models.transformer import ( + DEFAULT_MAX_SOURCE_POSITIONS, + DEFAULT_MAX_TARGET_POSITIONS, + TransformerDecoder, + TransformerEncoder, + TransformerModel, + base_architecture, +) +from torch import Tensor + + +logger = logging.getLogger(__name__) + + +@register_model("transformer_pointer_generator") +class TransformerPointerGeneratorModel(TransformerModel): + """ + Transformer model from `"Attention Is All You Need" (Vaswani et al, 2017) + `_, augmented with a pointer-generator + network from `"Get To The Point: Summarization with Pointer-Generator + Networks" (See et al, 2017) `_. + + Args: + encoder (TransformerPointerGeneratorEncoder): the encoder + decoder (TransformerPointerGeneratorDecoder): the decoder + + The Transformer pointer-generator model provides the following named + architectures and command-line arguments: + + .. argparse:: + :ref: fairseq.models.transformer_pointer_generator_parser + :prog: + """ + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + # fmt: off + TransformerModel.add_args(parser) + parser.add_argument('--alignment-heads', type=int, metavar='N', + help='number of attention heads to be used for ' + 'pointing') + parser.add_argument('--alignment-layer', type=int, metavar='I', + help='layer number to be used for pointing (0 ' + 'corresponding to the bottommost layer)') + parser.add_argument('--source-position-markers', type=int, metavar='N', + help='dictionary includes N additional items that ' + 'represent an OOV token at a particular input ' + 'position') + parser.add_argument('--force-generation', type=float, metavar='P', + default=None, + help='set the vocabulary distribution weight to P, ' + 'instead of predicting it from the input (1.0 ' + 'corresponding to generation, 0.0 to pointing)') + # fmt: on + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present in older models + base_architecture(args) + + if args.encoder_layers_to_keep: + args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) + if args.decoder_layers_to_keep: + args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) + + if getattr(args, "max_source_positions", None) is None: + args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS + if getattr(args, "max_target_positions", None) is None: + args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS + if getattr(args, "source_position_markers", None) is None: + args.source_position_markers = args.max_source_positions + + src_dict, tgt_dict = task.source_dictionary, task.target_dictionary + if src_dict != tgt_dict: + raise ValueError("Pointer-generator requires a joined dictionary") + + def build_embedding(dictionary, embed_dim, path=None): + # The dictionary may include additional items that can be used in + # place of the normal OOV token and that all map to the same + # embedding. Using a different token for each input position allows + # one to restore the word identities from the original source text. + num_embeddings = len(dictionary) - args.source_position_markers + padding_idx = dictionary.pad() + unk_idx = dictionary.unk() + logger.info( + "dictionary indices from {0} to {1} will be mapped to {2}".format( + num_embeddings, len(dictionary) - 1, unk_idx + ) + ) + emb = Embedding(num_embeddings, embed_dim, padding_idx, unk_idx) + # if provided, load from preloaded dictionaries + if path: + embed_dict = utils.parse_embedding(path) + utils.load_embedding(embed_dict, dictionary, emb) + return emb + + if args.share_all_embeddings: + if args.encoder_embed_dim != args.decoder_embed_dim: + raise ValueError( + "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" + ) + if args.decoder_embed_path and ( + args.decoder_embed_path != args.encoder_embed_path + ): + raise ValueError( + "--share-all-embeddings not compatible with --decoder-embed-path" + ) + encoder_embed_tokens = build_embedding( + src_dict, args.encoder_embed_dim, args.encoder_embed_path + ) + decoder_embed_tokens = encoder_embed_tokens + args.share_decoder_input_output_embed = True + else: + encoder_embed_tokens = build_embedding( + src_dict, args.encoder_embed_dim, args.encoder_embed_path + ) + decoder_embed_tokens = build_embedding( + tgt_dict, args.decoder_embed_dim, args.decoder_embed_path + ) + + encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens) + decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens) + return cls(args, encoder, decoder) + + @classmethod + def build_encoder(cls, args, src_dict, embed_tokens): + return TransformerPointerGeneratorEncoder(args, src_dict, embed_tokens) + + @classmethod + def build_decoder(cls, args, tgt_dict, embed_tokens): + return TransformerPointerGeneratorDecoder(args, tgt_dict, embed_tokens) + + +class TransformerPointerGeneratorEncoder(TransformerEncoder): + """ + Transformer encoder consisting of *args.encoder_layers* layers. Each layer + is a :class:`TransformerEncoderLayer`. The pointer-generator variant adds + the source tokens to the encoder output as these are otherwise not passed + to the decoder. + """ + + def forward( + self, + src_tokens, + src_lengths: Optional[Tensor] = None, + return_all_hiddens: bool = False, + token_embeddings: Optional[Tensor] = None + ): + """ + Runs the `forward()` method of the parent Transformer class. Then adds + the source tokens into the encoder output tuple. + + While it might be more elegant that the model would pass the source + tokens to the `forward()` method of the decoder too, this would require + changes to `SequenceGenerator`. + + Args: + src_tokens (torch.LongTensor): tokens in the source language of + shape `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + token_embeddings (torch.Tensor, optional): precomputed embeddings + default `None` will recompute embeddings + + Returns: + namedtuple: + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + - **encoder_embedding** (Tensor): the (scaled) embedding lookup + of shape `(batch, src_len, embed_dim)` + - **encoder_states** (List[Tensor]): all intermediate + hidden states of shape `(src_len, batch, embed_dim)`. + Only populated if *return_all_hiddens* is True. + - **src_tokens** (Tensor): input token ids of shape + `(batch, src_len)` + """ + encoder_out = self.forward_scriptable(src_tokens, + src_lengths, + return_all_hiddens, + token_embeddings) + + # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in + # `forward` so we use a dictionary instead. + # TorchScript does not support mixed values so the values are all lists. + # The empty list is equivalent to None. + return { + "encoder_out": encoder_out["encoder_out"], # T x B x C + "encoder_padding_mask": encoder_out["encoder_padding_mask"], # B x T + "encoder_embedding": encoder_out["encoder_embedding"], # B x T x C + "encoder_states": encoder_out["encoder_states"], # List[T x B x C] + "src_tokens": [src_tokens], # B x T + "src_lengths": [], + } + + +class TransformerPointerGeneratorDecoder(TransformerDecoder): + """ + Transformer decoder consisting of *args.decoder_layers* layers. Each layer + is a :class:`TransformerDecoderLayer`. The pointer-generator variant mixes + the output probabilities with an attention distribution in the output layer. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): decoding dictionary + embed_tokens (torch.nn.Embedding): output embedding + """ + + def __init__(self, args, dictionary, embed_tokens): + super().__init__(args, dictionary, embed_tokens, no_encoder_attn=False) + + # In the pointer-generator model these arguments define the decoder + # layer and the number of attention heads that will be averaged to + # create the alignment for pointing. + self.alignment_heads = args.alignment_heads + self.alignment_layer = args.alignment_layer + + input_embed_dim = embed_tokens.embedding_dim + + # Generation probabilities / interpolation coefficients are predicted + # from the current decoder input embedding and the decoder output, which + # is the size of output_embed_dim. + p_gen_input_size = input_embed_dim + self.output_embed_dim + self.project_p_gens = nn.Linear(p_gen_input_size, 1) + nn.init.zeros_(self.project_p_gens.bias) + + # The dictionary may include a separate entry for an OOV token in each + # input position, so that their identity can be restored from the + # original source text. + self.num_types = len(dictionary) + self.num_oov_types = args.source_position_markers + self.num_embeddings = self.num_types - self.num_oov_types + self.force_p_gen = args.force_generation + + def forward( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + features_only: bool = False, + alignment_layer: Optional[int] = 0, + alignment_heads: Optional[int] = 1, + src_lengths: Optional[Any] = None, + return_all_hiddens: bool = False, + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (optional): output from the encoder, used for + encoder-side attention + incremental_state (dict, optional): dictionary used for storing + state during :ref:`Incremental decoding` + features_only (bool, optional): only return features without + applying output layer (default: False) + alignment_layer (int, optional): 0-based index of the layer to be + used for pointing (default: 0) + alignment_heads (int, optional): number of attention heads to be + used for pointing (default: 1) + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + # The normal Transformer model doesn't pass the alignment_layer and + # alignment_heads parameters correctly. We use our local variables. + x, extra = self.extract_features( + prev_output_tokens, + encoder_out=encoder_out, + incremental_state=incremental_state, + alignment_layer=self.alignment_layer, + alignment_heads=self.alignment_heads, + ) + if not features_only: + # Embedding the tokens again for generation probability prediction, + # so that we don't have to reimplement the whole extract_features() + # method. + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + prev_output_embed = self.embed_tokens(prev_output_tokens) + prev_output_embed *= self.embed_scale + predictors = torch.cat((prev_output_embed, x), 2) + p_gens = self.project_p_gens(predictors) + p_gens = torch.sigmoid(p_gens.float()) + # Torchscript complains if encoder_out or attn are None because + # `output_layer()` signature expects tensors instead + attn: Optional[Tensor] = extra["attn"][0] + assert encoder_out is not None + assert attn is not None + x = self.output_layer(x, attn, encoder_out["src_tokens"][0], p_gens) + return x, extra + + def output_layer( + self, + features: Tensor, + attn: Tensor, + src_tokens: Tensor, + p_gens: Tensor + ) -> Tensor: + """ + Project features to the vocabulary size and mix with the attention + distributions. + """ + if self.force_p_gen is not None: + p_gens = self.force_p_gen + + # project back to size of vocabulary + if self.adaptive_softmax is None: + logits = self.output_projection(features) + else: + logits = features + + batch_size = logits.shape[0] + output_length = logits.shape[1] + assert logits.shape[2] == self.num_embeddings + assert src_tokens.shape[0] == batch_size + src_length = src_tokens.shape[1] + + # The final output distribution will be a mixture of the normal output + # distribution (softmax of logits) and attention weights. + gen_dists = self.get_normalized_probs_scriptable( + (logits, None), log_probs=False, sample=None + ) + gen_dists = torch.mul(gen_dists, p_gens) + padding_size = (batch_size, output_length, self.num_oov_types) + padding = gen_dists.new_zeros(padding_size) + gen_dists = torch.cat((gen_dists, padding), 2) + assert gen_dists.shape[2] == self.num_types + + # Scatter attention distributions to distributions over the extended + # vocabulary in a tensor of shape [batch_size, output_length, + # vocab_size]. Each attention weight will be written into a location + # that is for other dimensions the same as in the index tensor, but for + # the third dimension it's the value of the index tensor (the token ID). + attn = torch.mul(attn.float(), 1 - p_gens) + index = src_tokens[:, None, :] + index = index.expand(batch_size, output_length, src_length) + attn_dists_size = (batch_size, output_length, self.num_types) + attn_dists = attn.new_zeros(attn_dists_size) + attn_dists.scatter_add_(2, index, attn.float()) + + # Final distributions, [batch_size, output_length, num_types]. + return gen_dists + attn_dists + + def get_normalized_probs( + self, + net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], + log_probs: bool, + sample: Optional[Dict[str, Tensor]] = None, + ): + """ + Get normalized probabilities (or log probs) from a net's output. + Pointer-generator network output is already normalized. + """ + probs = net_output[0] + # Make sure the probabilities are greater than zero when returning log + # probabilities. + return probs.clamp(1e-10, 1.0).log() if log_probs else probs + + +class Embedding(nn.Embedding): + r"""A simple lookup table that stores embeddings of a fixed dictionary and size. + This module is often used to store word embeddings and retrieve them using indices. + The input to the module is a list of indices, and the output is the corresponding + word embeddings. This subclass differs from the standard PyTorch Embedding class by + allowing additional vocabulary entries that will be mapped to the unknown token + embedding. + Args: + num_embeddings (int): size of the dictionary of embeddings + embedding_dim (int): the size of each embedding vector + padding_idx (int): Pads the output with the embedding vector at :attr:`padding_idx` + (initialized to zeros) whenever it encounters the index. + unk_idx (int): Maps all token indices that are greater than or equal to + num_embeddings to this index. + Attributes: + weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim) + initialized from :math:`\mathcal{N}(0, 1)` + Shape: + - Input: :math:`(*)`, LongTensor of arbitrary shape containing the indices to extract + - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}` + .. note:: + Keep in mind that only a limited number of optimizers support + sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`), + :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`) + .. note:: + With :attr:`padding_idx` set, the embedding vector at + :attr:`padding_idx` is initialized to all zeros. However, note that this + vector can be modified afterwards, e.g., using a customized + initialization method, and thus changing the vector used to pad the + output. The gradient for this vector from :class:`~torch.nn.Embedding` + is always zero. + """ + __constants__ = ["unk_idx"] + + # Torchscript: Inheriting from Embedding class produces an error when exporting to Torchscript + # -> RuntimeError: Unable to cast Python instance to C++ type (compile in debug mode for details + # It's happening because max_norm attribute from nn.Embedding is None by default and it cannot be + # cast to a C++ type + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + padding_idx: Optional[int], + unk_idx: int, + max_norm: Optional[float] = float("inf"), + ): + super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx, max_norm=max_norm) + self.unk_idx = unk_idx + nn.init.normal_(self.weight, mean=0, std=embedding_dim ** -0.5) + nn.init.constant_(self.weight[padding_idx], 0) + + def forward(self, input): + input = torch.where( + input >= self.num_embeddings, torch.ones_like(input) * self.unk_idx, input + ) + return nn.functional.embedding( + input, self.weight, self.padding_idx, self.max_norm, + self.norm_type, self.scale_grad_by_freq, self.sparse + ) + + +@register_model_architecture( + "transformer_pointer_generator", "transformer_pointer_generator" +) +def transformer_pointer_generator(args): + args.alignment_heads = getattr(args, "alignment_heads", 1) + args.alignment_layer = getattr(args, "alignment_layer", -1) + base_architecture(args) + if args.alignment_layer < 0: + args.alignment_layer = args.decoder_layers + args.alignment_layer + + +@register_model_architecture( + "transformer_pointer_generator", "transformer_pointer_generator_iwslt_de_en" +) +def transformer_pointer_generator_iwslt_de_en(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) + args.decoder_layers = getattr(args, "decoder_layers", 6) + transformer_pointer_generator(args) + + +@register_model_architecture( + "transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de" +) +def transformer_pointer_generator_wmt_en_de(args): + transformer_pointer_generator(args) + + +# Transformer pointer-generator with the base Transformer parameters as used in +# the "Attention Is All You Need" paper (Vaswani et al., 2017) +@register_model_architecture( + "transformer_pointer_generator", + "transformer_pointer_generator_vaswani_wmt_en_de_big", +) +def transformer_pointer_generator_vaswani_wmt_en_de_big(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + args.dropout = getattr(args, "dropout", 0.3) + transformer_pointer_generator(args) + + +@register_model_architecture( + "transformer_pointer_generator", + "transformer_pointer_generator_vaswani_wmt_en_fr_big", +) +def transformer_pointer_generator_vaswani_wmt_en_fr_big(args): + args.dropout = getattr(args, "dropout", 0.1) + transformer_pointer_generator_vaswani_wmt_en_de_big(args) + + +@register_model_architecture( + "transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de_big" +) +def transformer_pointer_generator_wmt_en_de_big(args): + args.attention_dropout = getattr(args, "attention_dropout", 0.1) + transformer_pointer_generator_vaswani_wmt_en_de_big(args) + + +# default parameters used in tensor2tensor implementation +@register_model_architecture( + "transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de_big_t2t" +) +def transformer_pointer_generator_wmt_en_de_big_t2t(args): + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) + args.attention_dropout = getattr(args, "attention_dropout", 0.1) + args.activation_dropout = getattr(args, "activation_dropout", 0.1) + transformer_pointer_generator_vaswani_wmt_en_de_big(args) diff --git a/fairseq/examples/pointer_generator/postprocess.py b/fairseq/examples/pointer_generator/postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..b213aed80fd1e3d86f975256fcb7d9d4c16ca857 --- /dev/null +++ b/fairseq/examples/pointer_generator/postprocess.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import re +import sys + + +class OOVIndexError(IndexError): + def __init__(self, pos, source_seq, target_seq): + super(OOVIndexError, self).__init__( + "A tag in the target sequence refers to a position that is " + "outside the source sequence. Most likely there was a mismatch in " + "provided source and target sequences. Otherwise this would mean that " + "the pointing mechanism somehow attended to a position that is past " + "the actual sequence end." + ) + self.source_pos = pos + self.source_seq = source_seq + self.target_seq = target_seq + + +def replace_oovs(source_in, target_in, target_out): + """Replaces tokens in the target text with the corresponding word in + the source text. + """ + + oov_re = re.compile("^$") + + for source_seq, target_seq in zip(source_in, target_in): + target_seq_out = [] + + pos_to_word = source_seq.strip().split() + for token in target_seq.strip().split(): + m = oov_re.match(token) + if m: + pos = int(m.group(1)) + if pos >= len(pos_to_word): + raise OOVIndexError(pos, source_seq, target_seq) + token_out = pos_to_word[pos] + else: + token_out = token + target_seq_out.append(token_out) + target_out.write(" ".join(target_seq_out) + "\n") + + +def main(): + parser = argparse.ArgumentParser( + description="Replaces tokens in target sequences with words from " + "the corresponding position in the source sequence." + ) + parser.add_argument( + "--source", type=str, help="text file with source sequences", required=True + ) + parser.add_argument( + "--target", type=str, help="text file with target sequences", required=True + ) + parser.add_argument( + "--target-out", + type=str, + help="where to write target sequences without " "entries", + required=True, + ) + args = parser.parse_args() + + target_in = ( + open(args.target, "r", encoding="utf-8") if args.target is not None else None + ) + target_out = ( + open(args.target_out, "w", encoding="utf-8") + if args.target_out is not None + else None + ) + with open(args.source, "r", encoding="utf-8") as source_in, open( + args.target, "r", encoding="utf-8" + ) as target_in, open(args.target_out, "w", encoding="utf-8") as target_out: + replace_oovs(source_in, target_in, target_out) + + +if __name__ == "__main__": + try: + main() + except OOVIndexError as e: + print(e, file=sys.stderr) + print("Source sequence:", e.source_seq.strip(), file=sys.stderr) + print("Target sequence:", e.target_seq.strip(), file=sys.stderr) + print( + "Source sequence length:", + len(e.source_seq.strip().split()), + file=sys.stderr, + ) + print("The offending tag points to:", e.source_pos) + sys.exit(2) diff --git a/fairseq/examples/quant_noise/transformer_quantization_config.yaml b/fairseq/examples/quant_noise/transformer_quantization_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d4be14a93a3593f8e6dc66c3b05061bfdde3e0e0 --- /dev/null +++ b/fairseq/examples/quant_noise/transformer_quantization_config.yaml @@ -0,0 +1,33 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# This file defines example configuration arguments for quantizing +# a transformer model with product quantization + +# Number of Centroids for Product Quantization, by default 256 (byte-aligned) +n_centroids: + Linear: + key: in_features + value: {"*": 256} + Embedding: + key: embedding_dim + value: {"*": 256} + +# Block Sizes for Product Quantization +# We suggest: 8 for FFN, 4 for ATTN, 4 for embedding projections, 8 for embeddings +block_sizes: + Linear: + key: fuzzy_name + value: {fc: 8, attn: 4, emb: 4} + Embedding: + key: fuzzy_name + value: {emb: 8} + +# Layers to Quantize Sequentially +# We suggest: first FFN, then EMB, then ATTN +layers_to_quantize: + - decoder\\.layers\\.\d+\\.fc[12] + - decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01] + - decoder\\.layers\\.\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj) diff --git a/fairseq/examples/roberta/README.custom_classification.md b/fairseq/examples/roberta/README.custom_classification.md new file mode 100644 index 0000000000000000000000000000000000000000..7254bb7d178760ef5b847901bbcac3711af33ca2 --- /dev/null +++ b/fairseq/examples/roberta/README.custom_classification.md @@ -0,0 +1,168 @@ +# Finetuning RoBERTa on a custom classification task + +This example shows how to finetune RoBERTa on the IMDB dataset, but should illustrate the process for most classification tasks. + +### 1) Get the data + +```bash +wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz +tar zxvf aclImdb_v1.tar.gz +``` + + +### 2) Format data + +`IMDB` data has one data-sample in each file, below python code-snippet converts it one file for train and valid each for ease of processing. +```python +import argparse +import os +import random +from glob import glob + +random.seed(0) + +def main(args): + for split in ['train', 'test']: + samples = [] + for class_label in ['pos', 'neg']: + fnames = glob(os.path.join(args.datadir, split, class_label) + '/*.txt') + for fname in fnames: + with open(fname) as fin: + line = fin.readline() + samples.append((line, 1 if class_label == 'pos' else 0)) + random.shuffle(samples) + out_fname = 'train' if split == 'train' else 'dev' + f1 = open(os.path.join(args.datadir, out_fname + '.input0'), 'w') + f2 = open(os.path.join(args.datadir, out_fname + '.label'), 'w') + for sample in samples: + f1.write(sample[0] + '\n') + f2.write(str(sample[1]) + '\n') + f1.close() + f2.close() + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--datadir', default='aclImdb') + args = parser.parse_args() + main(args) +``` + + +### 3) BPE encode + +Run `multiprocessing_bpe_encoder`, you can also do this in previous step for each sample but that might be slower. +```bash +# Download encoder.json and vocab.bpe +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' + +for SPLIT in train dev; do + python -m examples.roberta.multiprocessing_bpe_encoder \ + --encoder-json encoder.json \ + --vocab-bpe vocab.bpe \ + --inputs "aclImdb/$SPLIT.input0" \ + --outputs "aclImdb/$SPLIT.input0.bpe" \ + --workers 60 \ + --keep-empty +done +``` + + +### 4) Preprocess data + +```bash +# Download fairseq dictionary. +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' + +fairseq-preprocess \ + --only-source \ + --trainpref "aclImdb/train.input0.bpe" \ + --validpref "aclImdb/dev.input0.bpe" \ + --destdir "IMDB-bin/input0" \ + --workers 60 \ + --srcdict dict.txt + +fairseq-preprocess \ + --only-source \ + --trainpref "aclImdb/train.label" \ + --validpref "aclImdb/dev.label" \ + --destdir "IMDB-bin/label" \ + --workers 60 + +``` + + +### 5) Run training + +```bash +TOTAL_NUM_UPDATES=7812 # 10 epochs through IMDB for bsz 32 +WARMUP_UPDATES=469 # 6 percent of the number of updates +LR=1e-05 # Peak LR for polynomial LR scheduler. +HEAD_NAME=imdb_head # Custom name for the classification head. +NUM_CLASSES=2 # Number of classes for the classification task. +MAX_SENTENCES=8 # Batch size. +ROBERTA_PATH=/path/to/roberta.large/model.pt + +CUDA_VISIBLE_DEVICES=0 fairseq-train IMDB-bin/ \ + --restore-file $ROBERTA_PATH \ + --max-positions 512 \ + --batch-size $MAX_SENTENCES \ + --max-tokens 4400 \ + --task sentence_prediction \ + --reset-optimizer --reset-dataloader --reset-meters \ + --required-batch-size-multiple 1 \ + --init-token 0 --separator-token 2 \ + --arch roberta_large \ + --criterion sentence_prediction \ + --classification-head-name $HEAD_NAME \ + --num-classes $NUM_CLASSES \ + --dropout 0.1 --attention-dropout 0.1 \ + --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ + --clip-norm 0.0 \ + --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ + --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ + --max-epoch 10 \ + --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ + --shorten-method "truncate" \ + --find-unused-parameters \ + --update-freq 4 +``` + +The above command will finetune RoBERTa-large with an effective batch-size of 32 +sentences (`--batch-size=8 --update-freq=4`). The expected +`best-validation-accuracy` after 10 epochs is ~96.5%. + +If you run out of GPU memory, try decreasing `--batch-size` and increase +`--update-freq` to compensate. + + +### 6) Load model using hub interface + +Now we can load the trained model checkpoint using the RoBERTa hub interface. + +Assuming your checkpoints are stored in `checkpoints/`: +```python +from fairseq.models.roberta import RobertaModel +roberta = RobertaModel.from_pretrained( + 'checkpoints', + checkpoint_file='checkpoint_best.pt', + data_name_or_path='IMDB-bin' +) +roberta.eval() # disable dropout +``` + +Finally you can make predictions using the `imdb_head` (or whatever you set +`--classification-head-name` to during training): +```python +label_fn = lambda label: roberta.task.label_dictionary.string( + [label + roberta.task.label_dictionary.nspecial] +) + +tokens = roberta.encode('Best movie this year') +pred = label_fn(roberta.predict('imdb_head', tokens).argmax().item()) +assert pred == '1' # positive + +tokens = roberta.encode('Worst movie ever') +pred = label_fn(roberta.predict('imdb_head', tokens).argmax().item()) +assert pred == '0' # negative +``` diff --git a/fairseq/examples/roberta/README.glue.md b/fairseq/examples/roberta/README.glue.md new file mode 100644 index 0000000000000000000000000000000000000000..4f596d55af99fba3cdf58b1d5ff3d8f8dbf4383d --- /dev/null +++ b/fairseq/examples/roberta/README.glue.md @@ -0,0 +1,64 @@ +# Finetuning RoBERTa on GLUE tasks + +### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands: +```bash +wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py +python download_glue_data.py --data_dir glue_data --tasks all +``` + +### 2) Preprocess GLUE task data: +```bash +./examples/roberta/preprocess_GLUE_tasks.sh glue_data +``` +`glue_task_name` is one of the following: +`{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}` +Use `ALL` for preprocessing all the glue tasks. + +### 3) Fine-tuning on GLUE task: +Example fine-tuning cmd for `RTE` task +```bash +ROBERTA_PATH=/path/to/roberta/model.pt + +CUDA_VISIBLE_DEVICES=0 fairseq-hydra-train -config-dir examples/roberta/config/finetuning --config-name rte \ +task.data=RTE-bin checkpoint.restore_file=$ROBERTA_PATH +``` + +There are additional config files for each of the GLUE tasks in the examples/roberta/config/finetuning directory. + +**Note:** + +a) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`. + +b) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search. + +### Inference on GLUE task +After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet: + +```python +from fairseq.models.roberta import RobertaModel + +roberta = RobertaModel.from_pretrained( + 'checkpoints/', + checkpoint_file='checkpoint_best.pt', + data_name_or_path='RTE-bin' +) + +label_fn = lambda label: roberta.task.label_dictionary.string( + [label + roberta.task.label_dictionary.nspecial] +) +ncorrect, nsamples = 0, 0 +roberta.cuda() +roberta.eval() +with open('glue_data/RTE/dev.tsv') as fin: + fin.readline() + for index, line in enumerate(fin): + tokens = line.strip().split('\t') + sent1, sent2, target = tokens[1], tokens[2], tokens[3] + tokens = roberta.encode(sent1, sent2) + prediction = roberta.predict('sentence_classification_head', tokens).argmax().item() + prediction_label = label_fn(prediction) + ncorrect += int(prediction_label == target) + nsamples += 1 +print('| Accuracy: ', float(ncorrect)/float(nsamples)) + +``` diff --git a/fairseq/examples/roberta/README.md b/fairseq/examples/roberta/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ed4d5df52ccea01216276054a1f253d0d16c0409 --- /dev/null +++ b/fairseq/examples/roberta/README.md @@ -0,0 +1,296 @@ +# RoBERTa: A Robustly Optimized BERT Pretraining Approach + +https://arxiv.org/abs/1907.11692 + +## Introduction + +RoBERTa iterates on BERT's pretraining procedure, including training the model longer, with bigger batches over more data; removing the next sentence prediction objective; training on longer sequences; and dynamically changing the masking pattern applied to the training data. See the associated paper for more details. + +### What's New: + +- December 2020: German model (GottBERT) is available: [GottBERT](https://github.com/pytorch/fairseq/tree/main/examples/gottbert). +- January 2020: Italian model (UmBERTo) is available from Musixmatch Research: [UmBERTo](https://github.com/musixmatchresearch/umberto). +- November 2019: French model (CamemBERT) is available: [CamemBERT](https://github.com/pytorch/fairseq/tree/main/examples/camembert). +- November 2019: Multilingual encoder (XLM-RoBERTa) is available: [XLM-R](https://github.com/pytorch/fairseq/tree/main/examples/xlmr). +- September 2019: TensorFlow and TPU support via the [transformers library](https://github.com/huggingface/transformers). +- August 2019: RoBERTa is now supported in the [pytorch-transformers library](https://github.com/huggingface/pytorch-transformers). +- August 2019: Added [tutorial for finetuning on WinoGrande](https://github.com/pytorch/fairseq/tree/main/examples/roberta/wsc#roberta-training-on-winogrande-dataset). +- August 2019: Added [tutorial for pretraining RoBERTa using your own data](README.pretraining.md). + +## Pre-trained models + +Model | Description | # params | Download +---|---|---|--- +`roberta.base` | RoBERTa using the BERT-base architecture | 125M | [roberta.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz) +`roberta.large` | RoBERTa using the BERT-large architecture | 355M | [roberta.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz) +`roberta.large.mnli` | `roberta.large` finetuned on [MNLI](http://www.nyu.edu/projects/bowman/multinli) | 355M | [roberta.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz) +`roberta.large.wsc` | `roberta.large` finetuned on [WSC](wsc/README.md) | 355M | [roberta.large.wsc.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz) + +## Results + +**[GLUE (Wang et al., 2019)](https://gluebenchmark.com/)** +_(dev set, single model, single-task finetuning)_ + +Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B +---|---|---|---|---|---|---|---|--- +`roberta.base` | 87.6 | 92.8 | 91.9 | 78.7 | 94.8 | 90.2 | 63.6 | 91.2 +`roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4 +`roberta.large.mnli` | 90.2 | - | - | - | - | - | - | - + +**[SuperGLUE (Wang et al., 2019)](https://super.gluebenchmark.com/)** +_(dev set, single model, single-task finetuning)_ + +Model | BoolQ | CB | COPA | MultiRC | RTE | WiC | WSC +---|---|---|---|---|---|---|--- +`roberta.large` | 86.9 | 98.2 | 94.0 | 85.7 | 89.5 | 75.6 | - +`roberta.large.wsc` | - | - | - | - | - | - | 91.3 + +**[SQuAD (Rajpurkar et al., 2018)](https://rajpurkar.github.io/SQuAD-explorer/)** +_(dev set, no additional data used)_ + +Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1 +---|---|--- +`roberta.large` | 88.9/94.6 | 86.5/89.4 + +**[RACE (Lai et al., 2017)](http://www.qizhexie.com/data/RACE_leaderboard.html)** +_(test set)_ + +Model | Accuracy | Middle | High +---|---|---|--- +`roberta.large` | 83.2 | 86.5 | 81.3 + +**[HellaSwag (Zellers et al., 2019)](https://rowanzellers.com/hellaswag/)** +_(test set)_ + +Model | Overall | In-domain | Zero-shot | ActivityNet | WikiHow +---|---|---|---|---|--- +`roberta.large` | 85.2 | 87.3 | 83.1 | 74.6 | 90.9 + +**[Commonsense QA (Talmor et al., 2019)](https://www.tau-nlp.org/commonsenseqa)** +_(test set)_ + +Model | Accuracy +---|--- +`roberta.large` (single model) | 72.1 +`roberta.large` (ensemble) | 72.5 + +**[Winogrande (Sakaguchi et al., 2019)](https://arxiv.org/abs/1907.10641)** +_(test set)_ + +Model | Accuracy +---|--- +`roberta.large` | 78.1 + +**[XNLI (Conneau et al., 2018)](https://arxiv.org/abs/1809.05053)** +_(TRANSLATE-TEST)_ + +Model | en | fr | es | de | el | bg | ru | tr | ar | vi | th | zh | hi | sw | ur +---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|--- +`roberta.large.mnli` | 91.3 | 82.91 | 84.27 | 81.24 | 81.74 | 83.13 | 78.28 | 76.79 | 76.64 | 74.17 | 74.05 | 77.5 | 70.9 | 66.65 | 66.81 + +## Example usage + +##### Load RoBERTa from torch.hub (PyTorch >= 1.1): +```python +import torch +roberta = torch.hub.load('pytorch/fairseq', 'roberta.large') +roberta.eval() # disable dropout (or leave in train mode to finetune) +``` + +##### Load RoBERTa (for PyTorch 1.0 or custom models): +```python +# Download roberta.large model +wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz +tar -xzvf roberta.large.tar.gz + +# Load the model in fairseq +from fairseq.models.roberta import RobertaModel +roberta = RobertaModel.from_pretrained('/path/to/roberta.large', checkpoint_file='model.pt') +roberta.eval() # disable dropout (or leave in train mode to finetune) +``` + +##### Apply Byte-Pair Encoding (BPE) to input text: +```python +tokens = roberta.encode('Hello world!') +assert tokens.tolist() == [0, 31414, 232, 328, 2] +roberta.decode(tokens) # 'Hello world!' +``` + +##### Extract features from RoBERTa: +```python +# Extract the last layer's features +last_layer_features = roberta.extract_features(tokens) +assert last_layer_features.size() == torch.Size([1, 5, 1024]) + +# Extract all layer's features (layer 0 is the embedding layer) +all_layers = roberta.extract_features(tokens, return_all_hiddens=True) +assert len(all_layers) == 25 +assert torch.all(all_layers[-1] == last_layer_features) +``` + +##### Use RoBERTa for sentence-pair classification tasks: +```python +# Download RoBERTa already finetuned for MNLI +roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli') +roberta.eval() # disable dropout for evaluation + +# Encode a pair of sentences and make a prediction +tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.') +roberta.predict('mnli', tokens).argmax() # 0: contradiction + +# Encode another pair of sentences +tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.') +roberta.predict('mnli', tokens).argmax() # 2: entailment +``` + +##### Register a new (randomly initialized) classification head: +```python +roberta.register_classification_head('new_task', num_classes=3) +logprobs = roberta.predict('new_task', tokens) # tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=) +``` + +##### Batched prediction: +```python +import torch +from fairseq.data.data_utils import collate_tokens + +roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli') +roberta.eval() + +batch_of_pairs = [ + ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'], + ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'], + ['potatoes are awesome.', 'I like to run.'], + ['Mars is very far from earth.', 'Mars is very close.'], +] + +batch = collate_tokens( + [roberta.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1 +) + +logprobs = roberta.predict('mnli', batch) +print(logprobs.argmax(dim=1)) +# tensor([0, 2, 1, 0]) +``` + +##### Using the GPU: +```python +roberta.cuda() +roberta.predict('new_task', tokens) # tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=) +``` + +## Advanced usage + +#### Filling masks: + +RoBERTa can be used to fill `` tokens in the input. Some examples from the +[Natural Questions dataset](https://ai.google.com/research/NaturalQuestions/): +```python +roberta.fill_mask('The first Star wars movie came out in ', topk=3) +# [('The first Star wars movie came out in 1977', 0.9504708051681519, ' 1977'), ('The first Star wars movie came out in 1978', 0.009986862540245056, ' 1978'), ('The first Star wars movie came out in 1979', 0.009574787691235542, ' 1979')] + +roberta.fill_mask('Vikram samvat calender is official in ', topk=3) +# [('Vikram samvat calender is official in India', 0.21878819167613983, ' India'), ('Vikram samvat calender is official in Delhi', 0.08547237515449524, ' Delhi'), ('Vikram samvat calender is official in Gujarat', 0.07556215673685074, ' Gujarat')] + +roberta.fill_mask(' is the common currency of the European Union', topk=3) +# [('Euro is the common currency of the European Union', 0.9456493854522705, 'Euro'), ('euro is the common currency of the European Union', 0.025748178362846375, 'euro'), ('€ is the common currency of the European Union', 0.011183084920048714, '€')] +``` + +#### Pronoun disambiguation (Winograd Schema Challenge): + +RoBERTa can be used to disambiguate pronouns. First install spaCy and download the English-language model: +```bash +pip install spacy +python -m spacy download en_core_web_lg +``` + +Next load the `roberta.large.wsc` model and call the `disambiguate_pronoun` +function. The pronoun should be surrounded by square brackets (`[]`) and the +query referent surrounded by underscores (`_`), or left blank to return the +predicted candidate text directly: +```python +roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.wsc', user_dir='examples/roberta/wsc') +roberta.cuda() # use the GPU (optional) + +roberta.disambiguate_pronoun('The _trophy_ would not fit in the brown suitcase because [it] was too big.') +# True +roberta.disambiguate_pronoun('The trophy would not fit in the brown _suitcase_ because [it] was too big.') +# False + +roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a permit because [they] feared violence.') +# 'The city councilmen' +roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a permit because [they] advocated violence.') +# 'demonstrators' +``` + +See the [RoBERTA Winograd Schema Challenge (WSC) README](wsc/README.md) for more details on how to train this model. + +#### Extract features aligned to words: + +By default RoBERTa outputs one feature vector per BPE token. You can instead +realign the features to match [spaCy's word-level tokenization](https://spacy.io/usage/linguistic-features#tokenization) +with the `extract_features_aligned_to_words` method. This will compute a +weighted average of the BPE-level features for each word and expose them in +spaCy's `Token.vector` attribute: +```python +doc = roberta.extract_features_aligned_to_words('I said, "hello RoBERTa."') +assert len(doc) == 10 +for tok in doc: + print('{:10}{} (...)'.format(str(tok), tok.vector[:5])) +# tensor([-0.1316, -0.0386, -0.0832, -0.0477, 0.1943], grad_fn=) (...) +# I tensor([ 0.0559, 0.1541, -0.4832, 0.0880, 0.0120], grad_fn=) (...) +# said tensor([-0.1565, -0.0069, -0.8915, 0.0501, -0.0647], grad_fn=) (...) +# , tensor([-0.1318, -0.0387, -0.0834, -0.0477, 0.1944], grad_fn=) (...) +# " tensor([-0.0486, 0.1818, -0.3946, -0.0553, 0.0981], grad_fn=) (...) +# hello tensor([ 0.0079, 0.1799, -0.6204, -0.0777, -0.0923], grad_fn=) (...) +# RoBERTa tensor([-0.2339, -0.1184, -0.7343, -0.0492, 0.5829], grad_fn=) (...) +# . tensor([-0.1341, -0.1203, -0.1012, -0.0621, 0.1892], grad_fn=) (...) +# " tensor([-0.1341, -0.1203, -0.1012, -0.0621, 0.1892], grad_fn=) (...) +# tensor([-0.0930, -0.0392, -0.0821, 0.0158, 0.0649], grad_fn=) (...) +``` + +#### Evaluating the `roberta.large.mnli` model: + +Example python code snippet to evaluate accuracy on the MNLI `dev_matched` set. +```python +label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'} +ncorrect, nsamples = 0, 0 +roberta.cuda() +roberta.eval() +with open('glue_data/MNLI/dev_matched.tsv') as fin: + fin.readline() + for index, line in enumerate(fin): + tokens = line.strip().split('\t') + sent1, sent2, target = tokens[8], tokens[9], tokens[-1] + tokens = roberta.encode(sent1, sent2) + prediction = roberta.predict('mnli', tokens).argmax().item() + prediction_label = label_map[prediction] + ncorrect += int(prediction_label == target) + nsamples += 1 +print('| Accuracy: ', float(ncorrect)/float(nsamples)) +# Expected output: 0.9060 +``` + +## Finetuning + +- [Finetuning on GLUE](README.glue.md) +- [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md) +- [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md) +- [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md) + +## Pretraining using your own data + +See the [tutorial for pretraining RoBERTa using your own data](README.pretraining.md). + +## Citation + +```bibtex +@article{liu2019roberta, + title = {RoBERTa: A Robustly Optimized BERT Pretraining Approach}, + author = {Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and + Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and + Luke Zettlemoyer and Veselin Stoyanov}, + journal={arXiv preprint arXiv:1907.11692}, + year = {2019}, +} +``` diff --git a/fairseq/examples/roberta/README.pretraining.md b/fairseq/examples/roberta/README.pretraining.md new file mode 100644 index 0000000000000000000000000000000000000000..a4e7453529111fdd198be637d911d1764cb96c0e --- /dev/null +++ b/fairseq/examples/roberta/README.pretraining.md @@ -0,0 +1,84 @@ +# Pretraining RoBERTa using your own data + +This tutorial will walk you through pretraining RoBERTa over your own data. + +### 1) Preprocess the data + +Data should be preprocessed following the [language modeling format](/examples/language_model), i.e. each document should be separated by an empty line (only useful with `--sample-break-mode complete_doc`). Lines will be concatenated as a 1D text stream during training. + +We'll use the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/) +to demonstrate how to preprocess raw text data with the GPT-2 BPE. Of course +this dataset is quite small, so the resulting pretrained model will perform +poorly, but it gives the general idea. + +First download the dataset: +```bash +wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip +unzip wikitext-103-raw-v1.zip +``` + +Next encode it with the GPT-2 BPE: +```bash +mkdir -p gpt2_bpe +wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json +wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe +for SPLIT in train valid test; do \ + python -m examples.roberta.multiprocessing_bpe_encoder \ + --encoder-json gpt2_bpe/encoder.json \ + --vocab-bpe gpt2_bpe/vocab.bpe \ + --inputs wikitext-103-raw/wiki.${SPLIT}.raw \ + --outputs wikitext-103-raw/wiki.${SPLIT}.bpe \ + --keep-empty \ + --workers 60; \ +done +``` + +Finally preprocess/binarize the data using the GPT-2 fairseq dictionary: +```bash +wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt +fairseq-preprocess \ + --only-source \ + --srcdict gpt2_bpe/dict.txt \ + --trainpref wikitext-103-raw/wiki.train.bpe \ + --validpref wikitext-103-raw/wiki.valid.bpe \ + --testpref wikitext-103-raw/wiki.test.bpe \ + --destdir data-bin/wikitext-103 \ + --workers 60 +``` + +### 2) Train RoBERTa base +```bash +DATA_DIR=data-bin/wikitext-103 + +fairseq-hydra-train -m --config-dir examples/roberta/config/pretraining \ +--config-name base task.data=$DATA_DIR +``` + +**Note:** You can optionally resume training the released RoBERTa base model by +adding `checkpoint.restore_file=/path/to/roberta.base/model.pt`. + +**Note:** The above command assumes training on 8x32GB V100 GPUs. Each GPU uses +a batch size of 16 sequences (`dataset.batch_size`) and accumulates gradients to +further increase the batch size by 16x (`optimization.update_freq`), for a total batch size +of 2048 sequences. If you have fewer GPUs or GPUs with less memory you may need +to reduce `dataset.batch_size` and increase dataset.update_freq to compensate. +Alternatively if you have more GPUs you can decrease `dataset.update_freq` accordingly +to increase training speed. + +**Note:** The learning rate and batch size are tightly connected and need to be +adjusted together. We generally recommend increasing the learning rate as you +increase the batch size according to the following table (although it's also +dataset dependent, so don't rely on the following values too closely): + +batch size | peak learning rate +---|--- +256 | 0.0001 +2048 | 0.0005 +8192 | 0.0007 + +### 3) Load your pretrained model +```python +from fairseq.models.roberta import RobertaModel +roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'path/to/data') +assert isinstance(roberta.model, torch.nn.Module) +``` diff --git a/fairseq/examples/roberta/README.race.md b/fairseq/examples/roberta/README.race.md new file mode 100644 index 0000000000000000000000000000000000000000..13c917e8eca6621e91dce541c7e41436b38cbdc1 --- /dev/null +++ b/fairseq/examples/roberta/README.race.md @@ -0,0 +1,68 @@ +# Finetuning RoBERTa on RACE tasks + +### 1) Download the data from RACE website (http://www.cs.cmu.edu/~glai1/data/race/) + +### 2) Preprocess RACE data: +```bash +python ./examples/roberta/preprocess_RACE.py --input-dir --output-dir +./examples/roberta/preprocess_RACE.sh +``` + +### 3) Fine-tuning on RACE: + +```bash +MAX_EPOCH=5 # Number of training epochs. +LR=1e-05 # Peak LR for fixed LR scheduler. +NUM_CLASSES=4 +MAX_SENTENCES=1 # Batch size per GPU. +UPDATE_FREQ=8 # Accumulate gradients to simulate training on 8 GPUs. +DATA_DIR=/path/to/race-output-dir +ROBERTA_PATH=/path/to/roberta/model.pt + +CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=legacy_ddp \ + --restore-file $ROBERTA_PATH \ + --reset-optimizer --reset-dataloader --reset-meters \ + --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ + --task sentence_ranking \ + --num-classes $NUM_CLASSES \ + --init-token 0 --separator-token 2 \ + --max-option-length 128 \ + --max-positions 512 \ + --shorten-method "truncate" \ + --arch roberta_large \ + --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \ + --criterion sentence_ranking \ + --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \ + --clip-norm 0.0 \ + --lr-scheduler fixed --lr $LR \ + --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ + --batch-size $MAX_SENTENCES \ + --required-batch-size-multiple 1 \ + --update-freq $UPDATE_FREQ \ + --max-epoch $MAX_EPOCH +``` + +**Note:** + +a) As contexts in RACE are relatively long, we are using smaller batch size per GPU while increasing update-freq to achieve larger effective batch size. + +b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`. + +c) The setting in above command is based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search. + +### 4) Evaluation: + +``` +DATA_DIR=/path/to/race-output-dir # data directory used during training +MODEL_PATH=/path/to/checkpoint_best.pt # path to the finetuned model checkpoint +PREDS_OUT=preds.tsv # output file path to save prediction +TEST_SPLIT=test # can be test (Middle) or test1 (High) +fairseq-validate \ + $DATA_DIR \ + --valid-subset $TEST_SPLIT \ + --path $MODEL_PATH \ + --batch-size 1 \ + --task sentence_ranking \ + --criterion sentence_ranking \ + --save-predictions $PREDS_OUT +``` diff --git a/fairseq/examples/roberta/commonsense_qa/README.md b/fairseq/examples/roberta/commonsense_qa/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7f386decd87d93bf701e2e313c7fea39d982224f --- /dev/null +++ b/fairseq/examples/roberta/commonsense_qa/README.md @@ -0,0 +1,99 @@ +# Finetuning RoBERTa on Commonsense QA + +We follow a similar approach to [finetuning RACE](../README.race.md). Specifically +for each question we construct five inputs, one for each of the five candidate +answer choices. Each input is constructed by concatenating the question and +candidate answer. We then encode each input and pass the resulting "[CLS]" +representations through a fully-connected layer to predict the correct answer. +We train with a standard cross-entropy loss. + +We also found it helpful to prepend a prefix of `Q:` to the question and `A:` to +the answer. The complete input format is: +``` + Q: Where would I not want a fox? A: hen house +``` + +Our final submission is based on a hyperparameter search over the learning rate +(1e-5, 2e-5, 3e-5), batch size (8, 16), number of training steps (2000, 3000, +4000) and random seed. We selected the model with the best performance on the +development set after 100 trials. + +### 1) Download data from the Commonsense QA website (https://www.tau-nlp.org/commonsenseqa) +```bash +bash examples/roberta/commonsense_qa/download_cqa_data.sh +``` + +### 2) Finetune + +```bash +MAX_UPDATES=3000 # Number of training steps. +WARMUP_UPDATES=150 # Linearly increase LR over this many steps. +LR=1e-05 # Peak LR for polynomial LR scheduler. +MAX_SENTENCES=16 # Batch size. +SEED=1 # Random seed. +ROBERTA_PATH=/path/to/roberta/model.pt +DATA_DIR=data/CommonsenseQA + +# we use the --user-dir option to load the task from +# the examples/roberta/commonsense_qa directory: +FAIRSEQ_PATH=/path/to/fairseq +FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa + +CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=legacy_ddp \ + $DATA_DIR \ + --user-dir $FAIRSEQ_USER_DIR \ + --restore-file $ROBERTA_PATH \ + --reset-optimizer --reset-dataloader --reset-meters \ + --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \ + --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ + --task commonsense_qa --init-token 0 --bpe gpt2 \ + --arch roberta_large --max-positions 512 \ + --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \ + --criterion sentence_ranking --num-classes 5 \ + --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 --clip-norm 0.0 \ + --lr-scheduler polynomial_decay --lr $LR \ + --warmup-updates $WARMUP_UPDATES --total-num-update $MAX_UPDATES \ + --batch-size $MAX_SENTENCES \ + --max-update $MAX_UPDATES \ + --log-format simple --log-interval 25 \ + --seed $SEED +``` + +The above command assumes training on 1 GPU with 32GB of RAM. For GPUs with +less memory, decrease `--batch-size` and increase `--update-freq` +accordingly to compensate. + +### 3) Evaluate +```python +import json +import torch +from fairseq.models.roberta import RobertaModel +from examples.roberta import commonsense_qa # load the Commonsense QA task +roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'data/CommonsenseQA') +roberta.eval() # disable dropout +roberta.cuda() # use the GPU (optional) +nsamples, ncorrect = 0, 0 +with open('data/CommonsenseQA/valid.jsonl') as h: + for line in h: + example = json.loads(line) + scores = [] + for choice in example['question']['choices']: + input = roberta.encode( + 'Q: ' + example['question']['stem'], + 'A: ' + choice['text'], + no_separator=True + ) + score = roberta.predict('sentence_classification_head', input, return_logits=True) + scores.append(score) + pred = torch.cat(scores).argmax() + answer = ord(example['answerKey']) - ord('A') + nsamples += 1 + if pred == answer: + ncorrect += 1 + +print('Accuracy: ' + str(ncorrect / float(nsamples))) +# Accuracy: 0.7846027846027847 +``` + +The above snippet is not batched, which makes it quite slow. See [instructions +for batched prediction with RoBERTa](https://github.com/pytorch/fairseq/tree/main/examples/roberta#batched-prediction). diff --git a/fairseq/examples/roberta/commonsense_qa/__init__.py b/fairseq/examples/roberta/commonsense_qa/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..42d21f35eb3dd33a053dcf0edd5eadd2dff11294 --- /dev/null +++ b/fairseq/examples/roberta/commonsense_qa/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from . import commonsense_qa_task # noqa diff --git a/fairseq/examples/roberta/commonsense_qa/commonsense_qa_task.py b/fairseq/examples/roberta/commonsense_qa/commonsense_qa_task.py new file mode 100644 index 0000000000000000000000000000000000000000..7d8f8131b38772d0018dbbb3524325bcc915120d --- /dev/null +++ b/fairseq/examples/roberta/commonsense_qa/commonsense_qa_task.py @@ -0,0 +1,190 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os + +import numpy as np +import torch +from fairseq.data import ( + Dictionary, + IdDataset, + ListDataset, + NestedDictionaryDataset, + NumelDataset, + NumSamplesDataset, + RawLabelDataset, + RightPadDataset, + SortDataset, + data_utils, + encoders, +) +from fairseq.tasks import LegacyFairseqTask, register_task + + +@register_task("commonsense_qa") +class CommonsenseQATask(LegacyFairseqTask): + """Task to finetune RoBERTa for Commonsense QA.""" + + @staticmethod + def add_args(parser): + """Add task-specific arguments to the parser.""" + parser.add_argument( + "data", metavar="DIR", help="path to data directory; we load .jsonl" + ) + parser.add_argument( + "--init-token", + type=int, + default=None, + help="add token at the beginning of each batch item", + ) + parser.add_argument("--num-classes", type=int, default=5) + + def __init__(self, args, vocab): + super().__init__(args) + self.vocab = vocab + self.mask = vocab.add_symbol("") + + self.bpe = encoders.build_bpe(args) + + @classmethod + def load_dictionary(cls, filename): + """Load the dictionary from the filename + + Args: + filename (str): the filename + """ + dictionary = Dictionary.load(filename) + dictionary.add_symbol("") + return dictionary + + @classmethod + def setup_task(cls, args, **kwargs): + assert ( + args.criterion == "sentence_ranking" + ), "Must set --criterion=sentence_ranking" + + # load data and label dictionaries + vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt")) + print("| dictionary: {} types".format(len(vocab))) + + return cls(args, vocab) + + def load_dataset( + self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs + ): + """Load a given dataset split. + + Args: + split (str): name of the split (e.g., train, valid, test) + """ + + def binarize(s, append_bos=False): + if self.bpe is not None: + s = self.bpe.encode(s) + tokens = self.vocab.encode_line( + s, + append_eos=True, + add_if_not_exist=False, + ).long() + if append_bos and self.args.init_token is not None: + tokens = torch.cat([tokens.new([self.args.init_token]), tokens]) + return tokens + + if data_path is None: + data_path = os.path.join(self.args.data, split + ".jsonl") + if not os.path.exists(data_path): + raise FileNotFoundError("Cannot find data: {}".format(data_path)) + + src_tokens = [[] for i in range(self.args.num_classes)] + src_lengths = [[] for i in range(self.args.num_classes)] + labels = [] + + with open(data_path) as h: + for line in h: + example = json.loads(line.strip()) + if "answerKey" in example: + label = ord(example["answerKey"]) - ord("A") + labels.append(label) + question = example["question"]["stem"] + assert len(example["question"]["choices"]) == self.args.num_classes + # format: ` Q: Where would I not want a fox? A: hen house ` + question = "Q: " + question + question_toks = binarize(question, append_bos=True) + for i, choice in enumerate(example["question"]["choices"]): + src = "A: " + choice["text"] + src_bin = torch.cat([question_toks, binarize(src)]) + src_tokens[i].append(src_bin) + src_lengths[i].append(len(src_bin)) + assert all( + len(src_tokens[0]) == len(src_tokens[i]) + for i in range(self.args.num_classes) + ) + assert len(src_tokens[0]) == len(src_lengths[0]) + assert len(labels) == 0 or len(labels) == len(src_tokens[0]) + + for i in range(self.args.num_classes): + src_lengths[i] = np.array(src_lengths[i]) + src_tokens[i] = ListDataset(src_tokens[i], src_lengths[i]) + src_lengths[i] = ListDataset(src_lengths[i]) + + dataset = { + "id": IdDataset(), + "nsentences": NumSamplesDataset(), + "ntokens": NumelDataset(src_tokens[0], reduce=True), + } + + for i in range(self.args.num_classes): + dataset.update( + { + "net_input{}".format(i + 1): { + "src_tokens": RightPadDataset( + src_tokens[i], + pad_idx=self.source_dictionary.pad(), + ), + "src_lengths": src_lengths[i], + } + } + ) + + if len(labels) > 0: + dataset.update({"target": RawLabelDataset(labels)}) + + dataset = NestedDictionaryDataset( + dataset, + sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])], + ) + + with data_utils.numpy_seed(self.args.seed): + dataset = SortDataset( + dataset, + # shuffle + sort_order=[np.random.permutation(len(dataset))], + ) + + print("| Loaded {} with {} samples".format(split, len(dataset))) + + self.datasets[split] = dataset + return self.datasets[split] + + def build_model(self, args, from_checkpoint=False): + from fairseq import models + + model = models.build_model(args, self) + + model.register_classification_head( + "sentence_classification_head", + num_classes=1, + ) + + return model + + @property + def source_dictionary(self): + return self.vocab + + @property + def target_dictionary(self): + return self.vocab diff --git a/fairseq/examples/roberta/commonsense_qa/download_cqa_data.sh b/fairseq/examples/roberta/commonsense_qa/download_cqa_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..5f300093fa0a0feb819d8b6aed307b59e3891d01 --- /dev/null +++ b/fairseq/examples/roberta/commonsense_qa/download_cqa_data.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +OUTDIR=data/CommonsenseQA + +mkdir -p $OUTDIR + +wget -O $OUTDIR/train.jsonl https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl +wget -O $OUTDIR/valid.jsonl https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl +wget -O $OUTDIR/test.jsonl https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl +wget -O $OUTDIR/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt diff --git a/fairseq/examples/roberta/config/finetuning/cola.yaml b/fairseq/examples/roberta/config/finetuning/cola.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac76611201275fcee6311b625599ea0863c92898 --- /dev/null +++ b/fairseq/examples/roberta/config/finetuning/cola.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 320 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 5336 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/fairseq/examples/roberta/config/finetuning/mnli.yaml b/fairseq/examples/roberta/config/finetuning/mnli.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5be10c362fdadae49e5a6018ef74095892903914 --- /dev/null +++ b/fairseq/examples/roberta/config/finetuning/mnli.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 3 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 7432 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 123873 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/fairseq/examples/roberta/config/finetuning/mrpc.yaml b/fairseq/examples/roberta/config/finetuning/mrpc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aa8b7db393ed00dd9b403ba009de70bf18a75309 --- /dev/null +++ b/fairseq/examples/roberta/config/finetuning/mrpc.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 137 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 2296 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/fairseq/examples/roberta/config/finetuning/qnli.yaml b/fairseq/examples/roberta/config/finetuning/qnli.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b4595b090ee23b74bb3924c09704702c4208e395 --- /dev/null +++ b/fairseq/examples/roberta/config/finetuning/qnli.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 1986 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 33112 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/fairseq/examples/roberta/config/finetuning/qqp.yaml b/fairseq/examples/roberta/config/finetuning/qqp.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a2b2ed743963af1f558927f226d993c66fbd45c --- /dev/null +++ b/fairseq/examples/roberta/config/finetuning/qqp.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 28318 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 113272 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/fairseq/examples/roberta/config/finetuning/rte.yaml b/fairseq/examples/roberta/config/finetuning/rte.yaml new file mode 100644 index 0000000000000000000000000000000000000000..73184650117e5f1ce5ec4542a0076eaf3044c2a3 --- /dev/null +++ b/fairseq/examples/roberta/config/finetuning/rte.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 122 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 2036 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/fairseq/examples/roberta/config/finetuning/run_config/local.yaml b/fairseq/examples/roberta/config/finetuning/run_config/local.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45595f9eea7c369a1200d802bfa1883c1bdfe573 --- /dev/null +++ b/fairseq/examples/roberta/config/finetuning/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/fairseq/examples/roberta/config/finetuning/run_config/slurm_1g.yaml b/fairseq/examples/roberta/config/finetuning/run_config/slurm_1g.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8bc21854d406ce8eb004c9bcba4c6b88bec0bcf3 --- /dev/null +++ b/fairseq/examples/roberta/config/finetuning/run_config/slurm_1g.yaml @@ -0,0 +1,28 @@ + +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: '_' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /checkpoint/${env:USER}/roberta_ft/${env:PREFIX}/${hydra.job.config_name}/${env:SUFFIX} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir}/submitit + timeout_min: 1000 + cpus_per_task: 8 + gpus_per_node: 1 + tasks_per_node: 1 + mem_gb: 60 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 + exclude: learnfair1381,learnfair5192,learnfair2304 diff --git a/fairseq/examples/roberta/config/finetuning/run_config/slurm_1g_aws.yaml b/fairseq/examples/roberta/config/finetuning/run_config/slurm_1g_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..085391cffa4adc2044f2112c212221a95bb50cd9 --- /dev/null +++ b/fairseq/examples/roberta/config/finetuning/run_config/slurm_1g_aws.yaml @@ -0,0 +1,25 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: '_' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + sweep: + dir: /fsx-wav2vec/${env:USER}/roberta_ft/${env:PREFIX}/${hydra.job.config_name}/${env:SUFFIX} + subdir: ${hydra.job.num} + launcher: + submitit_folder: ${hydra.sweep.dir}/submitit + timeout_min: 1000 + cpus_per_task: 8 + gpus_per_node: 1 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: learnfair,wav2vec + max_num_timeout: 30 diff --git a/fairseq/examples/roberta/config/finetuning/sst_2.yaml b/fairseq/examples/roberta/config/finetuning/sst_2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a93ad2f22c4c248f043fc18d345d61e9484ed39e --- /dev/null +++ b/fairseq/examples/roberta/config/finetuning/sst_2.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 1256 + +optimization: + clip_norm: 0.0 + lr: [1e-05] + max_update: 20935 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/fairseq/examples/roberta/config/finetuning/sts_b.yaml b/fairseq/examples/roberta/config/finetuning/sts_b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2d495221ad846162c0b3f15ea6e17d723e7ea754 --- /dev/null +++ b/fairseq/examples/roberta/config/finetuning/sts_b.yaml @@ -0,0 +1,58 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 1 + max_positions: 512 + +checkpoint: + restore_file: ??? + reset_optimizer: true + reset_dataloader: true + reset_meters: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + +criterion: + _name: sentence_prediction + regression_target: true + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 214 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 3598 + max_epoch: 10 + +model: + _name: roberta + dropout: 0.1 + attention_dropout: 0.1 diff --git a/fairseq/examples/roberta/config/pretraining/base.yaml b/fairseq/examples/roberta/config/pretraining/base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..97829908f740ba6813c895aa32019cc2760c1eb8 --- /dev/null +++ b/fairseq/examples/roberta/config/pretraining/base.yaml @@ -0,0 +1,42 @@ +# @package _group_ +common: + fp16: true + log_format: json + log_interval: 200 + +checkpoint: + no_epoch_checkpoints: true + +task: + _name: masked_lm + data: ??? + sample_break_mode: complete + tokens_per_sample: 512 + +criterion: masked_lm + +dataset: + batch_size: 16 + ignore_unused_valid_subsets: true + +optimizer: + _name: adam + weight_decay: 0.01 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 10000 + +optimization: + clip_norm: 0 + lr: [0.0005] + max_update: 125000 + update_freq: [16] + +model: + _name: roberta + max_positions: 512 + dropout: 0.1 + attention_dropout: 0.1 diff --git a/fairseq/examples/roberta/config/pretraining/run_config/local.yaml b/fairseq/examples/roberta/config/pretraining/run_config/local.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45595f9eea7c369a1200d802bfa1883c1bdfe573 --- /dev/null +++ b/fairseq/examples/roberta/config/pretraining/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/fairseq/examples/roberta/config/pretraining/run_config/slurm_2.yaml b/fairseq/examples/roberta/config/pretraining/run_config/slurm_2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..006a0f2116db5a85da558cada65873f1150eb717 --- /dev/null +++ b/fairseq/examples/roberta/config/pretraining/run_config/slurm_2.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/roberta/config/pretraining/run_config/slurm_2_aws.yaml b/fairseq/examples/roberta/config/pretraining/run_config/slurm_2_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a5937ea5a825558cb62b618491e013f10a1680a7 --- /dev/null +++ b/fairseq/examples/roberta/config/pretraining/run_config/slurm_2_aws.yaml @@ -0,0 +1,39 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.local_cache_path + - task.data + - task.post_save_script + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + - model.model_path + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec + max_num_timeout: 30 diff --git a/fairseq/examples/roberta/config/pretraining/run_config/slurm_3.yaml b/fairseq/examples/roberta/config/pretraining/run_config/slurm_3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0e1555d20f9380b9d82d5852be6799d35fa3d078 --- /dev/null +++ b/fairseq/examples/roberta/config/pretraining/run_config/slurm_3.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 3 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/roberta/config/pretraining/run_config/slurm_4.yaml b/fairseq/examples/roberta/config/pretraining/run_config/slurm_4.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c54d735fb2dc0b0af2d467caa7f64405af18ea10 --- /dev/null +++ b/fairseq/examples/roberta/config/pretraining/run_config/slurm_4.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/roberta/fb_multilingual/README.multilingual.pretraining.md b/fairseq/examples/roberta/fb_multilingual/README.multilingual.pretraining.md new file mode 100644 index 0000000000000000000000000000000000000000..234fd747085d8fa864be2749e4acb11147394bdf --- /dev/null +++ b/fairseq/examples/roberta/fb_multilingual/README.multilingual.pretraining.md @@ -0,0 +1,26 @@ +# Multilingual pretraining RoBERTa + +This tutorial will walk you through pretraining multilingual RoBERTa. + +### 1) Preprocess the data + +```bash +DICTIONARY="/private/home/namangoyal/dataset/XLM/wiki/17/175k/vocab" +DATA_LOCATION="/private/home/namangoyal/dataset/XLM/wiki/17/175k" + +for LANG in en es it +do + fairseq-preprocess \ + --only-source \ + --srcdict $DICTIONARY \ + --trainpref "$DATA_LOCATION/train.$LANG" \ + --validpref "$DATA_LOCATION/valid.$LANG" \ + --testpref "$DATA_LOCATION/test.$LANG" \ + --destdir "wiki_17-bin/$LANG" \ + --workers 60; +done +``` + +### 2) Train RoBERTa base + +[COMING UP...] diff --git a/fairseq/examples/roberta/multiprocessing_bpe_encoder.py b/fairseq/examples/roberta/multiprocessing_bpe_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..43fe0451bf4d5762d734314075b1402c2a8db2bb --- /dev/null +++ b/fairseq/examples/roberta/multiprocessing_bpe_encoder.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import contextlib +import sys +from collections import Counter +from multiprocessing import Pool + +from fairseq.data.encoders.gpt2_bpe import get_encoder + + +def main(): + """ + Helper script to encode raw text with the GPT-2 BPE using multiple processes. + + The encoder.json and vocab.bpe files can be obtained here: + - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json + - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--encoder-json", + help="path to encoder.json", + ) + parser.add_argument( + "--vocab-bpe", + type=str, + help="path to vocab.bpe", + ) + parser.add_argument( + "--inputs", + nargs="+", + default=["-"], + help="input files to filter/encode", + ) + parser.add_argument( + "--outputs", + nargs="+", + default=["-"], + help="path to save encoded outputs", + ) + parser.add_argument( + "--keep-empty", + action="store_true", + help="keep empty lines", + ) + parser.add_argument("--workers", type=int, default=20) + args = parser.parse_args() + + assert len(args.inputs) == len( + args.outputs + ), "number of input and output paths should match" + + with contextlib.ExitStack() as stack: + inputs = [ + stack.enter_context(open(input, "r", encoding="utf-8")) + if input != "-" + else sys.stdin + for input in args.inputs + ] + outputs = [ + stack.enter_context(open(output, "w", encoding="utf-8")) + if output != "-" + else sys.stdout + for output in args.outputs + ] + + encoder = MultiprocessingEncoder(args) + pool = Pool(args.workers, initializer=encoder.initializer) + encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 100) + + stats = Counter() + for i, (filt, enc_lines) in enumerate(encoded_lines, start=1): + if filt == "PASS": + for enc_line, output_h in zip(enc_lines, outputs): + print(enc_line, file=output_h) + else: + stats["num_filtered_" + filt] += 1 + if i % 10000 == 0: + print("processed {} lines".format(i), file=sys.stderr) + + for k, v in stats.most_common(): + print("[{}] filtered {} lines".format(k, v), file=sys.stderr) + + +class MultiprocessingEncoder(object): + def __init__(self, args): + self.args = args + + def initializer(self): + global bpe + bpe = get_encoder(self.args.encoder_json, self.args.vocab_bpe) + + def encode(self, line): + global bpe + ids = bpe.encode(line) + return list(map(str, ids)) + + def decode(self, tokens): + global bpe + return bpe.decode(tokens) + + def encode_lines(self, lines): + """ + Encode a set of lines. All lines will be encoded together. + """ + enc_lines = [] + for line in lines: + line = line.strip() + if len(line) == 0 and not self.args.keep_empty: + return ["EMPTY", None] + tokens = self.encode(line) + enc_lines.append(" ".join(tokens)) + return ["PASS", enc_lines] + + def decode_lines(self, lines): + dec_lines = [] + for line in lines: + tokens = map(int, line.strip().split()) + dec_lines.append(self.decode(tokens)) + return ["PASS", dec_lines] + + +if __name__ == "__main__": + main() diff --git a/fairseq/examples/roberta/preprocess_GLUE_tasks.sh b/fairseq/examples/roberta/preprocess_GLUE_tasks.sh new file mode 100644 index 0000000000000000000000000000000000000000..7f215a3b53e1c4a7b1f0320102915a49d84a5015 --- /dev/null +++ b/fairseq/examples/roberta/preprocess_GLUE_tasks.sh @@ -0,0 +1,185 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +# raw glue data as downloaded by glue download script (https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) +if [[ $# -ne 2 ]]; then + echo "Run as following:" + echo "./examples/roberta/preprocess_GLUE_tasks.sh " + exit 1 +fi + +GLUE_DATA_FOLDER=$1 + +# download bpe encoder.json, vocabulary and fairseq dictionary +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' + +TASKS=$2 # QQP + +if [ "$TASKS" = "ALL" ] +then + TASKS="QQP MNLI QNLI MRPC RTE STS-B SST-2 CoLA" +fi + +for TASK in $TASKS +do + echo "Preprocessing $TASK" + + TASK_DATA_FOLDER="$GLUE_DATA_FOLDER/$TASK" + echo "Raw data as downloaded from glue website: $TASK_DATA_FOLDER" + + SPLITS="train dev test" + INPUT_COUNT=2 + if [ "$TASK" = "QQP" ] + then + INPUT_COLUMNS=( 4 5 ) + TEST_INPUT_COLUMNS=( 2 3 ) + LABEL_COLUMN=6 + elif [ "$TASK" = "MNLI" ] + then + SPLITS="train dev_matched dev_mismatched test_matched test_mismatched" + INPUT_COLUMNS=( 9 10 ) + TEST_INPUT_COLUMNS=( 9 10 ) + DEV_LABEL_COLUMN=16 + LABEL_COLUMN=12 + elif [ "$TASK" = "QNLI" ] + then + INPUT_COLUMNS=( 2 3 ) + TEST_INPUT_COLUMNS=( 2 3 ) + LABEL_COLUMN=4 + elif [ "$TASK" = "MRPC" ] + then + INPUT_COLUMNS=( 4 5 ) + TEST_INPUT_COLUMNS=( 4 5 ) + LABEL_COLUMN=1 + elif [ "$TASK" = "RTE" ] + then + INPUT_COLUMNS=( 2 3 ) + TEST_INPUT_COLUMNS=( 2 3 ) + LABEL_COLUMN=4 + elif [ "$TASK" = "STS-B" ] + then + INPUT_COLUMNS=( 8 9 ) + TEST_INPUT_COLUMNS=( 8 9 ) + LABEL_COLUMN=10 + # Following are single sentence tasks. + elif [ "$TASK" = "SST-2" ] + then + INPUT_COLUMNS=( 1 ) + TEST_INPUT_COLUMNS=( 2 ) + LABEL_COLUMN=2 + INPUT_COUNT=1 + elif [ "$TASK" = "CoLA" ] + then + INPUT_COLUMNS=( 4 ) + TEST_INPUT_COLUMNS=( 2 ) + LABEL_COLUMN=2 + INPUT_COUNT=1 + fi + + # Strip out header and filter lines that don't have expected number of fields. + rm -rf "$TASK_DATA_FOLDER/processed" + mkdir -p "$TASK_DATA_FOLDER/processed" + for SPLIT in $SPLITS + do + # CoLA train and dev doesn't have header. + if [[ ( "$TASK" = "CoLA") && ( "$SPLIT" != "test" ) ]] + then + cp "$TASK_DATA_FOLDER/$SPLIT.tsv" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp"; + else + tail -n +2 "$TASK_DATA_FOLDER/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp"; + fi + + # Remove unformatted lines from train and dev files for QQP dataset. + if [[ ( "$TASK" = "QQP") && ( "$SPLIT" != "test" ) ]] + then + awk -F '\t' -v NUM_FIELDS=6 'NF==NUM_FIELDS{print}{}' "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv"; + else + cp "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv"; + fi + rm "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp"; + done + + # Split into input0, input1 and label + for SPLIT in $SPLITS + do + for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1))) + do + if [[ "$SPLIT" != test* ]] + then + COLUMN_NUMBER=${INPUT_COLUMNS[$INPUT_TYPE]} + else + COLUMN_NUMBER=${TEST_INPUT_COLUMNS[$INPUT_TYPE]} + fi + cut -f"$COLUMN_NUMBER" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.raw.input$INPUT_TYPE"; + done + + if [[ "$SPLIT" != test* ]] + then + if [ "$TASK" = "MNLI" ] && [ "$SPLIT" != "train" ] + then + cut -f"$DEV_LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.label"; + else + cut -f"$LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.label"; + fi + fi + + # BPE encode. + for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1))) + do + LANG="input$INPUT_TYPE" + echo "BPE encoding $SPLIT/$LANG" + python -m examples.roberta.multiprocessing_bpe_encoder \ + --encoder-json encoder.json \ + --vocab-bpe vocab.bpe \ + --inputs "$TASK_DATA_FOLDER/processed/$SPLIT.raw.$LANG" \ + --outputs "$TASK_DATA_FOLDER/processed/$SPLIT.$LANG" \ + --workers 60 \ + --keep-empty; + done + done + + # Remove output directory. + rm -rf "$TASK-bin" + + DEVPREF="$TASK_DATA_FOLDER/processed/dev.LANG" + TESTPREF="$TASK_DATA_FOLDER/processed/test.LANG" + if [ "$TASK" = "MNLI" ] + then + DEVPREF="$TASK_DATA_FOLDER/processed/dev_matched.LANG,$TASK_DATA_FOLDER/processed/dev_mismatched.LANG" + TESTPREF="$TASK_DATA_FOLDER/processed/test_matched.LANG,$TASK_DATA_FOLDER/processed/test_mismatched.LANG" + fi + + # Run fairseq preprocessing: + for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1))) + do + LANG="input$INPUT_TYPE" + fairseq-preprocess \ + --only-source \ + --trainpref "$TASK_DATA_FOLDER/processed/train.$LANG" \ + --validpref "${DEVPREF//LANG/$LANG}" \ + --testpref "${TESTPREF//LANG/$LANG}" \ + --destdir "$TASK-bin/$LANG" \ + --workers 60 \ + --srcdict dict.txt; + done + if [[ "$TASK" != "STS-B" ]] + then + fairseq-preprocess \ + --only-source \ + --trainpref "$TASK_DATA_FOLDER/processed/train.label" \ + --validpref "${DEVPREF//LANG/label}" \ + --destdir "$TASK-bin/label" \ + --workers 60; + else + # For STS-B output range is converted to be between: [0.0, 1.0] + mkdir -p "$TASK-bin/label" + awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/train.label" > "$TASK-bin/label/train.label" + awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/dev.label" > "$TASK-bin/label/valid.label" + fi +done diff --git a/fairseq/examples/roberta/preprocess_RACE.py b/fairseq/examples/roberta/preprocess_RACE.py new file mode 100644 index 0000000000000000000000000000000000000000..cdd66072718ccb6033304c97926271909a17f9d6 --- /dev/null +++ b/fairseq/examples/roberta/preprocess_RACE.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import json +import os +import re + + +class InputExample: + def __init__(self, paragraph, qa_list, label): + self.paragraph = paragraph + self.qa_list = qa_list + self.label = label + + +def get_examples(data_dir, set_type): + """ + Extract paragraph and question-answer list from each json file + """ + examples = [] + + levels = ["middle", "high"] + set_type_c = set_type.split("-") + if len(set_type_c) == 2: + levels = [set_type_c[1]] + set_type = set_type_c[0] + for level in levels: + cur_dir = os.path.join(data_dir, set_type, level) + for filename in os.listdir(cur_dir): + cur_path = os.path.join(cur_dir, filename) + with open(cur_path, "r") as f: + cur_data = json.load(f) + answers = cur_data["answers"] + options = cur_data["options"] + questions = cur_data["questions"] + context = cur_data["article"].replace("\n", " ") + context = re.sub(r"\s+", " ", context) + for i in range(len(answers)): + label = ord(answers[i]) - ord("A") + qa_list = [] + question = questions[i] + for j in range(4): + option = options[i][j] + if "_" in question: + qa_cat = question.replace("_", option) + else: + qa_cat = " ".join([question, option]) + qa_cat = re.sub(r"\s+", " ", qa_cat) + qa_list.append(qa_cat) + examples.append(InputExample(context, qa_list, label)) + + return examples + + +def main(): + """ + Helper script to extract paragraphs questions and answers from RACE datasets. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--input-dir", + help="input directory for downloaded RACE dataset", + ) + parser.add_argument( + "--output-dir", + help="output directory for extracted data", + ) + args = parser.parse_args() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir, exist_ok=True) + + for set_type in ["train", "dev", "test-middle", "test-high"]: + examples = get_examples(args.input_dir, set_type) + qa_file_paths = [ + os.path.join(args.output_dir, set_type + ".input" + str(i + 1)) + for i in range(4) + ] + qa_files = [open(qa_file_path, "w") for qa_file_path in qa_file_paths] + outf_context_path = os.path.join(args.output_dir, set_type + ".input0") + outf_label_path = os.path.join(args.output_dir, set_type + ".label") + outf_context = open(outf_context_path, "w") + outf_label = open(outf_label_path, "w") + for example in examples: + outf_context.write(example.paragraph + "\n") + for i in range(4): + qa_files[i].write(example.qa_list[i] + "\n") + outf_label.write(str(example.label) + "\n") + + for f in qa_files: + f.close() + outf_label.close() + outf_context.close() + + +if __name__ == "__main__": + main() diff --git a/fairseq/examples/roberta/preprocess_RACE.sh b/fairseq/examples/roberta/preprocess_RACE.sh new file mode 100644 index 0000000000000000000000000000000000000000..932d2ab6e521fecc7d0297f26a8c43857541ef3b --- /dev/null +++ b/fairseq/examples/roberta/preprocess_RACE.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +# data should be downloaded and processed with reprocess_RACE.py +if [[ $# -ne 2 ]]; then + echo "Run as following:" + echo "./examples/roberta/preprocess_RACE.sh " + exit 1 +fi + +RACE_DATA_FOLDER=$1 +OUT_DATA_FOLDER=$2 + +# download bpe encoder.json, vocabulary and fairseq dictionary +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' + +SPLITS="train dev test-middle test-high" +INPUT_TYPES="input0 input1 input2 input3 input4" +for INPUT_TYPE in $INPUT_TYPES +do + for SPLIT in $SPLITS + do + echo "BPE encoding $SPLIT/$INPUT_TYPE" + python -m examples.roberta.multiprocessing_bpe_encoder \ + --encoder-json encoder.json \ + --vocab-bpe vocab.bpe \ + --inputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE" \ + --outputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE.bpe" \ + --workers 10 \ + --keep-empty; + + done +done + +for INPUT_TYPE in $INPUT_TYPES + do + LANG="input$INPUT_TYPE" + fairseq-preprocess \ + --only-source \ + --trainpref "$RACE_DATA_FOLDER/train.$INPUT_TYPE.bpe" \ + --validpref "$RACE_DATA_FOLDER/dev.$INPUT_TYPE.bpe" \ + --testpref "$RACE_DATA_FOLDER/test-middle.$INPUT_TYPE.bpe,$RACE_DATA_FOLDER/test-high.$INPUT_TYPE.bpe" \ + --destdir "$OUT_DATA_FOLDER/$INPUT_TYPE" \ + --workers 10 \ + --srcdict dict.txt; +done + +rm -rf "$OUT_DATA_FOLDER/label" +mkdir -p "$OUT_DATA_FOLDER/label" +cp "$RACE_DATA_FOLDER/train.label" "$OUT_DATA_FOLDER/label/" +cp "$RACE_DATA_FOLDER/dev.label" "$OUT_DATA_FOLDER/label/valid.label" +cp "$RACE_DATA_FOLDER/test-middle.label" "$OUT_DATA_FOLDER/label/test.label" +cp "$RACE_DATA_FOLDER/test-high.label" "$OUT_DATA_FOLDER/label/test1.label" diff --git a/fairseq/examples/roberta/wsc/README.md b/fairseq/examples/roberta/wsc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..21a045d999739836a17574593292e42131315ae9 --- /dev/null +++ b/fairseq/examples/roberta/wsc/README.md @@ -0,0 +1,125 @@ +# Finetuning RoBERTa on Winograd Schema Challenge (WSC) data + +The following instructions can be used to finetune RoBERTa on the WSC training +data provided by [SuperGLUE](https://super.gluebenchmark.com/). + +Note that there is high variance in the results. For our GLUE/SuperGLUE +submission we swept over the learning rate (1e-5, 2e-5, 3e-5), batch size (16, +32, 64) and total number of updates (500, 1000, 2000, 3000), as well as the +random seed. Out of ~100 runs we chose the best 7 models and ensembled them. + +**Approach:** The instructions below use a slightly different loss function than +what's described in the original RoBERTa arXiv paper. In particular, +[Kocijan et al. (2019)](https://arxiv.org/abs/1905.06290) introduce a margin +ranking loss between `(query, candidate)` pairs with tunable hyperparameters +alpha and beta. This is supported in our code as well with the `--wsc-alpha` and +`--wsc-beta` arguments. However, we achieved slightly better (and more robust) +results on the development set by instead using a single cross entropy loss term +over the log-probabilities for the query and all mined candidates. **The +candidates are mined using spaCy from each input sentence in isolation, so the +approach remains strictly pointwise.** This reduces the number of +hyperparameters and our best model achieved 92.3% development set accuracy, +compared to ~90% accuracy for the margin loss. Later versions of the RoBERTa +arXiv paper will describe this updated formulation. + +### 1) Download the WSC data from the SuperGLUE website: +```bash +wget https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip +unzip WSC.zip + +# we also need to copy the RoBERTa dictionary into the same directory +wget -O WSC/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt +``` + +### 2) Finetune over the provided training data: +```bash +TOTAL_NUM_UPDATES=2000 # Total number of training steps. +WARMUP_UPDATES=250 # Linearly increase LR over this many steps. +LR=2e-05 # Peak LR for polynomial LR scheduler. +MAX_SENTENCES=16 # Batch size per GPU. +SEED=1 # Random seed. +ROBERTA_PATH=/path/to/roberta/model.pt + +# we use the --user-dir option to load the task and criterion +# from the examples/roberta/wsc directory: +FAIRSEQ_PATH=/path/to/fairseq +FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc + +CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train WSC/ \ + --restore-file $ROBERTA_PATH \ + --reset-optimizer --reset-dataloader --reset-meters \ + --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \ + --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ + --valid-subset val \ + --fp16 --ddp-backend legacy_ddp \ + --user-dir $FAIRSEQ_USER_DIR \ + --task wsc --criterion wsc --wsc-cross-entropy \ + --arch roberta_large --bpe gpt2 --max-positions 512 \ + --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \ + --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \ + --lr-scheduler polynomial_decay --lr $LR \ + --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \ + --batch-size $MAX_SENTENCES \ + --max-update $TOTAL_NUM_UPDATES \ + --log-format simple --log-interval 100 \ + --seed $SEED +``` + +The above command assumes training on 4 GPUs, but you can achieve the same +results on a single GPU by adding `--update-freq=4`. + +### 3) Evaluate +```python +from fairseq.models.roberta import RobertaModel +from examples.roberta.wsc import wsc_utils # also loads WSC task and criterion +roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'WSC/') +roberta.cuda() +nsamples, ncorrect = 0, 0 +for sentence, label in wsc_utils.jsonl_iterator('WSC/val.jsonl', eval=True): + pred = roberta.disambiguate_pronoun(sentence) + nsamples += 1 + if pred == label: + ncorrect += 1 +print('Accuracy: ' + str(ncorrect / float(nsamples))) +# Accuracy: 0.9230769230769231 +``` + +## RoBERTa training on WinoGrande dataset +We have also provided `winogrande` task and criterion for finetuning on the +[WinoGrande](https://mosaic.allenai.org/projects/winogrande) like datasets +where there are always two candidates and one is correct. +It's more efficient implementation for such subcases. + +```bash +TOTAL_NUM_UPDATES=23750 # Total number of training steps. +WARMUP_UPDATES=2375 # Linearly increase LR over this many steps. +LR=1e-05 # Peak LR for polynomial LR scheduler. +MAX_SENTENCES=32 # Batch size per GPU. +SEED=1 # Random seed. +ROBERTA_PATH=/path/to/roberta/model.pt + +# we use the --user-dir option to load the task and criterion +# from the examples/roberta/wsc directory: +FAIRSEQ_PATH=/path/to/fairseq +FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc + +cd fairseq +CUDA_VISIBLE_DEVICES=0 fairseq-train winogrande_1.0/ \ + --restore-file $ROBERTA_PATH \ + --reset-optimizer --reset-dataloader --reset-meters \ + --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \ + --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ + --valid-subset val \ + --fp16 --ddp-backend legacy_ddp \ + --user-dir $FAIRSEQ_USER_DIR \ + --task winogrande --criterion winogrande \ + --wsc-margin-alpha 5.0 --wsc-margin-beta 0.4 \ + --arch roberta_large --bpe gpt2 --max-positions 512 \ + --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \ + --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \ + --lr-scheduler polynomial_decay --lr $LR \ + --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \ + --batch-size $MAX_SENTENCES \ + --max-update $TOTAL_NUM_UPDATES \ + --log-format simple --log-interval 100 +``` diff --git a/fairseq/examples/roberta/wsc/__init__.py b/fairseq/examples/roberta/wsc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..78afa4728eeed96142900118f6452730023466c9 --- /dev/null +++ b/fairseq/examples/roberta/wsc/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from . import wsc_criterion # noqa +from . import wsc_task # noqa diff --git a/fairseq/examples/roberta/wsc/wsc_criterion.py b/fairseq/examples/roberta/wsc/wsc_criterion.py new file mode 100644 index 0000000000000000000000000000000000000000..ed0251fdecc3573228ad271f1090aaf914b48cd1 --- /dev/null +++ b/fairseq/examples/roberta/wsc/wsc_criterion.py @@ -0,0 +1,167 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +import torch.nn.functional as F +from fairseq import utils +from fairseq.criterions import LegacyFairseqCriterion, register_criterion +from fairseq.data import encoders + + +@register_criterion("wsc") +class WSCCriterion(LegacyFairseqCriterion): + def __init__(self, args, task): + super().__init__(args, task) + if self.args.save_predictions is not None: + self.prediction_h = open(self.args.save_predictions, "w") + else: + self.prediction_h = None + self.bpe = encoders.build_bpe(args.bpe) + self.tokenizer = encoders.build_tokenizer(args.tokenizer) + + def __del__(self): + if self.prediction_h is not None: + self.prediction_h.close() + + @staticmethod + def add_args(parser): + """Add criterion-specific arguments to the parser.""" + parser.add_argument("--wsc-margin-alpha", type=float, metavar="A", default=1.0) + parser.add_argument("--wsc-margin-beta", type=float, metavar="B", default=0.0) + parser.add_argument( + "--wsc-cross-entropy", + action="store_true", + help="use cross entropy formulation instead of margin loss", + ) + parser.add_argument( + "--save-predictions", metavar="FILE", help="file to save predictions to" + ) + + def get_masked_input(self, tokens, mask): + masked_tokens = tokens.clone() + masked_tokens[mask] = self.task.mask + return masked_tokens + + def get_lprobs(self, model, tokens, mask): + logits, _ = model(src_tokens=self.get_masked_input(tokens, mask)) + lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float) + scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1) + mask = mask.type_as(scores) + scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1) + return scores + + def get_loss(self, query_lprobs, cand_lprobs): + if self.args.wsc_cross_entropy: + return F.cross_entropy( + torch.cat([query_lprobs, cand_lprobs]).unsqueeze(0), + query_lprobs.new([0]).long(), + ) + else: + return ( + -query_lprobs + + self.args.wsc_margin_alpha + * (cand_lprobs - query_lprobs + self.args.wsc_margin_beta).clamp(min=0) + ).sum() + + def forward(self, model, sample, reduce=True): + # compute loss and accuracy + loss, nloss = 0.0, 0 + ncorrect, nqueries = 0, 0 + + for i, label in enumerate(sample["labels"]): + query_lprobs = self.get_lprobs( + model, + sample["query_tokens"][i].unsqueeze(0), + sample["query_masks"][i].unsqueeze(0), + ) + cand_lprobs = self.get_lprobs( + model, + sample["candidate_tokens"][i], + sample["candidate_masks"][i], + ) + + pred = (query_lprobs >= cand_lprobs).all().item() + + if label is not None: + label = 1 if label else 0 + ncorrect += 1 if pred == label else 0 + nqueries += 1 + + if label: + # only compute a loss for positive instances + nloss += 1 + loss += self.get_loss(query_lprobs, cand_lprobs) + + id = sample["id"][i].item() + if self.prediction_h is not None: + print("{}\t{}\t{}".format(id, pred, label), file=self.prediction_h) + + if nloss == 0: + loss = torch.tensor(0.0, requires_grad=True) + + sample_size = nqueries if nqueries > 0 else 1 + logging_output = { + "loss": utils.item(loss.data) if reduce else loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample["nsentences"], + "sample_size": sample_size, + "ncorrect": ncorrect, + "nqueries": nqueries, + } + return loss, sample_size, logging_output + + @staticmethod + def aggregate_logging_outputs(logging_outputs): + """Aggregate logging outputs from data parallel training.""" + loss_sum = sum(log.get("loss", 0) for log in logging_outputs) + ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) + nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + + agg_output = { + "loss": loss_sum / sample_size / math.log(2), + "ntokens": ntokens, + "nsentences": nsentences, + "sample_size": sample_size, + } + + ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs) + nqueries = sum(log.get("nqueries", 0) for log in logging_outputs) + if nqueries > 0: + agg_output["accuracy"] = ncorrect / float(nqueries) + + return agg_output + + +@register_criterion("winogrande") +class WinograndeCriterion(WSCCriterion): + def forward(self, model, sample, reduce=True): + # compute loss and accuracy + query_lprobs = self.get_lprobs( + model, + sample["query_tokens"], + sample["query_masks"], + ) + cand_lprobs = self.get_lprobs( + model, + sample["candidate_tokens"], + sample["candidate_masks"], + ) + pred = query_lprobs >= cand_lprobs + loss = self.get_loss(query_lprobs, cand_lprobs) + + sample_size = sample["query_tokens"].size(0) + ncorrect = pred.sum().item() + logging_output = { + "loss": utils.item(loss.data) if reduce else loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample["nsentences"], + "sample_size": sample_size, + "ncorrect": ncorrect, + "nqueries": sample_size, + } + return loss, sample_size, logging_output diff --git a/fairseq/examples/roberta/wsc/wsc_task.py b/fairseq/examples/roberta/wsc/wsc_task.py new file mode 100644 index 0000000000000000000000000000000000000000..602ea737ed75a33fddf44dd859e999ecfce2730d --- /dev/null +++ b/fairseq/examples/roberta/wsc/wsc_task.py @@ -0,0 +1,401 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +import tempfile + +import numpy as np +import torch +import torch.nn.functional as F +from fairseq import utils +from fairseq.data import ( + Dictionary, + IdDataset, + ListDataset, + NestedDictionaryDataset, + NumelDataset, + NumSamplesDataset, + PadDataset, + SortDataset, + data_utils, + encoders, +) +from fairseq.tasks import LegacyFairseqTask, register_task + +from . import wsc_utils + + +@register_task("wsc") +class WSCTask(LegacyFairseqTask): + """Task to finetune RoBERTa for Winograd Schemas.""" + + @staticmethod + def add_args(parser): + """Add task-specific arguments to the parser.""" + parser.add_argument( + "data", metavar="DIR", help="path to data directory; we load .jsonl" + ) + parser.add_argument( + "--init-token", + type=int, + default=None, + help="add token at the beginning of each batch item", + ) + + def __init__(self, args, vocab): + super().__init__(args) + self.vocab = vocab + self.mask = vocab.add_symbol("") + + self.bpe = encoders.build_bpe(args) + self.tokenizer = encoders.build_tokenizer(args) + + # hack to handle GPT-2 BPE, which includes leading spaces + if args.bpe == "gpt2": + self.leading_space = True + self.trailing_space = False + else: + self.leading_space = False + self.trailing_space = True + + @classmethod + def load_dictionary(cls, filename): + """Load the dictionary from the filename + + Args: + filename (str): the filename + """ + dictionary = Dictionary.load(filename) + dictionary.add_symbol("") + return dictionary + + @classmethod + def setup_task(cls, args, **kwargs): + assert args.criterion == "wsc", "Must set --criterion=wsc" + + # load data and label dictionaries + vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt")) + print("| dictionary: {} types".format(len(vocab))) + + return cls(args, vocab) + + def binarize(self, s: str, append_eos: bool = False): + if self.tokenizer is not None: + s = self.tokenizer.encode(s) + if self.bpe is not None: + s = self.bpe.encode(s) + tokens = self.vocab.encode_line( + s, + append_eos=append_eos, + add_if_not_exist=False, + ).long() + if self.args.init_token is not None: + tokens = torch.cat([tokens.new([self.args.init_token]), tokens]) + return tokens + + def binarize_with_mask(self, txt, prefix, suffix, leading_space, trailing_space): + toks = self.binarize( + prefix + leading_space + txt + trailing_space + suffix, + append_eos=True, + ) + mask = torch.zeros_like(toks, dtype=torch.bool) + mask_start = len(self.binarize(prefix)) + mask_size = len(self.binarize(leading_space + txt)) + mask[mask_start : mask_start + mask_size] = 1 + return toks, mask + + def load_dataset( + self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs + ): + """Load a given dataset split. + + Args: + split (str): name of the split (e.g., train, valid, test) + """ + if data_path is None: + data_path = os.path.join(self.args.data, split + ".jsonl") + if not os.path.exists(data_path): + raise FileNotFoundError("Cannot find data: {}".format(data_path)) + + query_tokens = [] + query_masks = [] + query_lengths = [] + candidate_tokens = [] + candidate_masks = [] + candidate_lengths = [] + labels = [] + + for sentence, pronoun_span, query, label in wsc_utils.jsonl_iterator(data_path): + prefix = sentence[: pronoun_span.start].text + suffix = sentence[pronoun_span.end :].text_with_ws + + # spaCy spans include trailing spaces, but we need to know about + # leading spaces for the GPT-2 BPE + leading_space = ( + " " if sentence[: pronoun_span.start].text_with_ws.endswith(" ") else "" + ) + trailing_space = " " if pronoun_span.text_with_ws.endswith(" ") else "" + + # get noun phrases, excluding pronouns and anything overlapping with the query + cand_spans = wsc_utils.filter_noun_chunks( + wsc_utils.extended_noun_chunks(sentence), + exclude_pronouns=True, + exclude_query=query, + exact_match=False, + ) + + if query is not None: + query_toks, query_mask = self.binarize_with_mask( + query, prefix, suffix, leading_space, trailing_space + ) + query_len = len(query_toks) + else: + query_toks, query_mask, query_len = None, None, 0 + + query_tokens.append(query_toks) + query_masks.append(query_mask) + query_lengths.append(query_len) + + cand_toks, cand_masks = [], [] + for cand_span in cand_spans: + toks, mask = self.binarize_with_mask( + cand_span.text, + prefix, + suffix, + leading_space, + trailing_space, + ) + cand_toks.append(toks) + cand_masks.append(mask) + + # collate candidates + cand_toks = data_utils.collate_tokens(cand_toks, pad_idx=self.vocab.pad()) + cand_masks = data_utils.collate_tokens(cand_masks, pad_idx=0) + assert cand_toks.size() == cand_masks.size() + + candidate_tokens.append(cand_toks) + candidate_masks.append(cand_masks) + candidate_lengths.append(cand_toks.size(1)) + + labels.append(label) + + query_lengths = np.array(query_lengths) + query_tokens = ListDataset(query_tokens, query_lengths) + query_masks = ListDataset(query_masks, query_lengths) + + candidate_lengths = np.array(candidate_lengths) + candidate_tokens = ListDataset(candidate_tokens, candidate_lengths) + candidate_masks = ListDataset(candidate_masks, candidate_lengths) + + labels = ListDataset(labels, [1] * len(labels)) + + dataset = { + "id": IdDataset(), + "query_tokens": query_tokens, + "query_masks": query_masks, + "candidate_tokens": candidate_tokens, + "candidate_masks": candidate_masks, + "labels": labels, + "nsentences": NumSamplesDataset(), + "ntokens": NumelDataset(query_tokens, reduce=True), + } + + nested_dataset = NestedDictionaryDataset( + dataset, + sizes=[query_lengths], + ) + + with data_utils.numpy_seed(self.args.seed): + shuffle = np.random.permutation(len(query_tokens)) + dataset = SortDataset( + nested_dataset, + # shuffle + sort_order=[shuffle], + ) + + if return_only: + return dataset + + self.datasets[split] = dataset + return self.datasets[split] + + def build_dataset_for_inference(self, sample_json): + with tempfile.NamedTemporaryFile(buffering=0) as h: + h.write((json.dumps(sample_json) + "\n").encode("utf-8")) + dataset = self.load_dataset( + "disambiguate_pronoun", + data_path=h.name, + return_only=True, + ) + return dataset + + def disambiguate_pronoun(self, model, sentence, use_cuda=False): + sample_json = wsc_utils.convert_sentence_to_json(sentence) + dataset = self.build_dataset_for_inference(sample_json) + sample = dataset.collater([dataset[0]]) + if use_cuda: + sample = utils.move_to_cuda(sample) + + def get_masked_input(tokens, mask): + masked_tokens = tokens.clone() + masked_tokens[mask.bool()] = self.mask + return masked_tokens + + def get_lprobs(tokens, mask): + logits, _ = model(src_tokens=get_masked_input(tokens, mask)) + lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float) + scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1) + mask = mask.type_as(scores) + scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1) + return scores + + cand_lprobs = get_lprobs( + sample["candidate_tokens"][0], + sample["candidate_masks"][0], + ) + if sample["query_tokens"][0] is not None: + query_lprobs = get_lprobs( + sample["query_tokens"][0].unsqueeze(0), + sample["query_masks"][0].unsqueeze(0), + ) + return (query_lprobs >= cand_lprobs).all().item() == 1 + else: + best_idx = cand_lprobs.argmax().item() + full_cand = sample["candidate_tokens"][0][best_idx] + mask = sample["candidate_masks"][0][best_idx] + toks = full_cand[mask.bool()] + return self.bpe.decode(self.source_dictionary.string(toks)).strip() + + @property + def source_dictionary(self): + return self.vocab + + @property + def target_dictionary(self): + return self.vocab + + +@register_task("winogrande") +class WinograndeTask(WSCTask): + """ + Task for WinoGrande dataset. Efficient implementation for Winograd schema + tasks with exactly two candidates, one of which is correct. + """ + + @classmethod + def setup_task(cls, args, **kwargs): + assert args.criterion == "winogrande", "Must set --criterion=winogrande" + + # load data and label dictionaries + vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt")) + print("| dictionary: {} types".format(len(vocab))) + + return cls(args, vocab) + + def load_dataset( + self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs + ): + """Load a given dataset split. + + Args: + split (str): name of the split (e.g., train, valid, test) + """ + if data_path is None: + data_path = os.path.join(self.args.data, split + ".jsonl") + if not os.path.exists(data_path): + raise FileNotFoundError("Cannot find data: {}".format(data_path)) + + query_tokens = [] + query_masks = [] + query_lengths = [] + candidate_tokens = [] + candidate_masks = [] + candidate_lengths = [] + + itr = wsc_utils.winogrande_jsonl_iterator(data_path, eval=(split == "test")) + + for sample in itr: + sentence, pronoun_span, query, cand_text = sample + prefix = sentence[: pronoun_span[0]].rstrip() + suffix = sentence[pronoun_span[1] :] + + leading_space = " " if sentence[: pronoun_span[0]].endswith(" ") else "" + trailing_space = "" + + if query is not None: + query_toks, query_mask = self.binarize_with_mask( + query, + prefix, + suffix, + leading_space, + trailing_space, + ) + query_len = len(query_toks) + else: + query_toks, query_mask, query_len = None, None, 0 + + query_tokens.append(query_toks) + query_masks.append(query_mask) + query_lengths.append(query_len) + + cand_toks, cand_mask = self.binarize_with_mask( + cand_text, + prefix, + suffix, + leading_space, + trailing_space, + ) + + candidate_tokens.append(cand_toks) + candidate_masks.append(cand_mask) + candidate_lengths.append(cand_toks.size(0)) + + query_lengths = np.array(query_lengths) + + def get_pad_dataset_fn(tokens, length, pad_idx): + return PadDataset( + ListDataset(tokens, length), + pad_idx=pad_idx, + left_pad=False, + ) + + query_tokens = get_pad_dataset_fn(query_tokens, query_lengths, self.vocab.pad()) + query_masks = get_pad_dataset_fn(query_masks, query_lengths, 0) + + candidate_lengths = np.array(candidate_lengths) + candidate_tokens = get_pad_dataset_fn( + candidate_tokens, candidate_lengths, self.vocab.pad() + ) + candidate_masks = get_pad_dataset_fn(candidate_masks, candidate_lengths, 0) + + dataset = { + "id": IdDataset(), + "query_tokens": query_tokens, + "query_masks": query_masks, + "candidate_tokens": candidate_tokens, + "candidate_masks": candidate_masks, + "nsentences": NumSamplesDataset(), + "ntokens": NumelDataset(query_tokens, reduce=True), + } + + nested_dataset = NestedDictionaryDataset( + dataset, + sizes=[query_lengths], + ) + + with data_utils.numpy_seed(self.args.seed): + shuffle = np.random.permutation(len(query_tokens)) + dataset = SortDataset( + nested_dataset, + # shuffle + sort_order=[shuffle], + ) + + if return_only: + return dataset + + self.datasets[split] = dataset + return self.datasets[split] diff --git a/fairseq/examples/roberta/wsc/wsc_utils.py b/fairseq/examples/roberta/wsc/wsc_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..da6ba74383a2490e1108609f315f44ad4b3bf002 --- /dev/null +++ b/fairseq/examples/roberta/wsc/wsc_utils.py @@ -0,0 +1,241 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +from functools import lru_cache + + +def convert_sentence_to_json(sentence): + if "_" in sentence: + prefix, rest = sentence.split("_", 1) + query, rest = rest.split("_", 1) + query_index = len(prefix.rstrip().split(" ")) + else: + query, query_index = None, None + + prefix, rest = sentence.split("[", 1) + pronoun, rest = rest.split("]", 1) + pronoun_index = len(prefix.rstrip().split(" ")) + + sentence = sentence.replace("_", "").replace("[", "").replace("]", "") + + return { + "idx": 0, + "text": sentence, + "target": { + "span1_index": query_index, + "span1_text": query, + "span2_index": pronoun_index, + "span2_text": pronoun, + }, + } + + +def extended_noun_chunks(sentence): + noun_chunks = {(np.start, np.end) for np in sentence.noun_chunks} + np_start, cur_np = 0, "NONE" + for i, token in enumerate(sentence): + np_type = token.pos_ if token.pos_ in {"NOUN", "PROPN"} else "NONE" + if np_type != cur_np: + if cur_np != "NONE": + noun_chunks.add((np_start, i)) + if np_type != "NONE": + np_start = i + cur_np = np_type + if cur_np != "NONE": + noun_chunks.add((np_start, len(sentence))) + return [sentence[s:e] for (s, e) in sorted(noun_chunks)] + + +def find_token(sentence, start_pos): + found_tok = None + for tok in sentence: + if tok.idx == start_pos: + found_tok = tok + break + return found_tok + + +def find_span(sentence, search_text, start=0): + search_text = search_text.lower() + for tok in sentence[start:]: + remainder = sentence[tok.i :].text.lower() + if remainder.startswith(search_text): + len_to_consume = len(search_text) + start_idx = tok.idx + for next_tok in sentence[tok.i :]: + end_idx = next_tok.idx + len(next_tok.text) + if end_idx - start_idx == len_to_consume: + span = sentence[tok.i : next_tok.i + 1] + return span + return None + + +@lru_cache(maxsize=1) +def get_detokenizer(): + from sacremoses import MosesDetokenizer + + detok = MosesDetokenizer(lang="en") + return detok + + +@lru_cache(maxsize=1) +def get_spacy_nlp(): + import en_core_web_lg + + nlp = en_core_web_lg.load() + return nlp + + +def jsonl_iterator(input_fname, positive_only=False, ngram_order=3, eval=False): + detok = get_detokenizer() + nlp = get_spacy_nlp() + + with open(input_fname) as fin: + for line in fin: + sample = json.loads(line.strip()) + + if positive_only and "label" in sample and not sample["label"]: + # only consider examples where the query is correct + continue + + target = sample["target"] + + # clean up the query + query = target["span1_text"] + if query is not None: + if "\n" in query: + continue + if query.endswith(".") or query.endswith(","): + query = query[:-1] + + # split tokens + tokens = sample["text"].split(" ") + + def strip_pronoun(x): + return x.rstrip('.,"') + + # find the pronoun + pronoun_idx = target["span2_index"] + pronoun = strip_pronoun(target["span2_text"]) + if strip_pronoun(tokens[pronoun_idx]) != pronoun: + # hack: sometimes the index is misaligned + if strip_pronoun(tokens[pronoun_idx + 1]) == pronoun: + pronoun_idx += 1 + else: + raise Exception("Misaligned pronoun!") + assert strip_pronoun(tokens[pronoun_idx]) == pronoun + + # split tokens before and after the pronoun + before = tokens[:pronoun_idx] + after = tokens[pronoun_idx + 1 :] + + # the GPT BPE attaches leading spaces to tokens, so we keep track + # of whether we need spaces before or after the pronoun + leading_space = " " if pronoun_idx > 0 else "" + trailing_space = " " if len(after) > 0 else "" + + # detokenize + before = detok.detokenize(before, return_str=True) + pronoun = detok.detokenize([pronoun], return_str=True) + after = detok.detokenize(after, return_str=True) + + # hack: when the pronoun ends in a period (or comma), move the + # punctuation to the "after" part + if pronoun.endswith(".") or pronoun.endswith(","): + after = pronoun[-1] + trailing_space + after + pronoun = pronoun[:-1] + + # hack: when the "after" part begins with a comma or period, remove + # the trailing space + if after.startswith(".") or after.startswith(","): + trailing_space = "" + + # parse sentence with spacy + sentence = nlp(before + leading_space + pronoun + trailing_space + after) + + # find pronoun span + start = len(before + leading_space) + first_pronoun_tok = find_token(sentence, start_pos=start) + pronoun_span = find_span(sentence, pronoun, start=first_pronoun_tok.i) + assert pronoun_span.text == pronoun + + if eval: + # convert to format where pronoun is surrounded by "[]" and + # query is surrounded by "_" + query_span = find_span(sentence, query) + query_with_ws = "_{}_{}".format( + query_span.text, + (" " if query_span.text_with_ws.endswith(" ") else ""), + ) + pronoun_with_ws = "[{}]{}".format( + pronoun_span.text, + (" " if pronoun_span.text_with_ws.endswith(" ") else ""), + ) + if query_span.start < pronoun_span.start: + first = (query_span, query_with_ws) + second = (pronoun_span, pronoun_with_ws) + else: + first = (pronoun_span, pronoun_with_ws) + second = (query_span, query_with_ws) + sentence = ( + sentence[: first[0].start].text_with_ws + + first[1] + + sentence[first[0].end : second[0].start].text_with_ws + + second[1] + + sentence[second[0].end :].text + ) + yield sentence, sample.get("label", None) + else: + yield sentence, pronoun_span, query, sample.get("label", None) + + +def winogrande_jsonl_iterator(input_fname, eval=False): + with open(input_fname) as fin: + for line in fin: + sample = json.loads(line.strip()) + sentence, option1, option2 = ( + sample["sentence"], + sample["option1"], + sample["option2"], + ) + + pronoun_span = (sentence.index("_"), sentence.index("_") + 1) + + if eval: + query, cand = option1, option2 + else: + query = option1 if sample["answer"] == "1" else option2 + cand = option2 if sample["answer"] == "1" else option1 + yield sentence, pronoun_span, query, cand + + +def filter_noun_chunks( + chunks, exclude_pronouns=False, exclude_query=None, exact_match=False +): + if exclude_pronouns: + chunks = [ + np + for np in chunks + if (np.lemma_ != "-PRON-" and not all(tok.pos_ == "PRON" for tok in np)) + ] + + if exclude_query is not None: + excl_txt = [exclude_query.lower()] + filtered_chunks = [] + for chunk in chunks: + lower_chunk = chunk.text.lower() + found = False + for excl in excl_txt: + if ( + not exact_match and (lower_chunk in excl or excl in lower_chunk) + ) or lower_chunk == excl: + found = True + break + if not found: + filtered_chunks.append(chunk) + chunks = filtered_chunks + + return chunks diff --git a/fairseq/examples/rxf/README.md b/fairseq/examples/rxf/README.md new file mode 100644 index 0000000000000000000000000000000000000000..22a1cc47df23c7e0ebbf0ad805031478d1b4a95e --- /dev/null +++ b/fairseq/examples/rxf/README.md @@ -0,0 +1,52 @@ +[Better Fine-Tuning by Reducing Representational Collapse](https://arxiv.org/abs/2008.03156) +===================== +This repo contains the code to replicate all experiments from the _Better Fine-Tuning by Reducing Representational Collapse_ paper excluding the probing results. + +The R3F sentence prediction criterion is registered as `sentence_prediction_r3f` while the label smoothing version of it is implemented as `label_smoothed_cross_entropy_r3f`. The R4F version of the sentence prediction criterion can be achieved by applying spectral norm to the classification head via the `--spectral-norm-classification-head` parameter. + +## Hyper-parameters +Our methods introduce 3 new hyper-parameters; `--eps` which sets the standard deviation or range of the distribution we're sampling from, `--r3f-lambda` which controls the combining of logistic loss and noisy KL loss and `--noise-type` which controls which parametric distribution we use ('normal', 'uniform'). + +For example to run R3F on RTE from GLUE + +``` +TOTAL_NUM_UPDATES=3120 +WARMUP_UPDATES=187 +LR=1e-05 +NUM_CLASSES=2 +MAX_SENTENCES=8 # Batch size. +ROBERTA_PATH=/path/to/roberta/model.pt + +CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin \ + --restore-file $ROBERTA_PATH \ + --max-positions 512 \ + --max-sentences $MAX_SENTENCES \ + --max-tokens 4400 \ + --task sentence_prediction \ + --reset-optimizer --reset-dataloader --reset-meters \ + --required-batch-size-multiple 1 \ + --init-token 0 --separator-token 2 \ + --arch roberta_large \ + --criterion sentence_prediction_r3f \ + --num-classes $NUM_CLASSES \ + --dropout 0.1 --attention-dropout 0.1 \ + --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ + --clip-norm 0.0 \ + --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ + --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ + --max-epoch 10 \ + --find-unused-parameters \ + --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ + --noise-type uniform --r3f-lambda 0.7 \ + --user-dir examples/rxf/rxf_src +``` + +## Citation +```bibtex +@article{aghajanyan2020better, + title={Better Fine-Tuning by Reducing Representational Collapse}, + author={Aghajanyan, Armen and Shrivastava, Akshat and Gupta, Anchit and Goyal, Naman and Zettlemoyer, Luke and Gupta, Sonal}, + journal={arXiv preprint arXiv:2008.03156}, + year={2020} +} +``` diff --git a/fairseq/examples/rxf/__init__.py b/fairseq/examples/rxf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b24cb6b797b4159c9862bab1f882ee6ae95614ab --- /dev/null +++ b/fairseq/examples/rxf/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from . import rxf_src # noqa diff --git a/fairseq/examples/rxf/rxf_src/__init__.py b/fairseq/examples/rxf/rxf_src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..306e232d6f386b26153864601114e162080dcee4 --- /dev/null +++ b/fairseq/examples/rxf/rxf_src/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from . import label_smoothed_cross_entropy_r3f, sentence_prediction_r3f # noqa diff --git a/fairseq/examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py b/fairseq/examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py new file mode 100644 index 0000000000000000000000000000000000000000..6191fd55ac54635ae23c453d064ae9a1dacb66a1 --- /dev/null +++ b/fairseq/examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py @@ -0,0 +1,158 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +import torch.nn.functional as F +from fairseq import utils +from fairseq.logging import metrics +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.criterions.label_smoothed_cross_entropy import label_smoothed_nll_loss + + +@register_criterion("label_smoothed_cross_entropy_r3f") +class LabelSmoothedCrossEntropyR3FCriterion(FairseqCriterion): + def __init__( + self, task, sentence_avg, label_smoothing, eps, r3f_lambda, noise_type + ): + super().__init__(task) + self.sentence_avg = sentence_avg + self.label_smoothing = label_smoothing + self.eps = eps + self.r3f_lambda = r3f_lambda + self.noise_type = noise_type + if self.noise_type in {"normal"}: + self.noise_sampler = torch.distributions.normal.Normal( + loc=0.0, scale=self.eps + ) + elif self.noise_type == "uniform": + self.noise_sampler = torch.distributions.uniform.Uniform( + low=-self.eps, high=self.eps + ) + else: + raise Exception(f"unrecognized noise type {self.noise_type}") + + @staticmethod + def add_args(parser): + """Add criterion-specific arguments to the parser.""" + # fmt: off + parser.add_argument('--label-smoothing', default=0., type=float, metavar='D', + help='epsilon for label smoothing, 0 means no label smoothing') + parser.add_argument('--eps', type=float, default=1e-5, + help='noise eps') + parser.add_argument('--r3f-lambda', type=float, default=1.0, + help='lambda for combining logistic loss and noisy KL loss') + parser.add_argument('--noise-type', type=str, default='normal', + choices=['normal', 'uniform'], + help='type of noises') + # fmt: on + + def _get_symm_kl(self, noised_logits, input_logits): + return ( + F.kl_div( + F.log_softmax(noised_logits, dim=-1, dtype=torch.float32), + F.softmax(input_logits, dim=-1, dtype=torch.float32), + None, + None, + "sum", + ) + + F.kl_div( + F.log_softmax(input_logits, dim=-1, dtype=torch.float32), + F.softmax(noised_logits, dim=-1, dtype=torch.float32), + None, + None, + "sum", + ) + ) / noised_logits.size(0) + + def forward(self, model, sample, reduce=True): + """Compute the loss for the given sample. + + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + token_embeddings = model.encoder.embed_tokens(sample["net_input"]["src_tokens"]) + input_logits, extra = model(**sample["net_input"]) + loss, nll_loss = self.compute_loss( + model, (input_logits, extra), sample, reduce=reduce + ) + sample_size = ( + sample["target"].size(0) if self.sentence_avg else sample["ntokens"] + ) + + if model.training: + noise = self.noise_sampler.sample(sample_shape=token_embeddings.shape).to( + token_embeddings + ) + noised_embeddings = token_embeddings.clone() + noise + + noised_logits, _ = model( + **sample["net_input"], token_embeddings=noised_embeddings + ) + symm_kl = self._get_symm_kl(noised_logits, input_logits) + + if model.training: + symm_kl = symm_kl * sample_size + loss = loss + self.r3f_lambda * symm_kl + + logging_output = { + "loss": loss.data, + "nll_loss": nll_loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample["target"].size(0), + "sample_size": sample_size, + } + + if model.training: + logging_output.update( + symm_kl=utils.item(symm_kl.data) if reduce else symm_kl.data + ) + + return loss, sample_size, logging_output + + def compute_loss(self, model, net_output, sample, reduce=True): + lprobs = model.get_normalized_probs(net_output, log_probs=True) + lprobs = lprobs.view(-1, lprobs.size(-1)) + target = model.get_targets(sample, net_output).view(-1, 1) + loss, nll_loss = label_smoothed_nll_loss( + lprobs, + target, + self.label_smoothing, + ignore_index=self.padding_idx, + reduce=reduce, + ) + return loss, nll_loss + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + loss_sum = sum(log.get("loss", 0) for log in logging_outputs) + nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs) + ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + symm_kl_sum = sum(log.get("symm_kl", 0) for log in logging_outputs) + + metrics.log_scalar("symm_kl", symm_kl_sum / sample_size, sample_size, round=3) + metrics.log_scalar( + "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 + ) + metrics.log_scalar( + "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3 + ) + metrics.log_derived( + "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) + ) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return True diff --git a/fairseq/examples/rxf/rxf_src/sentence_prediction_r3f.py b/fairseq/examples/rxf/rxf_src/sentence_prediction_r3f.py new file mode 100644 index 0000000000000000000000000000000000000000..6ecffd6b143debb1c67adccd77a6aaed194ec55a --- /dev/null +++ b/fairseq/examples/rxf/rxf_src/sentence_prediction_r3f.py @@ -0,0 +1,171 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +import torch.nn.functional as F +from fairseq import utils +from fairseq.criterions import FairseqCriterion, register_criterion + + +@register_criterion("sentence_prediction_r3f") +class SentencePredictionR3F(FairseqCriterion): + def __init__( + self, + task, + eps, + r3f_lambda, + noise_type, + classification_head_name, + regression_target, + ): + super().__init__(task) + self.eps = eps + self.r3f_lambda = r3f_lambda + self.noise_type = noise_type + self.classification_head_name = classification_head_name + self.regression_target = regression_target + if self.noise_type in {"normal"}: + self.noise_sampler = torch.distributions.normal.Normal( + loc=0.0, scale=self.eps + ) + elif self.noise_type == "uniform": + self.noise_sampler = torch.distributions.uniform.Uniform( + low=-self.eps, high=self.eps + ) + else: + raise Exception(f"unrecognized noise type {self.noise_type}") + + @staticmethod + def add_args(parser): + # fmt: off + parser.add_argument('--eps', type=float, default=1e-5, + help='noise eps') + parser.add_argument('--r3f-lambda', type=float, default=1.0, + help='lambda for combining logistic loss and noisy KL loss') + parser.add_argument('--noise-type', type=str, default='uniform', + choices=['normal', 'uniform'], + help='type of noises for RXF methods') + parser.add_argument('--classification-head-name', + default='sentence_classification_head', + help='name of the classification head to use') + parser.add_argument('--regression-target', action='store_true') + # fmt: on + + def _get_symm_kl(self, noised_logits, input_logits): + return ( + F.kl_div( + F.log_softmax(noised_logits, dim=-1, dtype=torch.float32), + F.softmax(input_logits, dim=-1, dtype=torch.float32), + None, + None, + "sum", + ) + + F.kl_div( + F.log_softmax(input_logits, dim=-1, dtype=torch.float32), + F.softmax(noised_logits, dim=-1, dtype=torch.float32), + None, + None, + "sum", + ) + ) / noised_logits.size(0) + + def forward(self, model, sample, reduce=True): + """Compute the loss for the given sample. + + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + assert ( + hasattr(model, "classification_heads") + and self.classification_head_name in model.classification_heads + ), "model must provide sentence classification head for --criterion=sentence_prediction" + + token_embeddings = model.encoder.sentence_encoder.embed_tokens( + sample["net_input"]["src_tokens"] + ) + input_logits, _ = model( + **sample["net_input"], + features_only=True, + classification_head_name=self.classification_head_name, + token_embeddings=token_embeddings, + ) + if model.training and self.noise_sampler: + noise = self.noise_sampler.sample(sample_shape=token_embeddings.shape).to( + token_embeddings + ) + noised_embeddings = token_embeddings.detach().clone() + noise + + noised_logits, _ = model( + **sample["net_input"], + features_only=True, + classification_head_name=self.classification_head_name, + token_embeddings=noised_embeddings, + ) + symm_kl = self._get_symm_kl(noised_logits, input_logits) + else: + symm_kl = 0 + + targets = model.get_targets(sample, [input_logits]).view(-1) + sample_size = targets.numel() + + if not self.regression_target: + loss = F.nll_loss( + F.log_softmax(input_logits, dim=-1, dtype=torch.float32), + targets, + reduction="sum", + ) + if model.training: + symm_kl = symm_kl * sample_size + loss = loss + self.r3f_lambda * symm_kl + else: + logits = input_logits.squeeze().float() + targets = targets.float() + loss = F.mse_loss(logits, targets, reduction="sum") + + logging_output = { + "loss": utils.item(loss.data) if reduce else loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample_size, + "sample_size": sample_size, + } + + if not self.regression_target: + preds = input_logits.max(dim=1)[1] + logging_output.update(ncorrect=(preds == targets).sum().item()) + + if model.training and self.noise_sampler: + logging_output.update( + symm_kl=utils.item(symm_kl.data) if reduce else symm_kl.data + ) + return loss, sample_size, logging_output + + @staticmethod + def aggregate_logging_outputs(logging_outputs): + """Aggregate logging outputs from data parallel training.""" + loss_sum = sum(log.get("loss", 0) for log in logging_outputs) + symm_kl_sum = sum(log.get("symm_kl", 0) for log in logging_outputs) + ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) + nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + + agg_output = { + "loss": loss_sum / sample_size / math.log(2), + "symm_kl": symm_kl_sum / sample_size, + "ntokens": ntokens, + "nsentences": nsentences, + "sample_size": sample_size, + } + + if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]: + ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs) + agg_output.update(accuracy=ncorrect / nsentences) + + if sample_size != ntokens: + agg_output["nll_loss"] = loss_sum / ntokens / math.log(2) + return agg_output diff --git a/fairseq/examples/scaling_nmt/README.md b/fairseq/examples/scaling_nmt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0cc3360c3bbd58fe35a51591db8f081fc8576877 --- /dev/null +++ b/fairseq/examples/scaling_nmt/README.md @@ -0,0 +1,114 @@ +# Scaling Neural Machine Translation (Ott et al., 2018) + +This page includes instructions for reproducing results from the paper [Scaling Neural Machine Translation (Ott et al., 2018)](https://arxiv.org/abs/1806.00187). + +## Pre-trained models + +Model | Description | Dataset | Download +---|---|---|--- +`transformer.wmt14.en-fr` | Transformer
([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model:
[download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2)
newstest2014:
[download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2) +`transformer.wmt16.en-de` | Transformer
([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model:
[download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2)
newstest2014:
[download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2) + +## Training a new model on WMT'16 En-De + +First download the [preprocessed WMT'16 En-De data provided by Google](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8). + +Then: + +##### 1. Extract the WMT'16 En-De data +```bash +TEXT=wmt16_en_de_bpe32k +mkdir -p $TEXT +tar -xzvf wmt16_en_de.tar.gz -C $TEXT +``` + +##### 2. Preprocess the dataset with a joined dictionary +```bash +fairseq-preprocess \ + --source-lang en --target-lang de \ + --trainpref $TEXT/train.tok.clean.bpe.32000 \ + --validpref $TEXT/newstest2013.tok.bpe.32000 \ + --testpref $TEXT/newstest2014.tok.bpe.32000 \ + --destdir data-bin/wmt16_en_de_bpe32k \ + --nwordssrc 32768 --nwordstgt 32768 \ + --joined-dictionary \ + --workers 20 +``` + +##### 3. Train a model +```bash +fairseq-train \ + data-bin/wmt16_en_de_bpe32k \ + --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \ + --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ + --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \ + --dropout 0.3 --weight-decay 0.0 \ + --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ + --max-tokens 3584 \ + --fp16 +``` + +Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU or newer. + +***IMPORTANT:*** You will get better performance by training with big batches and +increasing the learning rate. If you want to train the above model with big batches +(assuming your machine has 8 GPUs): +- add `--update-freq 16` to simulate training on 8x16=128 GPUs +- increase the learning rate; 0.001 works well for big batches + +##### 4. Evaluate + +Now we can evaluate our trained model. + +Note that the original [Attention Is All You Need](https://arxiv.org/abs/1706.03762) +paper used a couple tricks to achieve better BLEU scores. We use these same tricks in +the Scaling NMT paper, so it's important to apply them when reproducing our results. + +First, use the [average_checkpoints.py](/scripts/average_checkpoints.py) script to +average the last few checkpoints. Averaging the last 5-10 checkpoints is usually +good, but you may need to adjust this depending on how long you've trained: +```bash +python scripts/average_checkpoints \ + --inputs /path/to/checkpoints \ + --num-epoch-checkpoints 10 \ + --output checkpoint.avg10.pt +``` + +Next, generate translations using a beam width of 4 and length penalty of 0.6: +```bash +fairseq-generate \ + data-bin/wmt16_en_de_bpe32k \ + --path checkpoint.avg10.pt \ + --beam 4 --lenpen 0.6 --remove-bpe > gen.out +``` + +Finally, we apply the ["compound splitting" script](/scripts/compound_split_bleu.sh) to +add spaces around dashes. For example "Café-Liebhaber" would become three tokens: +"Café - Liebhaber". This typically results in larger BLEU scores, but it is not +appropriate to compare these inflated scores to work which does not include this trick. +This trick was used in the [original AIAYN code](https://github.com/tensorflow/tensor2tensor/blob/fc9335c0203685cbbfe2b30c92db4352d8f60779/tensor2tensor/utils/get_ende_bleu.sh), +so we used it in the Scaling NMT paper as well. That said, it's strongly advised to +report [sacrebleu](https://github.com/mjpost/sacrebleu) scores instead. + +To compute "compound split" tokenized BLEU (not recommended!): +```bash +bash scripts/compound_split_bleu.sh gen.out +# BLEU4 = 29.29, 60.3/35.0/22.8/15.3 (BP=1.000, ratio=1.004, syslen=64763, reflen=64496) +``` + +To compute detokenized BLEU with sacrebleu (preferred): +```bash +bash scripts/sacrebleu.sh wmt14/full en de gen.out +# BLEU+case.mixed+lang.en-de+numrefs.1+smooth.exp+test.wmt14/full+tok.13a+version.1.4.3 = 28.6 59.3/34.3/22.1/14.9 (BP = 1.000 ratio = 1.016 hyp_len = 63666 ref_len = 62688) +``` + +## Citation + +```bibtex +@inproceedings{ott2018scaling, + title = {Scaling Neural Machine Translation}, + author = {Ott, Myle and Edunov, Sergey and Grangier, David and Auli, Michael}, + booktitle = {Proceedings of the Third Conference on Machine Translation (WMT)}, + year = 2018, +} +``` diff --git a/fairseq/examples/shuffled_word_order/README.finetuning.md b/fairseq/examples/shuffled_word_order/README.finetuning.md new file mode 100644 index 0000000000000000000000000000000000000000..ecbcb65884640c3327a2cbaef8aad4f3cfe812f7 --- /dev/null +++ b/fairseq/examples/shuffled_word_order/README.finetuning.md @@ -0,0 +1,135 @@ +# Fine-tuning details + +For each task (GLUE and PAWS), we perform hyperparam search for each model, and report the mean and standard deviation across 5 seeds of the best model. First, get the datasets following the instructions in [RoBERTa fine-tuning README](../roberta/README.glue.md). Alternatively, you can use [huggingface datasets](https://huggingface.co/docs/datasets/) to get the task data: + +```python +from datasets import load_dataset +import pandas as pd +from pathlib import Path + +key2file = { +"paws": { + "loc": "paws_data", + "columns": ["id", "sentence1", "sentence2", "label"], + "train": "train.tsv", + "validation": "dev.tsv", + "test": "test.tsv" + } +} + +task_data = load_dataset("paws", "labeled_final") +task_config = key2file["paws"] +save_path = Path(task_config["loc"]) +save_path.mkdir(exist_ok=True, parents=True) +for key, fl in task_config.items(): + if key in ["loc", "columns"]: + continue + print(f"Reading {key}") + columns = task_config["columns"] + df = pd.DataFrame(task_data[key]) + print(df.columns) + df = df[columns] + print(f"Got {len(df)} records") + save_loc = save_path / fl + print(f"Saving to : {save_loc}") + df.to_csv(save_loc, sep="\t", header=None, index=None) + +``` + +- Preprocess using RoBERTa GLUE preprocessing script, while keeping in mind the column numbers for `sentence1`, `sentence2` and `label` (which is 0,1,2 if you save the data according to the above example.) +- Then, fine-tuning is performed similarly to RoBERTa (for example, in case of RTE): + +```bash +TOTAL_NUM_UPDATES=30875 # 10 epochs through RTE for bsz 16 +WARMUP_UPDATES=1852 # 6 percent of the number of updates +LR=2e-05 # Peak LR for polynomial LR scheduler. +NUM_CLASSES=2 +MAX_SENTENCES=16 # Batch size. +SHUFFLED_ROBERTA_PATH=/path/to/shuffled_roberta/model.pt + +CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin/ \ + --restore-file $SHUFFLED_ROBERTA_PATH \ + --max-positions 512 \ + --batch-size $MAX_SENTENCES \ + --max-tokens 4400 \ + --task sentence_prediction \ + --reset-optimizer --reset-dataloader --reset-meters \ + --required-batch-size-multiple 1 \ + --init-token 0 --separator-token 2 \ + --arch roberta_large \ + --criterion sentence_prediction \ + --num-classes $NUM_CLASSES \ + --dropout 0.1 --attention-dropout 0.1 \ + --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ + --clip-norm 0.0 \ + --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ + --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ + --max-epoch 10 \ + --find-unused-parameters \ + --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric; +``` + +- `TOTAL_NUM_UPDATES` is computed based on the `--batch_size` value and the dataset size. +- `WARMUP_UPDATES` is computed as 6% of `TOTAL_NUM_UPDATES` +- Best hyperparam of `--lr` and `--batch_size` is reported below: + +## `--lr` + +| | name | RTE | MRPC | SST-2 | CoLA | QQP | QNLI | MNLI | PAWS | +| --: | :----------- | ----: | ----: | ----: | ----: | ----: | ----: | ----: | ----: | +| 0 | original | 2e-05 | 2e-05 | 1e-05 | 2e-05 | 1e-05 | 1e-05 | 1e-05 | 2e-05 | +| 1 | n_1 | 2e-05 | 1e-05 | 1e-05 | 1e-05 | 3e-05 | 1e-05 | 2e-05 | 2e-05 | +| 2 | n_2 | 2e-05 | 2e-05 | 1e-05 | 1e-05 | 2e-05 | 1e-05 | 1e-05 | 3e-05 | +| 3 | n_3 | 3e-05 | 1e-05 | 2e-05 | 2e-05 | 3e-05 | 1e-05 | 1e-05 | 2e-05 | +| 4 | n_4 | 3e-05 | 1e-05 | 2e-05 | 2e-05 | 2e-05 | 1e-05 | 1e-05 | 2e-05 | +| 5 | r512 | 1e-05 | 3e-05 | 2e-05 | 2e-05 | 3e-05 | 2e-05 | 3e-05 | 2e-05 | +| 6 | rand_corpus | 2e-05 | 1e-05 | 3e-05 | 1e-05 | 3e-05 | 3e-05 | 3e-05 | 2e-05 | +| 7 | rand_uniform | 2e-05 | 1e-05 | 3e-05 | 2e-05 | 3e-05 | 3e-05 | 3e-05 | 1e-05 | +| 8 | rand_init | 1e-05 | 1e-05 | 3e-05 | 1e-05 | 1e-05 | 1e-05 | 2e-05 | 1e-05 | +| 9 | no_pos | 1e-05 | 3e-05 | 2e-05 | 1e-05 | 1e-05 | 1e-05 | 1e-05 | 1e-05 | + +## `--batch_size` + +| | name | RTE | MRPC | SST-2 | CoLA | QQP | QNLI | MNLI | PAWS | +| --: | :----------- | --: | ---: | ----: | ---: | --: | ---: | ---: | ---: | +| 0 | orig | 16 | 16 | 32 | 16 | 16 | 32 | 32 | 16 | +| 1 | n_1 | 32 | 32 | 16 | 32 | 32 | 16 | 32 | 16 | +| 2 | n_2 | 32 | 16 | 32 | 16 | 32 | 32 | 16 | 32 | +| 3 | n_3 | 32 | 32 | 16 | 32 | 32 | 16 | 32 | 32 | +| 4 | n_4 | 32 | 16 | 32 | 16 | 32 | 32 | 32 | 32 | +| 5 | r512 | 32 | 16 | 16 | 32 | 32 | 16 | 16 | 16 | +| 6 | rand_corpus | 16 | 16 | 16 | 16 | 32 | 16 | 16 | 32 | +| 7 | rand_uniform | 16 | 32 | 16 | 16 | 32 | 16 | 16 | 16 | +| 8 | rand_init | 16 | 16 | 32 | 16 | 16 | 16 | 32 | 16 | +| 9 | no_pos | 16 | 32 | 16 | 16 | 32 | 16 | 16 | 16 | + +- Perform inference similar to RoBERTa as well: + +```python +from fairseq.models.roberta import RobertaModel + +roberta = RobertaModel.from_pretrained( + 'checkpoints/', + checkpoint_file='checkpoint_best.pt', + data_name_or_path='PAWS-bin' +) + +label_fn = lambda label: roberta.task.label_dictionary.string( + [label + roberta.task.label_dictionary.nspecial] +) +ncorrect, nsamples = 0, 0 +roberta.cuda() +roberta.eval() +with open('paws_data/dev.tsv') as fin: + fin.readline() + for index, line in enumerate(fin): + tokens = line.strip().split('\t') + sent1, sent2, target = tokens[0], tokens[1], tokens[2] + tokens = roberta.encode(sent1, sent2) + prediction = roberta.predict('sentence_classification_head', tokens).argmax().item() + prediction_label = label_fn(prediction) + ncorrect += int(prediction_label == target) + nsamples += 1 +print('| Accuracy: ', float(ncorrect)/float(nsamples)) + +``` diff --git a/fairseq/examples/shuffled_word_order/README.md b/fairseq/examples/shuffled_word_order/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6ce0b3927dcec6b375149221d0c0511e9401d70b --- /dev/null +++ b/fairseq/examples/shuffled_word_order/README.md @@ -0,0 +1,94 @@ +# Masked Language Modeling and the Distributional Hypothesis: Order Word Matters Pre-training for Little + +[https://arxiv.org/abs/2104.06644](https://arxiv.org/abs/2104.06644) + +## Introduction + +In this work, we pre-train [RoBERTa](../roberta) base on various word shuffled variants of BookWiki corpus (16GB). We observe that a word shuffled pre-trained model achieves surprisingly good scores on GLUE, PAWS and several parametric probing tasks. Please read our paper for more details on the experiments. + +## Pre-trained models + +| Model | Description | Download | +| ------------------------------------- | -------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | +| `roberta.base.orig` | RoBERTa (base) trained on natural corpus | [roberta.base.orig.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.orig.tar.gz) | +| `roberta.base.shuffle.n1` | RoBERTa (base) trained on n=1 gram sentence word shuffled data | [roberta.base.shuffle.n1.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n1.tar.gz) | +| `roberta.base.shuffle.n2` | RoBERTa (base) trained on n=2 gram sentence word shuffled data | [roberta.base.shuffle.n2.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n2.tar.gz) | +| `roberta.base.shuffle.n3` | RoBERTa (base) trained on n=3 gram sentence word shuffled data | [roberta.base.shuffle.n3.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n3.tar.gz) | +| `roberta.base.shuffle.n4` | RoBERTa (base) trained on n=4 gram sentence word shuffled data | [roberta.base.shuffle.n4.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n4.tar.gz) | +| `roberta.base.shuffle.512` | RoBERTa (base) trained on unigram 512 word block shuffled data | [roberta.base.shuffle.512.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.512.tar.gz) | +| `roberta.base.shuffle.corpus` | RoBERTa (base) trained on unigram corpus word shuffled data | [roberta.base.shuffle.corpus.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.corpus.tar.gz) | +| `roberta.base.shuffle.corpus_uniform` | RoBERTa (base) trained on unigram corpus word shuffled data, where all words are uniformly sampled | [roberta.base.shuffle.corpus_uniform.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.corpus_uniform.tar.gz) | +| `roberta.base.nopos` | RoBERTa (base) without positional embeddings, trained on natural corpus | [roberta.base.nopos.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.nopos.tar.gz) | + +## Results + +[GLUE (Wang et al, 2019)](https://gluebenchmark.com/) & [PAWS (Zhang et al, 2019)](https://github.com/google-research-datasets/paws) _(dev set, single model, single-task fine-tuning, median of 5 seeds)_ + +| name | CoLA | MNLI | MRPC | PAWS | QNLI | QQP | RTE | SST-2 | +| :----------------------------------- | ----: | ----: | ----: | ----: | ----: | ----: | ----: | ----: | +| `roberta.base.orig` | 61.4 | 86.11 | 89.19 | 94.46 | 92.53 | 91.26 | 74.64 | 93.92 | +| `roberta.base.shuffle.n1` | 35.15 | 82.64 | 86 | 89.97 | 89.02 | 91.01 | 69.02 | 90.47 | +| `roberta.base.shuffle.n2` | 54.37 | 83.43 | 86.24 | 93.46 | 90.44 | 91.36 | 70.83 | 91.79 | +| `roberta.base.shuffle.n3` | 48.72 | 83.85 | 86.36 | 94.05 | 91.69 | 91.24 | 70.65 | 92.02 | +| `roberta.base.shuffle.n4` | 58.64 | 83.77 | 86.98 | 94.32 | 91.69 | 91.4 | 70.83 | 92.48 | +| `roberta.base.shuffle.512` | 12.76 | 77.52 | 79.61 | 84.77 | 85.19 | 90.2 | 56.52 | 86.34 | +| `roberta.base.shuffle.corpus` | 0 | 71.9 | 70.52 | 58.52 | 71.11 | 85.52 | 53.99 | 83.35 | +| `roberta.base.shuffle.corpus_random` | 9.19 | 72.33 | 70.76 | 58.42 | 77.76 | 85.93 | 53.99 | 84.04 | +| `roberta.base.nopos` | 0 | 63.5 | 72.73 | 57.08 | 77.72 | 87.87 | 54.35 | 83.24 | + +For more results on probing tasks, please refer to [our paper](https://arxiv.org/abs/2104.06644). + +## Example Usage + +Follow the same usage as in [RoBERTa](https://github.com/pytorch/fairseq/tree/main/examples/roberta) to load and test your models: + +```python +# Download roberta.base.shuffle.n1 model +wget https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n1.tar.gz +tar -xzvf roberta.base.shuffle.n1.tar.gz +# Copy the dictionary files +cd roberta.base.shuffle.n1.tar.gz +wget -O dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt && wget -O encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json && wget -O vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe +cd .. + +# Load the model in fairseq +from fairseq.models.roberta import RobertaModel +roberta = RobertaModel.from_pretrained('/path/to/roberta.base.shuffle.n1', checkpoint_file='model.pt') +roberta.eval() # disable dropout (or leave in train mode to finetune) +``` + +We have also provided a [Google Colab](https://colab.research.google.com/drive/1IJDVfNVWdvRfLjphQKBGzmob84t-OXpm) notebook to demonstrate the loading of the model. The models were trained on top of Fairseq from the following commit: [62cff008ebeeed855093837507d5e6bf52065ee6](https://github.com/pytorch/fairseq/commit/62cff008ebeeed855093837507d5e6bf52065ee6). + +**Note**: The model trained without positional embeddings (`roberta.base.nopos`) is a modified `RoBERTa` model, where the positional embeddings are not used. Thus, the typical `from_pretrained` method on fairseq version of RoBERTa will not be able to load the above model weights. To do so, construct a new `RoBERTaModel` object by setting the flag `use_positional_embeddings` to `False` (or [in the latest code](https://github.com/pytorch/fairseq/blob/main/fairseq/models/roberta/model.py#L543), set `no_token_positional_embeddings` to `True`), and then load the individual weights. + +## Fine-tuning Evaluation + +We provide the trained fine-tuned models on MNLI here for each model above for quick evaluation (1 seed for each model). Please refer to [finetuning details](README.finetuning.md) for the parameters of these models. Follow [RoBERTa](https://github.com/pytorch/fairseq/tree/main/examples/roberta) instructions to evaluate these models. + +| Model | MNLI M Dev Accuracy | Link | +| :----------------------------------------- | :------------------ | :--------------------------------------------------------------------------------------------------------------- | +| `roberta.base.orig.mnli` | 86.14 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.orig.mnli.tar.gz) | +| `roberta.base.shuffle.n1.mnli` | 82.55 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n1.mnli.tar.gz) | +| `roberta.base.shuffle.n2.mnli` | 83.21 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n2.mnli.tar.gz) | +| `roberta.base.shuffle.n3.mnli` | 83.89 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n3.mnli.tar.gz) | +| `roberta.base.shuffle.n4.mnli` | 84.00 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n4.mnli.tar.gz) | +| `roberta.base.shuffle.512.mnli` | 77.22 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.512.mnli.tar.gz) | +| `roberta.base.shuffle.corpus.mnli` | 71.88 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.corpus.mnli.tar.gz) | +| `roberta.base.shuffle.corpus_uniform.mnli` | 72.46 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.corpus_uniform.mnli.tar.gz) | + +## Citation + +```bibtex +@misc{sinha2021masked, + title={Masked Language Modeling and the Distributional Hypothesis: Order Word Matters Pre-training for Little}, + author={Koustuv Sinha and Robin Jia and Dieuwke Hupkes and Joelle Pineau and Adina Williams and Douwe Kiela}, + year={2021}, + eprint={2104.06644}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +## Contact + +For questions and comments, please reach out to Koustuv Sinha (koustuv.sinha@mail.mcgill.ca). diff --git a/fairseq/examples/simultaneous_translation/README.md b/fairseq/examples/simultaneous_translation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..62a005e0ec6f15af9015d335e34b45df6ed89b6c --- /dev/null +++ b/fairseq/examples/simultaneous_translation/README.md @@ -0,0 +1,5 @@ +# Simultaneous Translation +Examples of simultaneous translation in fairseq +- [English-to-Japanese text-to-text wait-k model](docs/enja-waitk.md) +- [English-to-Germen text-to-text monotonic multihead attention model](docs/ende-mma.md) +- [English-to-Germen speech-to-text simultaneous translation model](../speech_to_text/docs/simulst_mustc_example.md) diff --git a/fairseq/examples/simultaneous_translation/__init__.py b/fairseq/examples/simultaneous_translation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5835316ba9b23c0d99d1a8f109ee047682211546 --- /dev/null +++ b/fairseq/examples/simultaneous_translation/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from . import models # noqa diff --git a/fairseq/examples/simultaneous_translation/docs/ende-mma.md b/fairseq/examples/simultaneous_translation/docs/ende-mma.md new file mode 100644 index 0000000000000000000000000000000000000000..241d604a3b31a37755da68aad6ff47d46891d3fc --- /dev/null +++ b/fairseq/examples/simultaneous_translation/docs/ende-mma.md @@ -0,0 +1,74 @@ +# Simultaneous Machine Translation + +This directory contains the code for the paper [Monotonic Multihead Attention](https://openreview.net/forum?id=Hyg96gBKPS) + +## Prepare Data + +[Please follow the instructions to download and preprocess the WMT'15 En-De dataset.](https://github.com/pytorch/fairseq/tree/simulastsharedtask/examples/translation#prepare-wmt14en2desh) + +Another example of training an English to Japanese model can be found [here](docs/enja.md) + +## Training + +- MMA-IL + +```shell +fairseq-train \ + data-bin/wmt15_en_de_32k \ + --simul-type infinite_lookback \ + --user-dir $FAIRSEQ/example/simultaneous_translation \ + --mass-preservation \ + --criterion latency_augmented_label_smoothed_cross_entropy \ + --latency-weight-avg 0.1 \ + --max-update 50000 \ + --arch transformer_monotonic_iwslt_de_en save_dir_key=lambda \ + --optimizer adam --adam-betas '(0.9, 0.98)' \ + --lr-scheduler 'inverse_sqrt' \ + --warmup-init-lr 1e-7 --warmup-updates 4000 \ + --lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ + --dropout 0.3 \ + --label-smoothing 0.1\ + --max-tokens 3584 +``` + +- MMA-H + +```shell +fairseq-train \ + data-bin/wmt15_en_de_32k \ + --simul-type hard_aligned \ + --user-dir $FAIRSEQ/example/simultaneous_translation \ + --mass-preservation \ + --criterion latency_augmented_label_smoothed_cross_entropy \ + --latency-weight-var 0.1 \ + --max-update 50000 \ + --arch transformer_monotonic_iwslt_de_en save_dir_key=lambda \ + --optimizer adam --adam-betas '(0.9, 0.98)' \ + --lr-scheduler 'inverse_sqrt' \ + --warmup-init-lr 1e-7 --warmup-updates 4000 \ + --lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ + --dropout 0.3 \ + --label-smoothing 0.1\ + --max-tokens 3584 +``` + +- wait-k + +```shell +fairseq-train \ + data-bin/wmt15_en_de_32k \ + --simul-type wait-k \ + --waitk-lagging 3 \ + --user-dir $FAIRSEQ/example/simultaneous_translation \ + --mass-preservation \ + --criterion latency_augmented_label_smoothed_cross_entropy \ + --max-update 50000 \ + --arch transformer_monotonic_iwslt_de_en save_dir_key=lambda \ + --optimizer adam --adam-betas '(0.9, 0.98)' \ + --lr-scheduler 'inverse_sqrt' \ + --warmup-init-lr 1e-7 --warmup-updates 4000 \ + --lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ + --dropout 0.3 \ + --label-smoothing 0.1\ + --max-tokens 3584 +``` diff --git a/fairseq/examples/simultaneous_translation/docs/enja-waitk.md b/fairseq/examples/simultaneous_translation/docs/enja-waitk.md new file mode 100644 index 0000000000000000000000000000000000000000..fb9d82576f80b4405564a99774fc98ac2fe6ad3b --- /dev/null +++ b/fairseq/examples/simultaneous_translation/docs/enja-waitk.md @@ -0,0 +1,106 @@ +# An example of English to Japaneses Simultaneous Translation System + +This is an example of training and evaluating a transformer *wait-k* English to Japanese simultaneous text-to-text translation model. + +## Data Preparation +This section introduces the data preparation for training and evaluation. +If you only want to evaluate the model, please jump to [Inference & Evaluation](#inference-&-evaluation) + +For illustration, we only use the following subsets of the available data from [WMT20 news translation task](http://www.statmt.org/wmt20/translation-task.html), which results in 7,815,391 sentence pairs. +- News Commentary v16 +- Wiki Titles v3 +- WikiMatrix V1 +- Japanese-English Subtitle Corpus +- The Kyoto Free Translation Task Corpus + +We use WMT20 development data as development set. Training `transformer_vaswani_wmt_en_de_big` model on such amount of data will result in 17.3 BLEU with greedy search and 19.7 with beam (10) search. Notice that a better performance can be achieved with the full WMT training data. + +We use [sentencepiece](https://github.com/google/sentencepiece) toolkit to tokenize the data with a vocabulary size of 32000. +Additionally, we filtered out the sentences longer than 200 words after tokenization. +Assuming the tokenized text data is saved at `${DATA_DIR}`, +we prepare the data binary with the following command. + +```bash +fairseq-preprocess \ + --source-lang en --target-lang ja \ + --trainpref ${DATA_DIR}/train \ + --validpref ${DATA_DIR}/dev \ + --testpref ${DATA_DIR}/test \ + --destdir ${WMT20_ENJA_DATA_BIN} \ + --nwordstgt 32000 --nwordssrc 32000 \ + --workers 20 +``` + +## Simultaneous Translation Model Training +To train a wait-k `(k=10)` model. +```bash +fairseq-train ${WMT20_ENJA_DATA_BIN} \ + --save-dir ${SAVEDIR} + --simul-type waitk \ + --waitk-lagging 10 \ + --max-epoch 70 \ + --arch transformer_monotonic_vaswani_wmt_en_de_big \ + --optimizer adam \ + --adam-betas '(0.9, 0.98)' \ + --lr-scheduler inverse_sqrt \ + --warmup-init-lr 1e-07 \ + --warmup-updates 4000 \ + --lr 0.0005 \ + --stop-min-lr 1e-09 \ + --clip-norm 10.0 \ + --dropout 0.3 \ + --weight-decay 0.0 \ + --criterion label_smoothed_cross_entropy \ + --label-smoothing 0.1 \ + --max-tokens 3584 +``` +This command is for training on 8 GPUs. Equivalently, the model can be trained on one GPU with `--update-freq 8`. + +## Inference & Evaluation +First of all, install [SimulEval](https://github.com/facebookresearch/SimulEval) for evaluation. + +```bash +git clone https://github.com/facebookresearch/SimulEval.git +cd SimulEval +pip install -e . +``` + +The following command is for the evaluation. +Assuming the source and reference files are `${SRC_FILE}` and `${REF_FILE}`, the sentencepiece model file for English is saved at `${SRC_SPM_PATH}` + + +```bash +simuleval \ + --source ${SRC_FILE} \ + --target ${TGT_FILE} \ + --data-bin ${WMT20_ENJA_DATA_BIN} \ + --sacrebleu-tokenizer ja-mecab \ + --eval-latency-unit char \ + --no-space \ + --src-splitter-type sentencepiecemodel \ + --src-splitter-path ${SRC_SPM_PATH} \ + --agent ${FAIRSEQ}/examples/simultaneous_translation/agents/simul_trans_text_agent_enja.py \ + --model-path ${SAVE_DIR}/${CHECKPOINT_FILENAME} \ + --output ${OUTPUT} \ + --scores +``` + +The `--data-bin` should be the same in previous sections if you prepare the data from the scratch. +If only for evaluation, a prepared data directory can be found [here](https://dl.fbaipublicfiles.com/simultaneous_translation/wmt20_enja_medium_databin.tgz) and a pretrained checkpoint (wait-k=10 model) can be downloaded from [here](https://dl.fbaipublicfiles.com/simultaneous_translation/wmt20_enja_medium_wait10_ckpt.pt). + +The output should look like this: +```bash +{ + "Quality": { + "BLEU": 11.442253287568398 + }, + "Latency": { + "AL": 8.6587861866951, + "AP": 0.7863304776251316, + "DAL": 9.477850951194764 + } +} +``` +The latency is evaluated by characters (`--eval-latency-unit`) on the target side. The latency is evaluated with `sacrebleu` with `MeCab` tokenizer `--sacrebleu-tokenizer ja-mecab`. `--no-space` indicates that do not add space when merging the predicted words. + +If `--output ${OUTPUT}` option is used, the detailed log and scores will be stored under the `${OUTPUT}` directory. diff --git a/fairseq/examples/simultaneous_translation/eval/agents/simul_t2t_enja.py b/fairseq/examples/simultaneous_translation/eval/agents/simul_t2t_enja.py new file mode 100644 index 0000000000000000000000000000000000000000..8f3c8703ca37398b9d389ce5181bdfac2333cdf2 --- /dev/null +++ b/fairseq/examples/simultaneous_translation/eval/agents/simul_t2t_enja.py @@ -0,0 +1,226 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os + +from fairseq import checkpoint_utils, tasks +import sentencepiece as spm +import torch + +try: + from simuleval import READ_ACTION, WRITE_ACTION, DEFAULT_EOS + from simuleval.agents import TextAgent +except ImportError: + print("Please install simuleval 'pip install simuleval'") + + +BOS_PREFIX = "\u2581" + + +class SimulTransTextAgentJA(TextAgent): + """ + Simultaneous Translation + Text agent for Japanese + """ + def __init__(self, args): + + # Whether use gpu + self.gpu = getattr(args, "gpu", False) + + # Max len + self.max_len = args.max_len + + # Load Model + self.load_model_vocab(args) + + # build word splitter + self.build_word_splitter(args) + + self.eos = DEFAULT_EOS + + def initialize_states(self, states): + states.incremental_states = dict() + states.incremental_states["online"] = dict() + + def to_device(self, tensor): + if self.gpu: + return tensor.cuda() + else: + return tensor.cpu() + + def load_model_vocab(self, args): + + filename = args.model_path + if not os.path.exists(filename): + raise IOError("Model file not found: {}".format(filename)) + + state = checkpoint_utils.load_checkpoint_to_cpu(filename) + + task_args = state["cfg"]["task"] + task_args.data = args.data_bin + + task = tasks.setup_task(task_args) + + # build model for ensemble + state["cfg"]["model"].load_pretrained_encoder_from = None + state["cfg"]["model"].load_pretrained_decoder_from = None + + self.model = task.build_model(state["cfg"]["model"]) + self.model.load_state_dict(state["model"], strict=True) + self.model.eval() + self.model.share_memory() + + if self.gpu: + self.model.cuda() + + # Set dictionary + self.dict = {} + self.dict["tgt"] = task.target_dictionary + self.dict["src"] = task.source_dictionary + + @staticmethod + def add_args(parser): + # fmt: off + parser.add_argument('--model-path', type=str, required=True, + help='path to your pretrained model.') + parser.add_argument("--data-bin", type=str, required=True, + help="Path of data binary") + parser.add_argument("--max-len", type=int, default=100, + help="Max length of translation") + parser.add_argument("--tgt-splitter-type", type=str, default="SentencePiece", + help="Subword splitter type for target text.") + parser.add_argument("--tgt-splitter-path", type=str, default=None, + help="Subword splitter model path for target text.") + parser.add_argument("--src-splitter-type", type=str, default="SentencePiece", + help="Subword splitter type for source text.") + parser.add_argument("--src-splitter-path", type=str, default=None, + help="Subword splitter model path for source text.") + # fmt: on + return parser + + def build_word_splitter(self, args): + self.spm = {} + for lang in ['src', 'tgt']: + if getattr(args, f'{lang}_splitter_type', None): + path = getattr(args, f'{lang}_splitter_path', None) + if path: + self.spm[lang] = spm.SentencePieceProcessor() + self.spm[lang].Load(path) + + def segment_to_units(self, segment, states): + # Split a full word (segment) into subwords (units) + return self.spm['src'].EncodeAsPieces(segment) + + def update_model_encoder(self, states): + if len(states.units.source) == 0: + return + + src_indices = [ + self.dict['src'].index(x) + for x in states.units.source.value + ] + + if states.finish_read(): + # Append the eos index when the prediction is over + src_indices += [self.dict["tgt"].eos_index] + + src_indices = self.to_device( + torch.LongTensor(src_indices).unsqueeze(0) + ) + src_lengths = self.to_device( + torch.LongTensor([src_indices.size(1)]) + ) + + states.encoder_states = self.model.encoder(src_indices, src_lengths) + + torch.cuda.empty_cache() + + def update_states_read(self, states): + # Happens after a read action. + self.update_model_encoder(states) + + def units_to_segment(self, units, states): + # Merge sub words (units) to full word (segment). + # For Japanese, we can directly send + # the untokenized token to server except the BOS token + # with following option + # --sacrebleu-tokenizer MeCab + # --eval-latency-unit char + # --no-space + token = units.value.pop() + + if ( + token == self.dict["tgt"].eos_word + or len(states.segments.target) > self.max_len + ): + return DEFAULT_EOS + + if BOS_PREFIX == token: + return None + if token[0] == BOS_PREFIX: + return token[1:] + else: + return token + + def policy(self, states): + + if not getattr(states, "encoder_states", None): + # No encoder states, read a token first + return READ_ACTION + + # encode previous predicted target tokens + tgt_indices = self.to_device( + torch.LongTensor( + [self.model.decoder.dictionary.eos()] + + [ + self.dict['tgt'].index(x) + for x in states.units.target.value + if x is not None + ] + ).unsqueeze(0) + ) + + # Current steps + states.incremental_states["steps"] = { + "src": states.encoder_states["encoder_out"][0].size(0), + "tgt": 1 + len(states.units.target), + } + + # Online only means the reading is not finished + states.incremental_states["online"]["only"] = ( + torch.BoolTensor([not states.finish_read()]) + ) + + x, outputs = self.model.decoder.forward( + prev_output_tokens=tgt_indices, + encoder_out=states.encoder_states, + incremental_state=states.incremental_states, + ) + + states.decoder_out = x + + torch.cuda.empty_cache() + + if outputs.action == 0: + return READ_ACTION + else: + return WRITE_ACTION + + def predict(self, states): + # Predict target token from decoder states + decoder_states = states.decoder_out + + lprobs = self.model.get_normalized_probs( + [decoder_states[:, -1:]], log_probs=True + ) + + index = lprobs.argmax(dim=-1)[0, 0].item() + + if index != self.dict['tgt'].eos_index: + token = self.dict['tgt'].string([index]) + else: + token = self.dict['tgt'].eos_word + + return token diff --git a/fairseq/examples/simultaneous_translation/models/__init__.py b/fairseq/examples/simultaneous_translation/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..257a96593ff7af93c206c066d8db4ad795b2ae36 --- /dev/null +++ b/fairseq/examples/simultaneous_translation/models/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import importlib +import os + + +for file in sorted(os.listdir(os.path.dirname(__file__))): + if file.endswith(".py") and not file.startswith("_"): + model_name = file[: file.find(".py")] + importlib.import_module( + "examples.simultaneous_translation.models." + model_name + ) diff --git a/fairseq/examples/simultaneous_translation/models/__pycache__/__init__.cpython-310.pyc b/fairseq/examples/simultaneous_translation/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3a5141c769c9a4b3af595e3af4992151eb1a546 Binary files /dev/null and b/fairseq/examples/simultaneous_translation/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq/examples/simultaneous_translation/models/__pycache__/convtransformer_simul_trans.cpython-310.pyc b/fairseq/examples/simultaneous_translation/models/__pycache__/convtransformer_simul_trans.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4008a8261c70c43c116531e9a29116a6cb89ed13 Binary files /dev/null and b/fairseq/examples/simultaneous_translation/models/__pycache__/convtransformer_simul_trans.cpython-310.pyc differ diff --git a/fairseq/examples/simultaneous_translation/models/__pycache__/transformer_monotonic_attention.cpython-310.pyc b/fairseq/examples/simultaneous_translation/models/__pycache__/transformer_monotonic_attention.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3bc8eb9dc9841785ff6c70d92cb831dd9ad2be05 Binary files /dev/null and b/fairseq/examples/simultaneous_translation/models/__pycache__/transformer_monotonic_attention.cpython-310.pyc differ diff --git a/fairseq/examples/simultaneous_translation/models/convtransformer_simul_trans.py b/fairseq/examples/simultaneous_translation/models/convtransformer_simul_trans.py new file mode 100644 index 0000000000000000000000000000000000000000..4a26422f650cf13ee7d4e8d2228b50ec49876fb8 --- /dev/null +++ b/fairseq/examples/simultaneous_translation/models/convtransformer_simul_trans.py @@ -0,0 +1,204 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +from fairseq import checkpoint_utils +from fairseq.models import ( + register_model, + register_model_architecture, +) +from fairseq.models.speech_to_text import ( + ConvTransformerModel, + convtransformer_espnet, + ConvTransformerEncoder, +) +from fairseq.models.speech_to_text.modules.augmented_memory_attention import ( + augmented_memory, + SequenceEncoder, + AugmentedMemoryConvTransformerEncoder, +) + +from torch import nn, Tensor +from typing import Dict, List +from fairseq.models.speech_to_text.modules.emformer import NoSegAugmentedMemoryTransformerEncoderLayer + +@register_model("convtransformer_simul_trans") +class SimulConvTransformerModel(ConvTransformerModel): + """ + Implementation of the paper: + + SimulMT to SimulST: Adapting Simultaneous Text Translation to + End-to-End Simultaneous Speech Translation + + https://www.aclweb.org/anthology/2020.aacl-main.58.pdf + """ + + @staticmethod + def add_args(parser): + super(SimulConvTransformerModel, SimulConvTransformerModel).add_args(parser) + parser.add_argument( + "--train-monotonic-only", + action="store_true", + default=False, + help="Only train monotonic attention", + ) + + @classmethod + def build_decoder(cls, args, task, embed_tokens): + tgt_dict = task.tgt_dict + + from examples.simultaneous_translation.models.transformer_monotonic_attention import ( + TransformerMonotonicDecoder, + ) + + decoder = TransformerMonotonicDecoder(args, tgt_dict, embed_tokens) + + if getattr(args, "load_pretrained_decoder_from", None): + decoder = checkpoint_utils.load_pretrained_component_from_model( + component=decoder, checkpoint=args.load_pretrained_decoder_from + ) + return decoder + + +@register_model_architecture( + "convtransformer_simul_trans", "convtransformer_simul_trans_espnet" +) +def convtransformer_simul_trans_espnet(args): + convtransformer_espnet(args) + + +@register_model("convtransformer_augmented_memory") +@augmented_memory +class AugmentedMemoryConvTransformerModel(SimulConvTransformerModel): + @classmethod + def build_encoder(cls, args): + encoder = SequenceEncoder(args, AugmentedMemoryConvTransformerEncoder(args)) + + if getattr(args, "load_pretrained_encoder_from", None) is not None: + encoder = checkpoint_utils.load_pretrained_component_from_model( + component=encoder, checkpoint=args.load_pretrained_encoder_from + ) + + return encoder + + +@register_model_architecture( + "convtransformer_augmented_memory", "convtransformer_augmented_memory" +) +def augmented_memory_convtransformer_espnet(args): + convtransformer_espnet(args) + + +# ============================================================================ # +# Convtransformer +# with monotonic attention decoder +# with emformer encoder +# ============================================================================ # + + +class ConvTransformerEmformerEncoder(ConvTransformerEncoder): + def __init__(self, args): + super().__init__(args) + stride = self.conv_layer_stride(args) + trf_left_context = args.segment_left_context // stride + trf_right_context = args.segment_right_context // stride + context_config = [trf_left_context, trf_right_context] + self.transformer_layers = nn.ModuleList( + [ + NoSegAugmentedMemoryTransformerEncoderLayer( + input_dim=args.encoder_embed_dim, + num_heads=args.encoder_attention_heads, + ffn_dim=args.encoder_ffn_embed_dim, + num_layers=args.encoder_layers, + dropout_in_attn=args.dropout, + dropout_on_attn=args.dropout, + dropout_on_fc1=args.dropout, + dropout_on_fc2=args.dropout, + activation_fn=args.activation_fn, + context_config=context_config, + segment_size=args.segment_length, + max_memory_size=args.max_memory_size, + scaled_init=True, # TODO: use constant for now. + tanh_on_mem=args.amtrf_tanh_on_mem, + ) + ] + ) + self.conv_transformer_encoder = ConvTransformerEncoder(args) + + def forward(self, src_tokens, src_lengths): + encoder_out: Dict[str, List[Tensor]] = self.conv_transformer_encoder(src_tokens, src_lengths.to(src_tokens.device)) + output = encoder_out["encoder_out"][0] + encoder_padding_masks = encoder_out["encoder_padding_mask"] + + return { + "encoder_out": [output], + # This is because that in the original implementation + # the output didn't consider the last segment as right context. + "encoder_padding_mask": [encoder_padding_masks[0][:, : output.size(0)]] if len(encoder_padding_masks) > 0 + else [], + "encoder_embedding": [], + "encoder_states": [], + "src_tokens": [], + "src_lengths": [], + } + + @staticmethod + def conv_layer_stride(args): + # TODO: make it configurable from the args + return 4 + + +@register_model("convtransformer_emformer") +class ConvtransformerEmformer(SimulConvTransformerModel): + @staticmethod + def add_args(parser): + super(ConvtransformerEmformer, ConvtransformerEmformer).add_args(parser) + + parser.add_argument( + "--segment-length", + type=int, + metavar="N", + help="length of each segment (not including left context / right context)", + ) + parser.add_argument( + "--segment-left-context", + type=int, + help="length of left context in a segment", + ) + parser.add_argument( + "--segment-right-context", + type=int, + help="length of right context in a segment", + ) + parser.add_argument( + "--max-memory-size", + type=int, + default=-1, + help="Right context for the segment.", + ) + parser.add_argument( + "--amtrf-tanh-on-mem", + default=False, + action="store_true", + help="whether to use tanh on memory vector", + ) + + @classmethod + def build_encoder(cls, args): + encoder = ConvTransformerEmformerEncoder(args) + if getattr(args, "load_pretrained_encoder_from", None): + encoder = checkpoint_utils.load_pretrained_component_from_model( + component=encoder, checkpoint=args.load_pretrained_encoder_from + ) + return encoder + + +@register_model_architecture( + "convtransformer_emformer", + "convtransformer_emformer", +) +def convtransformer_emformer_base(args): + convtransformer_espnet(args) diff --git a/fairseq/examples/simultaneous_translation/models/transformer_monotonic_attention.py b/fairseq/examples/simultaneous_translation/models/transformer_monotonic_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..7b9414b0eb3b30c935478cd5b8a894168bd8cc98 --- /dev/null +++ b/fairseq/examples/simultaneous_translation/models/transformer_monotonic_attention.py @@ -0,0 +1,302 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, List, NamedTuple, Optional + +import torch +import torch.nn as nn +from examples.simultaneous_translation.modules.monotonic_transformer_layer import ( + TransformerMonotonicDecoderLayer, + TransformerMonotonicEncoderLayer, +) +from fairseq.models import ( + register_model, + register_model_architecture, +) +from fairseq.models.transformer import ( + TransformerModel, + TransformerEncoder, + TransformerDecoder, + base_architecture, + transformer_iwslt_de_en, + transformer_vaswani_wmt_en_de_big, + tiny_architecture +) +from torch import Tensor + +DEFAULT_MAX_SOURCE_POSITIONS = 1024 +DEFAULT_MAX_TARGET_POSITIONS = 1024 +READ_ACTION = 0 +WRITE_ACTION = 1 + +TransformerMonotonicDecoderOut = NamedTuple( + "TransformerMonotonicDecoderOut", + [ + ("action", int), + ("p_choose", Optional[Tensor]), + ("attn_list", Optional[List[Optional[Dict[str, Tensor]]]]), + ("encoder_out", Optional[Dict[str, List[Tensor]]]), + ("encoder_padding_mask", Optional[Tensor]), + ], +) + + +@register_model("transformer_unidirectional") +class TransformerUnidirectionalModel(TransformerModel): + @classmethod + def build_encoder(cls, args, src_dict, embed_tokens): + return TransformerMonotonicEncoder(args, src_dict, embed_tokens) + + +@register_model("transformer_monotonic") +class TransformerModelSimulTrans(TransformerModel): + @classmethod + def build_encoder(cls, args, src_dict, embed_tokens): + return TransformerMonotonicEncoder(args, src_dict, embed_tokens) + + @classmethod + def build_decoder(cls, args, tgt_dict, embed_tokens): + return TransformerMonotonicDecoder(args, tgt_dict, embed_tokens) + + +class TransformerMonotonicEncoder(TransformerEncoder): + def __init__(self, args, dictionary, embed_tokens): + super().__init__(args, dictionary, embed_tokens) + + self.dictionary = dictionary + self.layers = nn.ModuleList([]) + self.layers.extend( + [ + TransformerMonotonicEncoderLayer(args) + for i in range(args.encoder_layers) + ] + ) + + +class TransformerMonotonicDecoder(TransformerDecoder): + """ + Transformer decoder consisting of *args.decoder_layers* layers. Each layer + is a :class:`TransformerDecoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): decoding dictionary + embed_tokens (torch.nn.Embedding): output embedding + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): + super().__init__(args, dictionary, embed_tokens, no_encoder_attn=False) + + self.dictionary = dictionary + self.layers = nn.ModuleList([]) + self.layers.extend( + [ + TransformerMonotonicDecoderLayer(args) + for _ in range(args.decoder_layers) + ] + ) + self.policy_criterion = getattr(args, "policy_criterion", "any") + self.num_updates = None + + def set_num_updates(self, num_updates): + self.num_updates = num_updates + + def pre_attention( + self, + prev_output_tokens, + encoder_out_dict: Dict[str, List[Tensor]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + ): + positions = ( + self.embed_positions( + prev_output_tokens, + incremental_state=incremental_state, + ) + if self.embed_positions is not None + else None + ) + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + if positions is not None: + positions = positions[:, -1:] + # embed tokens and positions + x = self.embed_scale * self.embed_tokens(prev_output_tokens) + + if self.project_in_dim is not None: + x = self.project_in_dim(x) + + if positions is not None: + x += positions + + x = self.dropout_module(x) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + encoder_out = encoder_out_dict["encoder_out"][0] + + if "encoder_padding_mask" in encoder_out_dict: + encoder_padding_mask = ( + encoder_out_dict["encoder_padding_mask"][0] + if encoder_out_dict["encoder_padding_mask"] + and len(encoder_out_dict["encoder_padding_mask"]) > 0 + else None + ) + else: + encoder_padding_mask = None + + return x, encoder_out, encoder_padding_mask + + def post_attention(self, x): + if self.layer_norm is not None: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + if self.project_out_dim is not None: + x = self.project_out_dim(x) + + return x + + def clean_cache( + self, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], + end_id: Optional[int] = None, + ): + """ + Clean cache in the monotonic layers. + The cache is generated because of a forward pass of decoder has run but no prediction, + so that the self attention key value in decoder is written in the incremental state. + end_id is the last idx of the layers + """ + if end_id is None: + end_id = len(self.layers) + + for index, layer in enumerate(self.layers): + if index < end_id: + layer.prune_incremental_state(incremental_state) + + def extract_features( + self, + prev_output_tokens, + encoder_out: Optional[Dict[str, List[Tensor]]], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + full_context_alignment: bool = False, # unused + alignment_layer: Optional[int] = None, # unused + alignment_heads: Optional[int] = None, # unsed + ): + """ + Similar to *forward* but only return features. + + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + # incremental_state = None + assert encoder_out is not None + (x, encoder_outs, encoder_padding_mask) = self.pre_attention( + prev_output_tokens, encoder_out, incremental_state + ) + attn = None + inner_states = [x] + attn_list: List[Optional[Dict[str, Tensor]]] = [] + + p_choose = torch.tensor([1.0]) + + for i, layer in enumerate(self.layers): + + x, attn, _ = layer( + x=x, + encoder_out=encoder_outs, + encoder_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + self_attn_mask=self.buffered_future_mask(x) + if incremental_state is None + else None, + ) + + inner_states.append(x) + attn_list.append(attn) + + if incremental_state is not None: + if_online = incremental_state["online"]["only"] + assert if_online is not None + if if_online.to(torch.bool): + # Online indicates that the encoder states are still changing + assert attn is not None + if self.policy_criterion == "any": + # Any head decide to read than read + head_read = layer.encoder_attn._get_monotonic_buffer(incremental_state)["head_read"] + assert head_read is not None + if head_read.any(): + # We need to prune the last self_attn saved_state + # if model decide not to read + # otherwise there will be duplicated saved_state + self.clean_cache(incremental_state, i + 1) + + return x, TransformerMonotonicDecoderOut( + action=0, + p_choose=p_choose, + attn_list=None, + encoder_out=None, + encoder_padding_mask=None, + ) + + x = self.post_attention(x) + + return x, TransformerMonotonicDecoderOut( + action=1, + p_choose=p_choose, + attn_list=attn_list, + encoder_out=encoder_out, + encoder_padding_mask=encoder_padding_mask, + ) + + +@register_model_architecture("transformer_monotonic", "transformer_monotonic") +def base_monotonic_architecture(args): + base_architecture(args) + args.encoder_unidirectional = getattr(args, "encoder_unidirectional", False) + + +@register_model_architecture( + "transformer_monotonic", "transformer_monotonic_iwslt_de_en" +) +def transformer_monotonic_iwslt_de_en(args): + transformer_iwslt_de_en(args) + base_monotonic_architecture(args) + + +# parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017) +@register_model_architecture( + "transformer_monotonic", "transformer_monotonic_vaswani_wmt_en_de_big" +) +def transformer_monotonic_vaswani_wmt_en_de_big(args): + transformer_vaswani_wmt_en_de_big(args) + + +@register_model_architecture( + "transformer_monotonic", "transformer_monotonic_vaswani_wmt_en_fr_big" +) +def transformer_monotonic_vaswani_wmt_en_fr_big(args): + transformer_monotonic_vaswani_wmt_en_fr_big(args) + + +@register_model_architecture( + "transformer_unidirectional", "transformer_unidirectional_iwslt_de_en" +) +def transformer_unidirectional_iwslt_de_en(args): + transformer_iwslt_de_en(args) + + +@register_model_architecture("transformer_monotonic", "transformer_monotonic_tiny") +def monotonic_tiny_architecture(args): + tiny_architecture(args) + base_monotonic_architecture(args) diff --git a/fairseq/examples/simultaneous_translation/modules/__init__.py b/fairseq/examples/simultaneous_translation/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f5ea180f9b4cdb27cd553439b6df9d743105f18c --- /dev/null +++ b/fairseq/examples/simultaneous_translation/modules/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import os +import importlib +from fairseq import registry + +( + build_monotonic_attention, + register_monotonic_attention, + MONOTONIC_ATTENTION_REGISTRY, + _, +) = registry.setup_registry("--simul-type") + +for file in sorted(os.listdir(os.path.dirname(__file__))): + if file.endswith(".py") and not file.startswith("_"): + model_name = file[: file.find(".py")] + importlib.import_module( + "examples.simultaneous_translation.modules." + model_name + ) diff --git a/fairseq/examples/simultaneous_translation/modules/__pycache__/__init__.cpython-310.pyc b/fairseq/examples/simultaneous_translation/modules/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38f2a379101d990104e7cc27ee7ab0ecc7ad3ce6 Binary files /dev/null and b/fairseq/examples/simultaneous_translation/modules/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq/examples/simultaneous_translation/modules/__pycache__/fixed_pre_decision.cpython-310.pyc b/fairseq/examples/simultaneous_translation/modules/__pycache__/fixed_pre_decision.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..808b2c4d2fd801bdc2fadaa03854963a61ffbcda Binary files /dev/null and b/fairseq/examples/simultaneous_translation/modules/__pycache__/fixed_pre_decision.cpython-310.pyc differ diff --git a/fairseq/examples/simultaneous_translation/modules/__pycache__/monotonic_multihead_attention.cpython-310.pyc b/fairseq/examples/simultaneous_translation/modules/__pycache__/monotonic_multihead_attention.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..479fd0a3200f064749e4a3257f46b11308dfa26e Binary files /dev/null and b/fairseq/examples/simultaneous_translation/modules/__pycache__/monotonic_multihead_attention.cpython-310.pyc differ diff --git a/fairseq/examples/simultaneous_translation/modules/__pycache__/monotonic_transformer_layer.cpython-310.pyc b/fairseq/examples/simultaneous_translation/modules/__pycache__/monotonic_transformer_layer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..afbdbe9d4d4a5d4ab82c4b7fbae49432135fcd69 Binary files /dev/null and b/fairseq/examples/simultaneous_translation/modules/__pycache__/monotonic_transformer_layer.cpython-310.pyc differ diff --git a/fairseq/examples/simultaneous_translation/modules/fixed_pre_decision.py b/fairseq/examples/simultaneous_translation/modules/fixed_pre_decision.py new file mode 100644 index 0000000000000000000000000000000000000000..3991414aed3800f301e4097e819d3064bb549c37 --- /dev/null +++ b/fairseq/examples/simultaneous_translation/modules/fixed_pre_decision.py @@ -0,0 +1,190 @@ +from functools import partial + +import torch +from torch import Tensor +import math +import torch.nn.functional as F + +from . import register_monotonic_attention +from .monotonic_multihead_attention import ( + MonotonicAttention, + MonotonicInfiniteLookbackAttention, + WaitKAttention +) +from typing import Dict, Optional + + +def fixed_pooling_monotonic_attention(monotonic_attention): + def create_model(monotonic_attention, klass): + class FixedStrideMonotonicAttention(monotonic_attention): + def __init__(self, args): + self.waitk_lagging = 0 + self.num_heads = 0 + self.noise_mean = 0.0 + self.noise_var = 0.0 + super().__init__(args) + self.pre_decision_type = args.fixed_pre_decision_type + self.pre_decision_ratio = args.fixed_pre_decision_ratio + self.pre_decision_pad_threshold = args.fixed_pre_decision_pad_threshold + assert self.pre_decision_ratio > 1 + + if args.fixed_pre_decision_type == "average": + self.pooling_layer = torch.nn.AvgPool1d( + kernel_size=self.pre_decision_ratio, + stride=self.pre_decision_ratio, + ceil_mode=True, + ) + elif args.fixed_pre_decision_type == "last": + + def last(key): + if key.size(2) < self.pre_decision_ratio: + return key + else: + k = key[ + :, + :, + self.pre_decision_ratio - 1:: self.pre_decision_ratio, + ].contiguous() + if key.size(-1) % self.pre_decision_ratio != 0: + k = torch.cat([k, key[:, :, -1:]], dim=-1).contiguous() + return k + + self.pooling_layer = last + else: + raise NotImplementedError + + @staticmethod + def add_args(parser): + super( + FixedStrideMonotonicAttention, FixedStrideMonotonicAttention + ).add_args(parser) + parser.add_argument( + "--fixed-pre-decision-ratio", + type=int, + required=True, + help=( + "Ratio for the fixed pre-decision," + "indicating how many encoder steps will start" + "simultaneous decision making process." + ), + ) + parser.add_argument( + "--fixed-pre-decision-type", + default="average", + choices=["average", "last"], + help="Pooling type", + ) + parser.add_argument( + "--fixed-pre-decision-pad-threshold", + type=float, + default=0.3, + help="If a part of the sequence has pad" + ",the threshold the pooled part is a pad.", + ) + + def insert_zeros(self, x): + bsz_num_heads, tgt_len, src_len = x.size() + stride = self.pre_decision_ratio + weight = F.pad(torch.ones(1, 1, 1).to(x), (stride - 1, 0)) + x_upsample = F.conv_transpose1d( + x.view(-1, src_len).unsqueeze(1), + weight, + stride=stride, + padding=0, + ) + return x_upsample.squeeze(1).view(bsz_num_heads, tgt_len, -1) + + def p_choose( + self, + query: Optional[Tensor], + key: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + ): + assert key is not None + assert query is not None + src_len = key.size(0) + tgt_len = query.size(0) + batch_size = query.size(1) + + key_pool = self.pooling_layer(key.transpose(0, 2)).transpose(0, 2) + + if key_padding_mask is not None: + key_padding_mask_pool = ( + self.pooling_layer(key_padding_mask.unsqueeze(0).float()) + .squeeze(0) + .gt(self.pre_decision_pad_threshold) + ) + # Make sure at least one element is not pad + key_padding_mask_pool[:, 0] = 0 + else: + key_padding_mask_pool = None + + if incremental_state is not None: + # The floor instead of ceil is used for inference + # But make sure the length key_pool at least 1 + if ( + max(1, math.floor(key.size(0) / self.pre_decision_ratio)) + ) < key_pool.size(0): + key_pool = key_pool[:-1] + if key_padding_mask_pool is not None: + key_padding_mask_pool = key_padding_mask_pool[:-1] + + p_choose_pooled = self.p_choose_from_qk( + query, + key_pool, + key_padding_mask_pool, + incremental_state=incremental_state, + ) + + # Upsample, interpolate zeros + p_choose = self.insert_zeros(p_choose_pooled) + + if p_choose.size(-1) < src_len: + # Append zeros if the upsampled p_choose is shorter than src_len + p_choose = torch.cat( + [ + p_choose, + torch.zeros( + p_choose.size(0), + tgt_len, + src_len - p_choose.size(-1) + ).to(p_choose) + ], + dim=2 + ) + else: + # can be larger than src_len because we used ceil before + p_choose = p_choose[:, :, :src_len] + p_choose[:, :, -1] = p_choose_pooled[:, :, -1] + + assert list(p_choose.size()) == [ + batch_size * self.num_heads, + tgt_len, + src_len, + ] + + return p_choose + + FixedStrideMonotonicAttention.__name__ = klass.__name__ + return FixedStrideMonotonicAttention + + return partial(create_model, monotonic_attention) + + +@register_monotonic_attention("waitk_fixed_pre_decision") +@fixed_pooling_monotonic_attention(WaitKAttention) +class WaitKAttentionFixedStride: + pass + + +@register_monotonic_attention("hard_aligned_fixed_pre_decision") +@fixed_pooling_monotonic_attention(MonotonicAttention) +class MonotonicAttentionFixedStride: + pass + + +@register_monotonic_attention("infinite_lookback_fixed_pre_decision") +@fixed_pooling_monotonic_attention(MonotonicInfiniteLookbackAttention) +class MonotonicInfiniteLookbackAttentionFixedStride: + pass diff --git a/fairseq/examples/simultaneous_translation/modules/monotonic_multihead_attention.py b/fairseq/examples/simultaneous_translation/modules/monotonic_multihead_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..06d20d8d4acb8d674c2bd9e839a6560e6b33d28b --- /dev/null +++ b/fairseq/examples/simultaneous_translation/modules/monotonic_multihead_attention.py @@ -0,0 +1,520 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +from torch import Tensor +import torch.nn as nn + +from examples.simultaneous_translation.utils.p_choose_strategy import ( + learnable_p_choose, + waitk_p_choose +) + +from examples.simultaneous_translation.utils.monotonic_attention import ( + expected_alignment_from_p_choose, + expected_soft_attention, + mass_preservation, +) +from fairseq.modules import MultiheadAttention + +from . import register_monotonic_attention +from typing import Dict, Optional + + +@register_monotonic_attention("hard_aligned") +class MonotonicAttention(MultiheadAttention): + """ + Abstract class of monotonic attentions + """ + k_in_proj: Dict[str, nn.Linear] + q_in_proj: Dict[str, nn.Linear] + + def __init__(self, args): + super().__init__( + embed_dim=args.decoder_embed_dim, + num_heads=args.decoder_attention_heads, + kdim=getattr(args, "encoder_embed_dim", None), + vdim=getattr(args, "encoder_embed_dim", None), + dropout=args.attention_dropout, + encoder_decoder_attention=True, + ) + + self.soft_attention = False + + self.eps = getattr(args, "attention_eps", True) + self.mass_preservation = getattr(args, "mass_preservation", True) + + self.noise_type = args.noise_type + self.noise_mean = args.noise_mean + self.noise_var = args.noise_var + + self.energy_bias_init = args.energy_bias_init + self.energy_bias = ( + nn.Parameter(self.energy_bias_init * torch.ones([1])) + if args.energy_bias is True + else 0 + ) + + self.k_in_proj = {"monotonic": self.k_proj} + self.q_in_proj = {"monotonic": self.q_proj} + self.chunk_size = None + + @staticmethod + def add_args(parser): + # fmt: off + parser.add_argument('--no-mass-preservation', action="store_false", + dest="mass_preservation", + help='Do not stay on the last token when decoding') + parser.add_argument('--mass-preservation', action="store_true", + dest="mass_preservation", + help='Stay on the last token when decoding') + parser.set_defaults(mass_preservation=True) + parser.add_argument('--noise-var', type=float, default=1.0, + help='Variance of discretness noise') + parser.add_argument('--noise-mean', type=float, default=0.0, + help='Mean of discretness noise') + parser.add_argument('--noise-type', type=str, default="flat", + help='Type of discretness noise') + parser.add_argument('--energy-bias', action="store_true", + default=False, + help='Bias for energy') + parser.add_argument('--energy-bias-init', type=float, default=-2.0, + help='Initial value of the bias for energy') + parser.add_argument('--attention-eps', type=float, default=1e-6, + help='Epsilon when calculating expected attention') + + def energy_from_qk( + self, + query: Tensor, + key: Tensor, + energy_type: str, + key_padding_mask: Optional[Tensor] = None, + bias: int = 0 + ): + """ + Compute energy from query and key + q_func_value is a tuple looks like + (q_proj_func, q_tensor) + q_tensor size: bsz, tgt_len, emb_dim + k_tensor size: bsz, src_len, emb_dim + key_padding_mask size: bsz, src_len + attn_mask: bsz, src_len + """ + + length, bsz, _ = query.size() + q = self.q_in_proj[energy_type].forward(query) + q = ( + q.contiguous() + .view(length, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + q = q * self.scaling + length, bsz, _ = key.size() + k = self.k_in_proj[energy_type].forward(key) + k = ( + k.contiguous() + .view(length, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + + energy = torch.bmm(q, k.transpose(1, 2)) + bias + + if key_padding_mask is not None: + energy = energy.masked_fill( + key_padding_mask.unsqueeze(1).to(torch.bool), + - float("inf") + ) + + return energy + + def p_choose_from_qk(self, query, key, key_padding_mask, incremental_states=None): + monotonic_energy = self.energy_from_qk( + query, + key, + "monotonic", + key_padding_mask=key_padding_mask, + bias=self.energy_bias, + ) + + p_choose = learnable_p_choose( + monotonic_energy, + self.noise_mean, + self.noise_var, + self.training + ) + return p_choose + + def p_choose(self, query, key, key_padding_mask, incremental_states=None): + return self.p_choose_from_qk(self, query, key, key_padding_mask) + + def monotonic_attention_process_infer( + self, + query: Optional[Tensor], + key: Optional[Tensor], + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], + ): + """ + Monotonic attention at inference time + Notice that this function is designed for simuleval not sequence_generator + """ + assert query is not None + assert key is not None + + if query.size(1) != 1: + raise RuntimeError( + "Simultaneous translation models don't support batch decoding." + ) + # 1. compute stepwise probability + p_choose = self.p_choose( + query, key, None, incremental_state + ).squeeze(1) + + # 2. Compute the alpha + src_len = key.size(0) + # Maximum steps allows in this iteration + max_steps = src_len - 1 if self.mass_preservation else src_len + monotonic_cache = self._get_monotonic_buffer(incremental_state) + # Step for each head + monotonic_step = monotonic_cache.get( + 'head_step', + p_choose.new_zeros(1, self.num_heads).long() + ) + assert monotonic_step is not None + finish_read = monotonic_step.eq(max_steps) + p_choose_i = torch.tensor(1) + + while finish_read.sum().item() < self.num_heads: + # p_choose: self.num_heads, src_len + # only choose the p at monotonic steps + # p_choose_i: 1, self.num_heads + p_choose_i = ( + p_choose.gather( + 1, + monotonic_step + .clamp(0, src_len - 1), + ) + ) + + read_one_step = ( + (p_choose_i < 0.5) + .type_as(monotonic_step) + .masked_fill(finish_read, 0) + ) + # 1 x bsz + # sample actions on unfinished seq + # 0 means stay, finish reading + # 1 means leave, continue reading + + monotonic_step += read_one_step + + finish_read = monotonic_step.eq(max_steps) | (read_one_step == 0) + + # p_choose at last steps + p_choose_i = ( + p_choose.gather( + 1, + monotonic_step + .clamp(0, src_len - 1), + ) + ) + + monotonic_cache["head_step"] = monotonic_step + # Whether a head is looking for new input + monotonic_cache["head_read"] = ( + monotonic_step.eq(max_steps) & (p_choose_i < 0.5) + ) + self._set_monotonic_buffer(incremental_state, monotonic_cache) + + # 2. Update alpha + alpha = ( + p_choose + .new_zeros([self.num_heads, src_len]) + .scatter( + 1, + (monotonic_step) + .view(self.num_heads, 1).clamp(0, src_len - 1), + 1 + ) + ) + + if not self.mass_preservation: + alpha = alpha.masked_fill( + (monotonic_step == max_steps) + .view(self.num_heads, 1), + 0 + ) + + # 4. Compute Beta + if self.soft_attention: + monotonic_step = monotonic_step.t() + beta_mask = torch.arange(src_len).expand_as(alpha).gt(monotonic_step).unsqueeze(1) + # If it's soft attention just do softmax on current context + soft_energy = self.energy_from_qk( + query, + key, + "soft" + ) + beta = torch.nn.functional.softmax( + soft_energy.masked_fill(beta_mask, -float("inf")), dim=-1 + ) + # It could happen that a head doesn't move at all + beta = beta.masked_fill(monotonic_step.eq(0).unsqueeze(1), 0) + else: + # If it's hard attention just select the last state + beta = alpha + + return p_choose, alpha, beta + + def monotonic_attention_process_train( + self, + query: Optional[Tensor], + key: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + ): + """ + Calculating monotonic attention process for training + Including: + stepwise probability: p_choose + expected hard alignment: alpha + expected soft attention: beta + """ + assert query is not None + assert key is not None + + # 1. compute stepwise probability + p_choose = self.p_choose_from_qk(query, key, key_padding_mask) + + # 2. compute expected_alignment + alpha = expected_alignment_from_p_choose( + p_choose, + key_padding_mask, + eps=self.eps, + ) + + if self.mass_preservation: + alpha = mass_preservation( + alpha, key_padding_mask + ) + + # 3. compute expected soft attention (soft aligned model only) + if self.soft_attention: + soft_energy = self.energy_from_qk( + query, + key, + "soft", + key_padding_mask=None, + ) + + beta = expected_soft_attention( + alpha, + soft_energy, + padding_mask=key_padding_mask, + chunk_size=self.chunk_size, + eps=self.eps, + ) + else: + beta = alpha + soft_energy = alpha + + return p_choose, alpha, beta, soft_energy + + def forward( + self, + query: Optional[Tensor], + key: Optional[Tensor], + value: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + attn_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + need_weights: bool = True, static_kv: bool = False, need_head_weights: bool = False, + ): + """ + query: tgt_len, bsz, embed_dim + key: src_len, bsz, embed_dim + value: src_len, bsz, embed_dim + """ + + assert attn_mask is None + assert query is not None + assert key is not None + assert value is not None + + tgt_len, bsz, embed_dim = query.size() + src_len = value.size(0) + + if key_padding_mask is not None: + assert not key_padding_mask[:, 0].any(), ( + "Only right padding is supported." + ) + key_padding_mask = ( + key_padding_mask + .unsqueeze(1) + .expand([bsz, self.num_heads, src_len]) + .contiguous() + .view(-1, src_len) + ) + + if incremental_state is not None: + # Inference + ( + p_choose, alpha, beta + ) = self.monotonic_attention_process_infer( + query, key, incremental_state + ) + soft_energy = beta + else: + # Train + ( + p_choose, alpha, beta, soft_energy + ) = self.monotonic_attention_process_train( + query, key, key_padding_mask + ) + + v = self.v_proj(value) + length, bsz, _ = v.size() + v = ( + v.contiguous() + .view(length, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + + attn = torch.bmm(beta.type_as(v), v) + + attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) + + attn = self.out_proj(attn) + + p_choose = p_choose.view(bsz, self.num_heads, tgt_len, src_len) + alpha = alpha.view(bsz, self.num_heads, tgt_len, src_len) + beta = beta.view(bsz, self.num_heads, tgt_len, src_len) + + return attn, { + "p_choose": p_choose, + "alpha": alpha, + "beta": beta, + "soft_energy": soft_energy, + } + + def _get_monotonic_buffer(self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]): + maybe_incremental_state = self.get_incremental_state( + incremental_state, + 'monotonic', + ) + if maybe_incremental_state is None: + typed_empty_dict: Dict[str, Optional[Tensor]] = {} + return typed_empty_dict + else: + return maybe_incremental_state + + def _set_monotonic_buffer(self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], buffer: Dict[str, Optional[Tensor]]): + self.set_incremental_state( + incremental_state, + 'monotonic', + buffer, + ) + + +@register_monotonic_attention("infinite_lookback") +class MonotonicInfiniteLookbackAttention( + MonotonicAttention +): + def __init__(self, args): + super().__init__(args) + self.soft_attention = True + self.init_soft_attention() + + def init_soft_attention(self): + self.k_proj_soft = nn.Linear(self.kdim, self.embed_dim, bias=True) + self.q_proj_soft = nn.Linear(self.embed_dim, self.embed_dim, bias=True) + self.k_in_proj["soft"] = self.k_proj_soft + self.q_in_proj["soft"] = self.q_proj_soft + + if self.qkv_same_dim: + # Empirically observed the convergence to be much better with + # the scaled initialization + nn.init.xavier_uniform_( + self.k_in_proj["soft"].weight, gain=1 / math.sqrt(2) + ) + nn.init.xavier_uniform_( + self.q_in_proj["soft"].weight, gain=1 / math.sqrt(2) + ) + else: + nn.init.xavier_uniform_(self.k_in_proj["soft"].weight) + nn.init.xavier_uniform_(self.q_in_proj["soft"].weight) + + +@register_monotonic_attention("waitk") +class WaitKAttention( + MonotonicInfiniteLookbackAttention +): + """ + STACL: Simultaneous Translation with Implicit Anticipation and + Controllable Latency using Prefix-to-Prefix Framework + https://www.aclweb.org/anthology/P19-1289/ + """ + def __init__(self, args): + super().__init__(args) + self.q_in_proj["soft"] = self.q_in_proj["monotonic"] + self.k_in_proj["soft"] = self.k_in_proj["monotonic"] + + self.waitk_lagging = args.waitk_lagging + assert self.waitk_lagging > 0, ( + f"Lagging has to been larger than 0, get {self.waitk_lagging}." + ) + + @staticmethod + def add_args(parser): + super( + MonotonicInfiniteLookbackAttention, + MonotonicInfiniteLookbackAttention + ).add_args(parser) + + parser.add_argument( + "--waitk-lagging", type=int, required=True, help="Wait K lagging" + ) + + def p_choose_from_qk( + self, + query: Optional[Tensor], + key: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + ): + assert query is not None + assert key is not None + + p_choose = waitk_p_choose( + tgt_len=query.size(0), + src_len=key.size(0), + bsz=query.size(1) * self.num_heads, + waitk_lagging=self.waitk_lagging, + key_padding_mask=key_padding_mask, + incremental_state=incremental_state, + ) + + return p_choose.to(query) + + +@register_monotonic_attention("chunkwise") +class ChunkwiseAttention( + MonotonicInfiniteLookbackAttention +): + def __init__(self, args): + super().__init__(args) + self.chunk_size = args.mocha_chunk_size + assert self.chunk_size > 1 + + @staticmethod + def add_args(parser): + super( + MonotonicInfiniteLookbackAttention + ).add_args(parser) + + parser.add_argument( + "--mocha-chunk-size", type=int, + required=True, help="Mocha chunk size" + ) diff --git a/fairseq/examples/simultaneous_translation/modules/monotonic_transformer_layer.py b/fairseq/examples/simultaneous_translation/modules/monotonic_transformer_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..94bd71fb9c46a64a8b6e1960f47dfc43b78dda43 --- /dev/null +++ b/fairseq/examples/simultaneous_translation/modules/monotonic_transformer_layer.py @@ -0,0 +1,182 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer + +from . import build_monotonic_attention + +from typing import Dict, Optional, List + +from torch import Tensor +import torch + + +class TransformerMonotonicEncoderLayer(TransformerEncoderLayer): + def forward(self, x, encoder_padding_mask): + seq_len, _, _ = x.size() + attn_mask = x.new_ones([seq_len, seq_len]).triu(1) + attn_mask = attn_mask.masked_fill(attn_mask.bool(), float("-inf")) + return super().forward(x, encoder_padding_mask, attn_mask) + + +class TransformerMonotonicDecoderLayer(TransformerDecoderLayer): + def __init__(self, args): + super().__init__(args) + + assert args.simul_type is not None, "A --simul-type is needed." + self.encoder_attn = build_monotonic_attention(args) + + def prune_incremental_state( + self, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] + ): + input_buffer = self.self_attn._get_input_buffer(incremental_state) + for key in ["prev_key", "prev_value"]: + input_buffer_key = input_buffer[key] + assert input_buffer_key is not None + if input_buffer_key.size(2) > 1: + input_buffer[key] = input_buffer_key[:, :, :-1, :] + else: + typed_empty_dict: Dict[str, Optional[Tensor]] = {} + input_buffer = typed_empty_dict + break + assert incremental_state is not None + self.self_attn._set_input_buffer(incremental_state, input_buffer) + + def forward( + self, + x, + encoder_out: Optional[Tensor] = None, + encoder_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + prev_self_attn_state: Optional[List[Tensor]] = None, + prev_attn_state: Optional[List[Tensor]] = None, + self_attn_mask: Optional[Tensor] = None, + self_attn_padding_mask: Optional[Tensor] = None, + need_attn: bool = False, + need_head_weights: bool = False, + ): + """ + Args: + x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_padding_mask (ByteTensor, optional): binary + ByteTensor of shape `(batch, src_len)` where padding + elements are indicated by ``1``. + need_attn (bool, optional): return attention weights + need_head_weights (bool, optional): return attention weights + for each head (default: return average over heads). + + Returns: + encoded output of shape `(seq_len, batch, embed_dim)` + """ + if need_head_weights: + need_attn = True + + residual = x + if self.normalize_before: + x = self.self_attn_layer_norm(x) + if prev_self_attn_state is not None: + prev_key, prev_value = prev_self_attn_state[:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_self_attn_state) >= 3: + saved_state["prev_key_padding_mask"] = prev_self_attn_state[2] + assert incremental_state is not None + self.self_attn._set_input_buffer(incremental_state, saved_state) + _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state) + if self.cross_self_attention and not ( + incremental_state is not None + and _self_attn_input_buffer is not None + and "prev_key" in _self_attn_input_buffer + ): + if self_attn_mask is not None: + assert encoder_out is not None + self_attn_mask = torch.cat( + (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1 + ) + if self_attn_padding_mask is not None: + if encoder_padding_mask is None: + assert encoder_out is not None + encoder_padding_mask = self_attn_padding_mask.new_zeros( + encoder_out.size(1), encoder_out.size(0) + ) + self_attn_padding_mask = torch.cat( + (encoder_padding_mask, self_attn_padding_mask), dim=1 + ) + assert encoder_out is not None + y = torch.cat((encoder_out, x), dim=0) + else: + y = x + + x, attn = self.self_attn( + query=x, + key=y, + value=y, + key_padding_mask=self_attn_padding_mask, + incremental_state=incremental_state, + need_weights=False, + attn_mask=self_attn_mask, + ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.self_attn_layer_norm(x) + + assert self.encoder_attn is not None + residual = x + if self.normalize_before: + x = self.encoder_attn_layer_norm(x) + if prev_attn_state is not None: + prev_key, prev_value = prev_attn_state[:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_attn_state) >= 3: + saved_state["prev_key_padding_mask"] = prev_attn_state[2] + assert incremental_state is not None + self.encoder_attn._set_input_buffer(incremental_state, saved_state) + + x, attn = self.encoder_attn( + query=x, + key=encoder_out, + value=encoder_out, + key_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + static_kv=True, + need_weights=need_attn or (not self.training and self.need_attn), + need_head_weights=need_head_weights, + ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.encoder_attn_layer_norm(x) + + residual = x + if self.normalize_before: + x = self.final_layer_norm(x) + + x = self.activation_fn(self.fc1(x)) + x = self.activation_dropout_module(x) + x = self.fc2(x) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.final_layer_norm(x) + if self.onnx_trace and incremental_state is not None: + saved_state = self.self_attn._get_input_buffer(incremental_state) + assert saved_state is not None + if self_attn_padding_mask is not None: + self_attn_state = [ + saved_state["prev_key"], + saved_state["prev_value"], + saved_state["prev_key_padding_mask"], + ] + else: + self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]] + return x, attn, self_attn_state + return x, attn, None diff --git a/fairseq/examples/simultaneous_translation/tests/test_alignment_train.py b/fairseq/examples/simultaneous_translation/tests/test_alignment_train.py new file mode 100644 index 0000000000000000000000000000000000000000..2ad4ef1f6d064da05530b7d6ad32e034bac5582d --- /dev/null +++ b/fairseq/examples/simultaneous_translation/tests/test_alignment_train.py @@ -0,0 +1,88 @@ +import unittest + +import numpy as np +import torch + +import hypothesis.strategies as st +from hypothesis import assume, given, settings +from torch.testing._internal.common_utils import TestCase +from examples.simultaneous_translation.utils.functions import exclusive_cumprod + + +TEST_CUDA = torch.cuda.is_available() + + +class AlignmentTrainTest(TestCase): + def _test_custom_alignment_train_ref(self, p_choose, eps): + cumprod_1mp = exclusive_cumprod(1 - p_choose, dim=2, eps=eps) + cumprod_1mp_clamp = torch.clamp(cumprod_1mp, eps, 1.0) + + bsz = p_choose.size(0) + tgt_len = p_choose.size(1) + src_len = p_choose.size(2) + + alpha_0 = p_choose.new_zeros([bsz, 1, src_len]) + alpha_0[:, :, 0] = 1.0 + + previous_alpha = [alpha_0] + + for i in range(tgt_len): + # p_choose: bsz , tgt_len, src_len + # cumprod_1mp_clamp : bsz, tgt_len, src_len + # previous_alpha[i]: bsz, 1, src_len + # alpha_i: bsz, src_len + alpha_i = ( + p_choose[:, i] + * cumprod_1mp[:, i] + * torch.cumsum( + previous_alpha[i][:, 0] / cumprod_1mp_clamp[:, i], dim=1 + ) + ).clamp(0, 1.0) + + previous_alpha.append(alpha_i.unsqueeze(1)) + + # alpha: bsz * num_heads, tgt_len, src_len + alpha = torch.cat(previous_alpha[1:], dim=1) + return alpha + + def _test_custom_alignment_train_impl(self, p_choose, alpha, eps): + if p_choose.is_cuda: + from alignment_train_cuda_binding import alignment_train_cuda # @manual=//deeplearning/projects/fairseq-py:alignment_train_cuda_binding + alignment_train_cuda(p_choose, alpha, eps) + else: + from alignment_train_cpu_binding import alignment_train_cpu # @manual=//deeplearning/projects/fairseq-py:alignment_train_cpu_binding + alignment_train_cpu(p_choose, alpha, eps) + + @settings(deadline=None) + @given( + bsz=st.integers(1, 100), + tgt_len=st.integers(1, 100), + src_len=st.integers(1, 550), + device=st.sampled_from(["cpu", "cuda"]), + ) + def test_alignment_train(self, bsz, tgt_len, src_len, device): + eps = 1e-6 + + assume(device == "cpu" or TEST_CUDA) + p_choose = torch.rand(bsz, tgt_len, src_len, device=device) + + # run the alignment with the custom operator + alpha_act = p_choose.new_zeros([bsz, tgt_len, src_len]) + self._test_custom_alignment_train_impl(p_choose, alpha_act, eps) + + # runu the alignment with the ref implementation + alpha_ref = self._test_custom_alignment_train_ref(p_choose, eps) + + # verify the results + alpha_act = alpha_act.cpu().detach().numpy() + alpha_ref = alpha_ref.cpu().detach().numpy() + np.testing.assert_allclose( + alpha_act, + alpha_ref, + atol=1e-3, + rtol=1e-3, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq/examples/simultaneous_translation/tests/test_text_models.py b/fairseq/examples/simultaneous_translation/tests/test_text_models.py new file mode 100644 index 0000000000000000000000000000000000000000..19d63563047ea018ee760dc49530aa097920a4cb --- /dev/null +++ b/fairseq/examples/simultaneous_translation/tests/test_text_models.py @@ -0,0 +1,407 @@ +import argparse +import unittest +from typing import Any, Dict + +import torch +from examples.simultaneous_translation.models import ( + transformer_monotonic_attention +) + + +from tests.test_roberta import FakeTask + + +DEFAULT_CONFIG = { + "attention_eps": 1e-6, + "mass_preservation": True, + "noise_type": "flat", + "noise_mean": 0.0, + "noise_var": 1.0, + "energy_bias_init": -2, + "energy_bias": True +} + + +PAD_INDEX = 1 + + +def generate_config(overrides_kv): + new_dict = {key: value for key, value in DEFAULT_CONFIG.items()} + for key, value in overrides_kv.items(): + new_dict[key] = value + return new_dict + + +def make_sample_with_padding(longer_src=False) -> Dict[str, Any]: + tokens_1 = torch.LongTensor( + [ + [2, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 2], + [ + 2, 11, 12, 14, 15, 10, 11, 12, 13, 14, 15, 2, + PAD_INDEX, PAD_INDEX + ], + ] + ) + tokens_2 = torch.LongTensor( + [ + [2, 11, 12, 13, 14, 2, PAD_INDEX, PAD_INDEX], + [2, 11, 22, 33, 2, PAD_INDEX, PAD_INDEX, PAD_INDEX] + ] + ) + if longer_src: + src_tokens = tokens_1[:, 1:] + prev_output_tokens = tokens_2 + else: + src_tokens = tokens_2[:, 1:8] + prev_output_tokens = tokens_1 + + src_lengths = src_tokens.ne(PAD_INDEX).sum(dim=1).long() + + sample = { + "net_input": { + "src_tokens": src_tokens, + "prev_output_tokens": prev_output_tokens, + "src_lengths": src_lengths, + }, + "target": prev_output_tokens[:, 1:], + } + return sample + + +def build_transformer_monotonic_attention(**extra_args: Any): + overrides = { + # Use characteristics dimensions + "encoder_embed_dim": 12, + "encoder_ffn_embed_dim": 14, + "decoder_embed_dim": 12, + "decoder_ffn_embed_dim": 14, + # Disable dropout so we have comparable tests. + "dropout": 0, + "attention_dropout": 0, + "activation_dropout": 0, + "encoder_layerdrop": 0, + } + overrides.update(extra_args) + # Overrides the defaults from the parser + args = argparse.Namespace(**overrides) + transformer_monotonic_attention.monotonic_tiny_architecture(args) + + torch.manual_seed(0) + task = FakeTask(args) + return ( + transformer_monotonic_attention + .TransformerModelSimulTrans + .build_model(args, task) + ) + + +def expected_alignment_formula( + p_choose, + mass_perservation=True, + padding_mask=None +): + # Online and Linear-Time Attention by Enforcing Monotonic Alignments + # https://arxiv.org/pdf/1704.00784.pdf + # Eq 18, 19 + bsz, tgt_len, src_len = p_choose.size() + alpha = torch.zeros_like(p_choose) + + if padding_mask is not None: + bsz_pad = padding_mask.size(0) + num_heads = int(bsz / bsz_pad) + padding_mask = ( + padding_mask + .unsqueeze(1) + .expand([bsz_pad, num_heads, src_len]) + .contiguous() + .view(-1, src_len) + ) + + p_choose = p_choose.masked_fill(padding_mask.unsqueeze(1), 0) + + for bsz_i in range(bsz): + for i in range(tgt_len): + for j in range(src_len): + if i == 0: + if j == 0: + # First source token + alpha[bsz_i, i, j] = p_choose[bsz_i, i, j] + else: + # First target token + alpha[bsz_i, i, j] = ( + p_choose[bsz_i, i, j] + * torch.prod( + 1 - p_choose[bsz_i, i, :j] + ) + ) + else: + alpha[bsz_i, i, j] = alpha[bsz_i, i - 1, j] + for k in range(j): + alpha[bsz_i, i, j] += ( + alpha[bsz_i, i - 1, k] + * torch.prod( + 1 - p_choose[bsz_i, i, k:j] + ) + ) + alpha[bsz_i, i, j] *= p_choose[bsz_i, i, j] + + alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0) + + if mass_perservation: + alpha = mass_perservation_formula(alpha, False, padding_mask) + + return alpha + + +def mass_perservation_formula(alpha, left_padding=False, padding_mask=None): + if padding_mask is None or alpha.size(-1) == 1: + if alpha.size(-1) > 1: + alpha[:, :, -1] = 1 - alpha[:, :, :-1].sum(dim=-1) + return alpha + + src_lens = (padding_mask.logical_not()).sum(dim=1).long() + + bsz, tgt_len, src_len = alpha.size() + + assert ( + not left_padding + or (left_padding and (not padding_mask[:, 0].any())) + ) + + alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0) + + for bsz_i in range(bsz): + if left_padding: + alpha[bsz_i, :, -1] = ( + 1 - alpha[bsz_i, :, :-1].sum(dim=-1) + ) + else: + alpha[bsz_i, :, src_lens[bsz_i] - 1] = ( + 1 - alpha[bsz_i, :, :src_lens[bsz_i] - 1].sum(dim=-1) + ) + + return alpha + + +def expected_soft_attention_formula( + alpha, + soft_energy, + padding_mask=None, + chunksize=1e10, +): + # Monotonic Infinite Lookback Attention for Simultaneous Machine Translation + # https://arxiv.org/pdf/1906.05218.pdf + # Eq 14 + + # Monotonic Chunkwise Attention + # https://arxiv.org/abs/1712.05382 + # Eq 17 + bsz, tgt_len, src_len = alpha.size() + beta = torch.zeros_like(alpha) + + if padding_mask is not None: + bsz_pad = padding_mask.size(0) + num_heads = int(bsz / bsz_pad) + # Expanding for potential head dimension + padding_mask = ( + padding_mask + .unsqueeze(1) + .expand([bsz_pad, num_heads, src_len]) + .contiguous() + .view(-1, src_len) + ) + soft_energy = soft_energy.masked_fill(padding_mask.unsqueeze(1), float('-inf')) + + for bsz_i in range(bsz): + for i in range(tgt_len): + for j in range(src_len): + for k in range(j, min([src_len, j + chunksize])): + if not padding_mask[bsz_i, j]: + beta[bsz_i, i, j] += ( + alpha[bsz_i, i, k] * torch.exp(soft_energy[bsz_i, i, j]) + / torch.sum(torch.exp(soft_energy[bsz_i, i, max([0, k - chunksize + 1]):k + 1])) + ) + return beta + + +class MonotonicAttentionTestAbstractClass(object): + def test_forward(self): + sample = make_sample_with_padding() + out, _ = self.model.forward(**sample["net_input"]) + loss = out.sum() + loss.backward() + + def test_p_choose(self): + sample = make_sample_with_padding() + _, extra_out = self.model.forward(**sample["net_input"]) + for item in extra_out.attn_list: + p_choose = item["p_choose"] + self.assertTrue(p_choose.le(1.0).all()) + self.assertTrue(p_choose.ge(0.0).all()) + + def test_expected_alignment(self): + for longer_src in [True, False]: + sample = make_sample_with_padding(longer_src) + _, extra_out = self.model.forward(**sample["net_input"]) + for item in extra_out.attn_list: + p_choose = item["p_choose"] + alpha_system = item["alpha"] + self.assertTrue(p_choose.size() == alpha_system.size()) + bsz, num_head, tgt_len, src_len = alpha_system.size() + alpha_system = alpha_system.view(-1, tgt_len, src_len) + p_choose = p_choose.view(-1, tgt_len, src_len) + + alpha_real = expected_alignment_formula( + p_choose, + self.model.decoder.layers[0].encoder_attn.mass_preservation, + sample["net_input"]["src_tokens"].eq(PAD_INDEX) + ) + + self.assertTrue( + torch.abs(alpha_system - alpha_real).le(5e-5).all(), + ) + + +class HardMonotonicAttentionTestCase( + unittest.TestCase, + MonotonicAttentionTestAbstractClass +): + def setUp(self): + self.model = build_transformer_monotonic_attention( + **generate_config({"simul_type": "hard_aligned"}) + ) + + +class InfiniteLookbackTestCase( + unittest.TestCase, + MonotonicAttentionTestAbstractClass +): + def setUp(self): + self.model = build_transformer_monotonic_attention( + **generate_config( + { + "simul_type": "infinite_lookback" + } + ) + ) + self.model.train() + + def test_fp16_for_long_input(self): + sample = { + "net_input": { + "src_tokens": torch.LongTensor([7] * 1000 + [2]).cuda().unsqueeze(0), + "prev_output_tokens": torch.LongTensor([7] * 1000 + [2]).cuda().unsqueeze(0), + "src_lengths": torch.LongTensor([1000]).cuda(), + }, + "target": torch.LongTensor([2] + [7] * 1000).unsqueeze(0).cuda() + } + self.model.cuda().half() + _, extra_out = self.model.forward(**sample["net_input"]) + for item in extra_out.attn_list: + for key in ["p_choose", "alpha", "beta", "soft_energy"]: + self.assertFalse(torch.isnan(item[key]).any()) + + def test_expected_attention(self): + for longer_src in [True, False]: + sample = make_sample_with_padding(longer_src) + _, extra_out = self.model.forward(**sample["net_input"]) + for item in extra_out.attn_list: + p_choose = item["p_choose"] + alpha_system = item["alpha"] + beta_system = item["beta"] + soft_energy_system = item["soft_energy"] + self.assertTrue(beta_system.size() == alpha_system.size()) + self.assertTrue(p_choose.size() == alpha_system.size()) + + bsz, num_head, tgt_len, src_len = alpha_system.size() + + alpha_system = alpha_system.view(-1, tgt_len, src_len) + beta_system = beta_system.view(-1, tgt_len, src_len) + p_choose = p_choose.view(-1, tgt_len, src_len) + soft_energy_system = soft_energy_system.view(-1, tgt_len, src_len) + + alpha_real = expected_alignment_formula( + p_choose, + self.model.decoder.layers[0].encoder_attn.mass_preservation, + sample["net_input"]["src_tokens"].eq(PAD_INDEX) + ) + + beta_real = expected_soft_attention_formula( + alpha_real, + soft_energy_system, + sample["net_input"]["src_tokens"].eq(PAD_INDEX), + chunksize=getattr( + self.model.decoder.layers[0].encoder_attn, + "chunk_size", + int(1e10) + ) or int(1e10) + ) + + self.assertTrue( + torch.abs(beta_system - beta_real).le(1e-5).all(), + ) + + +class ChunkwiswTestCase( + InfiniteLookbackTestCase +): + def setUp(self): + self.model = build_transformer_monotonic_attention( + **generate_config( + { + "simul_type": "chunkwise", + "mocha_chunk_size": 3 + } + ) + ) + + +class WaitkTestCase(InfiniteLookbackTestCase): + def setUp(self): + self.model = build_transformer_monotonic_attention( + **generate_config( + { + "simul_type": "waitk", + "waitk_lagging": 3, + } + ) + ) + + def check_waitk(self, p_choose, lagging, padding_mask): + bsz, tgt_len, src_len = p_choose.size() + for bsz_i in range(bsz): + for i in range(tgt_len): + for j in range(src_len): + if not padding_mask[bsz_i, j]: + if j - i == lagging - 1: + self.assertTrue(p_choose[bsz_i, i, j] == 1) + else: + self.assertTrue(p_choose[bsz_i, i, j] == 0) + + def test_waitk_p_choose(self): + for longer_src in [True, False]: + for k in [1, 3, 10, 20, 100]: + sample = make_sample_with_padding(longer_src) + model = build_transformer_monotonic_attention( + **generate_config( + { + "simul_type": "waitk", + "waitk_lagging": k, + } + ) + ) + model.train() + _, extra_out = model.forward(**sample["net_input"]) + for item in extra_out.attn_list: + p_choose = item["p_choose"] + bsz, num_heads, tgt_len, src_len = p_choose.size() + padding_mask = sample["net_input"]["src_tokens"].eq(PAD_INDEX) + padding_mask = ( + padding_mask + .unsqueeze(1) + .expand([bsz, num_heads, src_len]) + .contiguous() + .view(-1, src_len) + ) + p_choose = p_choose.view(bsz * num_heads, tgt_len, src_len) + self.check_waitk(p_choose, k, padding_mask) diff --git a/fairseq/examples/simultaneous_translation/utils/__init__.py b/fairseq/examples/simultaneous_translation/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e9ce844f59a4211061392084cc81075e6bab19f --- /dev/null +++ b/fairseq/examples/simultaneous_translation/utils/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import importlib +import os + + +# automatically import any Python files in the criterions/ directory +for file in sorted(os.listdir(os.path.dirname(__file__))): + if file.endswith(".py") and not file.startswith("_"): + module = file[: file.find(".py")] + importlib.import_module("examples.simultaneous_translation.utils." + module) diff --git a/fairseq/examples/simultaneous_translation/utils/__pycache__/monotonic_attention.cpython-310.pyc b/fairseq/examples/simultaneous_translation/utils/__pycache__/monotonic_attention.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0520a2f84c83a43d033f612a50dc39f77b3bc672 Binary files /dev/null and b/fairseq/examples/simultaneous_translation/utils/__pycache__/monotonic_attention.cpython-310.pyc differ diff --git a/fairseq/examples/simultaneous_translation/utils/functions.py b/fairseq/examples/simultaneous_translation/utils/functions.py new file mode 100644 index 0000000000000000000000000000000000000000..590a6c11cea222ac9096b19f0e3dfe1b71b6c10b --- /dev/null +++ b/fairseq/examples/simultaneous_translation/utils/functions.py @@ -0,0 +1,125 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch + + +def prob_check(tensor, eps=1e-10): + assert not torch.isnan(tensor).any(), ( + "Nan in a probability tensor." + ) + # Add the eps here to prevent errors introduced by precision + assert tensor.le(1.0 + eps).all() and tensor.ge(0.0 - eps).all(), ( + "Incorrect values in a probability tensor" + ", 0.0 <= tensor <= 1.0" + ) + + +def exclusive_cumprod(tensor, dim: int, eps: float = 1e-10): + """ + Implementing exclusive cumprod. + There is cumprod in pytorch, however there is no exclusive mode. + cumprod(x) = [x1, x1x2, x2x3x4, ..., prod_{i=1}^n x_i] + exclusive means + cumprod(x) = [1, x1, x1x2, x1x2x3, ..., prod_{i=1}^{n-1} x_i] + """ + tensor_size = list(tensor.size()) + tensor_size[dim] = 1 + return_tensor = safe_cumprod( + torch.cat([torch.ones(tensor_size).type_as(tensor), tensor], dim=dim), + dim=dim, + eps=eps, + ) + + if dim == 0: + return return_tensor[:-1] + elif dim == 1: + return return_tensor[:, :-1] + elif dim == 2: + return return_tensor[:, :, :-1] + else: + raise RuntimeError( + "Cumprod on dimension 3 and more is not implemented" + ) + + +def safe_cumprod(tensor, dim: int, eps: float = 1e-10): + """ + An implementation of cumprod to prevent precision issue. + cumprod(x) + = [x1, x1x2, x1x2x3, ....] + = [exp(log(x1)), exp(log(x1) + log(x2)), exp(log(x1) + log(x2) + log(x3)), ...] + = exp(cumsum(log(x))) + """ + + if (tensor + eps < 0).any().item(): + raise RuntimeError( + "Safe cumprod can only take non-negative tensors as input." + "Consider use torch.cumprod if you want to calculate negative values." + ) + + log_tensor = torch.log(tensor + eps) + cumsum_log_tensor = torch.cumsum(log_tensor, dim) + exp_cumsum_log_tensor = torch.exp(cumsum_log_tensor) + return exp_cumsum_log_tensor + + +def moving_sum(x, start_idx: int, end_idx: int): + """ + From MONOTONIC CHUNKWISE ATTENTION + https://arxiv.org/pdf/1712.05382.pdf + Equation (18) + + x = [x_1, x_2, ..., x_N] + MovingSum(x, start_idx, end_idx)_n = Sigma_{m=n−(start_idx−1)}^{n+end_idx-1} x_m + for n in {1, 2, 3, ..., N} + + x : src_len, batch_size + start_idx : start idx + end_idx : end idx + + Example + src_len = 5 + batch_size = 3 + x = + [[ 0, 5, 10], + [ 1, 6, 11], + [ 2, 7, 12], + [ 3, 8, 13], + [ 4, 9, 14]] + + MovingSum(x, 3, 1) = + [[ 0, 5, 10], + [ 1, 11, 21], + [ 3, 18, 33], + [ 6, 21, 36], + [ 9, 24, 39]] + + MovingSum(x, 1, 3) = + [[ 3, 18, 33], + [ 6, 21, 36], + [ 9, 24, 39], + [ 7, 17, 27], + [ 4, 9, 14]] + """ + # TODO: Make dimension configurable + assert start_idx > 0 and end_idx > 0 + batch_size, tgt_len, src_len = x.size() + x = x.view(-1, src_len).unsqueeze(1) + # batch_size, 1, src_len + moving_sum_weight = torch.ones([1, 1, end_idx + start_idx - 1]).type_as(x) + + moving_sum = torch.nn.functional.conv1d( + x, moving_sum_weight, padding=start_idx + end_idx - 1 + ).squeeze(1) + + moving_sum = moving_sum[:, end_idx:-start_idx] + + assert src_len == moving_sum.size(1) + assert batch_size * tgt_len == moving_sum.size(0) + + moving_sum = moving_sum.view(batch_size, tgt_len, src_len) + + return moving_sum diff --git a/fairseq/examples/simultaneous_translation/utils/monotonic_attention.py b/fairseq/examples/simultaneous_translation/utils/monotonic_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..3b8e0a858cbe8713d99629d535119594aeb08d18 --- /dev/null +++ b/fairseq/examples/simultaneous_translation/utils/monotonic_attention.py @@ -0,0 +1,180 @@ +from typing import Optional +import torch +from torch import Tensor + +from examples.simultaneous_translation.utils.functions import ( + exclusive_cumprod, + prob_check, + moving_sum, +) + + +def expected_alignment_from_p_choose( + p_choose: Tensor, + padding_mask: Optional[Tensor] = None, + eps: float = 1e-6 +): + """ + Calculating expected alignment for from stepwise probability + + Reference: + Online and Linear-Time Attention by Enforcing Monotonic Alignments + https://arxiv.org/pdf/1704.00784.pdf + + q_ij = (1 − p_{ij−1})q_{ij−1} + a+{i−1j} + a_ij = p_ij q_ij + + Parallel solution: + ai = p_i * cumprod(1 − pi) * cumsum(a_i / cumprod(1 − pi)) + + ============================================================ + Expected input size + p_choose: bsz, tgt_len, src_len + """ + prob_check(p_choose) + + # p_choose: bsz, tgt_len, src_len + bsz, tgt_len, src_len = p_choose.size() + dtype = p_choose.dtype + + p_choose = p_choose.float() + + if padding_mask is not None: + p_choose = p_choose.masked_fill(padding_mask.unsqueeze(1), 0.0) + + if p_choose.is_cuda: + p_choose = p_choose.contiguous() + from alignment_train_cuda_binding import alignment_train_cuda as alignment_train + else: + from alignment_train_cpu_binding import alignment_train_cpu as alignment_train + + alpha = p_choose.new_zeros([bsz, tgt_len, src_len]) + alignment_train(p_choose, alpha, eps) + + # Mix precision to prevent overflow for fp16 + alpha = alpha.type(dtype) + + prob_check(alpha) + + return alpha + + +def expected_soft_attention( + alpha: Tensor, + soft_energy: Tensor, + padding_mask: Optional[Tensor] = None, + chunk_size: Optional[int] = None, + eps: float = 1e-10 +): + """ + Function to compute expected soft attention for + monotonic infinite lookback attention from + expected alignment and soft energy. + + Reference: + Monotonic Chunkwise Attention + https://arxiv.org/abs/1712.05382 + + Monotonic Infinite Lookback Attention for Simultaneous Machine Translation + https://arxiv.org/abs/1906.05218 + + alpha: bsz, tgt_len, src_len + soft_energy: bsz, tgt_len, src_len + padding_mask: bsz, src_len + left_padding: bool + """ + if padding_mask is not None: + alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0.0) + soft_energy = soft_energy.masked_fill( + padding_mask.unsqueeze(1), -float("inf") + ) + + prob_check(alpha) + + dtype = alpha.dtype + + alpha = alpha.float() + soft_energy = soft_energy.float() + + soft_energy = soft_energy - soft_energy.max(dim=2, keepdim=True)[0] + exp_soft_energy = torch.exp(soft_energy) + eps + + if chunk_size is not None: + # Chunkwise + beta = ( + exp_soft_energy + * moving_sum( + alpha / (eps + moving_sum(exp_soft_energy, chunk_size, 1)), + 1, chunk_size + ) + ) + else: + # Infinite lookback + # Notice that infinite lookback is a special case of chunkwise + # where chunksize = inf + inner_items = alpha / (eps + torch.cumsum(exp_soft_energy, dim=2)) + + beta = ( + exp_soft_energy + * torch.cumsum(inner_items.flip(dims=[2]), dim=2) + .flip(dims=[2]) + ) + + if padding_mask is not None: + beta = beta.masked_fill( + padding_mask.unsqueeze(1).to(torch.bool), 0.0) + + # Mix precision to prevent overflow for fp16 + beta = beta.type(dtype) + + beta = beta.clamp(0, 1) + + prob_check(beta) + + return beta + + +def mass_preservation( + alpha: Tensor, + padding_mask: Optional[Tensor] = None, + left_padding: bool = False +): + """ + Function to compute the mass perservation for alpha. + This means that the residual weights of alpha will be assigned + to the last token. + + Reference: + Monotonic Infinite Lookback Attention for Simultaneous Machine Translation + https://arxiv.org/abs/1906.05218 + + alpha: bsz, tgt_len, src_len + padding_mask: bsz, src_len + left_padding: bool + """ + + prob_check(alpha) + + if padding_mask is not None: + if not left_padding: + assert not padding_mask[:, 0].any(), ( + "Find padding on the beginning of the sequence." + ) + alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0.0) + + if left_padding or padding_mask is None: + residuals = 1 - alpha[:, :, :-1].sum(dim=-1).clamp(0, 1) + alpha[:, :, -1] = residuals + else: + # right padding + _, tgt_len, src_len = alpha.size() + residuals = 1 - alpha.sum(dim=-1, keepdim=True).clamp(0, 1) + src_lens = src_len - padding_mask.sum(dim=1, keepdim=True) + src_lens = src_lens.expand(-1, tgt_len).contiguous() + # add back the last value + residuals += alpha.gather(2, src_lens.unsqueeze(2) - 1) + alpha = alpha.scatter(2, src_lens.unsqueeze(2) - 1, residuals) + + prob_check(alpha) + + return alpha diff --git a/fairseq/examples/simultaneous_translation/utils/p_choose_strategy.py b/fairseq/examples/simultaneous_translation/utils/p_choose_strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..724c6912a62d48fc61988cac1434a4f5c8754521 --- /dev/null +++ b/fairseq/examples/simultaneous_translation/utils/p_choose_strategy.py @@ -0,0 +1,126 @@ +from typing import Optional, Dict +from torch import Tensor +import torch + + +def waitk_p_choose( + tgt_len: int, + src_len: int, + bsz: int, + waitk_lagging: int, + key_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None +): + + max_src_len = src_len + if incremental_state is not None: + # Retrieve target length from incremental states + # For inference the length of query is always 1 + max_tgt_len = incremental_state["steps"]["tgt"] + assert max_tgt_len is not None + max_tgt_len = int(max_tgt_len) + else: + max_tgt_len = tgt_len + + if max_src_len < waitk_lagging: + if incremental_state is not None: + max_tgt_len = 1 + return torch.zeros( + bsz, max_tgt_len, max_src_len + ) + + # Assuming the p_choose looks like this for wait k=3 + # src_len = 6, max_tgt_len = 5 + # [0, 0, 1, 0, 0, 0, 0] + # [0, 0, 0, 1, 0, 0, 0] + # [0, 0, 0, 0, 1, 0, 0] + # [0, 0, 0, 0, 0, 1, 0] + # [0, 0, 0, 0, 0, 0, 1] + # linearize the p_choose matrix: + # [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0...] + # The indices of linearized matrix that equals 1 is + # 2 + 6 * 0 + # 3 + 6 * 1 + # ... + # n + src_len * n + k - 1 = n * (src_len + 1) + k - 1 + # n from 0 to max_tgt_len - 1 + # + # First, generate the indices (activate_indices_offset: bsz, max_tgt_len) + # Second, scatter a zeros tensor (bsz, max_tgt_len * src_len) + # with activate_indices_offset + # Third, resize the tensor to (bsz, max_tgt_len, src_len) + + activate_indices_offset = ( + ( + torch.arange(max_tgt_len) * (max_src_len + 1) + + waitk_lagging - 1 + ) + .unsqueeze(0) + .expand(bsz, max_tgt_len) + .long() + ) + + if key_padding_mask is not None: + if key_padding_mask[:, 0].any(): + # Left padding + activate_indices_offset += ( + key_padding_mask.sum(dim=1, keepdim=True) + ) + + # Need to clamp the indices that are too large + activate_indices_offset = ( + activate_indices_offset + .clamp( + 0, + min( + [ + max_tgt_len, + max_src_len - waitk_lagging + 1 + ] + ) * max_src_len - 1 + ) + ) + + p_choose = torch.zeros(bsz, max_tgt_len * max_src_len) + + p_choose = p_choose.scatter( + 1, + activate_indices_offset, + 1.0 + ).view(bsz, max_tgt_len, max_src_len) + + if key_padding_mask is not None: + p_choose = p_choose.to(key_padding_mask) + p_choose = p_choose.masked_fill(key_padding_mask.unsqueeze(1), 0) + + if incremental_state is not None: + p_choose = p_choose[:, -1:] + + return p_choose.float() + + +def learnable_p_choose( + energy, + noise_mean: float = 0.0, + noise_var: float = 0.0, + training: bool = True +): + """ + Calculating step wise prob for reading and writing + 1 to read, 0 to write + energy: bsz, tgt_len, src_len + """ + + noise = 0 + if training: + # add noise here to encourage discretness + noise = ( + torch.normal(noise_mean, noise_var, energy.size()) + .type_as(energy) + .to(energy.device) + ) + + p_choose = torch.sigmoid(energy + noise) + + # p_choose: bsz * self.num_heads, tgt_len, src_len + return p_choose