# coding=utf-8 # Copyright 2022 IDEA-CCNL and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for TransfoXLDenoise.""" import sentencepiece as spm from transformers.tokenization_utils import PreTrainedTokenizer VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} PRETRAINED_VOCAB_FILES_MAP = { "vocab_file": { "transformer-xl-1b-base": "https://huggingface.co/IDEA-CCNL/Bigan-Transformer-XL-denoise-1.1B/resolve/main/spiece.model", }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "transformer-xl-1b-base": 512, } class TransfoXLDenoiseTokenizer(PreTrainedTokenizer): """ Construct a TransfoXLDenoise tokenizer. Based on pretrained sentence piece Args: vocab_file (`str`): Path to the vocabulary file. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] SPIECE_UNDERLINE = "▁" def __init__( self, vocab_file, unk_token="<|endoftext|>", bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs ): super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) "Initialisation" self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): "Returns vocab size" return len(self.sp_model) def _tokenize(self, text): """ Returns a tokenized string. """ return self.sp_model.EncodeAsPieces(text) def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ return self.sp_model.PieceToId(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.sp_model.IdToPiece(index) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = "".join(tokens).replace(self.SPIECE_UNDERLINE, " ").strip() return out_string