summary / fengshen /models /transfo_xl_denoise /tokenization_transfo_xl_denoise.py
fclong's picture
Upload 396 files
8ebda9e
raw
history blame
2.81 kB
# coding=utf-8
# Copyright 2022 IDEA-CCNL and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for TransfoXLDenoise."""
import sentencepiece as spm
from transformers.tokenization_utils import PreTrainedTokenizer
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"transformer-xl-1b-base":
"https://huggingface.co/IDEA-CCNL/Bigan-Transformer-XL-denoise-1.1B/resolve/main/spiece.model",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"transformer-xl-1b-base": 512,
}
class TransfoXLDenoiseTokenizer(PreTrainedTokenizer):
"""
Construct a TransfoXLDenoise tokenizer. Based on pretrained sentence piece
Args:
vocab_file (`str`):
Path to the vocabulary file.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
SPIECE_UNDERLINE = "▁"
def __init__(
self,
vocab_file,
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
**kwargs
):
super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
"Initialisation"
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
@property
def vocab_size(self):
"Returns vocab size"
return len(self.sp_model)
def _tokenize(self, text):
""" Returns a tokenized string. """
return self.sp_model.EncodeAsPieces(text)
def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """
out_string = "".join(tokens).replace(self.SPIECE_UNDERLINE, " ").strip()
return out_string