|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Tokenization classes for TransfoXLDenoise.""" |
|
|
|
import sentencepiece as spm |
|
from transformers.tokenization_utils import PreTrainedTokenizer |
|
|
|
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} |
|
|
|
PRETRAINED_VOCAB_FILES_MAP = { |
|
"vocab_file": { |
|
"transformer-xl-1b-base": |
|
"https://huggingface.co/IDEA-CCNL/Bigan-Transformer-XL-denoise-1.1B/resolve/main/spiece.model", |
|
}, |
|
} |
|
|
|
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { |
|
"transformer-xl-1b-base": 512, |
|
} |
|
|
|
|
|
class TransfoXLDenoiseTokenizer(PreTrainedTokenizer): |
|
""" |
|
Construct a TransfoXLDenoise tokenizer. Based on pretrained sentence piece |
|
|
|
Args: |
|
vocab_file (`str`): |
|
Path to the vocabulary file. |
|
""" |
|
|
|
vocab_files_names = VOCAB_FILES_NAMES |
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP |
|
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES |
|
model_input_names = ["input_ids", "attention_mask"] |
|
SPIECE_UNDERLINE = "▁" |
|
|
|
def __init__( |
|
self, |
|
vocab_file, |
|
unk_token="<|endoftext|>", |
|
bos_token="<|endoftext|>", |
|
eos_token="<|endoftext|>", |
|
**kwargs |
|
): |
|
super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) |
|
"Initialisation" |
|
self.sp_model = spm.SentencePieceProcessor() |
|
self.sp_model.Load(vocab_file) |
|
|
|
@property |
|
def vocab_size(self): |
|
"Returns vocab size" |
|
return len(self.sp_model) |
|
|
|
def _tokenize(self, text): |
|
""" Returns a tokenized string. """ |
|
return self.sp_model.EncodeAsPieces(text) |
|
|
|
def _convert_token_to_id(self, token): |
|
""" Converts a token (str) in an id using the vocab. """ |
|
return self.sp_model.PieceToId(token) |
|
|
|
def _convert_id_to_token(self, index): |
|
"""Converts an index (integer) in a token (str) using the vocab.""" |
|
return self.sp_model.IdToPiece(index) |
|
|
|
def convert_tokens_to_string(self, tokens): |
|
""" Converts a sequence of tokens (string) in a single string. """ |
|
out_string = "".join(tokens).replace(self.SPIECE_UNDERLINE, " ").strip() |
|
return out_string |
|
|