minBERT / tokenizer.py

Transfer code from Kaggle

a0b398e 3 months ago

104 kB

	from typing import List, Optional, Tuple, Dict, Union, Any, overload, Sequence, NamedTuple
	import collections
	import os
	import re
	import unicodedata
	import itertools
	import requests
	import copy
	import json
	from contextlib import contextmanager
	from collections import OrderedDict, UserDict
	from enum import Enum
	import numpy as np
	from utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
	from tokenizers import AddedToken
	from tokenizers import Encoding as EncodingFast


	VERY_LARGE_INTEGER = int(1e30) # This is used to set the max input length for a model with infinite size input
	LARGE_INTEGER = int(1e20) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER

	SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
	ADDED_TOKENS_FILE = "added_tokens.json"
	TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
	FULL_TOKENIZER_FILE = "tokenizer.json"

	VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
	PRETRAINED_VOCAB_FILES_MAP = {
	"vocab_file": {
	"bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt"
	}
	}
	PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
	"bert-base-uncased": 512
	}
	PRETRAINED_INIT_CONFIGURATION = {
	"bert-base-uncased": {"do_lower_case": True}
	}


	TextInput = str
	PreTokenizedInput = List[str]
	EncodedInput = List[int]
	TextInputPair = Tuple[str, str]
	PreTokenizedInputPair = Tuple[List[str], List[str]]
	EncodedInputPair = Tuple[List[int], List[int]]


	class ExplicitEnum(Enum):
	@classmethod
	def _missing_(cls, value):
	raise ValueError(
	"%r is not a valid %s, please select one of %s"
	% (value, cls.__name__, str(list(cls._value2member_map_.keys())))
	)


	class TruncationStrategy(ExplicitEnum):
	ONLY_FIRST = "only_first"
	ONLY_SECOND = "only_second"
	LONGEST_FIRST = "longest_first"
	DO_NOT_TRUNCATE = "do_not_truncate"


	class PaddingStrategy(ExplicitEnum):
	LONGEST = "longest"
	MAX_LENGTH = "max_length"
	DO_NOT_PAD = "do_not_pad"


	class TensorType(ExplicitEnum):
	PYTORCH = "pt"
	TENSORFLOW = "tf"
	NUMPY = "np"
	JAX = "jax"


	class CharSpan(NamedTuple):
	start: int
	end: int


	class TokenSpan(NamedTuple):
	start: int
	end: int


	def to_py_obj(obj):
	"""
	Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
	"""
	if isinstance(obj, (dict, BatchEncoding)):
	return {k: to_py_obj(v) for k, v in obj.items()}
	elif isinstance(obj, (list, tuple)):
	return [to_py_obj(o) for o in obj]
	elif is_tf_available() and _is_tensorflow(obj):
	return obj.numpy().tolist()
	elif is_torch_available() and _is_torch(obj):
	return obj.detach().cpu().tolist()
	elif isinstance(obj, np.ndarray):
	return obj.tolist()
	else:
	return obj


	def _is_torch(x):
	import torch
	return isinstance(x, torch.Tensor)


	def _is_torch_device(x):
	import torch
	return isinstance(x, torch.device)


	def _is_end_of_word(text):
	last_char = text[-1]
	return bool(_is_control(last_char) \| _is_punctuation(last_char) \| _is_whitespace(last_char))


	def _is_start_of_word(text):
	first_char = text[0]
	return bool(_is_control(first_char) \| _is_punctuation(first_char) \| _is_whitespace(first_char))


	def _is_punctuation(char):
	cp = ord(char)
	# We treat all non-letter/number ASCII as punctuation.
	# Characters such as "^", "$", and "`" are not in the Unicode
	# Punctuation class but we treat them as punctuation anyways, for
	# consistency.
	if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
	return True
	cat = unicodedata.category(char)
	if cat.startswith("P"):
	return True
	return False


	def _is_whitespace(char):
	# \t, \n, and \r are technically control characters but we treat them
	# as whitespace since they are generally considered as such.
	if char == " " or char == "\t" or char == "\n" or char == "\r":
	return True
	cat = unicodedata.category(char)
	if cat == "Zs":
	return True
	return False


	def _is_control(char):
	# These are technically control characters but we count them as whitespace
	# characters.
	if char == "\t" or char == "\n" or char == "\r":
	return False
	cat = unicodedata.category(char)
	if cat.startswith("C"):
	return True
	return False


	def load_vocab(vocab_file):
	vocab = collections.OrderedDict()
	with open(vocab_file, "r", encoding="utf-8") as reader:
	tokens = reader.readlines()
	for index, token in enumerate(tokens):
	token = token.rstrip("\n")
	vocab[token] = index
	return vocab


	def whitespace_tokenize(text):
	text = text.strip()
	if not text:
	return []
	tokens = text.split()
	return tokens


	class BatchEncoding(UserDict):
	def __init__(
	self,
	data: Optional[Dict[str, Any]] = None,
	encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
	tensor_type: Union[None, str, TensorType] = None,
	prepend_batch_axis: bool = False,
	n_sequences: Optional[int] = None,
	):
	super().__init__(data)

	if isinstance(encoding, EncodingFast):
	encoding = [encoding]

	self._encodings = encoding

	if n_sequences is None and encoding is not None and len(encoding):
	n_sequences = encoding[0].n_sequences

	self._n_sequences = n_sequences

	self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

	@property
	def n_sequences(self) -> Optional[int]:
	return self._n_sequences

	@property
	def is_fast(self) -> bool:
	return self._encodings is not None

	def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
	if isinstance(item, str):
	return self.data[item]
	elif self._encodings is not None:
	return self._encodings[item]
	else:
	raise KeyError(
	"Indexing with integers (to access backend Encoding for a given batch index) "
	"is not available when using Python based tokenizers"
	)

	def __getattr__(self, item: str):
	try:
	return self.data[item]
	except KeyError:
	raise AttributeError

	def __getstate__(self):
	return {"data": self.data, "encodings": self._encodings}

	def __setstate__(self, state):
	if "data" in state:
	self.data = state["data"]

	if "encodings" in state:
	self._encodings = state["encodings"]

	def keys(self):
	return self.data.keys()

	def values(self):
	return self.data.values()

	def items(self):
	return self.data.items()

	# After this point:
	# Extended properties and methods only available for fast (Rust-based) tokenizers
	# provided by HuggingFace tokenizers library.

	@property
	def encodings(self) -> Optional[List[EncodingFast]]:
	return self._encodings

	def tokens(self, batch_index: int = 0) -> List[str]:
	if not self._encodings:
	raise ValueError("tokens() is not available when using Python-based tokenizers")
	return self._encodings[batch_index].tokens

	def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
	if not self._encodings:
	raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
	return self._encodings[batch_index].sequence_ids

	def words(self, batch_index: int = 0) -> List[Optional[int]]:
	if not self._encodings:
	raise ValueError("words() is not available when using Python-based tokenizers")
	return self.word_ids(batch_index)

	def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
	if not self._encodings:
	raise ValueError("word_ids() is not available when using Python-based tokenizers")
	return self._encodings[batch_index].word_ids

	def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
	if not self._encodings:
	raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
	if token_index is not None:
	batch_index = batch_or_token_index
	else:
	batch_index = 0
	token_index = batch_or_token_index
	if batch_index < 0:
	batch_index = self._batch_size + batch_index
	if token_index < 0:
	token_index = self._seq_len + token_index
	return self._encodings[batch_index].token_to_sequence(token_index)

	def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
	if not self._encodings:
	raise ValueError("token_to_word() is not available when using Python based tokenizers")
	if token_index is not None:
	batch_index = batch_or_token_index
	else:
	batch_index = 0
	token_index = batch_or_token_index
	if batch_index < 0:
	batch_index = self._batch_size + batch_index
	if token_index < 0:
	token_index = self._seq_len + token_index
	return self._encodings[batch_index].token_to_word(token_index)

	def word_to_tokens(
	self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
	) -> Optional[TokenSpan]:
	if not self._encodings:
	raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
	if word_index is not None:
	batch_index = batch_or_word_index
	else:
	batch_index = 0
	word_index = batch_or_word_index
	if batch_index < 0:
	batch_index = self._batch_size + batch_index
	if word_index < 0:
	word_index = self._seq_len + word_index
	span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
	return TokenSpan(*span) if span is not None else None

	def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
	if not self._encodings:
	raise ValueError("token_to_chars() is not available when using Python based tokenizers")
	if token_index is not None:
	batch_index = batch_or_token_index
	else:
	batch_index = 0
	token_index = batch_or_token_index
	return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))

	def char_to_token(
	self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
	) -> int:
	if not self._encodings:
	raise ValueError("char_to_token() is not available when using Python based tokenizers")
	if char_index is not None:
	batch_index = batch_or_char_index
	else:
	batch_index = 0
	char_index = batch_or_char_index
	return self._encodings[batch_index].char_to_token(char_index, sequence_index)

	def word_to_chars(
	self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
	) -> CharSpan:
	if not self._encodings:
	raise ValueError("word_to_chars() is not available when using Python based tokenizers")
	if word_index is not None:
	batch_index = batch_or_word_index
	else:
	batch_index = 0
	word_index = batch_or_word_index
	return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))

	def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
	if not self._encodings:
	raise ValueError("char_to_word() is not available when using Python based tokenizers")
	if char_index is not None:
	batch_index = batch_or_char_index
	else:
	batch_index = 0
	char_index = batch_or_char_index
	return self._encodings[batch_index].char_to_word(char_index, sequence_index)

	def convert_to_tensors(
	self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
	):
	if tensor_type is None:
	return self

	# Convert to TensorType
	if not isinstance(tensor_type, TensorType):
	tensor_type = TensorType(tensor_type)

	# Get a function reference for the correct framework
	if tensor_type == TensorType.TENSORFLOW:
	if not is_tf_available():
	raise ImportError(
	"Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
	)
	import tensorflow as tf

	as_tensor = tf.constant
	is_tensor = tf.is_tensor
	elif tensor_type == TensorType.PYTORCH:
	if not is_torch_available():
	raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
	import torch

	as_tensor = torch.tensor
	is_tensor = torch.is_tensor
	elif tensor_type == TensorType.JAX:
	if not is_flax_available():
	raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
	import jax.numpy as jnp # noqa: F811

	as_tensor = jnp.array
	is_tensor = _is_jax
	else:
	as_tensor = np.asarray
	is_tensor = _is_numpy
	# (mfuntowicz: This code is unreachable)
	# else:
	# raise ImportError(
	# "Unable to convert output to tensors format {}".format(tensor_type)
	# )

	# Do the tensor conversion in batch
	for key, value in self.items():
	try:
	if prepend_batch_axis:
	value = [value]

	if not is_tensor(value):
	tensor = as_tensor(value)

	# Removing this for now in favor of controlling the shape with `prepend_batch_axis`
	# # at-least2d
	# if tensor.ndim > 2:
	# tensor = tensor.squeeze(0)
	# elif tensor.ndim < 2:
	# tensor = tensor[None, :]

	self[key] = tensor
	except: # noqa E722
	if key == "overflowing_tokens":
	raise ValueError(
	"Unable to create tensor returning overflowing tokens of different lengths. "
	"Please see if a fast version of this tokenizer is available to have this feature available."
	)
	raise ValueError(
	"Unable to create tensor, you should probably activate truncation and/or padding "
	"with 'padding=True' 'truncation=True' to have batched tensors with the same length."
	)

	return self

	def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
	# This check catches things like APEX blindly calling "to" on all inputs to a module
	# Otherwise it passes the casts down and casts the LongTensor containing the token idxs
	# into a HalfTensor
	if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
	self.data = {k: v.to(device=device) for k, v in self.data.items()}
	return self


	class SpecialTokensMixin:
	SPECIAL_TOKENS_ATTRIBUTES = [
	"bos_token",
	"eos_token",
	"unk_token",
	"sep_token",
	"pad_token",
	"cls_token",
	"mask_token",
	"additional_special_tokens",
	]

	def __init__(self, verbose=True, **kwargs):
	self._bos_token = None
	self._eos_token = None
	self._unk_token = None
	self._sep_token = None
	self._pad_token = None
	self._cls_token = None
	self._mask_token = None
	self._pad_token_type_id = 0
	self._additional_special_tokens = []
	self.verbose = verbose

	# We directly set the hidden value to allow initialization with special tokens
	# which are not yet in the vocabulary. Necessary for serialization/de-serialization
	# TODO clean this up at some point (probably by switching to fast tokenizers)
	for key, value in kwargs.items():
	if value is None:
	continue
	if key in self.SPECIAL_TOKENS_ATTRIBUTES:
	if key == "additional_special_tokens":
	assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
	assert all(isinstance(t, str) for t in value), "One of the tokens is not a string"
	setattr(self, key, value)
	elif isinstance(value, (str, AddedToken)):
	setattr(self, key, value)
	else:
	raise TypeError(
	"special token {} has to be either str or AddedToken but got: {}".format(key, type(value))
	)

	def sanitize_special_tokens(self) -> int:
	return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)

	def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
	if not special_tokens_dict:
	return 0

	added_tokens = 0
	for key, value in special_tokens_dict.items():
	assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"

	setattr(self, key, value)

	if key == "additional_special_tokens":
	assert isinstance(value, (list, tuple)) and all(
	isinstance(t, (str, AddedToken)) for t in value
	), f"Tokens {value} for key {key} should all be str or AddedToken instances"
	added_tokens += self.add_tokens(value, special_tokens=True)
	else:
	assert isinstance(
	value, (str, AddedToken)
	), f"Token {value} for key {key} should be a str or an AddedToken instance"
	added_tokens += self.add_tokens([value], special_tokens=True)

	return added_tokens

	def add_tokens(
	self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False
	) -> int:
	if not new_tokens:
	return 0

	if not isinstance(new_tokens, (list, tuple)):
	new_tokens = [new_tokens]

	return self._add_tokens(new_tokens, special_tokens=special_tokens)

	def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
	raise NotImplementedError

	@property
	def bos_token(self) -> str:
	if self._bos_token is None and self.verbose:
	return None
	return str(self._bos_token)

	@property
	def eos_token(self) -> str:
	if self._eos_token is None and self.verbose:
	return None
	return str(self._eos_token)

	@property
	def unk_token(self) -> str:
	if self._unk_token is None and self.verbose:
	return None
	return str(self._unk_token)

	@property
	def sep_token(self) -> str:
	if self._sep_token is None and self.verbose:
	return None
	return str(self._sep_token)

	@property
	def pad_token(self) -> str:
	if self._pad_token is None and self.verbose:
	return None
	return str(self._pad_token)

	@property
	def cls_token(self) -> str:
	if self._cls_token is None and self.verbose:
	return None
	return str(self._cls_token)

	@property
	def mask_token(self) -> str:
	if self._mask_token is None and self.verbose:
	return None
	return str(self._mask_token)

	@property
	def additional_special_tokens(self) -> List[str]:
	if self._additional_special_tokens is None and self.verbose:
	return None
	return [str(tok) for tok in self._additional_special_tokens]

	@bos_token.setter
	def bos_token(self, value):
	self._bos_token = value

	@eos_token.setter
	def eos_token(self, value):
	self._eos_token = value

	@unk_token.setter
	def unk_token(self, value):
	self._unk_token = value

	@sep_token.setter
	def sep_token(self, value):
	self._sep_token = value

	@pad_token.setter
	def pad_token(self, value):
	self._pad_token = value

	@cls_token.setter
	def cls_token(self, value):
	self._cls_token = value

	@mask_token.setter
	def mask_token(self, value):
	self._mask_token = value

	@additional_special_tokens.setter
	def additional_special_tokens(self, value):
	self._additional_special_tokens = value

	@property
	def bos_token_id(self) -> Optional[int]:
	if self._bos_token is None:
	return None
	return self.convert_tokens_to_ids(self.bos_token)

	@property
	def eos_token_id(self) -> Optional[int]:
	if self._eos_token is None:
	return None
	return self.convert_tokens_to_ids(self.eos_token)

	@property
	def unk_token_id(self) -> Optional[int]:
	if self._unk_token is None:
	return None
	return self.convert_tokens_to_ids(self.unk_token)

	@property
	def sep_token_id(self) -> Optional[int]:
	if self._sep_token is None:
	return None
	return self.convert_tokens_to_ids(self.sep_token)

	@property
	def pad_token_id(self) -> Optional[int]:
	if self._pad_token is None:
	return None
	return self.convert_tokens_to_ids(self.pad_token)

	@property
	def pad_token_type_id(self) -> int:
	return self._pad_token_type_id

	@property
	def cls_token_id(self) -> Optional[int]:
	if self._cls_token is None:
	return None
	return self.convert_tokens_to_ids(self.cls_token)

	@property
	def mask_token_id(self) -> Optional[int]:
	if self._mask_token is None:
	return None
	return self.convert_tokens_to_ids(self.mask_token)

	@property
	def additional_special_tokens_ids(self) -> List[int]:
	return self.convert_tokens_to_ids(self.additional_special_tokens)

	@bos_token_id.setter
	def bos_token_id(self, value):
	self._bos_token = self.convert_tokens_to_ids(value)

	@eos_token_id.setter
	def eos_token_id(self, value):
	self._eos_token = self.convert_tokens_to_ids(value)

	@unk_token_id.setter
	def unk_token_id(self, value):
	self._unk_token = self.convert_tokens_to_ids(value)

	@sep_token_id.setter
	def sep_token_id(self, value):
	self._sep_token = self.convert_tokens_to_ids(value)

	@pad_token_id.setter
	def pad_token_id(self, value):
	self._pad_token = self.convert_tokens_to_ids(value)

	@cls_token_id.setter
	def cls_token_id(self, value):
	self._cls_token = self.convert_tokens_to_ids(value)

	@mask_token_id.setter
	def mask_token_id(self, value):
	self._mask_token = self.convert_tokens_to_ids(value)

	@additional_special_tokens_ids.setter
	def additional_special_tokens_ids(self, values):
	self._additional_special_tokens = [self.convert_tokens_to_ids(value) for value in values]

	@property
	def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
	set_attr = {}
	for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
	attr_value = getattr(self, "_" + attr)
	if attr_value:
	set_attr[attr] = str(attr_value)
	return set_attr

	@property
	def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
	set_attr = {}
	for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
	attr_value = getattr(self, "_" + attr)
	if attr_value:
	set_attr[attr] = attr_value
	return set_attr

	@property
	def all_special_tokens(self) -> List[str]:
	all_toks = [str(s) for s in self.all_special_tokens_extended]
	return all_toks

	@property
	def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
	all_toks = []
	set_attr = self.special_tokens_map_extended
	for attr_value in set_attr.values():
	all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
	all_toks = list(OrderedDict.fromkeys(all_toks))
	return all_toks

	@property
	def all_special_ids(self) -> List[int]:
	all_toks = self.all_special_tokens
	all_ids = self.convert_tokens_to_ids(all_toks)
	return all_ids


	class PreTrainedTokenizerBase(SpecialTokensMixin):
	vocab_files_names: Dict[str, str] = {}
	pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}
	pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
	max_model_input_sizes: Dict[str, Optional[int]] = {}

	# first name has to correspond to main model input name
	# to make sure `tokenizer.pad(...)` works correctly
	model_input_names: List[str] = ["input_ids", "token_type_ids", "attention_mask"]
	padding_side: str = "right"
	slow_tokenizer_class = None

	def __init__(self, **kwargs):
	# inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
	self.init_inputs = ()
	self.init_kwargs = copy.deepcopy(kwargs)
	self.name_or_path = kwargs.pop("name_or_path", "")

	# For backward compatibility we fallback to set model_max_length from max_len if provided
	model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
	self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER

	# Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed.
	self.padding_side = kwargs.pop("padding_side", self.padding_side)
	assert self.padding_side in [
	"right",
	"left",
	], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
	self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)

	self.deprecation_warnings = (
	{}
	) # Use to store when we have already noticed a deprecation warning (avoid overlogging).

	super().__init__(**kwargs)

	@property
	def max_len_single_sentence(self) -> int:
	return self.model_max_length - self.num_special_tokens_to_add(pair=False)

	@property
	def max_len_sentences_pair(self) -> int:
	return self.model_max_length - self.num_special_tokens_to_add(pair=True)

	@max_len_single_sentence.setter
	def max_len_single_sentence(self, value) -> int:
	# For backward compatibility, allow to try to setup 'max_len_single_sentence'.
	if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
	self.deprecation_warnings["max_len_single_sentence"] = True
	else:
	raise ValueError(
	"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
	)

	@max_len_sentences_pair.setter
	def max_len_sentences_pair(self, value) -> int:
	# For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
	if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
	self.deprecation_warnings["max_len_sentences_pair"] = True
	else:
	raise ValueError(
	"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
	)

	def __repr__(self) -> str:
	return (
	f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
	f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
	f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
	)

	def get_vocab(self) -> Dict[str, int]:
	raise NotImplementedError()

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], init_inputs, *kwargs):
	cache_dir = kwargs.pop("cache_dir", None)
	force_download = kwargs.pop("force_download", False)
	resume_download = kwargs.pop("resume_download", False)
	proxies = kwargs.pop("proxies", None)
	local_files_only = kwargs.pop("local_files_only", False)
	use_auth_token = kwargs.pop("use_auth_token", None)
	revision = kwargs.pop("revision", None)
	subfolder = kwargs.pop("subfolder", None)

	s3_models = list(cls.max_model_input_sizes.keys())
	pretrained_model_name_or_path = str(pretrained_model_name_or_path)
	vocab_files = {}
	init_configuration = {}
	if pretrained_model_name_or_path in s3_models:
	# Get the vocabulary from AWS S3 bucket
	for file_id, map_list in cls.pretrained_vocab_files_map.items():
	vocab_files[file_id] = map_list[pretrained_model_name_or_path]
	if (
	cls.pretrained_init_configuration
	and pretrained_model_name_or_path in cls.pretrained_init_configuration
	):
	init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
	else:
	# Get the vocabulary from local files
	if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
	if len(cls.vocab_files_names) > 1:
	raise ValueError(
	"Calling {}.from_pretrained() with the path to a single file or url is not supported."
	"Use a model identifier or the path to a directory instead.".format(cls.__name__)
	)
	file_id = list(cls.vocab_files_names.keys())[0]
	vocab_files[file_id] = pretrained_model_name_or_path
	else:
	# At this point pretrained_model_name_or_path is either a directory or a model identifier name
	additional_files_names = {
	"added_tokens_file": ADDED_TOKENS_FILE,
	"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
	"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
	"tokenizer_file": FULL_TOKENIZER_FILE,
	}
	# Look for the tokenizer files
	for file_id, file_name in {cls.vocab_files_names, additional_files_names}.items():
	if os.path.isdir(pretrained_model_name_or_path):
	if subfolder is not None:
	full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name)
	else:
	full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
	if not os.path.exists(full_file_name):
	full_file_name = None
	else:
	full_file_name = hf_bucket_url(
	pretrained_model_name_or_path,
	filename=file_name,
	subfolder=subfolder,
	revision=revision,
	mirror=None,
	)

	vocab_files[file_id] = full_file_name

	# Get files from url, cache, or disk depending on the case
	resolved_vocab_files = {}
	unresolved_files = []
	for file_id, file_path in vocab_files.items():
	if file_path is None:
	resolved_vocab_files[file_id] = None
	else:
	try:
	try:
	resolved_vocab_files[file_id] = cached_path(
	file_path,
	cache_dir=cache_dir,
	force_download=force_download,
	proxies=proxies,
	resume_download=resume_download,
	local_files_only=local_files_only,
	use_auth_token=use_auth_token,
	)
	except FileNotFoundError as error:
	if local_files_only:
	unresolved_files.append(file_id)
	else:
	raise error

	except requests.exceptions.HTTPError as err:
	if "404 Client Error" in str(err):
	resolved_vocab_files[file_id] = None
	else:
	raise err

	if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
	msg = (
	f"Can't load tokenizer for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
	f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
	f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing relevant tokenizer files\n\n"
	)
	raise EnvironmentError(msg)

	for file_id, file_path in vocab_files.items():
	if file_id not in resolved_vocab_files:
	continue

	return cls._from_pretrained(
	resolved_vocab_files, pretrained_model_name_or_path, init_configuration, init_inputs, *kwargs
	)

	@classmethod
	def _from_pretrained(
	cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, init_inputs, *kwargs
	):
	# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
	# file or if `from_slow` is set to True.
	from_slow = kwargs.get("from_slow", False)
	has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
	if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
	slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
	copy.deepcopy(resolved_vocab_files),
	pretrained_model_name_or_path,
	copy.deepcopy(init_configuration),
	*init_inputs,
	**(copy.deepcopy(kwargs)),
	)
	else:
	slow_tokenizer = None

	# Prepare tokenizer initialization kwargs
	# Did we saved some inputs and kwargs to reload ?
	tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
	if tokenizer_config_file is not None:
	with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
	init_kwargs = json.load(tokenizer_config_handle)
	saved_init_inputs = init_kwargs.pop("init_inputs", ())
	if not init_inputs:
	init_inputs = saved_init_inputs
	else:
	init_kwargs = init_configuration

	# Update with newly provided kwargs
	init_kwargs.update(kwargs)

	# Convert AddedTokens serialized as dict to class instances
	def convert_added_tokens(obj: Union[AddedToken, Any]):
	if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
	obj.pop("__type")
	return AddedToken(**obj)
	elif isinstance(obj, (list, tuple)):
	return list(convert_added_tokens(o) for o in obj)
	elif isinstance(obj, dict):
	return {k: convert_added_tokens(v) for k, v in obj.items()}
	return obj

	init_kwargs = convert_added_tokens(init_kwargs)

	# Set max length if needed
	if pretrained_model_name_or_path in cls.max_model_input_sizes:
	# if we're using a pretrained model, ensure the tokenizer
	# wont index sequences longer than the number of positional embeddings
	model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
	if model_max_length is not None and isinstance(model_max_length, (int, float)):
	init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)

	# Merge resolved_vocab_files arguments in init_kwargs.
	added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
	for args_name, file_path in resolved_vocab_files.items():
	if args_name not in init_kwargs:
	init_kwargs[args_name] = file_path

	if slow_tokenizer is not None:
	init_kwargs["__slow_tokenizer"] = slow_tokenizer

	init_kwargs["name_or_path"] = pretrained_model_name_or_path

	# Instantiate tokenizer.
	try:
	tokenizer = cls(init_inputs, *init_kwargs)
	except OSError:
	raise OSError(
	"Unable to load vocabulary from file. "
	"Please check that the provided vocabulary is accessible and not corrupted."
	)

	# Save inputs and kwargs for saving and re-loading with ``save_pretrained``
	# Removed: Now done at the base class level
	# tokenizer.init_inputs = init_inputs
	# tokenizer.init_kwargs = init_kwargs

	# If there is a complementary special token map, load it
	special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
	if special_tokens_map_file is not None:
	with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
	special_tokens_map = json.load(special_tokens_map_handle)
	for key, value in special_tokens_map.items():
	if isinstance(value, dict):
	value = AddedToken(**value)
	elif isinstance(value, list):
	value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
	setattr(tokenizer, key, value)

	# Add supplementary tokens.
	special_tokens = tokenizer.all_special_tokens
	if added_tokens_file is not None:
	with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
	added_tok_encoder = json.load(added_tokens_handle)

	# Sort added tokens by index
	added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))

	for token, index in added_tok_encoder_sorted:
	assert index == len(tokenizer), (
	f"Non-consecutive added token '{token}' found. "
	f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
	)
	tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))

	# Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
	added_tokens = tokenizer.sanitize_special_tokens()

	return tokenizer

	def save_pretrained(
	self,
	save_directory: Union[str, os.PathLike],
	legacy_format: bool = True,
	filename_prefix: Optional[str] = None,
	) -> Tuple[str]:
	if os.path.isfile(save_directory):
	return
	os.makedirs(save_directory, exist_ok=True)

	special_tokens_map_file = os.path.join(
	save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
	)
	tokenizer_config_file = os.path.join(
	save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
	)

	tokenizer_config = copy.deepcopy(self.init_kwargs)
	if len(self.init_inputs) > 0:
	tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
	for file_id in self.vocab_files_names.keys():
	tokenizer_config.pop(file_id, None)

	# Sanitize AddedTokens
	def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
	if isinstance(obj, AddedToken):
	out = obj.__getstate__()
	if add_type_field:
	out["__type"] = "AddedToken"
	return out
	elif isinstance(obj, (list, tuple)):
	return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj)
	elif isinstance(obj, dict):
	return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
	return obj

	# add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
	tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
	with open(tokenizer_config_file, "w", encoding="utf-8") as f:
	f.write(json.dumps(tokenizer_config, ensure_ascii=False))

	# Sanitize AddedTokens in special_tokens_map
	write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
	with open(special_tokens_map_file, "w", encoding="utf-8") as f:
	f.write(json.dumps(write_dict, ensure_ascii=False))

	file_names = (tokenizer_config_file, special_tokens_map_file)

	return self._save_pretrained(
	save_directory=save_directory,
	file_names=file_names,
	legacy_format=legacy_format,
	filename_prefix=filename_prefix,
	)

	def _save_pretrained(
	self,
	save_directory: Union[str, os.PathLike],
	file_names: Tuple[str],
	legacy_format: bool = True,
	filename_prefix: Optional[str] = None,
	) -> Tuple[str]:
	if not legacy_format:
	raise ValueError(
	"Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format."
	)

	save_directory = str(save_directory)

	added_tokens_file = os.path.join(
	save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
	)
	added_vocab = self.get_added_vocab()
	if added_vocab:
	with open(added_tokens_file, "w", encoding="utf-8") as f:
	out_str = json.dumps(added_vocab, ensure_ascii=False)
	f.write(out_str)

	vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)

	return file_names + vocab_files + (added_tokens_file,)

	def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
	raise NotImplementedError

	def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
	raise NotImplementedError

	def encode(
	self,
	text: Union[TextInput, PreTokenizedInput, EncodedInput],
	text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
	add_special_tokens: bool = True,
	padding: Union[bool, str, PaddingStrategy] = False,
	truncation: Union[bool, str, TruncationStrategy] = False,
	max_length: Optional[int] = None,
	stride: int = 0,
	return_tensors: Optional[Union[str, TensorType]] = None,
	**kwargs
	) -> List[int]:
	encoded_inputs = self.encode_plus(
	text,
	text_pair=text_pair,
	add_special_tokens=add_special_tokens,
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	stride=stride,
	return_tensors=return_tensors,
	**kwargs,
	)

	return encoded_inputs["input_ids"]

	def num_special_tokens_to_add(self, pair: bool = False) -> int:
	raise NotImplementedError

	def _get_padding_truncation_strategies(
	self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
	):
	old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
	old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)

	# Backward compatibility for previous behavior, maybe we should deprecate it:
	# If you only set max_length, it activates truncation for max_length
	if max_length is not None and padding is False and truncation is False:
	if verbose:
	self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
	truncation = "longest_first"

	# Get padding strategy
	if padding is False and old_pad_to_max_length:
	if max_length is None:
	padding_strategy = PaddingStrategy.LONGEST
	else:
	padding_strategy = PaddingStrategy.MAX_LENGTH
	elif padding is not False:
	if padding is True:
	padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
	elif not isinstance(padding, PaddingStrategy):
	padding_strategy = PaddingStrategy(padding)
	elif isinstance(padding, PaddingStrategy):
	padding_strategy = padding
	else:
	padding_strategy = PaddingStrategy.DO_NOT_PAD

	# Get truncation strategy
	if truncation is False and old_truncation_strategy != "do_not_truncate":
	truncation_strategy = TruncationStrategy(old_truncation_strategy)
	elif truncation is not False:
	if truncation is True:
	truncation_strategy = (
	TruncationStrategy.LONGEST_FIRST
	) # Default to truncate the longest sequences in pairs of inputs
	elif not isinstance(truncation, TruncationStrategy):
	truncation_strategy = TruncationStrategy(truncation)
	elif isinstance(truncation, TruncationStrategy):
	truncation_strategy = truncation
	else:
	truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

	# Set max length if needed
	if max_length is None:
	if padding_strategy == PaddingStrategy.MAX_LENGTH:
	if self.model_max_length > LARGE_INTEGER:
	if verbose:
	self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
	padding_strategy = PaddingStrategy.DO_NOT_PAD
	else:
	max_length = self.model_max_length

	if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
	if self.model_max_length > LARGE_INTEGER:
	if verbose:
	self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
	truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
	else:
	max_length = self.model_max_length

	# Test if we have a padding token
	if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
	raise ValueError(
	"Asking to pad but the tokenizer does not have a padding token. "
	"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
	"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
	)

	# Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
	if (
	truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
	and padding_strategy != PaddingStrategy.DO_NOT_PAD
	and pad_to_multiple_of is not None
	and max_length is not None
	and (max_length % pad_to_multiple_of != 0)
	):
	raise ValueError(
	f"Truncation and padding are both activated but "
	f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
	)

	return padding_strategy, truncation_strategy, max_length, kwargs

	def __call__(
	self,
	text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
	text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
	add_special_tokens: bool = True,
	padding: Union[bool, str, PaddingStrategy] = False,
	truncation: Union[bool, str, TruncationStrategy] = False,
	max_length: Optional[int] = None,
	stride: int = 0,
	is_split_into_words: bool = False,
	pad_to_multiple_of: Optional[int] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	return_token_type_ids: Optional[bool] = None,
	return_attention_mask: Optional[bool] = None,
	return_overflowing_tokens: bool = False,
	return_special_tokens_mask: bool = False,
	return_offsets_mapping: bool = False,
	return_length: bool = False,
	verbose: bool = True,
	**kwargs
	) -> BatchEncoding:
	# Input type checking for clearer error
	assert isinstance(text, str) or (
	isinstance(text, (list, tuple))
	and (
	len(text) == 0
	or (
	isinstance(text[0], str)
	or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str)))
	)
	)
	), (
	"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
	"or `List[List[str]]` (batch of pretokenized examples)."
	)

	assert (
	text_pair is None
	or isinstance(text_pair, str)
	or (
	isinstance(text_pair, (list, tuple))
	and (
	len(text_pair) == 0
	or (
	isinstance(text_pair[0], str)
	or (
	isinstance(text_pair[0], (list, tuple))
	and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))
	)
	)
	)
	)
	), (
	"text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
	"or `List[List[str]]` (batch of pretokenized examples)."
	)

	is_batched = bool(
	(not is_split_into_words and isinstance(text, (list, tuple)))
	or (
	is_split_into_words and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
	)
	)

	if is_batched:
	batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
	return self.batch_encode_plus(
	batch_text_or_text_pairs=batch_text_or_text_pairs,
	add_special_tokens=add_special_tokens,
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	stride=stride,
	is_split_into_words=is_split_into_words,
	pad_to_multiple_of=pad_to_multiple_of,
	return_tensors=return_tensors,
	return_token_type_ids=return_token_type_ids,
	return_attention_mask=return_attention_mask,
	return_overflowing_tokens=return_overflowing_tokens,
	return_special_tokens_mask=return_special_tokens_mask,
	return_offsets_mapping=return_offsets_mapping,
	return_length=return_length,
	verbose=verbose,
	**kwargs,
	)
	else:
	return self.encode_plus(
	text=text,
	text_pair=text_pair,
	add_special_tokens=add_special_tokens,
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	stride=stride,
	is_split_into_words=is_split_into_words,
	pad_to_multiple_of=pad_to_multiple_of,
	return_tensors=return_tensors,
	return_token_type_ids=return_token_type_ids,
	return_attention_mask=return_attention_mask,
	return_overflowing_tokens=return_overflowing_tokens,
	return_special_tokens_mask=return_special_tokens_mask,
	return_offsets_mapping=return_offsets_mapping,
	return_length=return_length,
	verbose=verbose,
	**kwargs,
	)

	def encode_plus(
	self,
	text: Union[TextInput, PreTokenizedInput, EncodedInput],
	text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
	add_special_tokens: bool = True,
	padding: Union[bool, str, PaddingStrategy] = False,
	truncation: Union[bool, str, TruncationStrategy] = False,
	max_length: Optional[int] = None,
	stride: int = 0,
	is_split_into_words: bool = False,
	pad_to_multiple_of: Optional[int] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	return_token_type_ids: Optional[bool] = None,
	return_attention_mask: Optional[bool] = None,
	return_overflowing_tokens: bool = False,
	return_special_tokens_mask: bool = False,
	return_offsets_mapping: bool = False,
	return_length: bool = False,
	verbose: bool = True,
	**kwargs
	) -> BatchEncoding:
	# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
	padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	pad_to_multiple_of=pad_to_multiple_of,
	verbose=verbose,
	**kwargs,
	)

	return self._encode_plus(
	text=text,
	text_pair=text_pair,
	add_special_tokens=add_special_tokens,
	padding_strategy=padding_strategy,
	truncation_strategy=truncation_strategy,
	max_length=max_length,
	stride=stride,
	is_split_into_words=is_split_into_words,
	pad_to_multiple_of=pad_to_multiple_of,
	return_tensors=return_tensors,
	return_token_type_ids=return_token_type_ids,
	return_attention_mask=return_attention_mask,
	return_overflowing_tokens=return_overflowing_tokens,
	return_special_tokens_mask=return_special_tokens_mask,
	return_offsets_mapping=return_offsets_mapping,
	return_length=return_length,
	verbose=verbose,
	**kwargs,
	)

	def _encode_plus(
	self,
	text: Union[TextInput, PreTokenizedInput, EncodedInput],
	text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
	add_special_tokens: bool = True,
	padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
	truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
	max_length: Optional[int] = None,
	stride: int = 0,
	is_split_into_words: bool = False,
	pad_to_multiple_of: Optional[int] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	return_token_type_ids: Optional[bool] = None,
	return_attention_mask: Optional[bool] = None,
	return_overflowing_tokens: bool = False,
	return_special_tokens_mask: bool = False,
	return_offsets_mapping: bool = False,
	return_length: bool = False,
	verbose: bool = True,
	**kwargs
	) -> BatchEncoding:
	raise NotImplementedError

	def batch_encode_plus(
	self,
	batch_text_or_text_pairs: Union[
	List[TextInput],
	List[TextInputPair],
	List[PreTokenizedInput],
	List[PreTokenizedInputPair],
	List[EncodedInput],
	List[EncodedInputPair],
	],
	add_special_tokens: bool = True,
	padding: Union[bool, str, PaddingStrategy] = False,
	truncation: Union[bool, str, TruncationStrategy] = False,
	max_length: Optional[int] = None,
	stride: int = 0,
	is_split_into_words: bool = False,
	pad_to_multiple_of: Optional[int] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	return_token_type_ids: Optional[bool] = None,
	return_attention_mask: Optional[bool] = None,
	return_overflowing_tokens: bool = False,
	return_special_tokens_mask: bool = False,
	return_offsets_mapping: bool = False,
	return_length: bool = False,
	verbose: bool = True,
	**kwargs
	) -> BatchEncoding:
	# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
	padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	pad_to_multiple_of=pad_to_multiple_of,
	verbose=verbose,
	**kwargs,
	)

	return self._batch_encode_plus(
	batch_text_or_text_pairs=batch_text_or_text_pairs,
	add_special_tokens=add_special_tokens,
	padding_strategy=padding_strategy,
	truncation_strategy=truncation_strategy,
	max_length=max_length,
	stride=stride,
	is_split_into_words=is_split_into_words,
	pad_to_multiple_of=pad_to_multiple_of,
	return_tensors=return_tensors,
	return_token_type_ids=return_token_type_ids,
	return_attention_mask=return_attention_mask,
	return_overflowing_tokens=return_overflowing_tokens,
	return_special_tokens_mask=return_special_tokens_mask,
	return_offsets_mapping=return_offsets_mapping,
	return_length=return_length,
	verbose=verbose,
	**kwargs,
	)

	def _batch_encode_plus(
	self,
	batch_text_or_text_pairs: Union[
	List[TextInput],
	List[TextInputPair],
	List[PreTokenizedInput],
	List[PreTokenizedInputPair],
	List[EncodedInput],
	List[EncodedInputPair],
	],
	add_special_tokens: bool = True,
	padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
	truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
	max_length: Optional[int] = None,
	stride: int = 0,
	is_split_into_words: bool = False,
	pad_to_multiple_of: Optional[int] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	return_token_type_ids: Optional[bool] = None,
	return_attention_mask: Optional[bool] = None,
	return_overflowing_tokens: bool = False,
	return_special_tokens_mask: bool = False,
	return_offsets_mapping: bool = False,
	return_length: bool = False,
	verbose: bool = True,
	**kwargs
	) -> BatchEncoding:
	raise NotImplementedError

	def pad(
	self,
	encoded_inputs: Union[
	BatchEncoding,
	List[BatchEncoding],
	Dict[str, EncodedInput],
	Dict[str, List[EncodedInput]],
	List[Dict[str, EncodedInput]],
	],
	padding: Union[bool, str, PaddingStrategy] = True,
	max_length: Optional[int] = None,
	pad_to_multiple_of: Optional[int] = None,
	return_attention_mask: Optional[bool] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	verbose: bool = True,
	) -> BatchEncoding:
	# If we have a list of dicts, let's convert it in a dict of lists
	# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
	if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
	encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}

	# The model's main input name, usually `input_ids`, has be passed for padding
	if self.model_input_names[0] not in encoded_inputs:
	raise ValueError(
	"You should supply an encoding or a list of encodings to this method"
	f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
	)

	required_input = encoded_inputs[self.model_input_names[0]]

	if not required_input:
	if return_attention_mask:
	encoded_inputs["attention_mask"] = []
	return encoded_inputs

	# If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
	# and rebuild them afterwards if no return_tensors is specified
	# Note that we lose the specific device the tensor may be on for PyTorch

	first_element = required_input[0]
	if isinstance(first_element, (list, tuple)):
	# first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
	index = 0
	while len(required_input[index]) == 0:
	index += 1
	if index < len(required_input):
	first_element = required_input[index][0]
	# At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
	if not isinstance(first_element, (int, list, tuple)):
	if is_tf_available() and _is_tensorflow(first_element):
	return_tensors = "tf" if return_tensors is None else return_tensors
	elif is_torch_available() and _is_torch(first_element):
	return_tensors = "pt" if return_tensors is None else return_tensors
	elif isinstance(first_element, np.ndarray):
	return_tensors = "np" if return_tensors is None else return_tensors
	else:
	raise ValueError(
	f"type of {first_element} unknown: {type(first_element)}. "
	f"Should be one of a python, numpy, pytorch or tensorflow object."
	)

	for key, value in encoded_inputs.items():
	encoded_inputs[key] = to_py_obj(value)

	# Convert padding_strategy in PaddingStrategy
	padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
	padding=padding, max_length=max_length, verbose=verbose
	)

	required_input = encoded_inputs[self.model_input_names[0]]
	if required_input and not isinstance(required_input[0], (list, tuple)):
	encoded_inputs = self._pad(
	encoded_inputs,
	max_length=max_length,
	padding_strategy=padding_strategy,
	pad_to_multiple_of=pad_to_multiple_of,
	return_attention_mask=return_attention_mask,
	)
	return BatchEncoding(encoded_inputs, tensor_type=return_tensors)

	batch_size = len(required_input)
	assert all(
	len(v) == batch_size for v in encoded_inputs.values()
	), "Some items in the output dictionary have a different batch size than others."

	if padding_strategy == PaddingStrategy.LONGEST:
	max_length = max(len(inputs) for inputs in required_input)
	padding_strategy = PaddingStrategy.MAX_LENGTH

	batch_outputs = {}
	for i in range(batch_size):
	inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
	outputs = self._pad(
	inputs,
	max_length=max_length,
	padding_strategy=padding_strategy,
	pad_to_multiple_of=pad_to_multiple_of,
	return_attention_mask=return_attention_mask,
	)

	for key, value in outputs.items():
	if key not in batch_outputs:
	batch_outputs[key] = []
	batch_outputs[key].append(value)

	return BatchEncoding(batch_outputs, tensor_type=return_tensors)

	def create_token_type_ids_from_sequences(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	if token_ids_1 is None:
	return len(token_ids_0) * [0]
	return [0] * len(token_ids_0) + [1] * len(token_ids_1)

	def build_inputs_with_special_tokens(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	if token_ids_1 is None:
	return token_ids_0
	return token_ids_0 + token_ids_1

	def prepare_for_model(
	self,
	ids: List[int],
	pair_ids: Optional[List[int]] = None,
	add_special_tokens: bool = True,
	padding: Union[bool, str, PaddingStrategy] = False,
	truncation: Union[bool, str, TruncationStrategy] = False,
	max_length: Optional[int] = None,
	stride: int = 0,
	pad_to_multiple_of: Optional[int] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	return_token_type_ids: Optional[bool] = None,
	return_attention_mask: Optional[bool] = None,
	return_overflowing_tokens: bool = False,
	return_special_tokens_mask: bool = False,
	return_offsets_mapping: bool = False,
	return_length: bool = False,
	verbose: bool = True,
	prepend_batch_axis: bool = False,
	**kwargs
	) -> BatchEncoding:
	# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
	padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	pad_to_multiple_of=pad_to_multiple_of,
	verbose=verbose,
	**kwargs,
	)

	pair = bool(pair_ids is not None)
	len_ids = len(ids)
	len_pair_ids = len(pair_ids) if pair else 0

	if return_token_type_ids and not add_special_tokens:
	raise ValueError(
	"Asking to return token_type_ids while setting add_special_tokens to False "
	"results in an undefined behavior. Please set add_special_tokens to True or "
	"set return_token_type_ids to None."
	)

	# Load from model defaults
	if return_token_type_ids is None:
	return_token_type_ids = "token_type_ids" in self.model_input_names
	if return_attention_mask is None:
	return_attention_mask = "attention_mask" in self.model_input_names

	encoded_inputs = {}

	# Compute the total size of the returned encodings
	total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)

	# Truncation: Handle max sequence length
	overflowing_tokens = []
	if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
	ids, pair_ids, overflowing_tokens = self.truncate_sequences(
	ids,
	pair_ids=pair_ids,
	num_tokens_to_remove=total_len - max_length,
	truncation_strategy=truncation_strategy,
	stride=stride,
	)

	if return_overflowing_tokens:
	encoded_inputs["overflowing_tokens"] = overflowing_tokens
	encoded_inputs["num_truncated_tokens"] = total_len - max_length

	# Add special tokens
	if add_special_tokens:
	sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
	token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
	else:
	sequence = ids + pair_ids if pair else ids
	token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])

	# Build output dictionary
	encoded_inputs["input_ids"] = sequence
	if return_token_type_ids:
	encoded_inputs["token_type_ids"] = token_type_ids
	if return_special_tokens_mask:
	if add_special_tokens:
	encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
	else:
	encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

	# Check lengths
	self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)

	# Padding
	if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
	encoded_inputs = self.pad(
	encoded_inputs,
	max_length=max_length,
	padding=padding_strategy.value,
	pad_to_multiple_of=pad_to_multiple_of,
	return_attention_mask=return_attention_mask,
	)

	if return_length:
	encoded_inputs["length"] = len(encoded_inputs["input_ids"])

	batch_outputs = BatchEncoding(
	encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
	)

	return batch_outputs

	def truncate_sequences(
	self,
	ids: List[int],
	pair_ids: Optional[List[int]] = None,
	num_tokens_to_remove: int = 0,
	truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
	stride: int = 0,
	) -> Tuple[List[int], List[int], List[int]]:
	if num_tokens_to_remove <= 0:
	return ids, pair_ids, []

	if not isinstance(truncation_strategy, TruncationStrategy):
	truncation_strategy = TruncationStrategy(truncation_strategy)

	overflowing_tokens = []
	if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
	for _ in range(num_tokens_to_remove):
	if pair_ids is None or len(ids) > len(pair_ids):
	if not overflowing_tokens:
	window_len = min(len(ids), stride + 1)
	else:
	window_len = 1
	overflowing_tokens.extend(ids[-window_len:])
	ids = ids[:-1]
	else:
	if not overflowing_tokens:
	window_len = min(len(pair_ids), stride + 1)
	else:
	window_len = 1
	overflowing_tokens.extend(pair_ids[-window_len:])
	pair_ids = pair_ids[:-1]
	elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
	if len(ids) > num_tokens_to_remove:
	window_len = min(len(ids), stride + num_tokens_to_remove)
	overflowing_tokens = ids[-window_len:]
	ids = ids[:-num_tokens_to_remove]
	elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
	if len(pair_ids) > num_tokens_to_remove:
	window_len = min(len(pair_ids), stride + num_tokens_to_remove)
	overflowing_tokens = pair_ids[-window_len:]
	pair_ids = pair_ids[:-num_tokens_to_remove]

	return (ids, pair_ids, overflowing_tokens)

	def _pad(
	self,
	encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
	max_length: Optional[int] = None,
	padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
	pad_to_multiple_of: Optional[int] = None,
	return_attention_mask: Optional[bool] = None,
	) -> dict:
	# Load from model defaults
	if return_attention_mask is None:
	return_attention_mask = "attention_mask" in self.model_input_names

	required_input = encoded_inputs[self.model_input_names[0]]

	if padding_strategy == PaddingStrategy.LONGEST:
	max_length = len(required_input)

	if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
	max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

	needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length

	if needs_to_be_padded:
	difference = max_length - len(required_input)
	if self.padding_side == "right":
	if return_attention_mask:
	encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
	if "token_type_ids" in encoded_inputs:
	encoded_inputs["token_type_ids"] = (
	encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
	)
	if "special_tokens_mask" in encoded_inputs:
	encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
	encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
	elif self.padding_side == "left":
	if return_attention_mask:
	encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
	if "token_type_ids" in encoded_inputs:
	encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
	"token_type_ids"
	]
	if "special_tokens_mask" in encoded_inputs:
	encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
	encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
	else:
	raise ValueError("Invalid padding strategy:" + str(self.padding_side))
	elif return_attention_mask and "attention_mask" not in encoded_inputs:
	encoded_inputs["attention_mask"] = [1] * len(required_input)

	return encoded_inputs

	def convert_tokens_to_string(self, tokens: List[str]) -> str:
	raise NotImplementedError

	def batch_decode(
	self,
	sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
	skip_special_tokens: bool = False,
	clean_up_tokenization_spaces: bool = True,
	**kwargs
	) -> List[str]:
	return [
	self.decode(
	seq,
	skip_special_tokens=skip_special_tokens,
	clean_up_tokenization_spaces=clean_up_tokenization_spaces,
	**kwargs,
	)
	for seq in sequences
	]

	def decode(
	self,
	token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
	skip_special_tokens: bool = False,
	clean_up_tokenization_spaces: bool = True,
	**kwargs
	) -> str:
	# Convert inputs to python lists
	token_ids = to_py_obj(token_ids)

	return self._decode(
	token_ids=token_ids,
	skip_special_tokens=skip_special_tokens,
	clean_up_tokenization_spaces=clean_up_tokenization_spaces,
	**kwargs,
	)

	def _decode(
	self,
	token_ids: Union[int, List[int]],
	skip_special_tokens: bool = False,
	clean_up_tokenization_spaces: bool = True,
	**kwargs
	) -> str:
	raise NotImplementedError

	def get_special_tokens_mask(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
	) -> List[int]:
	assert already_has_special_tokens and token_ids_1 is None, (
	"You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
	"Please use a slow (full python) tokenizer to activate this argument."
	"Or set `return_special_tokens_mask=True` when calling the encoding method "
	"to get the special tokens mask in any tokenizer. "
	)

	all_special_ids = self.all_special_ids # cache the property

	special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]

	return special_tokens_mask

	@staticmethod
	def clean_up_tokenization(out_string: str) -> str:
	"""
	Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
	Args:
	out_string (:obj:`str`): The text to clean up.
	Returns:
	:obj:`str`: The cleaned-up string.
	"""
	out_string = (
	out_string.replace(" .", ".")
	.replace(" ?", "?")
	.replace(" !", "!")
	.replace(" ,", ",")
	.replace(" ' ", "'")
	.replace(" n't", "n't")
	.replace(" 'm", "'m")
	.replace(" 's", "'s")
	.replace(" 've", "'ve")
	.replace(" 're", "'re")
	)
	return out_string

	def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
	if max_length is None and len(ids) > self.model_max_length and verbose:
	self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True

	@contextmanager
	def as_target_tokenizer(self):
	yield

	def prepare_seq2seq_batch(
	self,
	src_texts: List[str],
	tgt_texts: Optional[List[str]] = None,
	max_length: Optional[int] = None,
	max_target_length: Optional[int] = None,
	padding: str = "longest",
	return_tensors: str = None,
	truncation: bool = True,
	**kwargs,
	) -> BatchEncoding:
	# mBART-specific kwargs that should be ignored by other models.
	kwargs.pop("src_lang", None)
	kwargs.pop("tgt_lang", None)
	if max_length is None:
	max_length = self.model_max_length
	model_inputs = self(
	src_texts,
	add_special_tokens=True,
	return_tensors=return_tensors,
	max_length=max_length,
	padding=padding,
	truncation=truncation,
	**kwargs,
	)
	if tgt_texts is None:
	return model_inputs
	# Process tgt_texts
	if max_target_length is None:
	max_target_length = max_length
	with self.as_target_tokenizer():
	labels = self(
	tgt_texts,
	add_special_tokens=True,
	return_tensors=return_tensors,
	padding=padding,
	max_length=max_target_length,
	truncation=truncation,
	**kwargs,
	)
	model_inputs["labels"] = labels["input_ids"]
	return model_inputs


	class PreTrainedTokenizer(PreTrainedTokenizerBase):
	def __init__(self, **kwargs):
	super().__init__(**kwargs)
	# Added tokens - We store this for both slow and fast tokenizers
	# until the serialization of Fast tokenizers is updated
	self.added_tokens_encoder: Dict[str, int] = {}
	self.added_tokens_decoder: Dict[int, str] = {}
	self.unique_no_split_tokens: List[str] = []

	@property
	def is_fast(self) -> bool:
	return False

	@property
	def vocab_size(self) -> int:
	"""
	:obj:`int`: Size of the base vocabulary (without the added tokens).
	"""
	raise NotImplementedError

	def get_added_vocab(self) -> Dict[str, int]:
	"""
	Returns the added tokens in the vocabulary as a dictionary of token to index.
	Returns:
	:obj:`Dict[str, int]`: The added tokens.
	"""
	return self.added_tokens_encoder

	def __len__(self):
	"""
	Size of the full vocabulary with the added tokens.
	"""
	return self.vocab_size + len(self.added_tokens_encoder)

	def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
	"""
	Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
	it with indices starting from length of the current vocabulary.
	Args:
	new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
	Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
	checking if the tokenizer assign the index of the ``unk_token`` to them).
	special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
	Whether or not the tokens should be added as special tokens.
	Returns:
	:obj:`int`: The number of tokens actually added to the vocabulary.
	Examples::
	# Let's see how to increase the vocabulary of Bert model and tokenizer
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
	model = BertModel.from_pretrained('bert-base-uncased')
	num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
	print('We have added', num_added_toks, 'tokens')
	# Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
	model.resize_token_embeddings(len(tokenizer))
	"""
	new_tokens = [str(tok) for tok in new_tokens]

	tokens_to_add = []
	for token in new_tokens:
	assert isinstance(token, str)
	if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
	token = token.lower()
	if (
	token != self.unk_token
	and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
	and token not in tokens_to_add
	):
	tokens_to_add.append(token)

	added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
	added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
	self.added_tokens_encoder.update(added_tok_encoder)
	self.added_tokens_decoder.update(added_tok_decoder)

	# Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
	if special_tokens:
	self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
	else:
	# Or on the newly added tokens
	self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))

	return len(tokens_to_add)

	def num_special_tokens_to_add(self, pair: bool = False) -> int:
	"""
	Returns the number of added tokens when encoding a sequence with special tokens.
	.. note::
	This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
	put this inside your training loop.
	Args:
	pair (:obj:`bool`, `optional`, defaults to :obj:`False`):
	Whether the number of added tokens should be computed in the case of a sequence pair or a single
	sequence.
	Returns:
	:obj:`int`: Number of special tokens added to sequences.
	"""
	token_ids_0 = []
	token_ids_1 = []
	return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))

	def tokenize(self, text: TextInput, **kwargs) -> List[str]:
	"""
	Converts a string in a sequence of tokens, using the tokenizer.
	Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
	(BPE/SentencePieces/WordPieces). Takes care of added tokens.
	Args:
	text (:obj:`str`):
	The sequence to be encoded.
	**kwargs (additional keyword arguments):
	Passed along to the model-specific ``prepare_for_tokenization`` preprocessing method.
	Returns:
	:obj:`List[str]`: The list of tokens.
	"""
	# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
	all_special_tokens_extended = dict(
	(str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
	)

	text, kwargs = self.prepare_for_tokenization(text, **kwargs)

	# TODO: should this be in the base class?
	if hasattr(self, "do_lower_case") and self.do_lower_case:
	# convert non-special tokens to lowercase
	escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
	pattern = r"(" + r"\|".join(escaped_special_toks) + r")\|" + r"(.+?)"
	text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)

	def split_on_token(tok, text):
	result = []
	tok_extended = all_special_tokens_extended.get(tok, None)
	split_text = text.split(tok)
	full_word = ""
	for i, sub_text in enumerate(split_text):
	# AddedToken can control whitespace stripping around them.
	# We use them for GPT2 and Roberta to have different behavior depending on the special token
	# Cf. https://github.com/huggingface/transformers/pull/2778
	# and https://github.com/huggingface/transformers/issues/3788
	if isinstance(tok_extended, AddedToken):
	if tok_extended.single_word:
	# Try to avoid splitting on token
	if (
	i < len(split_text) - 1
	and not _is_end_of_word(sub_text)
	and not _is_start_of_word(split_text[i + 1])
	):
	# Don't extract the special token
	full_word += sub_text + tok
	elif full_word:
	full_word += sub_text
	result.append(full_word)
	full_word = ""
	continue
	# Strip white spaces on the right
	if tok_extended.rstrip and i > 0:
	# A bit counter-intuitive but we strip the left of the string
	# since tok_extended.rstrip means the special token is eating all white spaces on its right
	sub_text = sub_text.lstrip()
	# Strip white spaces on the left
	if tok_extended.lstrip and i < len(split_text) - 1:
	sub_text = sub_text.rstrip() # Opposite here
	else:
	# We strip left and right by default
	if i < len(split_text) - 1:
	sub_text = sub_text.rstrip()
	if i > 0:
	sub_text = sub_text.lstrip()

	if i == 0 and not sub_text:
	result.append(tok)
	elif i == len(split_text) - 1:
	if sub_text:
	result.append(sub_text)
	else:
	pass
	else:
	if sub_text:
	result.append(sub_text)
	result.append(tok)
	return result

	def split_on_tokens(tok_list, text):
	if not text.strip():
	return []
	if not tok_list:
	return self._tokenize(text)

	tokenized_text = []
	text_list = [text]
	for tok in tok_list:
	tokenized_text = []
	for sub_text in text_list:
	if sub_text not in self.unique_no_split_tokens:
	tokenized_text.extend(split_on_token(tok, sub_text))
	else:
	tokenized_text.append(sub_text)
	text_list = tokenized_text

	return list(
	itertools.chain.from_iterable(
	(
	self._tokenize(token) if token not in self.unique_no_split_tokens else [token]
	for token in tokenized_text
	)
	)
	)

	no_split_token = self.unique_no_split_tokens
	tokenized_text = split_on_tokens(no_split_token, text)
	return tokenized_text

	def _tokenize(self, text, **kwargs):
	"""
	Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
	vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
	Do NOT take care of added tokens.
	"""
	raise NotImplementedError

	def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
	"""
	Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
	vocabulary.
	Args:
	tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
	Returns:
	:obj:`int` or :obj:`List[int]`: The token id or list of token ids.
	"""
	if tokens is None:
	return None

	if isinstance(tokens, str):
	return self._convert_token_to_id_with_added_voc(tokens)

	ids = []
	for token in tokens:
	ids.append(self._convert_token_to_id_with_added_voc(token))
	return ids

	def _convert_token_to_id_with_added_voc(self, token):
	if token is None:
	return None

	if token in self.added_tokens_encoder:
	return self.added_tokens_encoder[token]
	return self._convert_token_to_id(token)

	def _convert_token_to_id(self, token):
	raise NotImplementedError

	def _encode_plus(
	self,
	text: Union[TextInput, PreTokenizedInput, EncodedInput],
	text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
	add_special_tokens: bool = True,
	padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
	truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
	max_length: Optional[int] = None,
	stride: int = 0,
	is_split_into_words: bool = False,
	pad_to_multiple_of: Optional[int] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	return_token_type_ids: Optional[bool] = None,
	return_attention_mask: Optional[bool] = None,
	return_overflowing_tokens: bool = False,
	return_special_tokens_mask: bool = False,
	return_offsets_mapping: bool = False,
	return_length: bool = False,
	verbose: bool = True,
	**kwargs
	) -> BatchEncoding:
	def get_input_ids(text):
	if isinstance(text, str):
	tokens = self.tokenize(text, **kwargs)
	return self.convert_tokens_to_ids(tokens)
	elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
	if is_split_into_words:
	tokens = list(
	itertools.chain((self.tokenize(t, is_split_into_words=True, *kwargs) for t in text))
	)
	return self.convert_tokens_to_ids(tokens)
	else:
	return self.convert_tokens_to_ids(text)
	elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
	return text
	else:
	if is_split_into_words:
	raise ValueError(
	f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`."
	)
	else:
	raise ValueError(
	f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
	)

	if return_offsets_mapping:
	raise NotImplementedError(
	"return_offset_mapping is not available when using Python tokenizers."
	"To use this feature, change your tokenizer to one deriving from "
	"transformers.PreTrainedTokenizerFast."
	"More information on available tokenizers at "
	"https://github.com/huggingface/transformers/pull/2674"
	)

	first_ids = get_input_ids(text)
	second_ids = get_input_ids(text_pair) if text_pair is not None else None

	return self.prepare_for_model(
	first_ids,
	pair_ids=second_ids,
	add_special_tokens=add_special_tokens,
	padding=padding_strategy.value,
	truncation=truncation_strategy.value,
	max_length=max_length,
	stride=stride,
	pad_to_multiple_of=pad_to_multiple_of,
	return_tensors=return_tensors,
	prepend_batch_axis=True,
	return_attention_mask=return_attention_mask,
	return_token_type_ids=return_token_type_ids,
	return_overflowing_tokens=return_overflowing_tokens,
	return_special_tokens_mask=return_special_tokens_mask,
	return_length=return_length,
	verbose=verbose,
	)

	def _batch_encode_plus(
	self,
	batch_text_or_text_pairs: Union[
	List[TextInput],
	List[TextInputPair],
	List[PreTokenizedInput],
	List[PreTokenizedInputPair],
	List[EncodedInput],
	List[EncodedInputPair],
	],
	add_special_tokens: bool = True,
	padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
	truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
	max_length: Optional[int] = None,
	stride: int = 0,
	is_split_into_words: bool = False,
	pad_to_multiple_of: Optional[int] = None,
	return_tensors: Optional[Union[str, TensorType]] = None,
	return_token_type_ids: Optional[bool] = None,
	return_attention_mask: Optional[bool] = None,
	return_overflowing_tokens: bool = False,
	return_special_tokens_mask: bool = False,
	return_offsets_mapping: bool = False,
	return_length: bool = False,
	verbose: bool = True,
	**kwargs
	) -> BatchEncoding:
	def get_input_ids(text):
	if isinstance(text, str):
	tokens = self.tokenize(text, **kwargs)
	return self.convert_tokens_to_ids(tokens)
	elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
	if is_split_into_words:
	tokens = list(
	itertools.chain((self.tokenize(t, is_split_into_words=True, *kwargs) for t in text))
	)
	return self.convert_tokens_to_ids(tokens)
	else:
	return self.convert_tokens_to_ids(text)
	elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
	return text
	else:
	raise ValueError(
	"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
	)

	if return_offsets_mapping:
	raise NotImplementedError(
	"return_offset_mapping is not available when using Python tokenizers."
	"To use this feature, change your tokenizer to one deriving from "
	"transformers.PreTrainedTokenizerFast."
	)

	input_ids = []
	for ids_or_pair_ids in batch_text_or_text_pairs:
	if not isinstance(ids_or_pair_ids, (list, tuple)):
	ids, pair_ids = ids_or_pair_ids, None
	elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
	ids, pair_ids = ids_or_pair_ids, None
	else:
	ids, pair_ids = ids_or_pair_ids

	first_ids = get_input_ids(ids)
	second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
	input_ids.append((first_ids, second_ids))

	batch_outputs = self._batch_prepare_for_model(
	input_ids,
	add_special_tokens=add_special_tokens,
	padding_strategy=padding_strategy,
	truncation_strategy=truncation_strategy,
	max_length=max_length,
	stride=stride,
	pad_to_multiple_of=pad_to_multiple_of,
	return_attention_mask=return_attention_mask,
	return_token_type_ids=return_token_type_ids,
	return_overflowing_tokens=return_overflowing_tokens,
	return_special_tokens_mask=return_special_tokens_mask,
	return_length=return_length,
	return_tensors=return_tensors,
	verbose=verbose,
	)

	return BatchEncoding(batch_outputs)

	def _batch_prepare_for_model(
	self,
	batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
	add_special_tokens: bool = True,
	padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
	truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
	max_length: Optional[int] = None,
	stride: int = 0,
	pad_to_multiple_of: Optional[int] = None,
	return_tensors: Optional[str] = None,
	return_token_type_ids: Optional[bool] = None,
	return_attention_mask: Optional[bool] = None,
	return_overflowing_tokens: bool = False,
	return_special_tokens_mask: bool = False,
	return_length: bool = False,
	verbose: bool = True,
	) -> BatchEncoding:
	"""
	Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
	adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
	manages a moving window (with user defined stride) for overflowing tokens
	Args:
	batch_ids_pairs: list of tokenized input ids or input ids pairs
	"""

	batch_outputs = {}
	for first_ids, second_ids in batch_ids_pairs:
	outputs = self.prepare_for_model(
	first_ids,
	second_ids,
	add_special_tokens=add_special_tokens,
	padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
	truncation=truncation_strategy.value,
	max_length=max_length,
	stride=stride,
	pad_to_multiple_of=None, # we pad in batch afterward
	return_attention_mask=False, # we pad in batch afterward
	return_token_type_ids=return_token_type_ids,
	return_overflowing_tokens=return_overflowing_tokens,
	return_special_tokens_mask=return_special_tokens_mask,
	return_length=return_length,
	return_tensors=None, # We convert the whole batch to tensors at the end
	prepend_batch_axis=False,
	verbose=verbose,
	)

	for key, value in outputs.items():
	if key not in batch_outputs:
	batch_outputs[key] = []
	batch_outputs[key].append(value)

	batch_outputs = self.pad(
	batch_outputs,
	padding=padding_strategy.value,
	max_length=max_length,
	pad_to_multiple_of=pad_to_multiple_of,
	return_attention_mask=return_attention_mask,
	)

	batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)

	return batch_outputs

	def prepare_for_tokenization(
	self, text: str, is_split_into_words: bool = False, **kwargs
	) -> Tuple[str, Dict[str, Any]]:
	"""
	Performs any necessary transformations before tokenization.
	This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well. We test the
	:obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
	Args:
	text (:obj:`str`):
	The text to prepare.
	is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
	Whether or not the text has been pretokenized.
	kwargs:
	Keyword arguments to use for the tokenization.
	Returns:
	:obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
	"""
	return (text, kwargs)

	def get_special_tokens_mask(
	self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
	) -> List[int]:
	"""
	Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
	special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
	Args:
	token_ids_0 (:obj:`List[int]`):
	List of ids of the first sequence.
	token_ids_1 (:obj:`List[int]`, `optional`):
	List of ids of the second sequence.
	already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
	Whether or not the token list is already formatted with special tokens for the model.
	Returns:
	A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
	"""
	return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))

	@overload
	def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str:
	...

	@overload
	def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]:
	...

	def convert_ids_to_tokens(
	self, ids: Union[int, List[int]], skip_special_tokens: bool = False
	) -> Union[str, List[str]]:
	"""
	Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
	added tokens.
	Args:
	ids (:obj:`int` or :obj:`List[int]`):
	The token id (or token ids) to convert to tokens.
	skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
	Whether or not to remove special tokens in the decoding.
	Returns:
	:obj:`str` or :obj:`List[str]`: The decoded token(s).
	"""
	if isinstance(ids, int):
	if ids in self.added_tokens_decoder:
	return self.added_tokens_decoder[ids]
	else:
	return self._convert_id_to_token(ids)
	tokens = []
	for index in ids:
	index = int(index)
	if skip_special_tokens and index in self.all_special_ids:
	continue
	if index in self.added_tokens_decoder:
	tokens.append(self.added_tokens_decoder[index])
	else:
	tokens.append(self._convert_id_to_token(index))
	return tokens

	def _convert_id_to_token(self, index: int) -> str:
	raise NotImplementedError

	def convert_tokens_to_string(self, tokens: List[str]) -> str:
	return " ".join(tokens)

	def _decode(
	self,
	token_ids: List[int],
	skip_special_tokens: bool = False,
	clean_up_tokenization_spaces: bool = True,
	spaces_between_special_tokens: bool = True,
	) -> str:
	filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)

	# To avoid mixing byte-level and unicode for byte-level BPT
	# we need to build string separately for added tokens and byte-level tokens
	# cf. https://github.com/huggingface/transformers/issues/1133
	sub_texts = []
	current_sub_text = []
	for token in filtered_tokens:
	if skip_special_tokens and token in self.all_special_ids:
	continue
	if token in self.added_tokens_encoder:
	if current_sub_text:
	sub_texts.append(self.convert_tokens_to_string(current_sub_text))
	current_sub_text = []
	sub_texts.append(token)
	else:
	current_sub_text.append(token)
	if current_sub_text:
	sub_texts.append(self.convert_tokens_to_string(current_sub_text))

	if spaces_between_special_tokens:
	text = " ".join(sub_texts)
	else:
	text = "".join(sub_texts)

	if clean_up_tokenization_spaces:
	clean_text = self.clean_up_tokenization(text)
	return clean_text
	else:
	return text



	class BertTokenizer(PreTrainedTokenizer):
	vocab_files_names = VOCAB_FILES_NAMES
	pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
	pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
	max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

	def __init__(
	self,
	vocab_file,
	do_lower_case=True,
	do_basic_tokenize=True,
	never_split=None,
	unk_token="[UNK]",
	sep_token="[SEP]",
	pad_token="[PAD]",
	cls_token="[CLS]",
	mask_token="[MASK]",
	tokenize_chinese_chars=True,
	strip_accents=None,
	**kwargs
	):
	super().__init__(
	do_lower_case=do_lower_case,
	do_basic_tokenize=do_basic_tokenize,
	never_split=never_split,
	unk_token=unk_token,
	sep_token=sep_token,
	pad_token=pad_token,
	cls_token=cls_token,
	mask_token=mask_token,
	tokenize_chinese_chars=tokenize_chinese_chars,
	strip_accents=strip_accents,
	**kwargs,
	)
	self.vocab = load_vocab(vocab_file)
	self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
	self.do_basic_tokenize = do_basic_tokenize
	if do_basic_tokenize:
	self.basic_tokenizer = BasicTokenizer(
	do_lower_case=do_lower_case,
	never_split=never_split,
	tokenize_chinese_chars=tokenize_chinese_chars,
	strip_accents=strip_accents,
	)
	self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)

	@property
	def do_lower_case(self):
	return self.basic_tokenizer.do_lower_case

	@property
	def vocab_size(self):
	return len(self.vocab)

	def get_vocab(self):
	return dict(self.vocab, **self.added_tokens_encoder)

	def _tokenize(self, text):
	split_tokens = []
	if self.do_basic_tokenize:
	for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):

	# If the token is part of the never_split set
	if token in self.basic_tokenizer.never_split:
	split_tokens.append(token)
	else:
	split_tokens += self.wordpiece_tokenizer.tokenize(token)
	else:
	split_tokens = self.wordpiece_tokenizer.tokenize(text)
	return split_tokens

	def _convert_token_to_id(self, token):
	return self.vocab.get(token, self.vocab.get(self.unk_token))

	def _convert_id_to_token(self, index):
	return self.ids_to_tokens.get(index, self.unk_token)

	def convert_tokens_to_string(self, tokens):
	out_string = " ".join(tokens).replace(" ##", "").strip()
	return out_string

	def build_inputs_with_special_tokens(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	if token_ids_1 is None:
	return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
	cls = [self.cls_token_id]
	sep = [self.sep_token_id]
	return cls + token_ids_0 + sep + token_ids_1 + sep

	def get_special_tokens_mask(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
	) -> List[int]:
	if already_has_special_tokens:
	if token_ids_1 is not None:
	raise ValueError(
	"You should not supply a second sequence if the provided sequence of "
	"ids is already formatted with special tokens for the model."
	)
	return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

	if token_ids_1 is not None:
	return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
	return [1] + ([0] * len(token_ids_0)) + [1]

	def create_token_type_ids_from_sequences(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	sep = [self.sep_token_id]
	cls = [self.cls_token_id]
	if token_ids_1 is None:
	return len(cls + token_ids_0 + sep) * [0]
	return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

	def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
	index = 0
	if os.path.isdir(save_directory):
	vocab_file = os.path.join(
	save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
	)
	else:
	vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
	with open(vocab_file, "w", encoding="utf-8") as writer:
	for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
	if index != token_index:
	index = token_index
	writer.write(token + "\n")
	index += 1
	return (vocab_file,)


	class BasicTokenizer(object):
	def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
	if never_split is None:
	never_split = []
	self.do_lower_case = do_lower_case
	self.never_split = set(never_split)
	self.tokenize_chinese_chars = tokenize_chinese_chars
	self.strip_accents = strip_accents

	def tokenize(self, text, never_split=None):
	# union() returns a new set by concatenating the two sets.
	never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
	text = self._clean_text(text)

	# This was added on November 1st, 2018 for the multilingual and Chinese
	# models. This is also applied to the English models now, but it doesn't
	# matter since the English models were not trained on any Chinese data
	# and generally don't have any Chinese data in them (there are Chinese
	# characters in the vocabulary because Wikipedia does have some Chinese
	# words in the English Wikipedia.).
	if self.tokenize_chinese_chars:
	text = self._tokenize_chinese_chars(text)
	orig_tokens = whitespace_tokenize(text)
	split_tokens = []
	for token in orig_tokens:
	if token not in never_split:
	if self.do_lower_case:
	token = token.lower()
	if self.strip_accents is not False:
	token = self._run_strip_accents(token)
	elif self.strip_accents:
	token = self._run_strip_accents(token)
	split_tokens.extend(self._run_split_on_punc(token, never_split))

	output_tokens = whitespace_tokenize(" ".join(split_tokens))
	return output_tokens

	def _run_strip_accents(self, text):
	text = unicodedata.normalize("NFD", text)
	output = []
	for char in text:
	cat = unicodedata.category(char)
	if cat == "Mn":
	continue
	output.append(char)
	return "".join(output)

	def _run_split_on_punc(self, text, never_split=None):
	if never_split is not None and text in never_split:
	return [text]
	chars = list(text)
	i = 0
	start_new_word = True
	output = []
	while i < len(chars):
	char = chars[i]
	if _is_punctuation(char):
	output.append([char])
	start_new_word = True
	else:
	if start_new_word:
	output.append([])
	start_new_word = False
	output[-1].append(char)
	i += 1

	return ["".join(x) for x in output]

	def _tokenize_chinese_chars(self, text):
	output = []
	for char in text:
	cp = ord(char)
	if self._is_chinese_char(cp):
	output.append(" ")
	output.append(char)
	output.append(" ")
	else:
	output.append(char)
	return "".join(output)

	def _is_chinese_char(self, cp):
	# This defines a "chinese character" as anything in the CJK Unicode block:
	# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
	#
	# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
	# despite its name. The modern Korean Hangul alphabet is a different block,
	# as is Japanese Hiragana and Katakana. Those alphabets are used to write
	# space-separated words, so they are not treated specially and handled
	# like the all of the other languages.
	if (
	(cp >= 0x4E00 and cp <= 0x9FFF)
	or (cp >= 0x3400 and cp <= 0x4DBF) #
	or (cp >= 0x20000 and cp <= 0x2A6DF) #
	or (cp >= 0x2A700 and cp <= 0x2B73F) #
	or (cp >= 0x2B740 and cp <= 0x2B81F) #
	or (cp >= 0x2B820 and cp <= 0x2CEAF) #
	or (cp >= 0xF900 and cp <= 0xFAFF)
	or (cp >= 0x2F800 and cp <= 0x2FA1F) #
	): #
	return True

	return False

	def _clean_text(self, text):
	output = []
	for char in text:
	cp = ord(char)
	if cp == 0 or cp == 0xFFFD or _is_control(char):
	continue
	if _is_whitespace(char):
	output.append(" ")
	else:
	output.append(char)
	return "".join(output)


	class WordpieceTokenizer(object):
	def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
	self.vocab = vocab
	self.unk_token = unk_token
	self.max_input_chars_per_word = max_input_chars_per_word

	def tokenize(self, text):
	output_tokens = []
	for token in whitespace_tokenize(text):
	chars = list(token)
	if len(chars) > self.max_input_chars_per_word:
	output_tokens.append(self.unk_token)
	continue

	is_bad = False
	start = 0
	sub_tokens = []
	while start < len(chars):
	end = len(chars)
	cur_substr = None
	while start < end:
	substr = "".join(chars[start:end])
	if start > 0:
	substr = "##" + substr
	if substr in self.vocab:
	cur_substr = substr
	break
	end -= 1
	if cur_substr is None:
	is_bad = True
	break
	sub_tokens.append(cur_substr)
	start = end

	if is_bad:
	output_tokens.append(self.unk_token)
	else:
	output_tokens.extend(sub_tokens)
	return output_tokens