|  | from abc import ABC | 
					
						
						|  | from abc import abstractmethod | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class AbstractTokenizer(ABC): | 
					
						
						|  | """Abstract class for tokenizer.""" | 
					
						
						|  |  | 
					
						
						|  | def __init__(self, name): | 
					
						
						|  | self.name = name | 
					
						
						|  | super().__init__() | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | @abstractmethod | 
					
						
						|  | def vocab_size(self): | 
					
						
						|  | pass | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | @abstractmethod | 
					
						
						|  | def vocab(self): | 
					
						
						|  | """Dictionary from vocab text token to id token.""" | 
					
						
						|  | pass | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | @abstractmethod | 
					
						
						|  | def inv_vocab(self): | 
					
						
						|  | """Dictionary from vocab id token to text token.""" | 
					
						
						|  | pass | 
					
						
						|  |  | 
					
						
						|  | @abstractmethod | 
					
						
						|  | def tokenize(self, text): | 
					
						
						|  | pass | 
					
						
						|  |  | 
					
						
						|  | def detokenize(self, token_ids): | 
					
						
						|  | raise NotImplementedError('detokenizer is not implemented for {} ' | 
					
						
						|  | 'tokenizer'.format(self.name)) | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def cls(self): | 
					
						
						|  | raise NotImplementedError('CLS is not provided for {} ' | 
					
						
						|  | 'tokenizer'.format(self.name)) | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def sep(self): | 
					
						
						|  | raise NotImplementedError('SEP is not provided for {} ' | 
					
						
						|  | 'tokenizer'.format(self.name)) | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def pad(self): | 
					
						
						|  | raise NotImplementedError('PAD is not provided for {} ' | 
					
						
						|  | 'tokenizer'.format(self.name)) | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def eod(self): | 
					
						
						|  | raise NotImplementedError('EOD is not provided for {} ' | 
					
						
						|  | 'tokenizer'.format(self.name)) | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def mask(self): | 
					
						
						|  | raise NotImplementedError('MASK is not provided for {} ' | 
					
						
						|  | 'tokenizer'.format(self.name)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class _SentencePieceTokenizer(AbstractTokenizer): | 
					
						
						|  | """SentencePieceTokenizer-Megatron wrapper""" | 
					
						
						|  |  | 
					
						
						|  | def __init__(self, model_file, vocab_extra_ids=0): | 
					
						
						|  | name = 'SentencePieceTokenizer' | 
					
						
						|  | super().__init__(name) | 
					
						
						|  |  | 
					
						
						|  | import sentencepiece | 
					
						
						|  | self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_file) | 
					
						
						|  | self._initalize(vocab_extra_ids) | 
					
						
						|  |  | 
					
						
						|  | def _populate_vocab(self): | 
					
						
						|  | self._vocab = {} | 
					
						
						|  | self._inv_vocab = {} | 
					
						
						|  |  | 
					
						
						|  | for i in range(len(self.tokenizer)): | 
					
						
						|  | t = self.tokenizer.id_to_piece(i) | 
					
						
						|  | self._inv_vocab[i] = t | 
					
						
						|  | self._vocab[t] = i | 
					
						
						|  |  | 
					
						
						|  | def _initalize(self, vocab_extra_ids): | 
					
						
						|  | self._populate_vocab() | 
					
						
						|  | self._special_tokens = {} | 
					
						
						|  | self._inv_special_tokens = {} | 
					
						
						|  |  | 
					
						
						|  | self._t5_tokens = [] | 
					
						
						|  |  | 
					
						
						|  | def _add_special_token(t): | 
					
						
						|  | if t not in self._vocab: | 
					
						
						|  | next_id = len(self._vocab) | 
					
						
						|  | self._vocab[t] = next_id | 
					
						
						|  | self._inv_vocab[next_id] = t | 
					
						
						|  | self._special_tokens[t] = self._vocab[t] | 
					
						
						|  | self._inv_special_tokens[self._vocab[t]] = t | 
					
						
						|  |  | 
					
						
						|  | _add_special_token('<CLS>') | 
					
						
						|  | self._cls_id = self._vocab['<CLS>'] | 
					
						
						|  | _add_special_token('<SEP>') | 
					
						
						|  | self._sep_id = self._vocab['<SEP>'] | 
					
						
						|  | _add_special_token('<EOD>') | 
					
						
						|  | self._eod_id = self._vocab['<EOD>'] | 
					
						
						|  | _add_special_token('<MASK>') | 
					
						
						|  | self._mask_id = self._vocab['<MASK>'] | 
					
						
						|  |  | 
					
						
						|  | pad_id = self.tokenizer.pad_id() | 
					
						
						|  | try: | 
					
						
						|  | pad_token = self.tokenizer.id_to_piece(pad_id) | 
					
						
						|  | except IndexError: | 
					
						
						|  | pad_token = '<PAD>' | 
					
						
						|  | _add_special_token(pad_token) | 
					
						
						|  | self._pad_id = self._vocab[pad_token] | 
					
						
						|  |  | 
					
						
						|  | bos_id = self.tokenizer.bos_id() | 
					
						
						|  | try: | 
					
						
						|  | bos_token = self.tokenizer.id_to_piece(bos_id) | 
					
						
						|  | except IndexError: | 
					
						
						|  | bos_token = '<BOS>' | 
					
						
						|  | _add_special_token(bos_token) | 
					
						
						|  | self._bos_id = self._vocab[bos_token] | 
					
						
						|  |  | 
					
						
						|  | eos_id = self.tokenizer.eos_id() | 
					
						
						|  | try: | 
					
						
						|  | eos_token = self.tokenizer.id_to_piece(eos_id) | 
					
						
						|  | except IndexError: | 
					
						
						|  | eos_token = '<EOS>' | 
					
						
						|  | _add_special_token(eos_token) | 
					
						
						|  | self._eos_id = self._vocab[eos_token] | 
					
						
						|  |  | 
					
						
						|  | for i in range(vocab_extra_ids): | 
					
						
						|  | t = "<extra_id_{}>".format(i) | 
					
						
						|  | _add_special_token(t) | 
					
						
						|  | self._t5_tokens += [t] | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def vocab_size(self): | 
					
						
						|  | return len(self._vocab) | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def vocab(self): | 
					
						
						|  | return self._vocab | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def inv_vocab(self): | 
					
						
						|  | return self._inv_vocab | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def decoder(self): | 
					
						
						|  | return self._inv_vocab | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def encoder(self): | 
					
						
						|  | return self._vocab | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def tokenize(self, text): | 
					
						
						|  | ids = [] | 
					
						
						|  | idx = 0 | 
					
						
						|  |  | 
					
						
						|  | while 1: | 
					
						
						|  | indices = {} | 
					
						
						|  | for token in self._special_tokens: | 
					
						
						|  | try: | 
					
						
						|  | indices[token] = text[idx:].index(token) | 
					
						
						|  | except ValueError: | 
					
						
						|  | continue | 
					
						
						|  | if len(indices) == 0: | 
					
						
						|  | break | 
					
						
						|  |  | 
					
						
						|  | next_token = min(indices, key=indices.get) | 
					
						
						|  | next_idx = idx + indices[next_token] | 
					
						
						|  |  | 
					
						
						|  | ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx])) | 
					
						
						|  | ids.append(self._special_tokens[next_token]) | 
					
						
						|  | idx = next_idx + len(next_token) | 
					
						
						|  |  | 
					
						
						|  | ids.extend(self.tokenizer.encode_as_ids(text[idx:])) | 
					
						
						|  | return ids | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def detokenize(self, ids): | 
					
						
						|  | text = "" | 
					
						
						|  | last_i = 0 | 
					
						
						|  |  | 
					
						
						|  | for i, id in enumerate(ids): | 
					
						
						|  | if id in self._inv_special_tokens: | 
					
						
						|  | text += self.tokenizer.decode_ids(ids[last_i:i]) + " " | 
					
						
						|  | text += self._inv_special_tokens[id] + " " | 
					
						
						|  | last_i = i + 1 | 
					
						
						|  |  | 
					
						
						|  | text += self.tokenizer.decode_ids(ids[last_i:]) | 
					
						
						|  | return text | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def cls(self): | 
					
						
						|  | return self._cls_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def sep(self): | 
					
						
						|  | return self._sep_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def pad(self): | 
					
						
						|  | return self._pad_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def bos_token_id(self): | 
					
						
						|  | return self._bos_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def bos(self): | 
					
						
						|  | return self._bos_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def eod(self): | 
					
						
						|  | return self._eod_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def eos_token_id(self): | 
					
						
						|  | return self._eos_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def eos(self): | 
					
						
						|  | return self._eos_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def mask(self): | 
					
						
						|  | return self._mask_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def additional_special_tokens_ids(self): | 
					
						
						|  | return [self.vocab[k] for k in self._t5_tokens] | 
					
						
						|  |  | 
					
						
						|  | class _MMSentencePieceTokenizer(_SentencePieceTokenizer): | 
					
						
						|  | """SentencePieceTokenizer-Megatron wrapper""" | 
					
						
						|  |  | 
					
						
						|  | def __init__(self, model_file, vocab_extra_ids=0): | 
					
						
						|  | super().__init__(model_file, vocab_extra_ids) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def _initalize(self, vocab_extra_ids): | 
					
						
						|  | self._populate_vocab() | 
					
						
						|  | self._special_tokens = {} | 
					
						
						|  | self._inv_special_tokens = {} | 
					
						
						|  |  | 
					
						
						|  | self._t5_tokens = [] | 
					
						
						|  |  | 
					
						
						|  | def _add_special_token(t): | 
					
						
						|  | if t not in self._vocab: | 
					
						
						|  | next_id = len(self._vocab) | 
					
						
						|  | self._vocab[t] = next_id | 
					
						
						|  | self._inv_vocab[next_id] = t | 
					
						
						|  | self._special_tokens[t] = self._vocab[t] | 
					
						
						|  | self._inv_special_tokens[self._vocab[t]] = t | 
					
						
						|  |  | 
					
						
						|  | _add_special_token('<CLS>') | 
					
						
						|  | self._cls_id = self._vocab['<CLS>'] | 
					
						
						|  | _add_special_token('<SEP>') | 
					
						
						|  | self._sep_id = self._vocab['<SEP>'] | 
					
						
						|  | _add_special_token('<EOD>') | 
					
						
						|  | self._eod_id = self._vocab['<EOD>'] | 
					
						
						|  | _add_special_token('<MASK>') | 
					
						
						|  | self._mask_id = self._vocab['<MASK>'] | 
					
						
						|  |  | 
					
						
						|  | _add_special_token('<SOA>') | 
					
						
						|  | self._soa_id = self._vocab['<SOA>'] | 
					
						
						|  | _add_special_token('<EOA>') | 
					
						
						|  | self._eoa_id = self._vocab['<EOA>'] | 
					
						
						|  | _add_special_token('<SOV>') | 
					
						
						|  | self._sov_id = self._vocab['<SOV>'] | 
					
						
						|  | _add_special_token('<EOV>') | 
					
						
						|  | self._eov_id = self._vocab['<EOV>'] | 
					
						
						|  | _add_special_token('<SOI>') | 
					
						
						|  | self._soi_id = self._vocab['<SOI>'] | 
					
						
						|  | _add_special_token('<EOI>') | 
					
						
						|  | self._eoi_id = self._vocab['<EOI>'] | 
					
						
						|  | _add_special_token('<s_local>') | 
					
						
						|  | self._s_local_id = self._vocab['<s_local>'] | 
					
						
						|  | _add_special_token('<e_local>') | 
					
						
						|  | self._e_local_id = self._vocab['<e_local>'] | 
					
						
						|  | _add_special_token('<s_global>') | 
					
						
						|  | self._s_global_id = self._vocab['<s_global>'] | 
					
						
						|  | _add_special_token('<e_global>') | 
					
						
						|  | self._e_global_id = self._vocab['<e_global>'] | 
					
						
						|  | _add_special_token('<stage_1>') | 
					
						
						|  | self._stage_1_id = self._vocab['<stage_1>'] | 
					
						
						|  | _add_special_token('<stage_2>') | 
					
						
						|  | self._stage_2_id = self._vocab['<stage_2>'] | 
					
						
						|  | pad_id = self.tokenizer.pad_id() | 
					
						
						|  | try: | 
					
						
						|  | pad_token = self.tokenizer.id_to_piece(pad_id) | 
					
						
						|  | except IndexError: | 
					
						
						|  | pad_token = '<PAD>' | 
					
						
						|  | _add_special_token(pad_token) | 
					
						
						|  | self._pad_id = self._vocab[pad_token] | 
					
						
						|  |  | 
					
						
						|  | bos_id = self.tokenizer.bos_id() | 
					
						
						|  | try: | 
					
						
						|  | bos_token = self.tokenizer.id_to_piece(bos_id) | 
					
						
						|  | except IndexError: | 
					
						
						|  | bos_token = '<BOS>' | 
					
						
						|  | _add_special_token(bos_token) | 
					
						
						|  | self._bos_id = self._vocab[bos_token] | 
					
						
						|  |  | 
					
						
						|  | eos_id = self.tokenizer.eos_id() | 
					
						
						|  | try: | 
					
						
						|  | eos_token = self.tokenizer.id_to_piece(eos_id) | 
					
						
						|  | except IndexError: | 
					
						
						|  | eos_token = '<EOS>' | 
					
						
						|  | _add_special_token(eos_token) | 
					
						
						|  | self._eos_id = self._vocab[eos_token] | 
					
						
						|  |  | 
					
						
						|  | for i in range(vocab_extra_ids): | 
					
						
						|  | t = "<extra_id_{}>".format(i) | 
					
						
						|  | _add_special_token(t) | 
					
						
						|  | self._t5_tokens += [t] | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def soa(self): | 
					
						
						|  | return self._soa_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def eoa(self): | 
					
						
						|  | return self._eoa_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def sov(self): | 
					
						
						|  | return self._sov_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def eov(self): | 
					
						
						|  | return self._eov_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def soi(self): | 
					
						
						|  | return self._soi_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def eoi(self): | 
					
						
						|  | return self._eoi_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def s_local(self): | 
					
						
						|  | return self._s_local_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def e_local(self): | 
					
						
						|  | return self._e_local_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def s_global(self): | 
					
						
						|  | return self._s_global_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def e_global(self): | 
					
						
						|  | return self._e_global_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def stage_1(self): | 
					
						
						|  | return self._stage_1_id | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def stage_2(self): | 
					
						
						|  | return self._stage_2_id | 
					
						
						|  |  |