Japanese BERT-base (MeCab + BPE)
How to load the tokenizer
Please download the dictionary file for MeCab + BPE from our GitHub repository.
Then you can load the tokenizer by specifying the path of the dictionary file to dict_path
.
from typing import Optional
from tokenizers import Tokenizer, NormalizedString, PreTokenizedString
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import PreTokenizer
from transformers import PreTrainedTokenizerFast
from MeCab import Tagger
import textspan
class MecabPreTokenizer:
def __init__(self, mecab_dict_path: Optional[str] = None):
mecab_option = (f"-Owakati -d {mecab_dict_path}" if mecab_dict_path is not None else "-Owakati")
self.mecab = Tagger(mecab_option)
def tokenize(self, sequence: str) -> list[str]:
return self.mecab.parse(sequence).strip().split(" ")
def custom_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
text = str(normalized_string)
tokens = self.tokenize(text)
tokens_spans = textspan.get_original_spans(tokens, text)
return [normalized_string[st:ed] for cahr_spans in tokens_spans for st,ed in cahr_spans]
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(self.custom_split)
# load a tokenizer
dict_path = /path/to/mecab_bpe.json
tokenizer = Tokenizer.from_file(dict_path)
# load a pre-tokenizer
pre_tokenizer = MecabPreTokenizer()
tokenizer.post_processor = BertProcessing(
cls=("[CLS]", tokenizer.token_to_id('[CLS]')),
sep=("[SEP]", tokenizer.token_to_id('[SEP]'))
)
# convert to PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer,
unk_token='[UNK]',
cls_token='[CLS]',
sep_token='[SEP]',
pad_token='[PAD]',
mask_token='[MASK]'
)
# set a pre-tokenizer
tokenizer._tokenizer.pre_tokenizer = PreTokenizer.custom(pre_tokenizer)
# Test
test_str = "γγγ«γ‘γ―γη§γ―ε½’ζ
η΄ θ§£ζε¨γ«γ€γγ¦η η©Άγγγ¦γγΎγγ"
tokenizer.convert_ids_to_tokens(tokenizer(test_str).input_ids)
# -> ['[CLS]', 'γγ', 'γ«', 'γ‘', 'γ―', 'γ', 'η§', 'γ―', 'ε½’ζ
', 'η΄ ', '解ζ', 'ε¨', 'γ«γ€γγ¦', 'η η©Ά', 'γ', 'γ', 'γ¦', 'γ', 'γΎγ', 'γ', '[SEP]']
How to load the model
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("hitachi-nlp/bert-base_mecab-bpe")
See our repository for more details!
- Downloads last month
- 37