samleeasus's picture
Create README.md
5bc2a52
from transformers import LlamaTokenizer

tokenizer = LlamaTokenizer.from_pretrained(
        'ocisd4/openllama_tokenizer_v2',
        add_bos_token=False,
        add_eos_token=True,
        force_download=False,
        use_auth_token=True,
        # additional_special_tokens=['<|spcout|>', '<|sep|>', '<|eot|>', '<|output|>']
)

print('vocab size:',tokenizer.vocab_size)
#vocab size: 51456

text = '今天天氣真好!'


print(tokenizer.tokenize(text))
#['▁', '今天', '天氣', '真', '好', '!']

print(tokenizer.encode(text))
#[29500, 32097, 32916, 30615, 30192, 30042, 2]

print(tokenizer.decode(tokenizer.encode(text)))
# 今天天氣真好!</s>