|
```python |
|
from transformers import LlamaTokenizer |
|
|
|
tokenizer = LlamaTokenizer.from_pretrained( |
|
'ocisd4/openllama_tokenizer_v2', |
|
add_bos_token=False, |
|
add_eos_token=True, |
|
force_download=False, |
|
use_auth_token=True, |
|
# additional_special_tokens=['<|spcout|>', '<|sep|>', '<|eot|>', '<|output|>'] |
|
) |
|
|
|
print('vocab size:',tokenizer.vocab_size) |
|
#vocab size: 51456 |
|
|
|
text = '今天天氣真好!' |
|
|
|
|
|
print(tokenizer.tokenize(text)) |
|
#['▁', '今天', '天氣', '真', '好', '!'] |
|
|
|
print(tokenizer.encode(text)) |
|
#[29500, 32097, 32916, 30615, 30192, 30042, 2] |
|
|
|
print(tokenizer.decode(tokenizer.encode(text))) |
|
# 今天天氣真好!</s> |
|
``` |