Commit
Β·
dd7f5ac
1
Parent(s):
a8cd25a
Update README.md
Browse filesAdding the Tokenizer usage.
README.md
CHANGED
@@ -57,6 +57,26 @@ print(fill_mask("αααΊαα―ααΊαααΊ ααΌααΊαα¬ααα―α
|
|
57 |
'sequence': 'αααΊαα―ααΊαααΊ ααΌααΊαα¬ααα―ααΊααΆα α‘ααΎα±α·ααα―ααΊαΈ ααΌα
αΊαααΊα'}]
|
58 |
```
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
## Extract text embedding from the sentence
|
61 |
```python
|
62 |
import torch
|
|
|
57 |
'sequence': 'αααΊαα―ααΊαααΊ ααΌααΊαα¬ααα―ααΊααΆα α‘ααΎα±α·ααα―ααΊαΈ ααΌα
αΊαααΊα'}]
|
58 |
```
|
59 |
|
60 |
+
## How to use only the trained tokenizer for Burmese sentences
|
61 |
+
```python
|
62 |
+
from transformers import AutoTokenizer
|
63 |
+
|
64 |
+
model_name = "saihtaungkham/BurmeseRoBERTa"
|
65 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
66 |
+
text = "ααα¬ααα¬ααα¬ααα«α"
|
67 |
+
|
68 |
+
# Tokenized words
|
69 |
+
print(tokenizer.tokenize(text))
|
70 |
+
# Expected Output
|
71 |
+
# ['β', 'ααα¬α', 'αα¬', 'ααα¬α', 'αα«α']
|
72 |
+
|
73 |
+
# Tokenized IDs for training other models
|
74 |
+
print(tokenizer.encode(text))
|
75 |
+
# Expected Output
|
76 |
+
# [1, 3, 1003, 30, 1003, 62, 2]
|
77 |
+
|
78 |
+
```
|
79 |
+
|
80 |
## Extract text embedding from the sentence
|
81 |
```python
|
82 |
import torch
|