Nad1011 commited on
Commit
cd81fc7
·
1 Parent(s): e08131e

Upload 21 files

Browse files
first train/bpe/tokenizer.lo_bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57835e0a538802d5de3903f12483820a98949f25b7fccbb3ff8a73c8c0969fdb
3
+ size 1663887
first train/bpe/tokenizer.lo_bpe.vocab ADDED
The diff for this file is too large to render. See raw diff
 
first train/bpe/tokenizer.vi_bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57ad603b2251531ed49a27f5aa5ddc70cc8a65841d5626a2575e2442089950f0
3
+ size 577154
first train/bpe/tokenizer.vi_bpe.vocab ADDED
The diff for this file is too large to render. See raw diff
 
first train/corpus.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2fc2e449f1051612abe8795f0a8df744ead3736be8e40c3dadc181869efd965
3
+ size 485338322
first train/log.txt ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Loading config ...
2
+ Loading data ...
3
+ Building vocab ...
4
+ Creating iterator ...
5
+ Building dataset ...
6
+ Building vocab from dataset ...
7
+ Load vocab from path successful
8
+ Building encoder and decoder ...
9
+ src vocab size = 29393
10
+ trg vocab size = 15202
11
+ Encoder: 34065920 parameters
12
+ Decoder: 40908642 parameters
13
+ Starting training on cuda
14
+ Performing training...
15
+ ==================================================
16
+ Epoch: 01 - 15.0m40.41644310951233s
17
+ Train Loss/PPL: 9.094 / 8899.793
18
+ Val Loss/PPL: 8.620 / 5540.010
19
+ --------------------------------------------------
20
+ Epoch: 02 - 15.0m38.90395212173462s
21
+ Train Loss/PPL: 8.185 / 3586.749
22
+ Val Loss/PPL: 8.069 / 3193.848
23
+ --------------------------------------------------
24
+ Epoch: 03 - 15.0m39.11497640609741s
25
+ Train Loss/PPL: 7.795 / 2427.533
26
+ Val Loss/PPL: 7.912 / 2729.002
27
+ --------------------------------------------------
28
+ Epoch: 04 - 15.0m42.52194285392761s
29
+ Train Loss/PPL: 7.661 / 2123.347
30
+ Val Loss/PPL: 7.859 / 2589.700
31
+ --------------------------------------------------
32
+ Epoch: 05 - 15.0m42.61946368217468s
33
+ Train Loss/PPL: 7.604 / 2005.850
34
+ Val Loss/PPL: 7.837 / 2532.609
35
+ --------------------------------------------------
36
+ Epoch: 06 - 15.0m40.5325984954834s
37
+ Train Loss/PPL: 7.570 / 1938.907
38
+ Val Loss/PPL: 7.822 / 2493.998
39
+ --------------------------------------------------
40
+ Epoch: 07 - 15.0m44.441715240478516s
41
+ Train Loss/PPL: 7.546 / 1893.262
42
+ Val Loss/PPL: 7.812 / 2469.149
43
+ --------------------------------------------------
44
+ Epoch: 08 - 15.0m43.27636504173279s
45
+ Train Loss/PPL: 7.525 / 1854.688
46
+ Val Loss/PPL: 7.800 / 2441.054
47
+ --------------------------------------------------
48
+ Epoch: 09 - 17.0m49.64024472236633s
49
+ Train Loss/PPL: 7.509 / 1823.568
50
+ Val Loss/PPL: 7.790 / 2415.858
51
+ --------------------------------------------------
52
+ Epoch: 10 - 15.0m41.81872010231018s
53
+ Train Loss/PPL: 7.492 / 1793.774
54
+ Val Loss/PPL: 7.780 / 2391.125
55
+ --------------------------------------------------
56
+ Epoch: 11 - 28.0m3.3641841411590576s
57
+ Train Loss/PPL: 7.477 / 1767.388
58
+ Val Loss/PPL: 7.772 / 2373.962
59
+ --------------------------------------------------
60
+ Epoch: 12 - 15.0m45.12012314796448s
61
+ Train Loss/PPL: 7.463 / 1742.621
62
+ Val Loss/PPL: 7.763 / 2350.974
63
+ --------------------------------------------------
64
+ Epoch: 13 - 15.0m42.93015956878662s
65
+ Train Loss/PPL: 7.449 / 1718.568
66
+ Val Loss/PPL: 7.756 / 2335.491
67
+ --------------------------------------------------
68
+ Epoch: 14 - 15.0m44.00054144859314s
69
+ Train Loss/PPL: 7.438 / 1699.215
70
+ Val Loss/PPL: 7.748 / 2317.051
71
+ --------------------------------------------------
72
+ Epoch: 15 - 15.0m55.463807582855225s
73
+ Train Loss/PPL: 7.426 / 1679.351
74
+ Val Loss/PPL: 7.741 / 2301.697
75
+ --------------------------------------------------
76
+ Epoch: 16 - 15.0m44.77303099632263s
77
+ Train Loss/PPL: 7.415 / 1660.209
78
+ Val Loss/PPL: 7.733 / 2282.415
79
+ --------------------------------------------------
80
+ Epoch: 17 - 15.0m44.32082152366638s
81
+ Train Loss/PPL: 7.405 / 1643.373
82
+ Val Loss/PPL: 7.726 / 2266.476
83
+ --------------------------------------------------
84
+ Epoch: 18 - 15.0m43.58943033218384s
85
+ Train Loss/PPL: 7.395 / 1627.018
86
+ Val Loss/PPL: 7.719 / 2250.929
87
+ --------------------------------------------------
88
+ Epoch: 19 - 15.0m45.2637825012207s
89
+ Train Loss/PPL: 7.386 / 1613.311
90
+ Val Loss/PPL: 7.712 / 2234.829
91
+ --------------------------------------------------
92
+ Epoch: 20 - 15.0m43.84825682640076s
93
+ Train Loss/PPL: 7.376 / 1597.986
94
+ Val Loss/PPL: 7.706 / 2220.886
95
+ --------------------------------------------------
first train/model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb643ec4534bec110cbb4d9b4af5dac47ac8a540c25fd4835974e8d87c340cc1
3
+ size 299975138
first train/setup.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Put trained model in this directory
2
+
3
+ tokenizer.vi.model {
4
+ model_type=unigram
5
+ vocab_size=20000
6
+ max_sentence_length=100000
7
+ split_by_whitespace=false
8
+ input_sentence_size=700000
9
+ }
10
+ tokenizer.vi_bpe.model {
11
+ model_type=bpe
12
+ vocab_size=16000
13
+ max_sentence_length=100000
14
+ split_by_whitespace=false
15
+ input_sentence_size=1000000
16
+ }
first train/tokenizer_training.ipynb ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4",
8
+ "authorship_tag": "ABX9TyPXgKZqJoVuio+h58qyoujZ",
9
+ "include_colab_link": true
10
+ },
11
+ "kernelspec": {
12
+ "name": "python3",
13
+ "display_name": "Python 3"
14
+ },
15
+ "language_info": {
16
+ "name": "python"
17
+ },
18
+ "accelerator": "GPU"
19
+ },
20
+ "cells": [
21
+ {
22
+ "cell_type": "markdown",
23
+ "metadata": {
24
+ "id": "view-in-github",
25
+ "colab_type": "text"
26
+ },
27
+ "source": [
28
+ "<a href=\"https://colab.research.google.com/github/Erioldeth/Viet-Laos-Translator/blob/main/tokenizer_training.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 1,
34
+ "metadata": {
35
+ "colab": {
36
+ "base_uri": "https://localhost:8080/"
37
+ },
38
+ "id": "g6isgzoaxWTr",
39
+ "outputId": "8ea96348-ea45-4d2e-e0a5-b76f3cfbb255"
40
+ },
41
+ "outputs": [
42
+ {
43
+ "output_type": "stream",
44
+ "name": "stdout",
45
+ "text": [
46
+ "Mounted at /content/drive\n"
47
+ ]
48
+ }
49
+ ],
50
+ "source": [
51
+ "from google.colab import drive\n",
52
+ "drive.mount('/content/drive')"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "source": [
58
+ "!pip install sentencepiece"
59
+ ],
60
+ "metadata": {
61
+ "colab": {
62
+ "base_uri": "https://localhost:8080/"
63
+ },
64
+ "id": "2ZYK4GgzzSG4",
65
+ "outputId": "be748680-ccff-45e0-d0b5-91f9a9887608"
66
+ },
67
+ "execution_count": 2,
68
+ "outputs": [
69
+ {
70
+ "output_type": "stream",
71
+ "name": "stdout",
72
+ "text": [
73
+ "Collecting sentencepiece\n",
74
+ " Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
75
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
76
+ "\u001b[?25hInstalling collected packages: sentencepiece\n",
77
+ "Successfully installed sentencepiece-0.1.99\n"
78
+ ]
79
+ }
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "source": [
85
+ "import sentencepiece as spm"
86
+ ],
87
+ "metadata": {
88
+ "id": "N3j11OrLzxFC"
89
+ },
90
+ "execution_count": 4,
91
+ "outputs": []
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "source": [
96
+ "def train_sentencepiece_model(input_file, model_prefix, model_type='bpe', vocab_size=16000):\n",
97
+ " # Train SentencePiece model\n",
98
+ " spm.SentencePieceTrainer.train(\n",
99
+ " input=input_file,\n",
100
+ " model_prefix=model_prefix,\n",
101
+ " vocab_size=vocab_size,\n",
102
+ " model_type=model_type,\n",
103
+ " max_sentence_length=10000,\n",
104
+ " input_sentence_size=1000000,\n",
105
+ " split_by_whitespace=\"false\",\n",
106
+ " )"
107
+ ],
108
+ "metadata": {
109
+ "id": "LITW3pSpz2Vp"
110
+ },
111
+ "execution_count": 10,
112
+ "outputs": []
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "source": [
117
+ "train_sentencepiece_model('drive/MyDrive/vi.txt', 'tokenizer.vi_bpe')"
118
+ ],
119
+ "metadata": {
120
+ "id": "DuMgFiV60C_K"
121
+ },
122
+ "execution_count": 11,
123
+ "outputs": []
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "source": [
128
+ "def tokenize_text(text, model_path):\n",
129
+ " # Load trained model\n",
130
+ " sp = spm.SentencePieceProcessor()\n",
131
+ " sp.load(model_path)\n",
132
+ "\n",
133
+ " # Tokenize Lao sentence\n",
134
+ " tokens = sp.encode(text, out_type=str)\n",
135
+ " return tokens"
136
+ ],
137
+ "metadata": {
138
+ "id": "n3f4z8Ky6PcQ"
139
+ },
140
+ "execution_count": 7,
141
+ "outputs": []
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "source": [
146
+ "sentence = \"Cuộc thi sáng tác truyện tranh đến từ Nhật Bản, dành cho các họa sĩ Việt Nam!\"\n",
147
+ "model_path = 'tokenizer.vi_bpe.model'\n",
148
+ "tokens = tokenize_text(sentence, model_path)\n",
149
+ "print(\"Tokens:\", tokens)"
150
+ ],
151
+ "metadata": {
152
+ "colab": {
153
+ "base_uri": "https://localhost:8080/"
154
+ },
155
+ "id": "Zyjy-JJgHxUm",
156
+ "outputId": "e797ad00-0192-456a-963f-eed44274eae9"
157
+ },
158
+ "execution_count": 12,
159
+ "outputs": [
160
+ {
161
+ "output_type": "stream",
162
+ "name": "stdout",
163
+ "text": [
164
+ "Tokens: ['▁Cuộc▁thi', '▁sáng▁tác', '▁truyện', '▁tranh', '▁đến▁từ', '▁Nhật▁Bản', ',', '▁dành▁cho▁các', '▁họa▁sĩ', '▁Việt▁Nam', '!']\n"
165
+ ]
166
+ }
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "source": [],
172
+ "metadata": {
173
+ "id": "IN85a_AcHz2G"
174
+ },
175
+ "execution_count": null,
176
+ "outputs": []
177
+ }
178
+ ]
179
+ }
first train/unigram/tokenizer.lo_unigram.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:729b4b7b147eeee850788ca0e173f39ec88cab8bda1dac51df7124bd1c1dddac
3
+ size 1762304
first train/unigram/tokenizer.lo_unigram.vocab ADDED
The diff for this file is too large to render. See raw diff
 
first train/unigram/tokenizer.vi_unigram.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcb9c84235dc09be9a349ada98e8c8d854bab276d25150ede5b53617871e0607
3
+ size 688055
first train/unigram/tokenizer.vi_unigram.vocab ADDED
The diff for this file is too large to render. See raw diff
 
first train/vocab.lo.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7aa47d8be3e140ed3c43437dc2b565e4e35f674e2ebefcfecb4fd709d3acbad
3
+ size 1265180
first train/vocab.vi.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:793d8ecadcfdd00471dfc16c443fdfb3e18559c36d3d587320a87da30b6f26dc
3
+ size 494470
second train/model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b91113dd55f92f3d869415ce71ad65054c4dca3b1aa4183a551ec9cd30d7a282
3
+ size 363893090
second train/vocab.lo.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f864e51737ffc6c33c02b7a8e24deac5e46b20dad29424c62efd1d6584f5437
3
+ size 2541857
second train/vocab.vi.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1560cf76c6c08fff6fcd09364f4b7d25d134b165ed768c7f82805a1c73ec8093
3
+ size 1768831
third train/model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e879fd6a2ac4f18b7c7a5fab73acc358efa4a7c0bd93a29320d5ba7b5fb7d439
3
+ size 371094450
third train/vocab.lo.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c189ed4f3699d6559c1b4a4a706c41da679c6c2aa34a44ee92a8b329bced0410
3
+ size 2648975
third train/vocab.vi.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3cf487737a345c68b016d47922edc183c3d4af129c29308013e454ed6b525bd
3
+ size 1819136