Nad1011 commited on Dec 14, 2023

Commit

cd81fc7

1 Parent(s): e08131e

Upload 21 files

Browse files

Files changed (21) hide show

first train/bpe/tokenizer.lo_bpe.model +3 -0
first train/bpe/tokenizer.lo_bpe.vocab +0 -0
first train/bpe/tokenizer.vi_bpe.model +3 -0
first train/bpe/tokenizer.vi_bpe.vocab +0 -0
first train/corpus.zip +3 -0
first train/log.txt +95 -0
first train/model.pkl +3 -0
first train/setup.md +16 -0
first train/tokenizer_training.ipynb +179 -0
first train/unigram/tokenizer.lo_unigram.model +3 -0
first train/unigram/tokenizer.lo_unigram.vocab +0 -0
first train/unigram/tokenizer.vi_unigram.model +3 -0
first train/unigram/tokenizer.vi_unigram.vocab +0 -0
first train/vocab.lo.pkl +3 -0
first train/vocab.vi.pkl +3 -0
second train/model.pkl +3 -0
second train/vocab.lo.pkl +3 -0
second train/vocab.vi.pkl +3 -0
third train/model.pkl +3 -0
third train/vocab.lo.pkl +3 -0
third train/vocab.vi.pkl +3 -0

first train/bpe/tokenizer.lo_bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57835e0a538802d5de3903f12483820a98949f25b7fccbb3ff8a73c8c0969fdb
+size 1663887

first train/bpe/tokenizer.lo_bpe.vocab ADDED Viewed

The diff for this file is too large to render. See raw diff

first train/bpe/tokenizer.vi_bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57ad603b2251531ed49a27f5aa5ddc70cc8a65841d5626a2575e2442089950f0
+size 577154

first train/bpe/tokenizer.vi_bpe.vocab ADDED Viewed

The diff for this file is too large to render. See raw diff

first train/corpus.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2fc2e449f1051612abe8795f0a8df744ead3736be8e40c3dadc181869efd965
+size 485338322

first train/log.txt ADDED Viewed

	@@ -0,0 +1,95 @@

+Loading config ...
+Loading data ...
+Building vocab ...
+Creating iterator ...
+Building dataset ...
+Building vocab from dataset ...
+Load vocab from path successful
+Building encoder and decoder ...
+src vocab size = 29393
+trg vocab size = 15202
+Encoder: 34065920 parameters
+Decoder: 40908642 parameters
+Starting training on cuda
+Performing training...
+==================================================
+Epoch: 01 - 15.0m40.41644310951233s
+        Train Loss/PPL:   9.094 / 8899.793
+        Val   Loss/PPL:   8.620 / 5540.010
+--------------------------------------------------
+Epoch: 02 - 15.0m38.90395212173462s
+        Train Loss/PPL:   8.185 / 3586.749
+        Val   Loss/PPL:   8.069 / 3193.848
+--------------------------------------------------
+Epoch: 03 - 15.0m39.11497640609741s
+        Train Loss/PPL:   7.795 / 2427.533
+        Val   Loss/PPL:   7.912 / 2729.002
+--------------------------------------------------
+Epoch: 04 - 15.0m42.52194285392761s
+        Train Loss/PPL:   7.661 / 2123.347
+        Val   Loss/PPL:   7.859 / 2589.700
+--------------------------------------------------
+Epoch: 05 - 15.0m42.61946368217468s
+        Train Loss/PPL:   7.604 / 2005.850
+        Val   Loss/PPL:   7.837 / 2532.609
+--------------------------------------------------
+Epoch: 06 - 15.0m40.5325984954834s
+        Train Loss/PPL:   7.570 / 1938.907
+        Val   Loss/PPL:   7.822 / 2493.998
+--------------------------------------------------
+Epoch: 07 - 15.0m44.441715240478516s
+        Train Loss/PPL:   7.546 / 1893.262
+        Val   Loss/PPL:   7.812 / 2469.149
+--------------------------------------------------
+Epoch: 08 - 15.0m43.27636504173279s
+        Train Loss/PPL:   7.525 / 1854.688
+        Val   Loss/PPL:   7.800 / 2441.054
+--------------------------------------------------
+Epoch: 09 - 17.0m49.64024472236633s
+        Train Loss/PPL:   7.509 / 1823.568
+        Val   Loss/PPL:   7.790 / 2415.858
+--------------------------------------------------
+Epoch: 10 - 15.0m41.81872010231018s
+        Train Loss/PPL:   7.492 / 1793.774
+        Val   Loss/PPL:   7.780 / 2391.125
+--------------------------------------------------
+Epoch: 11 - 28.0m3.3641841411590576s
+        Train Loss/PPL:   7.477 / 1767.388
+        Val   Loss/PPL:   7.772 / 2373.962
+--------------------------------------------------
+Epoch: 12 - 15.0m45.12012314796448s
+        Train Loss/PPL:   7.463 / 1742.621
+        Val   Loss/PPL:   7.763 / 2350.974
+--------------------------------------------------
+Epoch: 13 - 15.0m42.93015956878662s
+        Train Loss/PPL:   7.449 / 1718.568
+        Val   Loss/PPL:   7.756 / 2335.491
+--------------------------------------------------
+Epoch: 14 - 15.0m44.00054144859314s
+        Train Loss/PPL:   7.438 / 1699.215
+        Val   Loss/PPL:   7.748 / 2317.051
+--------------------------------------------------
+Epoch: 15 - 15.0m55.463807582855225s
+        Train Loss/PPL:   7.426 / 1679.351
+        Val   Loss/PPL:   7.741 / 2301.697
+--------------------------------------------------
+Epoch: 16 - 15.0m44.77303099632263s
+        Train Loss/PPL:   7.415 / 1660.209
+        Val   Loss/PPL:   7.733 / 2282.415
+--------------------------------------------------
+Epoch: 17 - 15.0m44.32082152366638s
+        Train Loss/PPL:   7.405 / 1643.373
+        Val   Loss/PPL:   7.726 / 2266.476
+--------------------------------------------------
+Epoch: 18 - 15.0m43.58943033218384s
+        Train Loss/PPL:   7.395 / 1627.018
+        Val   Loss/PPL:   7.719 / 2250.929
+--------------------------------------------------
+Epoch: 19 - 15.0m45.2637825012207s
+        Train Loss/PPL:   7.386 / 1613.311
+        Val   Loss/PPL:   7.712 / 2234.829
+--------------------------------------------------
+Epoch: 20 - 15.0m43.84825682640076s
+        Train Loss/PPL:   7.376 / 1597.986
+        Val   Loss/PPL:   7.706 / 2220.886
+--------------------------------------------------

first train/model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb643ec4534bec110cbb4d9b4af5dac47ac8a540c25fd4835974e8d87c340cc1
+size 299975138

first train/setup.md ADDED Viewed

	@@ -0,0 +1,16 @@

+Put trained model in this directory
+tokenizer.vi.model {
+    model_type=unigram
+    vocab_size=20000
+    max_sentence_length=100000
+    split_by_whitespace=false
+    input_sentence_size=700000
+}
+tokenizer.vi_bpe.model {
+    model_type=bpe
+    vocab_size=16000
+    max_sentence_length=100000
+    split_by_whitespace=false
+    input_sentence_size=1000000
+}

first train/tokenizer_training.ipynb ADDED Viewed

	@@ -0,0 +1,179 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4",
+      "authorship_tag": "ABX9TyPXgKZqJoVuio+h58qyoujZ",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/Erioldeth/Viet-Laos-Translator/blob/main/tokenizer_training.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "g6isgzoaxWTr",
+        "outputId": "8ea96348-ea45-4d2e-e0a5-b76f3cfbb255"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Mounted at /content/drive\n"
+          ]
+        }
+      ],
+      "source": [
+        "from google.colab import drive\n",
+        "drive.mount('/content/drive')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install sentencepiece"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "2ZYK4GgzzSG4",
+        "outputId": "be748680-ccff-45e0-d0b5-91f9a9887608"
+      },
+      "execution_count": 2,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Collecting sentencepiece\n",
+            "  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hInstalling collected packages: sentencepiece\n",
+            "Successfully installed sentencepiece-0.1.99\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import sentencepiece as spm"
+      ],
+      "metadata": {
+        "id": "N3j11OrLzxFC"
+      },
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def train_sentencepiece_model(input_file, model_prefix, model_type='bpe', vocab_size=16000):\n",
+        "    # Train SentencePiece model\n",
+        "    spm.SentencePieceTrainer.train(\n",
+        "        input=input_file,\n",
+        "        model_prefix=model_prefix,\n",
+        "        vocab_size=vocab_size,\n",
+        "        model_type=model_type,\n",
+        "        max_sentence_length=10000,\n",
+        "        input_sentence_size=1000000,\n",
+        "        split_by_whitespace=\"false\",\n",
+        "    )"
+      ],
+      "metadata": {
+        "id": "LITW3pSpz2Vp"
+      },
+      "execution_count": 10,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "train_sentencepiece_model('drive/MyDrive/vi.txt', 'tokenizer.vi_bpe')"
+      ],
+      "metadata": {
+        "id": "DuMgFiV60C_K"
+      },
+      "execution_count": 11,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def tokenize_text(text, model_path):\n",
+        "    # Load trained model\n",
+        "    sp = spm.SentencePieceProcessor()\n",
+        "    sp.load(model_path)\n",
+        "\n",
+        "    # Tokenize Lao sentence\n",
+        "    tokens = sp.encode(text, out_type=str)\n",
+        "    return tokens"
+      ],
+      "metadata": {
+        "id": "n3f4z8Ky6PcQ"
+      },
+      "execution_count": 7,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "sentence = \"Cuộc thi sáng tác truyện tranh đến từ Nhật Bản, dành cho các họa sĩ Việt Nam!\"\n",
+        "model_path = 'tokenizer.vi_bpe.model'\n",
+        "tokens = tokenize_text(sentence, model_path)\n",
+        "print(\"Tokens:\", tokens)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Zyjy-JJgHxUm",
+        "outputId": "e797ad00-0192-456a-963f-eed44274eae9"
+      },
+      "execution_count": 12,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Tokens: ['▁Cuộc▁thi', '▁sáng▁tác', '▁truyện', '▁tranh', '▁đến▁từ', '▁Nhật▁Bản', ',', '▁dành▁cho▁các', '▁họa▁sĩ', '▁Việt▁Nam', '!']\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "IN85a_AcHz2G"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}

first train/unigram/tokenizer.lo_unigram.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:729b4b7b147eeee850788ca0e173f39ec88cab8bda1dac51df7124bd1c1dddac
+size 1762304

first train/unigram/tokenizer.lo_unigram.vocab ADDED Viewed

The diff for this file is too large to render. See raw diff

first train/unigram/tokenizer.vi_unigram.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcb9c84235dc09be9a349ada98e8c8d854bab276d25150ede5b53617871e0607
+size 688055

first train/unigram/tokenizer.vi_unigram.vocab ADDED Viewed

The diff for this file is too large to render. See raw diff

first train/vocab.lo.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7aa47d8be3e140ed3c43437dc2b565e4e35f674e2ebefcfecb4fd709d3acbad
+size 1265180

first train/vocab.vi.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:793d8ecadcfdd00471dfc16c443fdfb3e18559c36d3d587320a87da30b6f26dc
+size 494470

second train/model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b91113dd55f92f3d869415ce71ad65054c4dca3b1aa4183a551ec9cd30d7a282
+size 363893090

second train/vocab.lo.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f864e51737ffc6c33c02b7a8e24deac5e46b20dad29424c62efd1d6584f5437
+size 2541857

second train/vocab.vi.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1560cf76c6c08fff6fcd09364f4b7d25d134b165ed768c7f82805a1c73ec8093
+size 1768831

third train/model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e879fd6a2ac4f18b7c7a5fab73acc358efa4a7c0bd93a29320d5ba7b5fb7d439
+size 371094450

third train/vocab.lo.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c189ed4f3699d6559c1b4a4a706c41da679c6c2aa34a44ee92a8b329bced0410
+size 2648975

third train/vocab.vi.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3cf487737a345c68b016d47922edc183c3d4af129c29308013e454ed6b525bd
+size 1819136