ducdatit2002 commited on Feb 13

Commit

e09333c

verified ·

1 Parent(s): 8fe0454

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +10 -0
.vector_cache/word2vec_vi_syllables_100dims.txt.pt +3 -0
abbreviations.json +363 -0
bilstm_best.keras +3 -0
bilstm_emotion_model/bilstm_model.keras +3 -0
bilstm_emotion_model/classification_report.txt +33 -0
bilstm_emotion_model/label_mapping.json +9 -0
bilstm_emotion_model/vocabulary.json +0 -0
cnn_lstm_best.keras +3 -0
cnn_lstm_emotion_model/classification_report.txt +33 -0
cnn_lstm_emotion_model/cnn_lstm_model.keras +3 -0
cnn_lstm_model.keras +3 -0
flagged/log.csv +2 -0
logs/events.out.tfevents.1736834439.ai1gpu-virtual-machine.52042.0 +3 -0
logs/events.out.tfevents.1736835355.ai1gpu-virtual-machine.52042.1 +3 -0
logs/events.out.tfevents.1736835689.ai1gpu-virtual-machine.52955.0 +3 -0
logs/events.out.tfevents.1736835769.ai1gpu-virtual-machine.53242.0 +3 -0
logs/events.out.tfevents.1736835850.ai1gpu-virtual-machine.53528.0 +3 -0
logs/events.out.tfevents.1736835995.ai1gpu-virtual-machine.53982.0 +3 -0
logs/events.out.tfevents.1736836066.ai1gpu-virtual-machine.54029.0 +3 -0
logs/events.out.tfevents.1736836768.ai1gpu-virtual-machine.55099.0 +3 -0
logs/events.out.tfevents.1736841979.ai1gpu-virtual-machine.55099.1 +3 -0
logs/events.out.tfevents.1736844609.ai1gpu-virtual-machine.66743.0 +3 -0
logs/events.out.tfevents.1736852947.ai1gpu-virtual-machine.76812.0 +3 -0
logs/events.out.tfevents.1736858105.ai1gpu-virtual-machine.76812.1 +3 -0
logs/events.out.tfevents.1736858545.ai1gpu-virtual-machine.87908.0 +3 -0
logs/events.out.tfevents.1736858698.ai1gpu-virtual-machine.88011.0 +3 -0
logs/events.out.tfevents.1736864229.ai1gpu-virtual-machine.88011.1 +3 -0
logs/events.out.tfevents.1736907563.ai1gpu-virtual-machine.145430.0 +3 -0
logs/events.out.tfevents.1736908155.ai1gpu-virtual-machine.146675.0 +3 -0
logs/events.out.tfevents.1736911863.ai1gpu-virtual-machine.152249.0 +3 -0
logs/events.out.tfevents.1736916063.ai1gpu-virtual-machine.152249.1 +3 -0
main_BILSTM.py +573 -0
main_RNN_CNN-LSTM.py +738 -0
main_lstm.py +289 -0
main_phobert.py +349 -0
main_svm.py +261 -0
main_v1.py +494 -0
phobert_emotion_model/classification_report.txt +23 -0
phobert_emotion_model/confusion_matrix.png +0 -0
phobert_emotion_model/id2label.json +9 -0
phobert_emotion_model/phobert_emotion_model/added_tokens.json +3 -0
phobert_emotion_model/phobert_emotion_model/bpe.codes +0 -0
phobert_emotion_model/phobert_emotion_model/config.json +48 -0
phobert_emotion_model/phobert_emotion_model/model.safetensors +3 -0
phobert_emotion_model/phobert_emotion_model/special_tokens_map.json +9 -0
phobert_emotion_model/phobert_emotion_model/tokenizer_config.json +54 -0
phobert_emotion_model/phobert_emotion_model/vocab.txt +0 -0
phobert_results/checkpoint-10410/added_tokens.json +3 -0
phobert_results/checkpoint-10410/bpe.codes +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+bilstm_best.keras filter=lfs diff=lfs merge=lfs -text
+bilstm_emotion_model/bilstm_model.keras filter=lfs diff=lfs merge=lfs -text
+cnn_lstm_best.keras filter=lfs diff=lfs merge=lfs -text
+cnn_lstm_emotion_model/cnn_lstm_model.keras filter=lfs diff=lfs merge=lfs -text
+cnn_lstm_model.keras filter=lfs diff=lfs merge=lfs -text
+processed.xlsx filter=lfs diff=lfs merge=lfs -text
+processed_phobert.xlsx filter=lfs diff=lfs merge=lfs -text
+processed_svm.xlsx filter=lfs diff=lfs merge=lfs -text
+train.xlsx filter=lfs diff=lfs merge=lfs -text
+word2vec_vi_syllables_100dims.txt filter=lfs diff=lfs merge=lfs -text

.vector_cache/word2vec_vi_syllables_100dims.txt.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3390520329ebe14cddb38384d80bd8b6e4948e023977ba5dbe32235b4a3503e7
+size 418631353

abbreviations.json ADDED Viewed

	@@ -0,0 +1,363 @@

+{
+    "ad": [
+        "admin",
+        "quản trị viên"
+    ],
+    "bb": [
+        "bye bye",
+        "tạm biệt"
+    ],
+    "bl": [
+        "bình luận"
+    ],
+    "bth": [
+        "bình thường"
+    ],
+    "bmn": [
+        "bạn muốn"
+    ],
+    "cxk": [
+        "cũng không"
+    ],
+    "đm": [
+        "đ** m**"
+    ],
+    "gg": [
+        "good game",
+        "Google"
+    ],
+    "hc": [
+        "học"
+    ],
+    "kq": [
+        "kết quả"
+    ],
+    "kb": [
+        "kết bạn"
+    ],
+    "khá": [
+        "khá là"
+    ],
+    "lq": [
+        "liên quan"
+    ],
+    "lmh": [
+        "làm gì thế"
+    ],
+    "ng": [
+        "người"
+    ],
+    "nsao": [
+        "nói sao"
+    ],
+    "nv": [
+        "nhân vật"
+    ],
+    "nvay": [
+        "như vậy"
+    ],
+    "nxk": [
+        "nói không"
+    ],
+    "ob": [
+        "ông bà"
+    ],
+    "pc": [
+        "phải không"
+    ],
+    "ph": [
+        "phim"
+    ],
+    "ql": [
+        "quản lý"
+    ],
+    "qt": [
+        "quá trời"
+    ],
+    "sdt": [
+        "số điện thoại"
+    ],
+    "sk": [
+        "sức khỏe"
+    ],
+    "tc": [
+        "tài chính"
+    ],
+    "td": [
+        "tâm điểm",
+        "tập đoàn"
+    ],
+    "th": [
+        "thôi"
+    ],
+    "tl": [
+        "trả lời"
+    ],
+    "ty": [
+        "tình yêu"
+    ],
+    "up": [
+        "cập nhật",
+        "update"
+    ],
+    "xđ": [
+        "xác định"
+    ],
+    "zui": [
+        "vui"
+    ],
+    "zời": [
+        "trời"
+    ],
+    "hdsd": [
+        "hướng dẫn sử dụng"
+    ],
+    "bbq": [
+        "barbecue",
+        "tiệc nướng"
+    ],
+    "cx": [
+        "chắc chắn",
+        "cũng"
+    ],
+    "vkc": [
+        "vãi kinh"
+    ],
+    "kt": [
+        "kiểm tra",
+        "không thèm"
+    ],
+    "tks": [
+        "thanks",
+        "cảm ơn"
+    ],
+    "đg": [
+        "đang"
+    ],
+    "qa": [
+        "quá"
+    ],
+    "ht": [
+        "học tập",
+        "hoàn tất"
+    ],
+    "clgt": [
+        "cái l** gì thế"
+    ],
+    "pls": [
+        "please",
+        "làm ơn"
+    ],
+    "qtqđ": [
+        "quá trời quá đất"
+    ],
+    "klq": [
+        "không liên quan"
+    ],
+    "mn": [
+        "mọi người"
+    ],
+    "vc": [
+        "vãi chưởng",
+        "vợ chồng"
+    ],
+    "vch": [
+        "vãi chưởng"
+    ],
+    "cđ": [
+        "cuộc đời"
+    ],
+    "đhs": [
+        "đ** hiểu sao"
+    ],
+    "ib": [
+        "inbox",
+        "nhắn tin"
+    ],
+    "ttyl": [
+        "talk to you later",
+        "nói chuyện sau"
+    ],
+    "stt": [
+        "status",
+        "trạng thái"
+    ],
+    "sr": [
+        "sorry",
+        "xin lỗi"
+    ],
+    "bn": [
+        "bao nhiêu",
+        "bạn"
+    ],
+    "ckmnl": [
+        "chào cả nhà mình nha l"
+    ],
+    "cr": [
+        "crush"
+    ],
+    "mng": [
+        "mọi người"
+    ],
+    "vl": [
+        "vãi l",
+        "rất"
+    ],
+    "khbn": [
+        "không biết nữa"
+    ],
+    "qtq": [
+        "quá trời quá"
+    ],
+    "sml": [
+        "sấp mặt luôn"
+    ],
+    "ns": [
+        "nói"
+    ],
+    "ăn h": [
+        "ăn hành"
+    ],
+    "qh": [
+        "quan hệ"
+    ],
+    "ăn b": [
+        "ăn bánh"
+    ],
+    "hph": [
+        "hạnh phúc"
+    ],
+    "ngta": [
+        "người ta"
+    ],
+    "mnk": [
+        "mọi người không"
+    ],
+    "ahihi": [
+        "cười đùa"
+    ],
+    "chz": [
+        "chuyện"
+    ],
+    "vđ": [
+        "vấn đề"
+    ],
+    "pp": [
+        "bye bye",
+        "tạm biệt"
+    ],
+    "dc": [
+        "được"
+    ],
+    "nt": [
+        "nhắn tin"
+    ],
+    "thik": [
+        "thích"
+    ],
+    "bt": [
+        "biết",
+        "bình thường"
+    ],
+    "kp": [
+        "không phải"
+    ],
+    "mik": [
+        "mình"
+    ],
+    "lm": [
+        "làm"
+    ],
+    "nx": [
+        "nữa"
+    ],
+    "mk": [
+        "mình",
+        "mày"
+    ],
+    "cmt": [
+        "comment",
+        "bình luận"
+    ],
+    "rep": [
+        "trả lời",
+        "phản hồi"
+    ],
+    "fa": [
+        "độc thân",
+        "forever alone"
+    ],
+    "chx": [
+        "chưa"
+    ],
+    "qlq": [
+        "quản lý quán"
+    ],
+    "a": [
+        "anh"
+    ],
+    "e": [
+        "em"
+    ],
+    "ko": [
+        "không"
+    ],
+    "kh": [
+        "không"
+    ],
+    "z": [
+        "vậy"
+    ],
+    "ny": [
+        "người yêu"
+    ],
+    "l": [
+        "là"
+    ],
+    "sn": [
+        "sinh nhật"
+    ],
+    "ckk": [
+        "chúc ngủ ngon"
+    ],
+    "hpbd": [
+        "happy birthday"
+    ],
+    "tt": [
+        "thông tin",
+        "tương tác"
+    ],
+    "ms": [
+        "mới"
+    ],
+    "k": [
+        "không"
+    ],
+    "vk": [
+        "vợ"
+    ],
+    "ck": [
+        "chồng"
+    ],
+    "j": [
+        "gì"
+    ],
+    "m": [
+        "mày"
+    ],
+    "t": [
+        "tao"
+    ],
+    "sgk": [
+        "sách giáo khoa"
+    ],
+    "cv": [
+        "công việc"
+    ],
+    "pv": [
+        "phục vụ"
+    ],
+    "dth":["dễ thương"],
+    "gato": ["ghen ăn tức ở"]
+}

bilstm_best.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:085cb3b7394a3db69287c6ede56834dfc9d6e56e2f169c5a05e49ffb5267fb6a
+size 13203552

bilstm_emotion_model/bilstm_model.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40715c89bc3bc193a953c792527898450dd10979bd0bcd62ed32b8df471fa2bb
+size 13203552

bilstm_emotion_model/classification_report.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+========== BiLSTM Classification Report ==========
+              precision    recall  f1-score   support
+   Enjoyment     0.6490    0.7296    0.6869       991
+        Fear     0.5580    0.4709    0.5108       327
+     Sadness     0.4580    0.4747    0.4662       356
+       Anger     0.6587    0.6748    0.6667       369
+       Other     0.6601    0.6733    0.6667       600
+     Disgust     0.4967    0.4488    0.4715       332
+    Surprise     0.4683    0.3620    0.4083       326
+    accuracy                         0.5956      3301
+   macro avg     0.5641    0.5477    0.5539      3301
+weighted avg     0.5893    0.5956    0.5905      3301
+========== Additional Metrics ==========
+Test Loss: 2.0363
+Test Accuracy: 0.5956
+Precision (Macro): 0.5641
+Precision (Weighted): 0.5893
+Recall (Macro): 0.5477
+Recall (Weighted): 0.5956
+F1-Score (Macro): 0.5539
+F1-Score (Weighted): 0.5905
+========== Confusion Matrix ==========
+[[723  23  83   3  81  29  49]
+ [ 38 154  26  72  10  14  13]
+ [108  14 169   2  30  23  10]
+ [ 13  42  12 249  14  29  10]
+ [110   9  30   9 404  18  20]
+ [ 32  25  26  30  38 149  32]
+ [ 90   9  23  13  35  38 118]]

bilstm_emotion_model/label_mapping.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "Enjoyment": 0,
+    "Fear": 1,
+    "Sadness": 2,
+    "Anger": 3,
+    "Other": 4,
+    "Disgust": 5,
+    "Surprise": 6
+}

bilstm_emotion_model/vocabulary.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cnn_lstm_best.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e98590341cdfcc831873ee3fddc3c17f16a350085df1e302e2e22a4eda0c03ad
+size 13535600

cnn_lstm_emotion_model/classification_report.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+========== CNN-LSTM Classification Report ==========
+              precision    recall  f1-score   support
+   Enjoyment     0.6977    0.7265    0.7118       991
+        Fear     0.5526    0.6269    0.5874       327
+     Sadness     0.4955    0.4663    0.4805       356
+       Anger     0.7022    0.6070    0.6512       369
+       Other     0.6740    0.7650    0.7166       600
+     Disgust     0.5194    0.4849    0.5016       332
+    Surprise     0.5020    0.3896    0.4387       326
+    accuracy                         0.6247      3301
+   macro avg     0.5919    0.5809    0.5840      3301
+weighted avg     0.6204    0.6247    0.6205      3301
+========== Additional Metrics ==========
+Test Loss: 1.6124
+Test Accuracy: 0.6247
+Precision (Macro): 0.5919
+Precision (Weighted): 0.6204
+Recall (Macro): 0.5809
+Recall (Weighted): 0.6247
+F1-Score (Macro): 0.5840
+F1-Score (Weighted): 0.6205
+========== Confusion Matrix ==========
+[[720  28  69  11  93  37  33]
+ [ 34 205  13  39  10  14  12]
+ [ 92  22 166   7  31  19  19]
+ [ 13  62  13 224  17  34   6]
+ [ 56  15  29   6 459  10  25]
+ [ 34  21  22  27  36 161  31]
+ [ 83  18  23   5  35  35 127]]

cnn_lstm_emotion_model/cnn_lstm_model.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c45256b322b2360c9ba9e0c5da5fd42705f7d4395f6c1d4c6a94035e43bf05d0
+size 13535600

cnn_lstm_model.keras ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78c966f03f234f409270b699f84a635d98128de271d8492ee25776026312cd24
+size 13535600

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Nhập câu cần phân loại cảm xúc,Kết quả dự đoán,flag,username,timestamp
2	+ "Hôm nay là ngày đẹp trời, tôi muốn có người yêu 😊",Disgust,,,2025-01-14 13:57:25.419643

logs/events.out.tfevents.1736834439.ai1gpu-virtual-machine.52042.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb26f251abccb92c7342c443b6b7c7faa2b0d0c41976053706f1c002754680a
+size 23650

logs/events.out.tfevents.1736835355.ai1gpu-virtual-machine.52042.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72bc950b1e422eb9db07cba8ad85db543521c38025579fcc2cce1dd799313233
+size 411

logs/events.out.tfevents.1736835689.ai1gpu-virtual-machine.52955.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:744768bef1c4f7e54446c6a7925c8b770d2d5af70f6f76016fab9805a3802b6f
+size 346

logs/events.out.tfevents.1736835769.ai1gpu-virtual-machine.53242.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0843cbd924008b8a37ef65480d32b8e16241e9e059a3784b0b8ce6d097a0d0c5
+size 346

logs/events.out.tfevents.1736835850.ai1gpu-virtual-machine.53528.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c3fc1113ddc32236fc69e785dfa73481178e728dd02e131bad5add13004729f
+size 346

logs/events.out.tfevents.1736835995.ai1gpu-virtual-machine.53982.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3de874ab406b8d42e3f02443b3ae8fce7228cffb61c6845aab400981d1263b0
+size 5228

logs/events.out.tfevents.1736836066.ai1gpu-virtual-machine.54029.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f927f800053a89cf20a14bf5a48c6343b31d9a49d5e670a4fc48ad7fb676874
+size 8712

logs/events.out.tfevents.1736836768.ai1gpu-virtual-machine.55099.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2816f60b911788c30bc43168dbbe689eee10a119e1e450767e54f521cb5f03c
+size 81906

logs/events.out.tfevents.1736841979.ai1gpu-virtual-machine.55099.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:426ac92bb076d56fd8130e04ac0064542681f9ddd70fbeb64779f10b8521bb1d
+size 417

logs/events.out.tfevents.1736844609.ai1gpu-virtual-machine.66743.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ff2d9a713d3ea47e04c6361df3c62d551e983cd170de4a163798e58eed51111
+size 346

logs/events.out.tfevents.1736852947.ai1gpu-virtual-machine.76812.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2cea1a1f21eb664b3b5ae8f09ae76a38a3c7a37560a4432c805772a8afb171b
+size 83399

logs/events.out.tfevents.1736858105.ai1gpu-virtual-machine.76812.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e9817a200d06938057f30fdac643b1480e734857bb5337aa4f494b29d199245
+size 569

logs/events.out.tfevents.1736858545.ai1gpu-virtual-machine.87908.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48134c412b09adeae17bc7aac0295e48dce80cf72ce2a1f4109c159ee99819b1
+size 486

logs/events.out.tfevents.1736858698.ai1gpu-virtual-machine.88011.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0165be0e6c2731ce32b3e3cbe11b5a6997120211c06d0d04c264b5c69c8f9f2
+size 83399

logs/events.out.tfevents.1736864229.ai1gpu-virtual-machine.88011.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e468b0b65d952e3df6c9eb4f53bb8a8f867532828522b13b8229b53ea2787f9a
+size 569

logs/events.out.tfevents.1736907563.ai1gpu-virtual-machine.145430.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a67cb94b4913d02142ea7fb0bbad62005700059dc0bc6670464999d33dce0daf
+size 7756

logs/events.out.tfevents.1736908155.ai1gpu-virtual-machine.146675.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a882fae8ea63fa2ecf17da9e9c44bcd33568c5a998b11da0ceb6c537857223c
+size 7367

logs/events.out.tfevents.1736911863.ai1gpu-virtual-machine.152249.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dcadbf84e08ca0d1c9cf9f877233b857eb144b8aa92bd28291827220a0f7ea6
+size 85351

logs/events.out.tfevents.1736916063.ai1gpu-virtual-machine.152249.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6eee809e23d4dd927f9c3dffb75d8184a24ae246cd0380fc93894bccc415d632
+size 766

main_BILSTM.py ADDED Viewed

	@@ -0,0 +1,573 @@

+# thesis.py
+# -*- coding: utf-8 -*-
+import pandas as pd
+import emoji
+import json
+import re
+import numpy as np
+from underthesea import word_tokenize
+from tqdm import tqdm
+import torch
+from torchtext.vocab import Vectors
+from sklearn.model_selection import train_test_split
+from sklearn.utils import resample
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+    precision_score,
+    recall_score,
+    f1_score,
+    confusion_matrix
+)
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from torch.utils.data import DataLoader, TensorDataset
+import torch.nn as nn
+import torch.optim as optim
+import tensorflow as tf
+import os
+# ========== CÁC HÀM TIỀN XỬ LÝ ==========
+def preprocess_sentence(sentence, abbreviations, emoji_mapping):
+    """
+    Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
+    ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
+    """
+    sentence = sentence.lower()
+    sentence = replace_emojis(sentence, emoji_mapping)
+    sentence = remove_profanity(sentence)
+    sentence = remove_special_characters(sentence)
+    sentence = normalize_whitespace(sentence)
+    sentence = replace_abbreviations(sentence, abbreviations)
+    sentence = remove_repeated_characters(sentence)
+    sentence = replace_numbers(sentence)
+    sentence = tokenize_sentence(sentence)
+    return sentence
+def replace_emojis(sentence, emoji_mapping):
+    processed_sentence = []
+    for char in sentence:
+        if char in emoji_mapping:
+            processed_sentence.append(emoji_mapping[char])
+        elif not emoji.is_emoji(char):
+            processed_sentence.append(char)
+    return ''.join(processed_sentence)
+def remove_profanity(sentence):
+    profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
+    words = sentence.split()
+    filtered_words = [word for word in words if word.lower() not in profane_words]
+    return ' '.join(filtered_words)
+def remove_special_characters(sentence):
+    return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
+def normalize_whitespace(sentence):
+    return ' '.join(sentence.split())
+def replace_abbreviations(sentence, abbreviations):
+    words = sentence.split()
+    replaced_words = [
+        " ".join(abbreviations[word]) if word in abbreviations else word
+        for word in words
+    ]
+    return ' '.join(replaced_words)
+def remove_repeated_characters(sentence):
+    # Ví dụ: "đẹp quáaaaaaa" -> "đẹp quá"
+    return re.sub(r"(.)\1{2,}", r"\1", sentence)
+def replace_numbers(sentence):
+    # Thay toàn bộ số bằng token [number]
+    return re.sub(r"\d+", "[number]", sentence)
+def tokenize_sentence(sentence):
+    # Tách từ bằng underthesea
+    return ' '.join(word_tokenize(sentence))
+# ========== VOCABULARY CLASS ==========
+class Vocabulary:
+    def __init__(self):
+        self.word2id = {}
+        self.word2id['<pad>'] = 0
+        self.word2id['<unk>'] = 1
+        self.unk_id = 1
+        self.id2word = {0: '<pad>', 1: '<unk>'}
+    def __getitem__(self, word):
+        return self.word2id.get(word, self.unk_id)
+    def __contains__(self, word):
+        return word in self.word2id
+    def __len__(self):
+        return len(self.word2id)
+    def lookup_tokens(self, indices):
+        return [self.id2word[idx] for idx in indices]
+    def add(self, word):
+        if word not in self.word2id:
+            idx = len(self.word2id)
+            self.word2id[word] = idx
+            self.id2word[idx] = word
+    @staticmethod
+    def tokenize_corpus(corpus):
+        tokenized_corpus = []
+        for doc in tqdm(corpus, desc="Tokenizing Corpus"):
+            tokens = [w.replace(" ", "_") for w in word_tokenize(doc)]
+            tokenized_corpus.append(tokens)
+        return tokenized_corpus
+    def corpus_to_tensor(self, corpus, is_tokenized=False):
+        """
+        corpus: list các câu (chuỗi) hoặc list các list từ (nếu is_tokenized=True)
+        return: list[list[int]], mỗi câu là 1 list gồm các chỉ số token
+        """
+        tokenized_corpus = (
+            self.tokenize_corpus(corpus) if not is_tokenized else corpus
+        )
+        return [
+            [self[token] for token in doc]
+            for doc in tokenized_corpus
+        ]
+# ========== EMOJI MAPPING ==========
+emoji_mapping = {
+    "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
+    "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
+    "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
+    "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
+    "🤑": "[satisfaction]",
+    "🤐": "[neutral]", "🤨": "[neutral]", "���": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
+    "😏": "[sarcasm]",
+    "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
+    "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
+    "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
+    "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
+    "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
+    "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
+    "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
+    "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
+    "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
+    "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
+}
+# ========== DATA MANAGER ==========
+class DataManager:
+    def __init__(self, file_path, abbreviations_path, word2vec_path):
+        self.file_path = file_path
+        self.abbreviations_path = abbreviations_path
+        self.word2vec_path = word2vec_path
+        self.vocabulary = None
+        self.word_embeddings = None
+        self.abbreviations = None
+        self.load_abbreviations()
+    def load_abbreviations(self):
+        with open(self.abbreviations_path, "r", encoding="utf-8") as f:
+            self.abbreviations = json.load(f)
+    def load_word2vec(self):
+        """
+        Tải vector từ file word2vec,
+        dùng torchtext.Vectors để load embedding pretrained.
+        """
+        self.word_embeddings = Vectors(
+            name=self.word2vec_path,
+            unk_init=torch.Tensor.normal_
+        )
+    def create_vocab_from_corpus(self, corpus, max_vocab_size=30000):
+        """
+        Tạo vocabulary từ corpus, chỉ lấy top max_vocab_size từ.
+        """
+        vocab = Vocabulary()
+        from collections import Counter
+        counter = Counter()
+        for sent in corpus:
+            for token in sent.split():
+                counter[token] += 1
+        most_common = counter.most_common(max_vocab_size)
+        for word, _freq in most_common:
+            vocab.add(word)
+        return vocab
+    def preprocess_data(self):
+        df = pd.read_excel(self.file_path)
+        if "Sentence" not in df.columns:
+            raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
+        # Tiền xử lý từng câu
+        df["processed_sentence"] = df["Sentence"].apply(
+            lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
+        )
+        # Loại những dòng rỗng
+        df = df[df["processed_sentence"].str.strip().astype(bool)]
+        # Tạo vocab từ chính dữ liệu
+        all_sentences = df["processed_sentence"].tolist()
+        self.vocabulary = self.create_vocab_from_corpus(all_sentences, max_vocab_size=30000)
+        # Load word2vec
+        self.load_word2vec()
+        return df
+    def build_pretrained_embedding_matrix(self, embedding_dim=100):
+        """
+        Tạo weight_matrix (numpy) (vocab_size x embedding_dim)
+        với trọng số pretrained.
+        """
+        vocab_size = len(self.vocabulary)
+        weight_matrix = np.random.normal(
+            scale=0.1, size=(vocab_size, embedding_dim)
+        ).astype(np.float32)
+        # Copy vector pretrained
+        for word, idx in self.vocabulary.word2id.items():
+            if word in self.word_embeddings.stoi:
+                weight_matrix[idx] = self.word_embeddings.vectors[
+                    self.word_embeddings.stoi[word]
+                ]
+        return weight_matrix
+    def split_and_convert(
+        self, df, label_column="Emotion", maxlen=400, test_size=0.2,
+        for_keras=False, batch_size=32
+    ):
+        """
+        Chia dữ liệu thành train/test hoặc train/val/test.
+        - for_keras=False → return train_loader, test_loader, label_mapping (PyTorch)
+        - for_keras=True  → return X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
+        """
+        if label_column not in df.columns:
+            raise ValueError(
+                f"Cột '{label_column}' không tồn tại. Hiện có: {df.columns.tolist()}"
+            )
+        # Tạo mapping nhãn -> số
+        label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
+        df[label_column] = df[label_column].map(label_mapping)
+        if df[label_column].isnull().any():
+            missing = df[df[label_column].isnull()][label_column].unique()
+            raise ValueError(f"Những nhãn cảm xúc sau không có trong label_mapping: {missing}")
+        X = df["processed_sentence"].tolist()
+        y = df[label_column].tolist()
+        # Stratify để duy trì phân phối lớp
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=42, stratify=y
+        )
+        if not for_keras:
+            # Chia train thành train và validation
+            X_train, X_val, y_train, y_val = train_test_split(
+                X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
+            )
+        # Convert text -> index
+        X_train_ids = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
+        X_test_ids  = self.vocabulary.corpus_to_tensor(X_test,  is_tokenized=False)
+        if not for_keras:
+            X_val_ids = self.vocabulary.corpus_to_tensor(X_val, is_tokenized=False)
+        # Pad
+        X_train_padded = pad_sequences(X_train_ids, maxlen=maxlen, padding='post', truncating='post')
+        X_test_padded  = pad_sequences(X_test_ids,  maxlen=maxlen, padding='post', truncating='post')
+        if not for_keras:
+            X_val_padded = pad_sequences(X_val_ids, maxlen=maxlen, padding='post', truncating='post')
+        print(">>> Debug Split and Convert:")
+        print("X_train_padded.shape:", X_train_padded.shape)
+        print("X_val_padded.shape: ", X_val_padded.shape if not for_keras else "N/A")
+        print("X_test_padded.shape: ", X_test_padded.shape)
+        print("y_train length:", len(y_train))
+        print("y_val length: ", len(y_val) if not for_keras else "N/A")
+        print("y_test length: ", len(y_test))
+        print("vocab_size:", len(self.vocabulary))
+        if for_keras:
+            num_classes = len(label_mapping)
+            y_train_onehot = tf.keras.utils.to_categorical(
+                y_train,
+                num_classes=num_classes
+            )
+            y_test_onehot  = tf.keras.utils.to_categorical(
+                y_test,
+                num_classes=num_classes
+            )
+            print("y_train_onehot.shape:", y_train_onehot.shape)
+            print("y_test_onehot.shape: ", y_test_onehot.shape)
+            return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
+        else:
+            # Convert validation set
+            X_val_ids = self.vocabulary.corpus_to_tensor(X_val, is_tokenized=False)
+            X_val_padded = pad_sequences(X_val_ids, maxlen=maxlen, padding='post', truncating='post')
+            X_train_t = torch.tensor(X_train_padded, dtype=torch.long)
+            X_val_t   = torch.tensor(X_val_padded, dtype=torch.long)
+            X_test_t  = torch.tensor(X_test_padded,  dtype=torch.long)
+            y_train_t = torch.tensor(y_train, dtype=torch.long)
+            y_val_t   = torch.tensor(y_val, dtype=torch.long)
+            y_test_t  = torch.tensor(y_test,  dtype=torch.long)
+            train_ds = TensorDataset(X_train_t, y_train_t)
+            val_ds   = TensorDataset(X_val_t, y_val_t)
+            test_ds  = TensorDataset(X_test_t,  y_test_t)
+            train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
+            val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
+            test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
+            return train_loader, val_loader, test_loader, label_mapping
+# ========== MÔ HÌNH KERAS BI-LSTM ==========
+def predict_emotion_bilstm(model, text, data_manager, label_mapping):
+    processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
+    tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
+    text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
+    text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
+    output = model.predict(text_padded)
+    pred = output.argmax(axis=1)[0]
+    rev_map = {v: k for k, v in label_mapping.items()}
+    return rev_map[pred]
+# ========== MAIN ==========
+if __name__ == "__main__":
+    from keras.models import Model
+    from keras.layers import (
+        Input, Embedding, Dense, Dropout, Bidirectional, LSTM
+    )
+    from keras.optimizers import Adam
+    from keras.callbacks import ModelCheckpoint, EarlyStopping
+    # -------- ĐƯỜNG DẪN ----------
+    file_path = "train.xlsx"
+    abbreviations_path = "abbreviations.json"
+    word2vec_path = "word2vec_vi_syllables_100dims.txt"
+    output_path = "processed.xlsx"
+    # Khởi tạo DataManager
+    data_manager = DataManager(
+        file_path=file_path,
+        abbreviations_path=abbreviations_path,
+        word2vec_path=word2vec_path
+    )
+    # 1) Tiền xử lý, tạo vocab, load word2vec
+    df = data_manager.preprocess_data()
+    print("Trước khi cân bằng lớp (undersampling/oversampling):")
+    print(df["Emotion"].value_counts())
+    # 2) Cân bằng lớp dữ liệu (Ví dụ: Oversample 'Other' lên 3000)
+    # Bạn có thể điều chỉnh theo nhu cầu của mình
+    df_enjoyment = df[df["Emotion"] == "Enjoyment"]
+    df_other     = df[df["Emotion"] == "Other"]
+    df_anger     = df[df["Emotion"] == "Anger"]
+    df_sadness   = df[df["Emotion"] == "Sadness"]
+    df_disgust   = df[df["Emotion"] == "Disgust"]
+    df_fear      = df[df["Emotion"] == "Fear"]
+    df_surprise  = df[df["Emotion"] == "Surprise"]
+    # Oversample lớp 'Other' lên 3000 (chỉ minh hoạ)
+    if len(df_other) < 3000:
+        df_other_oversampled = resample(
+            df_other,
+            replace=True,
+            n_samples=3000,
+            random_state=42
+        )
+    else:
+        df_other_oversampled = df_other
+    # Giữ nguyên các lớp khác (hoặc oversample tùy ý)
+    df_balanced = pd.concat([
+        df_enjoyment,
+        df_other_oversampled,
+        df_anger,
+        df_sadness,
+        df_disgust,
+        df_fear,
+        df_surprise
+    ], axis=0)
+    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
+    df = df_balanced
+    print("\nSau khi cân bằng lớp (demo oversample):")
+    print(df["Emotion"].value_counts())
+    # Xuất file (nếu muốn)
+    df.to_excel(output_path, index=False)
+    # ========== TRAIN BI-LSTM KERAS ==========
+    print("\n========== Training Keras BiLSTM ==========")
+    # Tạo embedding pretrained cho Keras
+    pretrained_matrix = data_manager.build_pretrained_embedding_matrix(embedding_dim=100)
+    pretrained_matrix_keras = pretrained_matrix.astype(np.float32)
+    # Split data for Keras
+    X_train, X_test, y_train, y_test, label_mapping = data_manager.split_and_convert(
+        df, label_column="Emotion", maxlen=400,
+        test_size=0.2, for_keras=True
+    )
+    num_classes = len(label_mapping)
+    input_dim = len(data_manager.vocabulary)
+    embedding_dim = pretrained_matrix.shape[1]
+    maxlen = 400
+    # Define BiLSTM Model
+    def create_bilstm_model():
+        input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
+        emb_layer = Embedding(
+            input_dim=input_dim,
+            output_dim=embedding_dim,
+            weights=[pretrained_matrix_keras],
+            input_length=maxlen,
+            trainable=True  # Set to False nếu bạn không muốn fine-tune embeddings
+        )(input_layer)
+        bilstm = Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.5))(emb_layer)
+        dense1 = Dense(64, activation='relu')(bilstm)
+        dropout1 = Dropout(0.5)(dense1)
+        dense2 = Dense(32, activation='relu')(dropout1)
+        dropout2 = Dropout(0.5)(dense2)
+        output_layer = Dense(num_classes, activation='softmax')(dropout2)
+        model = Model(inputs=input_layer, outputs=output_layer)
+        model.compile(
+            loss='categorical_crossentropy',
+            optimizer=Adam(lr=1e-3),
+            metrics=['accuracy']
+        )
+        return model
+    # Create model
+    model_bilstm = create_bilstm_model()
+    model_bilstm.summary()
+    # Define callbacks
+    checkpoint = ModelCheckpoint(
+        'bilstm_best.keras',
+        save_best_only=True,
+        monitor='val_accuracy',
+        mode='max'
+    )
+    early_stopping = EarlyStopping(
+        monitor='val_accuracy',
+        patience=5,
+        restore_best_weights=True
+    )
+    # Train model
+    history = model_bilstm.fit(
+        X_train, y_train,
+        validation_data=(X_test, y_test),
+        epochs=100,
+        batch_size=32,
+        callbacks=[checkpoint, early_stopping]
+    )
+    # Đánh giá trên test set với detailed metrics
+    loss, acc = model_bilstm.evaluate(X_test, y_test)
+    print(f"BiLSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")
+    # Thu thập dự đoán và tính toán các chỉ số
+    y_pred_bilstm = model_bilstm.predict(X_test)
+    y_pred_bilstm = np.argmax(y_pred_bilstm, axis=1)
+    y_true_bilstm = np.argmax(y_test, axis=1)
+    test_accuracy_bilstm = accuracy_score(y_true_bilstm, y_pred_bilstm)
+    precision_macro_bilstm = precision_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
+    precision_weighted_bilstm = precision_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
+    recall_macro_bilstm = recall_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
+    recall_weighted_bilstm = recall_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
+    f1_macro_bilstm = f1_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
+    f1_weighted_bilstm = f1_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
+    report_bilstm = classification_report(y_true_bilstm, y_pred_bilstm, target_names=label_mapping.keys(), digits=4)
+    conf_matrix_bilstm = confusion_matrix(y_true_bilstm, y_pred_bilstm)
+    # In các chỉ số
+    print(f"\nBiLSTM Test Accuracy: {test_accuracy_bilstm:.4f}")
+    print(f"Precision (Macro): {precision_macro_bilstm:.4f}")
+    print(f"Precision (Weighted): {precision_weighted_bilstm:.4f}")
+    print(f"Recall (Macro): {recall_macro_bilstm:.4f}")
+    print(f"Recall (Weighted): {recall_weighted_bilstm:.4f}")
+    print(f"F1-Score (Macro): {f1_macro_bilstm:.4f}")
+    print(f"F1-Score (Weighted): {f1_weighted_bilstm:.4f}")
+    print("\n========== BiLSTM Classification Report ==========")
+    print(report_bilstm)
+    print("\n========== BiLSTM Confusion Matrix ==========")
+    print(conf_matrix_bilstm)
+    # Lưu báo cáo vào file
+    bilstm_report_dir = "bilstm_emotion_model"
+    os.makedirs(bilstm_report_dir, exist_ok=True)
+    with open(os.path.join(bilstm_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
+        f.write("========== BiLSTM Classification Report ==========\n")
+        f.write(report_bilstm)
+        f.write("\n========== Additional Metrics ==========\n")
+        f.write(f"Test Loss: {loss:.4f}\n")
+        f.write(f"Test Accuracy: {test_accuracy_bilstm:.4f}\n")
+        f.write(f"Precision (Macro): {precision_macro_bilstm:.4f}\n")
+        f.write(f"Precision (Weighted): {precision_weighted_bilstm:.4f}\n")
+        f.write(f"Recall (Macro): {recall_macro_bilstm:.4f}\n")
+        f.write(f"Recall (Weighted): {recall_weighted_bilstm:.4f}\n")
+        f.write(f"F1-Score (Macro): {f1_macro_bilstm:.4f}\n")
+        f.write(f"F1-Score (Weighted): {f1_weighted_bilstm:.4f}\n")
+        f.write("\n========== Confusion Matrix ==========\n")
+        f.write(np.array2string(conf_matrix_bilstm))
+    print("\n========== BiLSTM Classification Report saved to 'bilstm_emotion_model/classification_report.txt' ==========")
+    # Lưu mô hình BiLSTM
+    model_bilstm.save(os.path.join(bilstm_report_dir, 'bilstm_model.keras'))
+    print(f"========== BiLSTM Model saved to '{bilstm_report_dir}/bilstm_model.keras' ==========")
+    # ========== DEMO DỰ ĐOÁN 1 CÂU MỚI ==========
+    custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
+    # BiLSTM (Keras)
+    emotion_bilstm = predict_emotion_bilstm(
+        model_bilstm, custom_text, data_manager, label_mapping
+    )
+    print(f"Predicted Emotion (BiLSTM): {emotion_bilstm}")
+    # Kiểm tra TF, GPU
+    print("TF version:", tf.__version__)
+    print("GPU devices:", tf.config.list_physical_devices("GPU"))
+    # os.system("nvidia-smi")  # nếu muốn xem info GPU
+    # ========== LƯU LABEL MAPPING VÀ VOCABULARY ==========
+    # Lưu label_mapping và vocabulary cho BiLSTM
+    with open(os.path.join(bilstm_report_dir, "label_mapping.json"), "w", encoding="utf-8") as f:
+        json.dump(label_mapping, f, ensure_ascii=False, indent=4)
+    with open(os.path.join(bilstm_report_dir, "vocabulary.json"), "w", encoding="utf-8") as f:
+        json.dump(data_manager.vocabulary.word2id, f, ensure_ascii=False, indent=4)
+    print("========== Label Mapping and Vocabulary saved ==========")

main_RNN_CNN-LSTM.py ADDED Viewed

	@@ -0,0 +1,738 @@

+# thesis.py
+# -*- coding: utf-8 -*-
+import pandas as pd
+import emoji
+import json
+import re
+import numpy as np
+from underthesea import word_tokenize
+from tqdm import tqdm
+import torch
+from torchtext.vocab import Vectors
+from sklearn.model_selection import train_test_split
+from sklearn.utils import resample
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+    precision_score,
+    recall_score,
+    f1_score,
+    confusion_matrix
+)
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from torch.utils.data import DataLoader, TensorDataset
+import torch.nn as nn
+import torch.optim as optim
+import tensorflow as tf
+import os
+import joblib
+# ========== CÁC HÀM TIỀN XỬ LÝ ==========
+def preprocess_sentence(sentence, abbreviations, emoji_mapping):
+    """
+    Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
+    ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
+    """
+    sentence = sentence.lower()
+    sentence = replace_emojis(sentence, emoji_mapping)
+    sentence = remove_profanity(sentence)
+    sentence = remove_special_characters(sentence)
+    sentence = normalize_whitespace(sentence)
+    sentence = replace_abbreviations(sentence, abbreviations)
+    sentence = remove_repeated_characters(sentence)
+    sentence = replace_numbers(sentence)
+    sentence = tokenize_sentence(sentence)
+    return sentence
+def replace_emojis(sentence, emoji_mapping):
+    processed_sentence = []
+    for char in sentence:
+        if char in emoji_mapping:
+            processed_sentence.append(emoji_mapping[char])
+        elif not emoji.is_emoji(char):
+            processed_sentence.append(char)
+    return ''.join(processed_sentence)
+def remove_profanity(sentence):
+    profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
+    words = sentence.split()
+    filtered_words = [word for word in words if word.lower() not in profane_words]
+    return ' '.join(filtered_words)
+def remove_special_characters(sentence):
+    return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
+def normalize_whitespace(sentence):
+    return ' '.join(sentence.split())
+def replace_abbreviations(sentence, abbreviations):
+    words = sentence.split()
+    replaced_words = [
+        " ".join(abbreviations[word]) if word in abbreviations else word
+        for word in words
+    ]
+    return ' '.join(replaced_words)
+def remove_repeated_characters(sentence):
+    # Ví dụ: "đẹp quáaaaaaa" -> "đẹp quá"
+    return re.sub(r"(.)\1{2,}", r"\1", sentence)
+def replace_numbers(sentence):
+    # Thay toàn bộ số bằng token [number]
+    return re.sub(r"\d+", "[number]", sentence)
+def tokenize_sentence(sentence):
+    # Tách từ bằng underthesea
+    return ' '.join(word_tokenize(sentence))
+# ========== VOCABULARY CLASS ==========
+class Vocabulary:
+    def __init__(self):
+        self.word2id = {}
+        self.word2id['<pad>'] = 0
+        self.word2id['<unk>'] = 1
+        self.unk_id = 1
+        self.id2word = {0: '<pad>', 1: '<unk>'}
+    def __getitem__(self, word):
+        return self.word2id.get(word, self.unk_id)
+    def __contains__(self, word):
+        return word in self.word2id
+    def __len__(self):
+        return len(self.word2id)
+    def lookup_tokens(self, indices):
+        return [self.id2word[idx] for idx in indices]
+    def add(self, word):
+        if word not in self.word2id:
+            idx = len(self.word2id)
+            self.word2id[word] = idx
+            self.id2word[idx] = word
+    @staticmethod
+    def tokenize_corpus(corpus):
+        tokenized_corpus = []
+        for doc in tqdm(corpus, desc="Tokenizing Corpus"):
+            tokens = [w.replace(" ", "_") for w in word_tokenize(doc)]
+            tokenized_corpus.append(tokens)
+        return tokenized_corpus
+    def corpus_to_tensor(self, corpus, is_tokenized=False):
+        """
+        corpus: list các câu (chuỗi) hoặc list các list từ (nếu is_tokenized=True)
+        return: list[list[int]], mỗi câu là 1 list gồm các chỉ số token
+        """
+        tokenized_corpus = (
+            self.tokenize_corpus(corpus) if not is_tokenized else corpus
+        )
+        return [
+            [self[token] for token in doc]
+            for doc in tokenized_corpus
+        ]
+# ========== EMOJI MAPPING ==========
+emoji_mapping = {
+    "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
+    "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
+    "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
+    "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
+    "🤑": "[satisfaction]",
+    "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
+    "😏": "[sarcasm]",
+    "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
+    "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
+    "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
+    "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
+    "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
+    "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
+    "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
+    "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
+    "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
+    "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
+}
+def load_abbreviations(path):
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+# ========== DATA MANAGER ==========
+class DataManager:
+    def __init__(self, file_path, abbreviations_path, word2vec_path):
+        self.file_path = file_path
+        self.abbreviations_path = abbreviations_path
+        self.word2vec_path = word2vec_path
+        self.vocabulary = None
+        self.word_embeddings = None
+        self.abbreviations = None
+        self.load_abbreviations()
+    def load_abbreviations(self):
+        with open(self.abbreviations_path, "r", encoding="utf-8") as f:
+            self.abbreviations = json.load(f)
+    def load_word2vec(self):
+        """
+        Tải vector từ file word2vec,
+        dùng torchtext.Vectors để load embedding pretrained.
+        """
+        self.word_embeddings = Vectors(
+            name=self.word2vec_path,
+            unk_init=torch.Tensor.normal_
+        )
+    def create_vocab_from_corpus(self, corpus, max_vocab_size=30000):
+        """
+        Tạo vocabulary từ corpus, chỉ lấy top max_vocab_size từ.
+        """
+        vocab = Vocabulary()
+        from collections import Counter
+        counter = Counter()
+        for sent in corpus:
+            for token in sent.split():
+                counter[token] += 1
+        most_common = counter.most_common(max_vocab_size)
+        for word, _freq in most_common:
+            vocab.add(word)
+        return vocab
+    def preprocess_data(self):
+        df = pd.read_excel(self.file_path)
+        if "Sentence" not in df.columns:
+            raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
+        # Tiền xử lý từng câu
+        df["processed_sentence"] = df["Sentence"].apply(
+            lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
+        )
+        # Loại những dòng rỗng
+        df = df[df["processed_sentence"].str.strip().astype(bool)]
+        # Tạo vocab từ chính dữ liệu
+        all_sentences = df["processed_sentence"].tolist()
+        self.vocabulary = self.create_vocab_from_corpus(all_sentences, max_vocab_size=30000)
+        # Load word2vec
+        self.load_word2vec()
+        return df
+    def build_pretrained_embedding_matrix(self, embedding_dim=100):
+        """
+        Tạo weight_matrix (numpy) (vocab_size x embedding_dim)
+        với trọng số pretrained.
+        """
+        vocab_size = len(self.vocabulary)
+        weight_matrix = np.random.normal(
+            scale=0.1, size=(vocab_size, embedding_dim)
+        ).astype(np.float32)
+        # Copy vector pretrained
+        for word, idx in self.vocabulary.word2id.items():
+            if word in self.word_embeddings.stoi:
+                weight_matrix[idx] = self.word_embeddings.vectors[
+                    self.word_embeddings.stoi[word]
+                ]
+        return weight_matrix
+    def split_and_convert(
+        self, df, label_column="Emotion", maxlen=400, test_size=0.2,
+        for_keras=False, batch_size=32
+    ):
+        """
+        Chia dữ liệu thành train/test.
+        - for_keras=False → return train_loader, test_loader, label_mapping (PyTorch)
+        - for_keras=True  → return X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
+        """
+        if label_column not in df.columns:
+            raise ValueError(
+                f"Cột '{label_column}' không tồn tại. Hiện có: {df.columns.tolist()}"
+            )
+        # Tạo mapping nhãn -> số
+        label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
+        df[label_column] = df[label_column].map(label_mapping)
+        if df[label_column].isnull().any():
+            missing = df[df[label_column].isnull()][label_column].unique()
+            raise ValueError(f"Những nhãn cảm xúc sau không có trong label_mapping: {missing}")
+        X = df["processed_sentence"].tolist()
+        y = df[label_column].tolist()
+        # Stratify to maintain class distribution
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=42, stratify=y
+        )
+        # Convert text -> index
+        X_train_ids = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
+        X_test_ids  = self.vocabulary.corpus_to_tensor(X_test,  is_tokenized=False)
+        # Pad
+        X_train_padded = pad_sequences(X_train_ids, maxlen=maxlen, padding='post', truncating='post')
+        X_test_padded  = pad_sequences(X_test_ids,  maxlen=maxlen, padding='post', truncating='post')
+        print(">>> Debug Split and Convert:")
+        print("X_train_padded.shape:", X_train_padded.shape)
+        print("X_test_padded.shape: ", X_test_padded.shape)
+        print("y_train length:", len(y_train))
+        print("y_test length: ", len(y_test))
+        print("vocab_size:", len(self.vocabulary))
+        if for_keras:
+            num_classes = len(label_mapping)
+            y_train_onehot = torch.nn.functional.one_hot(
+                torch.tensor(y_train),
+                num_classes=num_classes
+            ).numpy()
+            y_test_onehot  = torch.nn.functional.one_hot(
+                torch.tensor(y_test),
+                num_classes=num_classes
+            ).numpy()
+            print("y_train_onehot.shape:", y_train_onehot.shape)
+            print("y_test_onehot.shape: ", y_test_onehot.shape)
+            return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
+        else:
+            # Trả về DataLoader
+            X_train_t = torch.tensor(X_train_padded, dtype=torch.long)
+            X_test_t  = torch.tensor(X_test_padded,  dtype=torch.long)
+            y_train_t = torch.tensor(y_train, dtype=torch.long)
+            y_test_t  = torch.tensor(y_test,  dtype=torch.long)
+            train_ds = TensorDataset(X_train_t, y_train_t)
+            test_ds  = TensorDataset(X_test_t,  y_test_t)
+            train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
+            test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
+            return train_loader, test_loader, label_mapping
+# ========== MÔ HÌNH PYTORCH RNN ==========
+class SimpleRNN(nn.Module):
+    def __init__(self, pretrained_weight, hidden_dim, output_dim, dropout=0.3):
+        super(SimpleRNN, self).__init__()
+        vocab_size, embedding_dim = pretrained_weight.shape
+        # Tạo nn.Embedding từ pretrained_weight
+        self.embedding = nn.Embedding.from_pretrained(
+            torch.from_numpy(pretrained_weight),
+            freeze=False  # True nếu muốn cố định embedding
+        )
+        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
+        self.dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(hidden_dim, output_dim)
+    def forward(self, x):
+        embedded = self.dropout(self.embedding(x))
+        _, (hidden, _) = self.rnn(embedded)
+        hidden = self.dropout(hidden.squeeze(0))
+        output = self.fc(hidden)
+        return output
+def predict_emotion_rnn(model, text, data_manager, label_mapping, device):
+    model.eval()
+    with torch.no_grad():
+        processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
+        tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
+        text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
+        text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
+        text_tensor = torch.tensor(
+            text_padded,
+            dtype=torch.long
+        ).to(device)
+        output = model(text_tensor)
+        _, predicted = torch.max(output, 1)
+        rev_map = {v: k for k, v in label_mapping.items()}
+        return rev_map[predicted.item()]
+# ========== MÔ HÌNH KERAS CNN-LSTM ==========
+def predict_emotion_cnn_lstm(model, text, data_manager, label_mapping):
+    processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
+    tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
+    text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
+    text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
+    output = model.predict(text_padded)
+    pred = output.argmax(axis=1)[0]
+    rev_map = {v: k for k, v in label_mapping.items()}
+    return rev_map[pred]
+# ========== MAIN ==========
+if __name__ == "__main__":
+    from keras.models import Model
+    from keras.layers import (
+        Input, Embedding, Convolution1D, LSTM, Dense, Dropout, Lambda, concatenate
+    )
+    from keras.optimizers import Adam
+    from keras.callbacks import ModelCheckpoint, EarlyStopping
+    # -------- ĐƯỜNG DẪN ----------
+    file_path = "train.xlsx"
+    abbreviations_path = "abbreviations.json"
+    word2vec_path = "word2vec_vi_syllables_100dims.txt"
+    output_path = "processed.xlsx"
+    # Khởi tạo DataManager
+    data_manager = DataManager(
+        file_path=file_path,
+        abbreviations_path=abbreviations_path,
+        word2vec_path=word2vec_path
+    )
+    # 1) Tiền xử lý, tạo vocab, load word2vec
+    df = data_manager.preprocess_data()
+    print("Trước khi cân bằng lớp (undersampling/oversampling):")
+    print(df["Emotion"].value_counts())
+    # 2) Cân bằng lớp dữ liệu (Ví dụ: Oversample 'Other' lên 3000)
+    # Bạn có thể điều chỉnh theo nhu cầu của mình
+    df_enjoyment = df[df["Emotion"] == "Enjoyment"]
+    df_other     = df[df["Emotion"] == "Other"]
+    df_anger     = df[df["Emotion"] == "Anger"]
+    df_sadness   = df[df["Emotion"] == "Sadness"]
+    df_disgust   = df[df["Emotion"] == "Disgust"]
+    df_fear      = df[df["Emotion"] == "Fear"]
+    df_surprise  = df[df["Emotion"] == "Surprise"]
+    # Oversample lớp 'Other' lên 3000 (chỉ minh hoạ)
+    if len(df_other) < 3000:
+        df_other_oversampled = resample(
+            df_other,
+            replace=True,
+            n_samples=3000,
+            random_state=42
+        )
+    else:
+        df_other_oversampled = df_other
+    # Giữ nguyên các lớp khác (hoặc oversample tùy ý)
+    df_balanced = pd.concat([
+        df_enjoyment,
+        df_other_oversampled,
+        df_anger,
+        df_sadness,
+        df_disgust,
+        df_fear,
+        df_surprise
+    ], axis=0)
+    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
+    df = df_balanced
+    print("\nSau khi cân bằng lớp (demo oversample):")
+    print(df["Emotion"].value_counts())
+    # Xuất file (nếu muốn)
+    df.to_excel(output_path, index=False)
+    # ========== TRAIN RNN PYTORCH ==========
+    print("\n========== Training PyTorch SimpleRNN ==========")
+    # Xây ma trận embedding pretrained
+    pretrained_matrix = data_manager.build_pretrained_embedding_matrix(embedding_dim=100)
+    # Chia và chuyển đổi dữ liệu thành DataLoader
+    train_loader, test_loader, label_mapping = data_manager.split_and_convert(
+        df, label_column="Emotion", maxlen=400, test_size=0.2,
+        for_keras=False, batch_size=32
+    )
+    hidden_dim = 128
+    output_dim = len(label_mapping)
+    model_rnn = SimpleRNN(pretrained_weight=pretrained_matrix,
+                          hidden_dim=hidden_dim,
+                          output_dim=output_dim,
+                          dropout=0.3)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(model_rnn.parameters(), lr=1e-3)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_rnn.to(device)
+    num_epochs = 20
+    for epoch in range(num_epochs):
+        model_rnn.train()
+        epoch_loss = 0
+        correct = 0
+        total = 0
+        for X_batch, y_batch in train_loader:
+            X_batch = X_batch.to(device)
+            y_batch = y_batch.to(device)
+            optimizer.zero_grad()
+            preds = model_rnn(X_batch)
+            loss = criterion(preds, y_batch)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+            _, pred_label = torch.max(preds, 1)
+            correct += (pred_label == y_batch).sum().item()
+            total += y_batch.size(0)
+        epoch_accuracy = correct / total
+        epoch_loss_avg = epoch_loss / len(train_loader)
+        print(f"Epoch {epoch+1}/{num_epochs}, "
+              f"Loss: {epoch_loss_avg:.4f}, "
+              f"Accuracy: {epoch_accuracy:.4f}")
+    # Đánh giá trên test set với detailed metrics
+    model_rnn.eval()
+    test_loss = 0
+    correct = 0
+    total = 0
+    y_true = []
+    y_pred = []
+    with torch.no_grad():
+        for X_batch, y_batch in test_loader:
+            X_batch = X_batch.to(device)
+            y_batch = y_batch.to(device)
+            preds = model_rnn(X_batch)
+            loss = criterion(preds, y_batch)
+            test_loss += loss.item()
+            _, predicted = torch.max(preds, 1)
+            correct += (predicted == y_batch).sum().item()
+            total += y_batch.size(0)
+            y_true.extend(y_batch.cpu().numpy())
+            y_pred.extend(predicted.cpu().numpy())
+    test_accuracy = accuracy_score(y_true, y_pred)
+    test_loss_avg = test_loss / len(test_loader)
+    precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
+    precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
+    recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
+    recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
+    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
+    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
+    report = classification_report(y_true, y_pred, target_names=label_mapping.keys(), digits=4)
+    conf_matrix = confusion_matrix(y_true, y_pred)
+    # In các chỉ số
+    print(f"\nTest Loss: {test_loss_avg:.4f}, Test Accuracy: {test_accuracy:.4f}")
+    print(f"Precision (Macro): {precision_macro:.4f}")
+    print(f"Precision (Weighted): {precision_weighted:.4f}")
+    print(f"Recall (Macro): {recall_macro:.4f}")
+    print(f"Recall (Weighted): {recall_weighted:.4f}")
+    print(f"F1-Score (Macro): {f1_macro:.4f}")
+    print(f"F1-Score (Weighted): {f1_weighted:.4f}")
+    print("\n========== Classification Report ==========")
+    print(report)
+    print("\n========== Confusion Matrix ==========")
+    print(conf_matrix)
+    # Lưu báo cáo vào file
+    rnn_report_dir = "rnn_emotion_model"
+    os.makedirs(rnn_report_dir, exist_ok=True)
+    with open(os.path.join(rnn_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
+        f.write("========== Classification Report ==========\n")
+        f.write(report)
+        f.write("\n========== Additional Metrics ==========\n")
+        f.write(f"Test Loss: {test_loss_avg:.4f}\n")
+        f.write(f"Test Accuracy: {test_accuracy:.4f}\n")
+        f.write(f"Precision (Macro): {precision_macro:.4f}\n")
+        f.write(f"Precision (Weighted): {precision_weighted:.4f}\n")
+        f.write(f"Recall (Macro): {recall_macro:.4f}\n")
+        f.write(f"Recall (Weighted): {recall_weighted:.4f}\n")
+        f.write(f"F1-Score (Macro): {f1_macro:.4f}\n")
+        f.write(f"F1-Score (Weighted): {f1_weighted:.4f}\n")
+        f.write("\n========== Confusion Matrix ==========\n")
+        f.write(np.array2string(conf_matrix))
+    print("\n========== Classification Report saved to 'rnn_emotion_model/classification_report.txt' ==========")
+    # Lưu mô hình RNN
+    torch.save(model_rnn.state_dict(), os.path.join(rnn_report_dir, "simple_rnn.pth"))
+    print("========== RNN Model saved to 'rnn_emotion_model/simple_rnn.pth' ==========")
+    # ========== TRAIN CNN-LSTM KERAS ==========
+    print("\n========== Training CNN-LSTM (Keras) ==========")
+    # Tạo embedding pretrained cho Keras
+    # Chúng ta có pretrained_matrix (num_vocab x 100)
+    # Sẽ truyền vào layer Embedding(..., weights=[...])
+    X_train_keras, X_test_keras, y_train_keras, y_test_keras, label_mapping_keras = data_manager.split_and_convert(
+        df, label_column="Emotion", maxlen=400, test_size=0.2,
+        for_keras=True
+    )
+    maxlen = 400
+    vocab_size, embedding_dim = pretrained_matrix.shape
+    # Chuyển pretrained_matrix -> float32 (đảm bảo Keras nhận dạng)
+    pretrained_matrix_keras = pretrained_matrix.astype(np.float32)
+    input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
+    emb_layer = Embedding(
+        input_dim=vocab_size,
+        output_dim=embedding_dim,
+        weights=[pretrained_matrix_keras],
+        trainable=True  # True hoặc False tùy muốn fine-tune embedding
+    )(input_layer)
+    def max_1d(X):
+        return tf.reduce_max(X, axis=1)
+    con3 = Convolution1D(150, kernel_size=3, activation='relu')(emb_layer)
+    pool_con3 = Lambda(max_1d, output_shape=(150,))(con3)
+    con5 = Convolution1D(150, kernel_size=5, activation='relu')(emb_layer)
+    pool_con5 = Lambda(max_1d, output_shape=(150,))(con5)
+    lstm_out = LSTM(128, dropout=0.3)(emb_layer)
+    merged = concatenate([pool_con3, pool_con5, lstm_out])
+    dense  = Dense(100, activation='relu')(merged)
+    drop   = Dropout(0.3)(dense)
+    output = Dense(output_dim, activation='softmax')(drop)
+    model_cnn_lstm = Model(inputs=input_layer, outputs=output)
+    model_cnn_lstm.compile(
+        loss='categorical_crossentropy',
+        optimizer=Adam(lr=1e-3),
+        metrics=['accuracy']
+    )
+    checkpoint = ModelCheckpoint(
+        'cnn_lstm_best.keras',
+        save_best_only=True,
+        monitor='val_accuracy',
+        mode='max'
+    )
+    early_stopping = EarlyStopping(
+        monitor='val_accuracy',
+        patience=5,
+        restore_best_weights=True
+    )
+    history = model_cnn_lstm.fit(
+        X_train_keras, y_train_keras,
+        validation_data=(X_test_keras, y_test_keras),
+        epochs=30,
+        batch_size=32,
+        callbacks=[checkpoint, early_stopping]
+    )
+    # Đánh giá trên test set với detailed metrics
+    loss, acc = model_cnn_lstm.evaluate(X_test_keras, y_test_keras)
+    print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")
+    # Thu thập dự đoán và tính toán các chỉ số
+    y_pred_cnn_lstm = model_cnn_lstm.predict(X_test_keras)
+    y_pred_cnn_lstm = np.argmax(y_pred_cnn_lstm, axis=1)
+    y_true_cnn_lstm = np.argmax(y_test_keras, axis=1)
+    test_accuracy_cnn_lstm = accuracy_score(y_true_cnn_lstm, y_pred_cnn_lstm)
+    precision_macro_cnn_lstm = precision_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
+    precision_weighted_cnn_lstm = precision_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
+    recall_macro_cnn_lstm = recall_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
+    recall_weighted_cnn_lstm = recall_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
+    f1_macro_cnn_lstm = f1_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
+    f1_weighted_cnn_lstm = f1_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
+    report_cnn_lstm = classification_report(y_true_cnn_lstm, y_pred_cnn_lstm, target_names=label_mapping.keys(), digits=4)
+    conf_matrix_cnn_lstm = confusion_matrix(y_true_cnn_lstm, y_pred_cnn_lstm)
+    # In các chỉ số
+    print(f"\nCNN-LSTM Test Accuracy: {test_accuracy_cnn_lstm:.4f}")
+    print(f"Precision (Macro): {precision_macro_cnn_lstm:.4f}")
+    print(f"Precision (Weighted): {precision_weighted_cnn_lstm:.4f}")
+    print(f"Recall (Macro): {recall_macro_cnn_lstm:.4f}")
+    print(f"Recall (Weighted): {recall_weighted_cnn_lstm:.4f}")
+    print(f"F1-Score (Macro): {f1_macro_cnn_lstm:.4f}")
+    print(f"F1-Score (Weighted): {f1_weighted_cnn_lstm:.4f}")
+    print("\n========== CNN-LSTM Classification Report ==========")
+    print(report_cnn_lstm)
+    print("\n========== CNN-LSTM Confusion Matrix ==========")
+    print(conf_matrix_cnn_lstm)
+    # Lưu báo cáo vào file
+    cnn_lstm_report_dir = "cnn_lstm_emotion_model"
+    os.makedirs(cnn_lstm_report_dir, exist_ok=True)
+    with open(os.path.join(cnn_lstm_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
+        f.write("========== CNN-LSTM Classification Report ==========\n")
+        f.write(report_cnn_lstm)
+        f.write("\n========== Additional Metrics ==========\n")
+        f.write(f"Test Loss: {loss:.4f}\n")
+        f.write(f"Test Accuracy: {test_accuracy_cnn_lstm:.4f}\n")
+        f.write(f"Precision (Macro): {precision_macro_cnn_lstm:.4f}\n")
+        f.write(f"Precision (Weighted): {precision_weighted_cnn_lstm:.4f}\n")
+        f.write(f"Recall (Macro): {recall_macro_cnn_lstm:.4f}\n")
+        f.write(f"Recall (Weighted): {recall_weighted_cnn_lstm:.4f}\n")
+        f.write(f"F1-Score (Macro): {f1_macro_cnn_lstm:.4f}\n")
+        f.write(f"F1-Score (Weighted): {f1_weighted_cnn_lstm:.4f}\n")
+        f.write("\n========== Confusion Matrix ==========\n")
+        f.write(np.array2string(conf_matrix_cnn_lstm))
+    print("\n========== CNN-LSTM Classification Report saved to 'cnn_lstm_emotion_model/classification_report.txt' ==========")
+    # Lưu mô hình CNN-LSTM
+    model_cnn_lstm.save(os.path.join(cnn_lstm_report_dir, 'cnn_lstm_model.keras'))
+    print(f"========== CNN-LSTM Model saved to '{cnn_lstm_report_dir}/cnn_lstm_model.keras' ==========")
+    # ========== LƯU LABEL MAPPING VÀ VOCABULARY ==========
+    # Lưu label_mapping và vocabulary cho RNN
+    with open(os.path.join(rnn_report_dir, "label_mapping.json"), "w", encoding="utf-8") as f:
+        json.dump(label_mapping, f, ensure_ascii=False, indent=4)
+    with open(os.path.join(rnn_report_dir, "vocabulary.json"), "w", encoding="utf-8") as f:
+        json.dump(data_manager.vocabulary.word2id, f, ensure_ascii=False, indent=4)
+    # Lưu label_mapping và vocabulary cho CNN-LSTM
+    # Giả sử label_mapping và vocabulary giống nhau, bạn có thể chỉ lưu một lần.
+    # Nếu khác, hãy điều chỉnh tương ứng.
+    print("========== Label Mapping and Vocabulary saved ==========")
+    # ========== DEMO DỰ ĐOÁN 1 CÂU MỚI ==========
+    custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
+    # RNN (PyTorch)
+    emotion_rnn = predict_emotion_rnn(
+        model_rnn, custom_text, data_manager, label_mapping, device
+    )
+    print(f"Predicted Emotion (RNN): {emotion_rnn}")
+    # CNN-LSTM (Keras)
+    cnn_lstm_loaded = tf.keras.models.load_model(os.path.join(cnn_lstm_report_dir, 'cnn_lstm_model.keras'))
+    emotion_cnn_lstm = predict_emotion_cnn_lstm(
+        cnn_lstm_loaded, custom_text, data_manager, label_mapping
+    )
+    print(f"Predicted Emotion (CNN-LSTM): {emotion_cnn_lstm}")
+    # Kiểm tra TF, GPU
+    print("TF version:", tf.__version__)
+    print("GPU devices:", tf.config.list_physical_devices("GPU"))
+    # os.system("nvidia-smi")  # nếu muốn xem info GPU

main_lstm.py ADDED Viewed

	@@ -0,0 +1,289 @@

+# lstm_emotion_classifier.py
+# -*- coding: utf-8 -*-
+import re
+import emoji
+import json
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+from underthesea import word_tokenize
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from sklearn.utils import resample
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
+from tensorflow.keras.callbacks import EarlyStopping
+import joblib
+import os
+import matplotlib.pyplot as plt
+import seaborn as sns
+########################
+# TIỀN XỬ LÝ
+########################
+def replace_emojis(sentence, emoji_mapping):
+    processed_sentence = []
+    for char in sentence:
+        if char in emoji_mapping:
+            processed_sentence.append(emoji_mapping[char])
+        elif not emoji.is_emoji(char):
+            processed_sentence.append(char)
+    return ''.join(processed_sentence)
+def remove_profanity(sentence):
+    profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
+    words = sentence.split()
+    filtered = [w for w in words if w.lower() not in profane_words]
+    return ' '.join(filtered)
+def remove_special_characters(sentence):
+    return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
+def normalize_whitespace(sentence):
+    return ' '.join(sentence.split())
+def remove_repeated_characters(sentence):
+    return re.sub(r"(.)\1{2,}", r"\1", sentence)
+def replace_numbers(sentence):
+    return re.sub(r"\d+", "[number]", sentence)
+def tokenize_underthesea(sentence):
+    tokens = word_tokenize(sentence)
+    return " ".join(tokens)
+def preprocess_sentence(sentence, abbreviations, emoji_mapping):
+    sentence = sentence.lower()
+    sentence = replace_emojis(sentence, emoji_mapping)
+    sentence = remove_profanity(sentence)
+    sentence = remove_special_characters(sentence)
+    sentence = normalize_whitespace(sentence)
+    # Thay thế viết tắt
+    words = sentence.split()
+    replaced = []
+    for w in words:
+        if w in abbreviations:
+            replaced.append(" ".join(abbreviations[w]))
+        else:
+            replaced.append(w)
+    sentence = " ".join(replaced)
+    sentence = remove_repeated_characters(sentence)
+    sentence = replace_numbers(sentence)
+    # Tokenize tiếng Việt
+    sentence = tokenize_underthesea(sentence)
+    return sentence
+emoji_mapping = {
+    "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
+    "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
+    "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
+    "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
+    "🤑": "[satisfaction]",
+    "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
+    "😏": "[sarcasm]",
+    "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
+    "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
+    "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
+    "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
+    "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
+    "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
+    "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
+    "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
+    "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
+    "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
+}
+def load_abbreviations(path):
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+###################################
+# MAIN
+###################################
+if __name__ == "__main__":
+    file_path = "train.xlsx"
+    abbreviations_path = "abbreviations.json"
+    output_path = "processed_phobert.xlsx"
+    abbreviations = load_abbreviations(abbreviations_path)
+    df = pd.read_excel(file_path)
+    if "Sentence" not in df.columns or "Emotion" not in df.columns:
+        raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
+    # Tiền xử lý
+    df["processed_sentence"] = df["Sentence"].apply(
+        lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
+    )
+    # Loại bỏ rỗng
+    df = df[df["processed_sentence"].str.strip().astype(bool)]
+    print("Trước khi cân bằng:")
+    print(df["Emotion"].value_counts())
+    # =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
+    # Lấy max samples
+    max_count = df["Emotion"].value_counts().max()
+    df_balanced_list = []
+    for emo in df["Emotion"].unique():
+        df_emo = df[df["Emotion"] == emo]
+        if len(df_emo) < max_count:
+            # Oversample lên max_count
+            df_emo_oversampled = resample(
+                df_emo,
+                replace=True,
+                n_samples=max_count,
+                random_state=42
+            )
+            df_balanced_list.append(df_emo_oversampled)
+        else:
+            # Nếu emo này = max_count rồi thì giữ nguyên
+            df_balanced_list.append(df_emo)
+    df = pd.concat(df_balanced_list, axis=0)
+    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
+    print("\nSau khi cân bằng tất cả lớp:")
+    print(df["Emotion"].value_counts())
+    df.to_excel(output_path, index=False)
+    # Tạo label2id và id2label theo thứ tự bạn cung cấp
+    custom_id2label = {
+        0: 'Anger',
+        1: 'Disgust',
+        2: 'Enjoyment',
+        3: 'Fear',
+        4: 'Other',
+        5: 'Sadness',
+        6: 'Surprise'
+    }
+    label2id = {label: idx for idx, label in enumerate(custom_id2label.values())}
+    id2label = {v: k for k, v in label2id.items()}
+    df["label_id"] = df["Emotion"].map(label2id)
+    # Tách train/test
+    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
+    print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
+    # Feature Extraction với Tokenizer và Padding
+    tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
+    tokenizer.fit_on_texts(train_df["processed_sentence"])
+    X_train_seq = tokenizer.texts_to_sequences(train_df["processed_sentence"])
+    X_test_seq = tokenizer.texts_to_sequences(test_df["processed_sentence"])
+    max_length = 256
+    X_train = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
+    X_test = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')
+    y_train = train_df["label_id"].values
+    y_test = test_df["label_id"].values
+    # Chuyển đổi nhãn thành one-hot encoding
+    num_classes = len(custom_id2label)
+    y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
+    y_test = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)
+    # Xây dựng mô hình LSTM
+    model = Sequential([
+        Embedding(input_dim=5000, output_dim=128, input_length=max_length),
+        LSTM(128, dropout=0.2, recurrent_dropout=0.2),
+        Dense(64, activation='relu'),
+        Dropout(0.5),
+        Dense(num_classes, activation='softmax')
+    ])
+    model.compile(loss='categorical_crossentropy',
+                  optimizer='adam',
+                  metrics=['accuracy'])
+    model.summary()
+    # Huấn luyện mô hình
+    early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
+    history = model.fit(
+        X_train, y_train,
+        epochs=10,
+        batch_size=32,
+        validation_data=(X_test, y_test),
+        callbacks=[early_stop],
+        verbose=1
+    )
+    # Đánh giá mô hình
+    print("\n========== Evaluate on Test set ==========")
+    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
+    print(f"Test Accuracy: {accuracy:.4f}")
+    # Dự đoán và in báo cáo phân loại
+    y_pred_probs = model.predict(X_test)
+    y_pred = np.argmax(y_pred_probs, axis=1)
+    y_true = np.argmax(y_test, axis=1)
+    # In Classification Report
+    print("\nClassification Report:")
+    report = classification_report(y_true, y_pred, target_names=custom_id2label.values())
+    print(report)
+    # Tính và in Confusion Matrix
+    conf_matrix = confusion_matrix(y_true, y_pred)
+    print("\nConfusion Matrix:")
+    print(conf_matrix)
+    # Vẽ Confusion Matrix
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
+                xticklabels=custom_id2label.values(),
+                yticklabels=custom_id2label.values())
+    plt.ylabel('Actual')
+    plt.xlabel('Predicted')
+    plt.title('Confusion Matrix')
+    plt.tight_layout()
+    plt.savefig(os.path.join("lstm_emotion_model", "confusion_matrix.png"))
+    plt.close()
+    print("\nConfusion Matrix plot saved to 'lstm_emotion_model/confusion_matrix.png'")
+    # Lưu Classification Report vào file
+    report_path = os.path.join("lstm_emotion_model", "classification_report.txt")
+    with open(report_path, "w", encoding="utf-8") as f:
+        f.write("========== Classification Report ==========\n")
+        f.write(report)
+        f.write("\n========== Confusion Matrix ==========\n")
+        f.write(np.array2string(conf_matrix))
+    print(f"\nClassification Report saved to '{report_path}'")
+    # Lưu mô hình và tokenizer
+    model_output_dir = "./lstm_emotion_model"
+    os.makedirs(model_output_dir, exist_ok=True)
+    model.save(os.path.join(model_output_dir, "lstm_emotion_model.h5"))
+    joblib.dump(tokenizer, os.path.join(model_output_dir, "tokenizer.joblib"))
+    with open(os.path.join(model_output_dir, "id2label.json"), "w", encoding="utf-8") as f:
+        json.dump(id2label, f, ensure_ascii=False, indent=4)
+    print("\n========== Model and Tokenizer saved ==========")
+    # Predict 1 câu (ví dụ)
+    def predict_text(text):
+        text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
+        seq = tokenizer.texts_to_sequences([text_proc])
+        padded = pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')
+        pred_prob = model.predict(padded)
+        pred_id = np.argmax(pred_prob, axis=1)[0]
+        label = custom_id2label[pred_id]
+        return label
+    custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
+    emotion_pred = predict_text(custom_text)
+    print("\nCâu ví dụ:", custom_text)
+    print("Dự đoán cảm xúc:", emotion_pred)
+    print("\nHoàn thành demo LSTM với cân bằng dữ liệu & nhiều epoch hơn!")

main_phobert.py ADDED Viewed

	@@ -0,0 +1,349 @@

+# phobert_emotion_balanced.py
+# -*- coding: utf-8 -*-
+import re
+import emoji
+import json
+import pandas as pd
+import torch
+import numpy as np
+import os
+import matplotlib.pyplot as plt
+import seaborn as sns
+from transformers import (
+    AutoTokenizer,
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    Trainer,
+    TrainingArguments
+)
+from sklearn.model_selection import train_test_split
+from sklearn.utils import resample
+from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix
+########################
+# TIỀN XỬ LÝ
+########################
+def replace_emojis(sentence, emoji_mapping):
+    processed_sentence = []
+    for char in sentence:
+        if char in emoji_mapping:
+            processed_sentence.append(emoji_mapping[char])
+        elif not emoji.is_emoji(char):
+            processed_sentence.append(char)
+    return ''.join(processed_sentence)
+def remove_profanity(sentence):
+    profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
+    words = sentence.split()
+    filtered = [w for w in words if w.lower() not in profane_words]
+    return ' '.join(filtered)
+def remove_special_characters(sentence):
+    return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
+def normalize_whitespace(sentence):
+    return ' '.join(sentence.split())
+def remove_repeated_characters(sentence):
+    return re.sub(r"(.)\1{2,}", r"\1", sentence)
+def replace_numbers(sentence):
+    return re.sub(r"\d+", "[number]", sentence)
+def tokenize_underthesea(sentence):
+    from underthesea import word_tokenize
+    tokens = word_tokenize(sentence)
+    return " ".join(tokens)
+def preprocess_sentence(sentence, abbreviations, emoji_mapping):
+    sentence = sentence.lower()
+    sentence = replace_emojis(sentence, emoji_mapping)
+    sentence = remove_profanity(sentence)
+    sentence = remove_special_characters(sentence)
+    sentence = normalize_whitespace(sentence)
+    # Thay thế viết tắt
+    words = sentence.split()
+    replaced = []
+    for w in words:
+        if w in abbreviations:
+            replaced.append(" ".join(abbreviations[w]))
+        else:
+            replaced.append(w)
+    sentence = " ".join(replaced)
+    sentence = remove_repeated_characters(sentence)
+    sentence = replace_numbers(sentence)
+    # Tokenize
+    sentence = tokenize_underthesea(sentence)
+    return sentence
+emoji_mapping = {
+    "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
+    "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
+    "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
+    "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
+    "🤑": "[satisfaction]",
+    "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
+    "😏": "[sarcasm]",
+    "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
+    "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
+    "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
+    "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
+    "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
+    "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
+    "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
+    "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
+    "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
+    "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
+}
+def load_abbreviations(path):
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+# Dataset HF
+class PhoBertEmotionDataset(torch.utils.data.Dataset):
+    def __init__(self, encodings, labels):
+        self.encodings = encodings
+        self.labels = labels
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, idx):
+        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+        item["labels"] = torch.tensor(self.labels[idx])
+        return item
+###################################
+# MAIN
+###################################
+if __name__ == "__main__":
+    file_path = "train.xlsx"
+    abbreviations_path = "abbreviations.json"
+    output_path = "processed_phobert.xlsx"
+    abbreviations = load_abbreviations(abbreviations_path)
+    df = pd.read_excel(file_path)
+    if "Sentence" not in df.columns or "Emotion" not in df.columns:
+        raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
+    # Tiền xử lý
+    df["processed_sentence"] = df["Sentence"].apply(
+        lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
+    )
+    # Loại bỏ rỗng
+    df = df[df["processed_sentence"].str.strip().astype(bool)]
+    print("Trước khi cân bằng:")
+    print(df["Emotion"].value_counts())
+    # =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
+    # Lấy max samples
+    max_count = df["Emotion"].value_counts().max()
+    df_balanced_list = []
+    for emo in df["Emotion"].unique():
+        df_emo = df[df["Emotion"] == emo]
+        if len(df_emo) < max_count:
+            # Oversample lên max_count
+            df_emo_oversampled = resample(
+                df_emo,
+                replace=True,
+                n_samples=max_count,
+                random_state=42
+            )
+            df_balanced_list.append(df_emo_oversampled)
+        else:
+            # Nếu emo này = max_count rồi thì giữ nguyên
+            df_balanced_list.append(df_emo)
+    df = pd.concat(df_balanced_list, axis=0)
+    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
+    print("\nSau khi cân bằng tất cả lớp:")
+    print(df["Emotion"].value_counts())
+    df.to_excel(output_path, index=False)
+    # Tạo label2id
+    unique_labels = sorted(df["Emotion"].unique())  # Sắp xếp để cố định
+    label2id = {label: i for i, label in enumerate(unique_labels)}
+    id2label = {v: k for k, v in label2id.items()}
+    df["label_id"] = df["Emotion"].map(label2id)
+    # Tách train/test
+    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
+    print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
+    # Load tokenizer
+    checkpoint = "vinai/phobert-base"
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+    def tokenize_texts(texts):
+        return tokenizer(
+            texts,
+            padding=True,
+            truncation=True,
+            max_length=256
+        )
+    train_texts = train_df["processed_sentence"].tolist()
+    train_labels = train_df["label_id"].tolist()
+    test_texts  = test_df["processed_sentence"].tolist()
+    test_labels = test_df["label_id"].tolist()
+    train_encodings = tokenize_texts(train_texts)
+    test_encodings  = tokenize_texts(test_texts)
+    train_dataset = PhoBertEmotionDataset(train_encodings, train_labels)
+    test_dataset  = PhoBertEmotionDataset(test_encodings,  test_labels)
+    # Load model
+    config = AutoConfig.from_pretrained(checkpoint)
+    config.num_labels = len(label2id)
+    model = AutoModelForSequenceClassification.from_pretrained(
+        checkpoint,
+        config=config
+    )
+    # Tăng epoch lên 10, LR=2e-5
+    training_args = TrainingArguments(
+        output_dir="./phobert_results_v2",
+        overwrite_output_dir=True,
+        do_train=True,
+        do_eval=True,
+        evaluation_strategy="epoch",
+        save_strategy="epoch",
+        num_train_epochs=10,  # Tăng epoch
+        per_device_train_batch_size=16,
+        per_device_eval_batch_size=16,
+        learning_rate=2e-5,
+        logging_dir="./logs",
+        logging_steps=50,
+        load_best_model_at_end=True,
+        metric_for_best_model="f1_weighted",  # Chọn metric để lưu model tốt nhất
+        greater_is_better=True,
+        seed=42
+    )
+    # Define compute_metrics with additional metrics
+    def compute_metrics(eval_pred):
+        logits, labels = eval_pred
+        preds = np.argmax(logits, axis=-1)
+        precision_weighted = precision_score(labels, preds, average='weighted', zero_division=0)
+        recall_weighted = recall_score(labels, preds, average='weighted', zero_division=0)
+        f1_weighted = f1_score(labels, preds, average='weighted', zero_division=0)
+        precision_macro = precision_score(labels, preds, average='macro', zero_division=0)
+        recall_macro = recall_score(labels, preds, average='macro', zero_division=0)
+        f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
+        accuracy = accuracy_score(labels, preds)
+        return {
+            "accuracy": accuracy,
+            "precision_weighted": precision_weighted,
+            "recall_weighted": recall_weighted,
+            "f1_weighted": f1_weighted,
+            "precision_macro": precision_macro,
+            "recall_macro": recall_macro,
+            "f1_macro": f1_macro
+        }
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=test_dataset,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics
+    )
+    print("\n========== Training PhoBERT (balanced, more epochs) ==========")
+    trainer.train()
+    print("\n========== Evaluate on Test set ==========")
+    results = trainer.evaluate(test_dataset)
+    print("Test results:", results)
+    # Extract additional metrics
+    print("\n========== Additional Metrics ==========")
+    print(f"Test Loss: {results.get('eval_loss'):.4f}")
+    print(f"Test Accuracy: {results.get('eval_accuracy'):.4f}")
+    print(f"Precision (Macro): {results.get('eval_precision_macro'):.4f}")
+    print(f"Precision (Weighted): {results.get('eval_precision_weighted'):.4f}")
+    print(f"Recall (Macro): {results.get('eval_recall_macro'):.4f}")
+    print(f"Recall (Weighted): {results.get('eval_recall_weighted'):.4f}")
+    print(f"F1-Score (Macro): {results.get('eval_f1_macro'):.4f}")
+    print(f"F1-Score (Weighted): {results.get('eval_f1_weighted'):.4f}")
+    # Generate detailed classification report
+    print("\n========== Detailed Classification Report ==========")
+    predictions, labels, _ = trainer.predict(test_dataset)
+    preds = np.argmax(predictions, axis=1)
+    report = classification_report(labels, preds, target_names=unique_labels, digits=4)
+    print(report)
+    # Tính Confusion Matrix
+    conf_matrix = confusion_matrix(labels, preds)
+    print("\nConfusion Matrix:")
+    print(conf_matrix)
+    # Vẽ Confusion Matrix
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
+                xticklabels=unique_labels,
+                yticklabels=unique_labels)
+    plt.ylabel('Actual')
+    plt.xlabel('Predicted')
+    plt.title('Confusion Matrix')
+    plt.tight_layout()
+    confusion_matrix_path = os.path.join("phobert_emotion_model", "confusion_matrix.png")
+    os.makedirs("phobert_emotion_model", exist_ok=True)
+    plt.savefig(confusion_matrix_path)
+    plt.close()
+    print(f"\nConfusion Matrix plot saved to '{confusion_matrix_path}'")
+    # Lưu Classification Report vào file
+    report_path = os.path.join("phobert_emotion_model", "classification_report.txt")
+    with open(report_path, "w", encoding="utf-8") as f:
+        f.write("========== Classification Report ==========\n")
+        f.write(report)
+        f.write("\n========== Confusion Matrix ==========\n")
+        f.write(np.array2string(conf_matrix))
+    print(f"\nClassification Report saved to '{report_path}'")
+    # Lưu mô hình và tokenizer
+    model_output_dir = "./phobert_emotion_model"
+    os.makedirs(model_output_dir, exist_ok=True)
+    model.save_pretrained(os.path.join(model_output_dir, "phobert_emotion_model"))
+    tokenizer.save_pretrained(os.path.join(model_output_dir, "phobert_emotion_model"))
+    with open(os.path.join(model_output_dir, "id2label.json"), "w", encoding="utf-8") as f:
+        json.dump(id2label, f, ensure_ascii=False, indent=4)
+    print("\n========== Model and Tokenizer saved ==========")
+    # Predict 1 câu (ví dụ)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    def predict_text(text):
+        text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
+        enc = tokenizer(text_proc, padding=True, truncation=True, max_length=256, return_tensors="pt")
+        enc = {k: v.to(device) for k, v in enc.items()}
+        with torch.no_grad():
+            out = model(**enc)
+            pred_id = out.logits.argmax(dim=-1).item()
+        return id2label[pred_id]
+    custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
+    emotion_pred = predict_text(custom_text)
+    print("\nCâu ví dụ:", custom_text)
+    print("Dự đoán cảm xúc:", emotion_pred)
+    print("\nHoàn thành demo PhoBERT với cân bằng dữ liệu & nhiều epoch hơn!")

main_svm.py ADDED Viewed

	@@ -0,0 +1,261 @@

+# svm_emotion_classifier.py
+# -*- coding: utf-8 -*-
+import re
+import emoji
+import json
+import pandas as pd
+import numpy as np
+import torch  # Có thể không cần thiết cho SVM, nhưng giữ lại nếu cần
+from underthesea import word_tokenize
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.svm import SVC
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import (
+    accuracy_score,
+    classification_report,
+    precision_score,
+    recall_score,
+    f1_score,
+    confusion_matrix
+)
+from sklearn.utils import resample
+import joblib
+import os
+########################
+# TIỀN XỬ LÝ
+########################
+def replace_emojis(sentence, emoji_mapping):
+    processed_sentence = []
+    for char in sentence:
+        if char in emoji_mapping:
+            processed_sentence.append(emoji_mapping[char])
+        elif not emoji.is_emoji(char):
+            processed_sentence.append(char)
+    return ''.join(processed_sentence)
+def remove_profanity(sentence):
+    profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
+    words = sentence.split()
+    filtered = [w for w in words if w.lower() not in profane_words]
+    return ' '.join(filtered)
+def remove_special_characters(sentence):
+    return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
+def normalize_whitespace(sentence):
+    return ' '.join(sentence.split())
+def remove_repeated_characters(sentence):
+    return re.sub(r"(.)\1{2,}", r"\1", sentence)
+def replace_numbers(sentence):
+    return re.sub(r"\d+", "[number]", sentence)
+def tokenize_underthesea(sentence):
+    tokens = word_tokenize(sentence)
+    return " ".join(tokens)
+def preprocess_sentence(sentence, abbreviations, emoji_mapping):
+    sentence = sentence.lower()
+    sentence = replace_emojis(sentence, emoji_mapping)
+    sentence = remove_profanity(sentence)
+    sentence = remove_special_characters(sentence)
+    sentence = normalize_whitespace(sentence)
+    # Thay thế viết tắt
+    words = sentence.split()
+    replaced = []
+    for w in words:
+        if w in abbreviations:
+            replaced.append(" ".join(abbreviations[w]))
+        else:
+            replaced.append(w)
+    sentence = " ".join(replaced)
+    sentence = remove_repeated_characters(sentence)
+    sentence = replace_numbers(sentence)
+    # Tokenize tiếng Việt
+    sentence = tokenize_underthesea(sentence)
+    return sentence
+emoji_mapping = {
+    "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
+    "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
+    "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
+    "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
+    "🤑": "[satisfaction]",
+    "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
+    "😏": "[sarcasm]",
+    "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
+    "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
+    "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
+    "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
+    "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
+    "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
+    "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
+    "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
+    "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
+    "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
+}
+def load_abbreviations(path):
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+###################################
+# MAIN
+###################################
+if __name__ == "__main__":
+    file_path = "train.xlsx"
+    abbreviations_path = "abbreviations.json"
+    output_path = "processed_svm.xlsx"  # Changed output filename to reflect SVM
+    abbreviations = load_abbreviations(abbreviations_path)
+    df = pd.read_excel(file_path)
+    if "Sentence" not in df.columns or "Emotion" not in df.columns:
+        raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
+    # Tiền xử lý
+    df["processed_sentence"] = df["Sentence"].apply(
+        lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
+    )
+    # Loại bỏ rỗng
+    df = df[df["processed_sentence"].str.strip().astype(bool)]
+    print("Trước khi cân bằng:")
+    print(df["Emotion"].value_counts())
+    # =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
+    # Lấy max samples
+    max_count = df["Emotion"].value_counts().max()
+    df_balanced_list = []
+    for emo in df["Emotion"].unique():
+        df_emo = df[df["Emotion"] == emo]
+        if len(df_emo) < max_count:
+            # Oversample lên max_count
+            df_emo_oversampled = resample(
+                df_emo,
+                replace=True,
+                n_samples=max_count,
+                random_state=42
+            )
+            df_balanced_list.append(df_emo_oversampled)
+        else:
+            # Nếu emo này = max_count rồi thì giữ nguyên
+            df_balanced_list.append(df_emo)
+    df = pd.concat(df_balanced_list, axis=0)
+    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
+    print("\nSau khi cân bằng tất cả lớp:")
+    print(df["Emotion"].value_counts())
+    df.to_excel(output_path, index=False)
+    # Tạo label2id và id2label theo thứ tự bạn cung cấp
+    custom_id2label = {
+        0: 'Anger',
+        1: 'Disgust',
+        2: 'Enjoyment',
+        3: 'Fear',
+        4: 'Other',
+        5: 'Sadness',
+        6: 'Surprise'
+    }
+    label2id = {label: idx for idx, label in custom_id2label.items()}
+    id2label = {v: k for k, v in label2id.items()}
+    df["label_id"] = df["Emotion"].map(label2id)
+    if df["label_id"].isnull().any():
+        missing = df[df["label_id"].isnull()]["Emotion"].unique()
+        raise ValueError(f"Những nhãn cảm xúc sau không có trong label2id: {missing}")
+    # Tách train/test
+    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
+    print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
+    # Feature Extraction với TF-IDF
+    vectorizer = TfidfVectorizer(max_features=5000)
+    X_train = vectorizer.fit_transform(train_df["processed_sentence"])
+    X_test = vectorizer.transform(test_df["processed_sentence"])
+    y_train = train_df["label_id"].values
+    y_test = test_df["label_id"].values
+    # Huấn luyện mô hình SVM
+    svm_classifier = SVC(kernel='linear', probability=True, random_state=42)
+    print("\n========== Training SVM ==========")
+    svm_classifier.fit(X_train, y_train)
+    # Đánh giá mô hình
+    print("\n========== Evaluate on Test set ==========")
+    y_pred = svm_classifier.predict(X_test)
+    # Tính các chỉ số
+    accuracy = accuracy_score(y_test, y_pred)
+    precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
+    precision_weighted = precision_score(y_test, y_pred, average='weighted', zero_division=0)
+    recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)
+    recall_weighted = recall_score(y_test, y_pred, average='weighted', zero_division=0)
+    f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
+    f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)
+    conf_matrix = confusion_matrix(y_test, y_pred)
+    # In các chỉ số
+    print(f"Test Accuracy: {accuracy:.4f}")
+    print(f"Precision (Macro): {precision_macro:.4f}")
+    print(f"Precision (Weighted): {precision_weighted:.4f}")
+    print(f"Recall (Macro): {recall_macro:.4f}")
+    print(f"Recall (Weighted): {recall_weighted:.4f}")
+    print(f"F1-Score (Macro): {f1_macro:.4f}")
+    print(f"F1-Score (Weighted): {f1_weighted:.4f}")
+    print("\n========== Classification Report ==========")
+    report = classification_report(y_test, y_pred, target_names=custom_id2label.values(), digits=4)
+    print(report)
+    # Lưu báo cáo vào file
+    report_path = os.path.join("svm_emotion_model", "classification_report.txt")
+    os.makedirs(os.path.dirname(report_path), exist_ok=True)
+    with open(report_path, "w", encoding="utf-8") as f:
+        f.write("========== Classification Report ==========\n")
+        f.write(report)
+        f.write("\n========== Additional Metrics ==========\n")
+        f.write(f"Accuracy: {accuracy:.4f}\n")
+        f.write(f"Precision (Macro): {precision_macro:.4f}\n")
+        f.write(f"Precision (Weighted): {precision_weighted:.4f}\n")
+        f.write(f"Recall (Macro): {recall_macro:.4f}\n")
+        f.write(f"Recall (Weighted): {recall_weighted:.4f}\n")
+        f.write(f"F1-Score (Macro): {f1_macro:.4f}\n")
+        f.write(f"F1-Score (Weighted): {f1_weighted:.4f}\n")
+        f.write("\n========== Confusion Matrix ==========\n")
+        f.write(np.array2string(conf_matrix))
+    print("\n========== Classification Report saved to 'svm_emotion_model/classification_report.txt' ==========")
+    # Lưu mô hình và các thành phần cần thiết
+    model_output_dir = "./svm_emotion_model"
+    os.makedirs(model_output_dir, exist_ok=True)
+    joblib.dump(svm_classifier, os.path.join(model_output_dir, "svm_classifier.joblib"))
+    joblib.dump(vectorizer, os.path.join(model_output_dir, "tfidf_vectorizer.joblib"))
+    joblib.dump(id2label, os.path.join(model_output_dir, "id2label.json"))
+    print("\n========== Model and Vectorizer saved ==========")
+    # Predict 1 câu (ví dụ)
+    def predict_text(text):
+        text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
+        X = vectorizer.transform([text_proc])
+        pred_id = svm_classifier.predict(X)[0]
+        label = custom_id2label[pred_id]
+        return label
+    custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
+    emotion_pred = predict_text(custom_text)
+    print("\nCâu ví dụ:", custom_text)
+    print("Dự đoán cảm xúc:", emotion_pred)
+    print("\nHoàn thành demo SVM với cân bằng dữ liệu & nhiều chỉ số đánh giá!")

main_v1.py ADDED Viewed

	@@ -0,0 +1,494 @@

+# thesis.py
+# -*- coding: utf-8 -*-
+import pandas as pd
+import emoji
+import json
+import re
+from underthesea import word_tokenize
+from tqdm import tqdm
+import torch
+from torchtext.vocab import Vectors
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from torch.utils.data import DataLoader, TensorDataset
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+import tensorflow as tf
+# ========== CÁC HÀM TIỀN XỬ LÝ ==========
+def preprocess_sentence(sentence, abbreviations, emoji_mapping):
+    """
+    Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
+    ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
+    """
+    sentence = sentence.lower()
+    sentence = replace_emojis(sentence, emoji_mapping)
+    sentence = remove_profanity(sentence)
+    sentence = remove_special_characters(sentence)
+    sentence = normalize_whitespace(sentence)
+    sentence = replace_abbreviations(sentence, abbreviations)
+    sentence = remove_repeated_characters(sentence)
+    sentence = replace_numbers(sentence)
+    sentence = tokenize_sentence(sentence)
+    return sentence
+def replace_emojis(sentence, emoji_mapping):
+    processed_sentence = []
+    for char in sentence:
+        if char in emoji_mapping:
+            processed_sentence.append(emoji_mapping[char])
+        elif not emoji.is_emoji(char):
+            processed_sentence.append(char)
+    return ''.join(processed_sentence)
+def remove_profanity(sentence):
+    profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
+    words = sentence.split()
+    filtered_words = [word for word in words if word.lower() not in profane_words]
+    return ' '.join(filtered_words)
+def remove_special_characters(sentence):
+    return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
+def normalize_whitespace(sentence):
+    return ' '.join(sentence.split())
+def replace_abbreviations(sentence, abbreviations):
+    words = sentence.split()
+    replaced_words = [
+        " ".join(abbreviations[word]) if word in abbreviations else word
+        for word in words
+    ]
+    return ' '.join(replaced_words)
+def remove_repeated_characters(sentence):
+    return re.sub(r"(.)\1{2,}", r"\1", sentence)
+def replace_numbers(sentence):
+    return re.sub(r"\d+", "[number]", sentence)
+def tokenize_sentence(sentence):
+    return ' '.join(word_tokenize(sentence))
+# ========== LỚP DATA MANAGER ==========
+class DataManager:
+    def __init__(self, file_path, abbreviations_path, word2vec_path):
+        self.file_path = file_path
+        self.abbreviations_path = abbreviations_path
+        self.word2vec_path = word2vec_path
+        self.load_abbreviations()
+        self.load_word2vec()
+    def load_abbreviations(self):
+        with open(self.abbreviations_path, "r", encoding="utf-8") as file:
+            self.abbreviations = json.load(file)
+    def load_word2vec(self):
+        # Tải vector từ file word2vec, unk_init để từ vựng ngoài tập sẽ random normal
+        self.word_embeddings = Vectors(name=self.word2vec_path, unk_init=torch.Tensor.normal_)
+        self.vocabulary = self.create_vocab_from_word2vec()
+    def create_vocab_from_word2vec(self):
+        vocab = Vocabulary()
+        words_list = list(self.word_embeddings.stoi.keys())
+        for word in words_list:
+            vocab.add(word)
+        return vocab
+    def preprocess_data(self):
+        df = pd.read_excel(self.file_path)
+        if "Sentence" not in df.columns:
+            raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
+        # Tiền xử lý từng câu
+        df["processed_sentence"] = df["Sentence"].apply(
+            lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
+        )
+        # Loại bỏ những dòng rỗng sau khi xử lý
+        df = df[df["processed_sentence"].str.strip().astype(bool)]
+        return df
+    def split_and_convert(
+        self, df, label_column="Emotion", maxlen=400, test_size=0.2,
+        for_keras=False, batch_size=32
+    ):
+        """
+        Chia dữ liệu thành train/test. Trả về:
+        - Nếu for_keras=False: train_loader, test_loader, label_mapping (PyTorch)
+        - Nếu for_keras=True: X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
+        """
+        if label_column not in df.columns:
+            raise ValueError(
+                f"Cột '{label_column}' không tồn tại trong DataFrame. "
+                f"Các cột hiện có: {df.columns.tolist()}"
+            )
+        # Tạo mapping nhãn -> số
+        label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
+        df[label_column] = df[label_column].map(label_mapping)
+        X = df["processed_sentence"].tolist()
+        y = df[label_column].tolist()
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
+        # Chuyển văn bản thành tensor chỉ số
+        X_train_tensors = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
+        X_test_tensors  = self.vocabulary.corpus_to_tensor(X_test,  is_tokenized=False)
+        # Pad sequences
+        X_train_padded = pad_sequences(X_train_tensors, maxlen=maxlen)
+        X_test_padded  = pad_sequences(X_test_tensors,  maxlen=maxlen)
+        # Debug thông tin
+        print(">>> Debug Split and Convert:")
+        print("X_train_padded.shape:", X_train_padded.shape)
+        print("X_test_padded.shape: ", X_test_padded.shape)
+        print("y_train length:", len(y_train))
+        print("y_test length: ", len(y_test))
+        # Kiểm tra min/max token
+        max_token_train = np.max(X_train_padded) if X_train_padded.size > 0 else None
+        min_token_train = np.min(X_train_padded) if X_train_padded.size > 0 else None
+        max_token_test  = np.max(X_test_padded)  if X_test_padded.size  > 0 else None
+        min_token_test  = np.min(X_test_padded)  if X_test_padded.size  > 0 else None
+        vocab_size = len(self.vocabulary)
+        print(f"vocab_size: {vocab_size}")
+        print(f"max_token_train: {max_token_train}, min_token_train: {min_token_train}")
+        print(f"max_token_test:  {max_token_test},  min_token_test:  {min_token_test}")
+        if for_keras:
+            num_classes = len(label_mapping)
+            # One-hot cho nhãn
+            y_train_onehot = torch.nn.functional.one_hot(torch.tensor(y_train), num_classes=num_classes).numpy()
+            y_test_onehot  = torch.nn.functional.one_hot(torch.tensor(y_test),  num_classes=num_classes).numpy()
+            # Debug
+            print("y_train_onehot.shape:", y_train_onehot.shape)
+            print("y_test_onehot.shape: ", y_test_onehot.shape)
+            return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
+        else:
+            # Trả về DataLoader cho PyTorch
+            X_train_tensor = torch.tensor(X_train_padded, dtype=torch.long)
+            X_test_tensor  = torch.tensor(X_test_padded,  dtype=torch.long)
+            y_train_tensor = torch.tensor(y_train, dtype=torch.long)
+            y_test_tensor  = torch.tensor(y_test,  dtype=torch.long)
+            train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
+            test_dataset  = TensorDataset(X_test_tensor,  y_test_tensor)
+            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+            test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)
+            return train_loader, test_loader, label_mapping
+# ========== LỚP TỪ ĐIỂN (VOCABULARY) ==========
+class Vocabulary:
+    def __init__(self):
+        self.word2id = {}
+        self.word2id['<pad>'] = 0
+        self.word2id['<unk>'] = 1
+        self.unk_id = self.word2id['<unk>']
+        self.id2word = {0: '<pad>', 1: '<unk>'}
+    def __getitem__(self, word):
+        return self.word2id.get(word, self.unk_id)
+    def __contains__(self, word):
+        return word in self.word2id
+    def __len__(self):
+        return len(self.word2id)
+    def lookup_tokens(self, word_indexes: list):
+        return [self.id2word[word_index] for word_index in word_indexes]
+    def add(self, word):
+        if word not in self:
+            word_index = len(self.word2id)
+            self.word2id[word] = word_index
+            self.id2word[word_index] = word
+            return word_index
+        else:
+            return self[word]
+    @staticmethod
+    def tokenize_corpus(corpus):
+        tokenized_corpus = []
+        for document in tqdm(corpus):
+            tokenized_document = [word.replace(" ", "_") for word in word_tokenize(document)]
+            tokenized_corpus.append(tokenized_document)
+        return tokenized_corpus
+    def corpus_to_tensor(self, corpus, is_tokenized=False):
+        tokenized_corpus = self.tokenize_corpus(corpus) if not is_tokenized else corpus
+        return [
+            [self[word] for word in document]
+            for document in tokenized_corpus
+        ]
+# ========== MAPPING EMOJI => NHÃN ==========
+emoji_mapping = {
+    "😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
+    "🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
+    "🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
+    "😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
+    "🤑": "[satisfaction]",
+    "🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
+    "😏": "[sarcasm]",
+    "😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
+    "😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
+    "😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
+    "🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
+    "🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
+    "😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
+    "😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
+    "😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
+    "😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
+    "😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
+}
+# ========== ĐỊNH NGHĨA MÔ HÌNH RNN PYTORCH ==========
+class SimpleRNN(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
+        super(SimpleRNN, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
+        self.fc = nn.Linear(hidden_dim, output_dim)
+    def forward(self, x):
+        embedded = self.embedding(x)
+        _, (hidden, _) = self.rnn(embedded)
+        return self.fc(hidden.squeeze(0))
+# ========== HÀM DỰ ĐOÁN VỚI MÔ HÌNH PYTORCH RNN ==========
+def predict_emotion_rnn(model, text, data_manager, label_mapping, device):
+    model.eval()
+    with torch.no_grad():
+        processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
+        tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
+        text_tensor = torch.tensor(
+            pad_sequences(data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True), maxlen=400),
+            dtype=torch.long
+        ).to(device)
+        output = model(text_tensor)
+        _, predicted = torch.max(output, 1)
+        reverse_label_mapping = {v: k for k, v in label_mapping.items()}
+        return reverse_label_mapping[predicted.item()]
+# ========== HÀM DỰ ĐOÁN VỚI MÔ HÌNH KERAS CNN-LSTM ==========
+def predict_emotion_cnn_lstm(model, text, data_manager, label_mapping):
+    processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
+    tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
+    text_tensor = pad_sequences(data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True), maxlen=400)
+    output = model.predict(text_tensor)
+    predicted = output.argmax(axis=1)[0]
+    reverse_label_mapping = {v: k for k, v in label_mapping.items()}
+    return reverse_label_mapping[predicted]
+# ========== PHẦN MAIN (CHẠY THỬ) ==========
+if __name__ == "__main__":
+    # --------------------------
+    # Thay đường dẫn tại đây:
+    # --------------------------
+    file_path = "train.xlsx"               # file Excel gốc (chứa cột "Sentence", "Emotion", ...)
+    abbreviations_path = "abbreviations.json"
+    word2vec_path = "/home/datpham/datpham/thesis-ngtram/word2vec_vi_syllables_100dims.txt"
+    output_path = "processed.xlsx"
+    data_manager = DataManager(
+        file_path=file_path,
+        abbreviations_path=abbreviations_path,
+        word2vec_path=word2vec_path
+    )
+    # 1) Đọc và tiền xử lý
+    df = data_manager.preprocess_data()
+    print("Trước khi undersampling:")
+    print(df["Emotion"].value_counts())
+    # 2) UNDERSAMPLING (Ví dụ)
+    # Chỉnh lại tên emotion cụ thể cho phù hợp tập dữ liệu của bạn
+    df_enjoyment = df[df["Emotion"] == "Enjoyment"]
+    df_other     = df[df["Emotion"] == "Other"]
+    df_anger     = df[df["Emotion"] == "Anger"]
+    df_sadness   = df[df["Emotion"] == "Sadness"]
+    df_disgust   = df[df["Emotion"] == "Disgust"]
+    df_fear      = df[df["Emotion"] == "Fear"]
+    df_surprise  = df[df["Emotion"] == "Surprise"]
+    # Ví dụ: Chọn 2000 mẫu cho 'Enjoyment'
+    if len(df_enjoyment) > 2000:
+        df_enjoyment_undersampled = df_enjoyment.sample(n=2000, random_state=42)
+    else:
+        df_enjoyment_undersampled = df_enjoyment
+    df_balanced = pd.concat([
+        df_enjoyment_undersampled,
+        df_other,
+        df_anger,
+        df_sadness,
+        df_disgust,
+        df_fear,
+        df_surprise
+    ], axis=0)
+    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
+    df = df_balanced
+    print("\nSau khi undersampling:")
+    print(df["Emotion"].value_counts())
+    df.to_excel(output_path, index=False)
+    # 3) Tạo data loader cho PyTorch
+    train_loader, test_loader, label_mapping = data_manager.split_and_convert(
+        df, label_column="Emotion", for_keras=False
+    )
+    vocab_size = len(data_manager.vocabulary)
+    embedding_dim = 100
+    hidden_dim = 128
+    output_dim = len(label_mapping)
+    model_rnn = SimpleRNN(vocab_size, embedding_dim, hidden_dim, output_dim)
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.Adam(model_rnn.parameters())
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_rnn.to(device)
+    num_epochs = 20
+    for epoch in range(num_epochs):
+        model_rnn.train()
+        epoch_loss = 0
+        correct = 0
+        total = 0
+        for X_batch, y_batch in train_loader:
+            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
+            optimizer.zero_grad()
+            predictions = model_rnn(X_batch)
+            loss = criterion(predictions, y_batch)
+            loss.backward()
+            optimizer.step()
+            epoch_loss += loss.item()
+            _, predicted = torch.max(predictions, 1)
+            correct += (predicted == y_batch).sum().item()
+            total += y_batch.size(0)
+        print(f"Epoch {epoch+1}/{num_epochs}, "
+              f"Loss: {epoch_loss/len(train_loader):.4f}, "
+              f"Accuracy: {correct/total:.4f}")
+    # Đánh giá RNN trên test set
+    model_rnn.eval()
+    test_loss = 0
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for X_batch, y_batch in test_loader:
+            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
+            predictions = model_rnn(X_batch)
+            loss = criterion(predictions, y_batch)
+            test_loss += loss.item()
+            _, predicted = torch.max(predictions, 1)
+            correct += (predicted == y_batch).sum().item()
+            total += y_batch.size(0)
+    print(f"Test Loss: {test_loss/len(test_loader):.4f}, "
+          f"Test Accuracy: {correct/total:.4f}")
+    # ========== CNN-LSTM (Keras) ==========
+    from keras.models import Model
+    from keras.layers import Input, Embedding, Convolution1D, LSTM, Dense, Dropout, Lambda, concatenate
+    from keras.optimizers import Adam
+    from keras.callbacks import ModelCheckpoint
+    print("Training CNN-LSTM...")
+    X_train, X_test, y_train, y_test, label_mapping = data_manager.split_and_convert(
+        df, label_column="Emotion", for_keras=True
+    )
+    maxlen = 400
+    input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
+    emb_layer = Embedding(len(data_manager.vocabulary), embedding_dim)(input_layer)
+    def max_1d(X):
+        return tf.reduce_max(X, axis=1)
+    con3_layer = Convolution1D(150, kernel_size=3, activation='relu')(emb_layer)
+    pool_con3_layer = Lambda(max_1d, output_shape=(150,))(con3_layer)
+    con5_layer = Convolution1D(150, kernel_size=5, activation='relu')(emb_layer)
+    pool_con5_layer = Lambda(max_1d, output_shape=(150,))(con5_layer)
+    lstm_layer = LSTM(128)(emb_layer)
+    cnn_lstm_layer = concatenate([pool_con3_layer, pool_con5_layer, lstm_layer])
+    dense_layer   = Dense(100, activation='relu')(cnn_lstm_layer)
+    dropout_layer = Dropout(0.2)(dense_layer)
+    output_layer  = Dense(len(label_mapping), activation='softmax')(dropout_layer)
+    model_cnn_lstm = Model(inputs=input_layer, outputs=output_layer)
+    model_cnn_lstm.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
+    checkpoint = ModelCheckpoint('cnn_lstm_best.keras', save_best_only=True, monitor='val_accuracy', mode='max')
+    model_cnn_lstm.fit(
+        X_train, y_train,
+        validation_data=(X_test, y_test),
+        batch_size=32,
+        epochs=20,
+        callbacks=[checkpoint]
+    )
+    model_cnn_lstm.save('cnn_lstm_model.keras')
+    loss, accuracy = model_cnn_lstm.evaluate(X_test, y_test)
+    print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")
+    # Demo dự đoán 1 câu mới
+    custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
+    # RNN (PyTorch)
+    emotion_rnn = predict_emotion_rnn(model_rnn, custom_text, data_manager, label_mapping, device)
+    print(f"Predicted Emotion (RNN): {emotion_rnn}")
+    # CNN-LSTM (Keras)
+    cnn_lstm_model = tf.keras.models.load_model('cnn_lstm_model.keras')
+    emotion_cnn_lstm = predict_emotion_cnn_lstm(cnn_lstm_model, custom_text, data_manager, label_mapping)
+    print(f"Predicted Emotion (CNN-LSTM): {emotion_cnn_lstm}")
+    # Kiểm tra phiên bản TF, GPU
+    print("TF version:", tf.__version__)
+    print("GPU devices:", tf.config.list_physical_devices("GPU"))
+    # Có thể kiểm tra CUDA/GPU thông qua lệnh system sau (nếu muốn):
+    # import os
+    # os.system("nvidia-smi")

phobert_emotion_model/classification_report.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+========== Classification Report ==========
+              precision    recall  f1-score   support
+       Anger     0.9768    0.9788    0.9778       991
+     Disgust     0.9457    0.9657    0.9556       991
+   Enjoyment     0.9166    0.8204    0.8658       991
+        Fear     0.9771    0.9879    0.9825       992
+       Other     0.9026    0.9253    0.9138       991
+     Sadness     0.9302    0.9677    0.9486       991
+    Surprise     0.9448    0.9496    0.9472       992
+    accuracy                         0.9422      6939
+   macro avg     0.9420    0.9422    0.9416      6939
+weighted avg     0.9420    0.9422    0.9416      6939
+========== Confusion Matrix ==========
+[[970   9   3   4   2   2   1]
+ [ 12 957   2   3   7   5   5]
+ [  5  16 813   9  67  42  39]
+ [  2   2   6 980   1   1   0]
+ [  3  13  33   2 917  13  10]
+ [  1   7  17   3   4 959   0]
+ [  0   8  13   2  18   9 942]]

phobert_emotion_model/confusion_matrix.png ADDED Viewed

phobert_emotion_model/id2label.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "0": "Anger",
+    "1": "Disgust",
+    "2": "Enjoyment",
+    "3": "Fear",
+    "4": "Other",
+    "5": "Sadness",
+    "6": "Surprise"
+}

phobert_emotion_model/phobert_emotion_model/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<mask>": 64000
+}

phobert_emotion_model/phobert_emotion_model/bpe.codes ADDED Viewed

The diff for this file is too large to render. See raw diff

phobert_emotion_model/phobert_emotion_model/config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "_name_or_path": "vinai/phobert-base",
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2,
+    "LABEL_3": 3,
+    "LABEL_4": 4,
+    "LABEL_5": 5,
+    "LABEL_6": 6
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 258,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "tokenizer_class": "PhobertTokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 64001
+}

phobert_emotion_model/phobert_emotion_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23cc285ab489e07145436eebb67247d71cd67c817155cc65eb5a7e52e78ed4f0
+size 540038764

phobert_emotion_model/phobert_emotion_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

phobert_emotion_model/phobert_emotion_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "64000": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "PhobertTokenizer",
+  "unk_token": "<unk>"
+}

phobert_emotion_model/phobert_emotion_model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

phobert_results/checkpoint-10410/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<mask>": 64000
+}

phobert_results/checkpoint-10410/bpe.codes ADDED Viewed

The diff for this file is too large to render. See raw diff