Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +10 -0
- .vector_cache/word2vec_vi_syllables_100dims.txt.pt +3 -0
- abbreviations.json +363 -0
- bilstm_best.keras +3 -0
- bilstm_emotion_model/bilstm_model.keras +3 -0
- bilstm_emotion_model/classification_report.txt +33 -0
- bilstm_emotion_model/label_mapping.json +9 -0
- bilstm_emotion_model/vocabulary.json +0 -0
- cnn_lstm_best.keras +3 -0
- cnn_lstm_emotion_model/classification_report.txt +33 -0
- cnn_lstm_emotion_model/cnn_lstm_model.keras +3 -0
- cnn_lstm_model.keras +3 -0
- flagged/log.csv +2 -0
- logs/events.out.tfevents.1736834439.ai1gpu-virtual-machine.52042.0 +3 -0
- logs/events.out.tfevents.1736835355.ai1gpu-virtual-machine.52042.1 +3 -0
- logs/events.out.tfevents.1736835689.ai1gpu-virtual-machine.52955.0 +3 -0
- logs/events.out.tfevents.1736835769.ai1gpu-virtual-machine.53242.0 +3 -0
- logs/events.out.tfevents.1736835850.ai1gpu-virtual-machine.53528.0 +3 -0
- logs/events.out.tfevents.1736835995.ai1gpu-virtual-machine.53982.0 +3 -0
- logs/events.out.tfevents.1736836066.ai1gpu-virtual-machine.54029.0 +3 -0
- logs/events.out.tfevents.1736836768.ai1gpu-virtual-machine.55099.0 +3 -0
- logs/events.out.tfevents.1736841979.ai1gpu-virtual-machine.55099.1 +3 -0
- logs/events.out.tfevents.1736844609.ai1gpu-virtual-machine.66743.0 +3 -0
- logs/events.out.tfevents.1736852947.ai1gpu-virtual-machine.76812.0 +3 -0
- logs/events.out.tfevents.1736858105.ai1gpu-virtual-machine.76812.1 +3 -0
- logs/events.out.tfevents.1736858545.ai1gpu-virtual-machine.87908.0 +3 -0
- logs/events.out.tfevents.1736858698.ai1gpu-virtual-machine.88011.0 +3 -0
- logs/events.out.tfevents.1736864229.ai1gpu-virtual-machine.88011.1 +3 -0
- logs/events.out.tfevents.1736907563.ai1gpu-virtual-machine.145430.0 +3 -0
- logs/events.out.tfevents.1736908155.ai1gpu-virtual-machine.146675.0 +3 -0
- logs/events.out.tfevents.1736911863.ai1gpu-virtual-machine.152249.0 +3 -0
- logs/events.out.tfevents.1736916063.ai1gpu-virtual-machine.152249.1 +3 -0
- main_BILSTM.py +573 -0
- main_RNN_CNN-LSTM.py +738 -0
- main_lstm.py +289 -0
- main_phobert.py +349 -0
- main_svm.py +261 -0
- main_v1.py +494 -0
- phobert_emotion_model/classification_report.txt +23 -0
- phobert_emotion_model/confusion_matrix.png +0 -0
- phobert_emotion_model/id2label.json +9 -0
- phobert_emotion_model/phobert_emotion_model/added_tokens.json +3 -0
- phobert_emotion_model/phobert_emotion_model/bpe.codes +0 -0
- phobert_emotion_model/phobert_emotion_model/config.json +48 -0
- phobert_emotion_model/phobert_emotion_model/model.safetensors +3 -0
- phobert_emotion_model/phobert_emotion_model/special_tokens_map.json +9 -0
- phobert_emotion_model/phobert_emotion_model/tokenizer_config.json +54 -0
- phobert_emotion_model/phobert_emotion_model/vocab.txt +0 -0
- phobert_results/checkpoint-10410/added_tokens.json +3 -0
- phobert_results/checkpoint-10410/bpe.codes +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
bilstm_best.keras filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
bilstm_emotion_model/bilstm_model.keras filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
cnn_lstm_best.keras filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
cnn_lstm_emotion_model/cnn_lstm_model.keras filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
cnn_lstm_model.keras filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
processed.xlsx filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
processed_phobert.xlsx filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
processed_svm.xlsx filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
train.xlsx filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
word2vec_vi_syllables_100dims.txt filter=lfs diff=lfs merge=lfs -text
|
.vector_cache/word2vec_vi_syllables_100dims.txt.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3390520329ebe14cddb38384d80bd8b6e4948e023977ba5dbe32235b4a3503e7
|
| 3 |
+
size 418631353
|
abbreviations.json
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"ad": [
|
| 3 |
+
"admin",
|
| 4 |
+
"quản trị viên"
|
| 5 |
+
],
|
| 6 |
+
"bb": [
|
| 7 |
+
"bye bye",
|
| 8 |
+
"tạm biệt"
|
| 9 |
+
],
|
| 10 |
+
"bl": [
|
| 11 |
+
"bình luận"
|
| 12 |
+
],
|
| 13 |
+
"bth": [
|
| 14 |
+
"bình thường"
|
| 15 |
+
],
|
| 16 |
+
"bmn": [
|
| 17 |
+
"bạn muốn"
|
| 18 |
+
],
|
| 19 |
+
"cxk": [
|
| 20 |
+
"cũng không"
|
| 21 |
+
],
|
| 22 |
+
"đm": [
|
| 23 |
+
"đ** m**"
|
| 24 |
+
],
|
| 25 |
+
"gg": [
|
| 26 |
+
"good game",
|
| 27 |
+
"Google"
|
| 28 |
+
],
|
| 29 |
+
"hc": [
|
| 30 |
+
"học"
|
| 31 |
+
],
|
| 32 |
+
"kq": [
|
| 33 |
+
"kết quả"
|
| 34 |
+
],
|
| 35 |
+
"kb": [
|
| 36 |
+
"kết bạn"
|
| 37 |
+
],
|
| 38 |
+
"khá": [
|
| 39 |
+
"khá là"
|
| 40 |
+
],
|
| 41 |
+
"lq": [
|
| 42 |
+
"liên quan"
|
| 43 |
+
],
|
| 44 |
+
"lmh": [
|
| 45 |
+
"làm gì thế"
|
| 46 |
+
],
|
| 47 |
+
"ng": [
|
| 48 |
+
"người"
|
| 49 |
+
],
|
| 50 |
+
"nsao": [
|
| 51 |
+
"nói sao"
|
| 52 |
+
],
|
| 53 |
+
"nv": [
|
| 54 |
+
"nhân vật"
|
| 55 |
+
],
|
| 56 |
+
"nvay": [
|
| 57 |
+
"như vậy"
|
| 58 |
+
],
|
| 59 |
+
"nxk": [
|
| 60 |
+
"nói không"
|
| 61 |
+
],
|
| 62 |
+
"ob": [
|
| 63 |
+
"ông bà"
|
| 64 |
+
],
|
| 65 |
+
"pc": [
|
| 66 |
+
"phải không"
|
| 67 |
+
],
|
| 68 |
+
"ph": [
|
| 69 |
+
"phim"
|
| 70 |
+
],
|
| 71 |
+
"ql": [
|
| 72 |
+
"quản lý"
|
| 73 |
+
],
|
| 74 |
+
"qt": [
|
| 75 |
+
"quá trời"
|
| 76 |
+
],
|
| 77 |
+
"sdt": [
|
| 78 |
+
"số điện thoại"
|
| 79 |
+
],
|
| 80 |
+
"sk": [
|
| 81 |
+
"sức khỏe"
|
| 82 |
+
],
|
| 83 |
+
"tc": [
|
| 84 |
+
"tài chính"
|
| 85 |
+
],
|
| 86 |
+
"td": [
|
| 87 |
+
"tâm điểm",
|
| 88 |
+
"tập đoàn"
|
| 89 |
+
],
|
| 90 |
+
"th": [
|
| 91 |
+
"thôi"
|
| 92 |
+
],
|
| 93 |
+
"tl": [
|
| 94 |
+
"trả lời"
|
| 95 |
+
],
|
| 96 |
+
"ty": [
|
| 97 |
+
"tình yêu"
|
| 98 |
+
],
|
| 99 |
+
"up": [
|
| 100 |
+
"cập nhật",
|
| 101 |
+
"update"
|
| 102 |
+
],
|
| 103 |
+
"xđ": [
|
| 104 |
+
"xác định"
|
| 105 |
+
],
|
| 106 |
+
"zui": [
|
| 107 |
+
"vui"
|
| 108 |
+
],
|
| 109 |
+
"zời": [
|
| 110 |
+
"trời"
|
| 111 |
+
],
|
| 112 |
+
"hdsd": [
|
| 113 |
+
"hướng dẫn sử dụng"
|
| 114 |
+
],
|
| 115 |
+
"bbq": [
|
| 116 |
+
"barbecue",
|
| 117 |
+
"tiệc nướng"
|
| 118 |
+
],
|
| 119 |
+
"cx": [
|
| 120 |
+
"chắc chắn",
|
| 121 |
+
"cũng"
|
| 122 |
+
],
|
| 123 |
+
"vkc": [
|
| 124 |
+
"vãi kinh"
|
| 125 |
+
],
|
| 126 |
+
"kt": [
|
| 127 |
+
"kiểm tra",
|
| 128 |
+
"không thèm"
|
| 129 |
+
],
|
| 130 |
+
"tks": [
|
| 131 |
+
"thanks",
|
| 132 |
+
"cảm ơn"
|
| 133 |
+
],
|
| 134 |
+
"đg": [
|
| 135 |
+
"đang"
|
| 136 |
+
],
|
| 137 |
+
"qa": [
|
| 138 |
+
"quá"
|
| 139 |
+
],
|
| 140 |
+
"ht": [
|
| 141 |
+
"học tập",
|
| 142 |
+
"hoàn tất"
|
| 143 |
+
],
|
| 144 |
+
"clgt": [
|
| 145 |
+
"cái l** gì thế"
|
| 146 |
+
],
|
| 147 |
+
"pls": [
|
| 148 |
+
"please",
|
| 149 |
+
"làm ơn"
|
| 150 |
+
],
|
| 151 |
+
"qtqđ": [
|
| 152 |
+
"quá trời quá đất"
|
| 153 |
+
],
|
| 154 |
+
"klq": [
|
| 155 |
+
"không liên quan"
|
| 156 |
+
],
|
| 157 |
+
"mn": [
|
| 158 |
+
"mọi người"
|
| 159 |
+
],
|
| 160 |
+
"vc": [
|
| 161 |
+
"vãi chưởng",
|
| 162 |
+
"vợ chồng"
|
| 163 |
+
],
|
| 164 |
+
"vch": [
|
| 165 |
+
"vãi chưởng"
|
| 166 |
+
],
|
| 167 |
+
"cđ": [
|
| 168 |
+
"cuộc đời"
|
| 169 |
+
],
|
| 170 |
+
"đhs": [
|
| 171 |
+
"đ** hiểu sao"
|
| 172 |
+
],
|
| 173 |
+
"ib": [
|
| 174 |
+
"inbox",
|
| 175 |
+
"nhắn tin"
|
| 176 |
+
],
|
| 177 |
+
"ttyl": [
|
| 178 |
+
"talk to you later",
|
| 179 |
+
"nói chuyện sau"
|
| 180 |
+
],
|
| 181 |
+
"stt": [
|
| 182 |
+
"status",
|
| 183 |
+
"trạng thái"
|
| 184 |
+
],
|
| 185 |
+
"sr": [
|
| 186 |
+
"sorry",
|
| 187 |
+
"xin lỗi"
|
| 188 |
+
],
|
| 189 |
+
"bn": [
|
| 190 |
+
"bao nhiêu",
|
| 191 |
+
"bạn"
|
| 192 |
+
],
|
| 193 |
+
"ckmnl": [
|
| 194 |
+
"chào cả nhà mình nha l"
|
| 195 |
+
],
|
| 196 |
+
"cr": [
|
| 197 |
+
"crush"
|
| 198 |
+
],
|
| 199 |
+
"mng": [
|
| 200 |
+
"mọi người"
|
| 201 |
+
],
|
| 202 |
+
"vl": [
|
| 203 |
+
"vãi l",
|
| 204 |
+
"rất"
|
| 205 |
+
],
|
| 206 |
+
"khbn": [
|
| 207 |
+
"không biết nữa"
|
| 208 |
+
],
|
| 209 |
+
"qtq": [
|
| 210 |
+
"quá trời quá"
|
| 211 |
+
],
|
| 212 |
+
"sml": [
|
| 213 |
+
"sấp mặt luôn"
|
| 214 |
+
],
|
| 215 |
+
"ns": [
|
| 216 |
+
"nói"
|
| 217 |
+
],
|
| 218 |
+
"ăn h": [
|
| 219 |
+
"ăn hành"
|
| 220 |
+
],
|
| 221 |
+
"qh": [
|
| 222 |
+
"quan hệ"
|
| 223 |
+
],
|
| 224 |
+
"ăn b": [
|
| 225 |
+
"ăn bánh"
|
| 226 |
+
],
|
| 227 |
+
"hph": [
|
| 228 |
+
"hạnh phúc"
|
| 229 |
+
],
|
| 230 |
+
"ngta": [
|
| 231 |
+
"người ta"
|
| 232 |
+
],
|
| 233 |
+
"mnk": [
|
| 234 |
+
"mọi người không"
|
| 235 |
+
],
|
| 236 |
+
"ahihi": [
|
| 237 |
+
"cười đùa"
|
| 238 |
+
],
|
| 239 |
+
"chz": [
|
| 240 |
+
"chuyện"
|
| 241 |
+
],
|
| 242 |
+
"vđ": [
|
| 243 |
+
"vấn đề"
|
| 244 |
+
],
|
| 245 |
+
"pp": [
|
| 246 |
+
"bye bye",
|
| 247 |
+
"tạm biệt"
|
| 248 |
+
],
|
| 249 |
+
"dc": [
|
| 250 |
+
"được"
|
| 251 |
+
],
|
| 252 |
+
"nt": [
|
| 253 |
+
"nhắn tin"
|
| 254 |
+
],
|
| 255 |
+
"thik": [
|
| 256 |
+
"thích"
|
| 257 |
+
],
|
| 258 |
+
"bt": [
|
| 259 |
+
"biết",
|
| 260 |
+
"bình thường"
|
| 261 |
+
],
|
| 262 |
+
"kp": [
|
| 263 |
+
"không phải"
|
| 264 |
+
],
|
| 265 |
+
"mik": [
|
| 266 |
+
"mình"
|
| 267 |
+
],
|
| 268 |
+
"lm": [
|
| 269 |
+
"làm"
|
| 270 |
+
],
|
| 271 |
+
"nx": [
|
| 272 |
+
"nữa"
|
| 273 |
+
],
|
| 274 |
+
"mk": [
|
| 275 |
+
"mình",
|
| 276 |
+
"mày"
|
| 277 |
+
],
|
| 278 |
+
"cmt": [
|
| 279 |
+
"comment",
|
| 280 |
+
"bình luận"
|
| 281 |
+
],
|
| 282 |
+
"rep": [
|
| 283 |
+
"trả lời",
|
| 284 |
+
"phản hồi"
|
| 285 |
+
],
|
| 286 |
+
"fa": [
|
| 287 |
+
"độc thân",
|
| 288 |
+
"forever alone"
|
| 289 |
+
],
|
| 290 |
+
"chx": [
|
| 291 |
+
"chưa"
|
| 292 |
+
],
|
| 293 |
+
"qlq": [
|
| 294 |
+
"quản lý quán"
|
| 295 |
+
],
|
| 296 |
+
"a": [
|
| 297 |
+
"anh"
|
| 298 |
+
],
|
| 299 |
+
"e": [
|
| 300 |
+
"em"
|
| 301 |
+
],
|
| 302 |
+
"ko": [
|
| 303 |
+
"không"
|
| 304 |
+
],
|
| 305 |
+
"kh": [
|
| 306 |
+
"không"
|
| 307 |
+
],
|
| 308 |
+
"z": [
|
| 309 |
+
"vậy"
|
| 310 |
+
],
|
| 311 |
+
"ny": [
|
| 312 |
+
"người yêu"
|
| 313 |
+
],
|
| 314 |
+
"l": [
|
| 315 |
+
"là"
|
| 316 |
+
],
|
| 317 |
+
"sn": [
|
| 318 |
+
"sinh nhật"
|
| 319 |
+
],
|
| 320 |
+
"ckk": [
|
| 321 |
+
"chúc ngủ ngon"
|
| 322 |
+
],
|
| 323 |
+
"hpbd": [
|
| 324 |
+
"happy birthday"
|
| 325 |
+
],
|
| 326 |
+
"tt": [
|
| 327 |
+
"thông tin",
|
| 328 |
+
"tương tác"
|
| 329 |
+
],
|
| 330 |
+
"ms": [
|
| 331 |
+
"mới"
|
| 332 |
+
],
|
| 333 |
+
"k": [
|
| 334 |
+
"không"
|
| 335 |
+
],
|
| 336 |
+
"vk": [
|
| 337 |
+
"vợ"
|
| 338 |
+
],
|
| 339 |
+
"ck": [
|
| 340 |
+
"chồng"
|
| 341 |
+
],
|
| 342 |
+
"j": [
|
| 343 |
+
"gì"
|
| 344 |
+
],
|
| 345 |
+
"m": [
|
| 346 |
+
"mày"
|
| 347 |
+
],
|
| 348 |
+
"t": [
|
| 349 |
+
"tao"
|
| 350 |
+
],
|
| 351 |
+
"sgk": [
|
| 352 |
+
"sách giáo khoa"
|
| 353 |
+
],
|
| 354 |
+
"cv": [
|
| 355 |
+
"công việc"
|
| 356 |
+
],
|
| 357 |
+
"pv": [
|
| 358 |
+
"phục vụ"
|
| 359 |
+
],
|
| 360 |
+
"dth":["dễ thương"],
|
| 361 |
+
"gato": ["ghen ăn tức ở"]
|
| 362 |
+
|
| 363 |
+
}
|
bilstm_best.keras
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:085cb3b7394a3db69287c6ede56834dfc9d6e56e2f169c5a05e49ffb5267fb6a
|
| 3 |
+
size 13203552
|
bilstm_emotion_model/bilstm_model.keras
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:40715c89bc3bc193a953c792527898450dd10979bd0bcd62ed32b8df471fa2bb
|
| 3 |
+
size 13203552
|
bilstm_emotion_model/classification_report.txt
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
========== BiLSTM Classification Report ==========
|
| 2 |
+
precision recall f1-score support
|
| 3 |
+
|
| 4 |
+
Enjoyment 0.6490 0.7296 0.6869 991
|
| 5 |
+
Fear 0.5580 0.4709 0.5108 327
|
| 6 |
+
Sadness 0.4580 0.4747 0.4662 356
|
| 7 |
+
Anger 0.6587 0.6748 0.6667 369
|
| 8 |
+
Other 0.6601 0.6733 0.6667 600
|
| 9 |
+
Disgust 0.4967 0.4488 0.4715 332
|
| 10 |
+
Surprise 0.4683 0.3620 0.4083 326
|
| 11 |
+
|
| 12 |
+
accuracy 0.5956 3301
|
| 13 |
+
macro avg 0.5641 0.5477 0.5539 3301
|
| 14 |
+
weighted avg 0.5893 0.5956 0.5905 3301
|
| 15 |
+
|
| 16 |
+
========== Additional Metrics ==========
|
| 17 |
+
Test Loss: 2.0363
|
| 18 |
+
Test Accuracy: 0.5956
|
| 19 |
+
Precision (Macro): 0.5641
|
| 20 |
+
Precision (Weighted): 0.5893
|
| 21 |
+
Recall (Macro): 0.5477
|
| 22 |
+
Recall (Weighted): 0.5956
|
| 23 |
+
F1-Score (Macro): 0.5539
|
| 24 |
+
F1-Score (Weighted): 0.5905
|
| 25 |
+
|
| 26 |
+
========== Confusion Matrix ==========
|
| 27 |
+
[[723 23 83 3 81 29 49]
|
| 28 |
+
[ 38 154 26 72 10 14 13]
|
| 29 |
+
[108 14 169 2 30 23 10]
|
| 30 |
+
[ 13 42 12 249 14 29 10]
|
| 31 |
+
[110 9 30 9 404 18 20]
|
| 32 |
+
[ 32 25 26 30 38 149 32]
|
| 33 |
+
[ 90 9 23 13 35 38 118]]
|
bilstm_emotion_model/label_mapping.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Enjoyment": 0,
|
| 3 |
+
"Fear": 1,
|
| 4 |
+
"Sadness": 2,
|
| 5 |
+
"Anger": 3,
|
| 6 |
+
"Other": 4,
|
| 7 |
+
"Disgust": 5,
|
| 8 |
+
"Surprise": 6
|
| 9 |
+
}
|
bilstm_emotion_model/vocabulary.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
cnn_lstm_best.keras
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e98590341cdfcc831873ee3fddc3c17f16a350085df1e302e2e22a4eda0c03ad
|
| 3 |
+
size 13535600
|
cnn_lstm_emotion_model/classification_report.txt
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
========== CNN-LSTM Classification Report ==========
|
| 2 |
+
precision recall f1-score support
|
| 3 |
+
|
| 4 |
+
Enjoyment 0.6977 0.7265 0.7118 991
|
| 5 |
+
Fear 0.5526 0.6269 0.5874 327
|
| 6 |
+
Sadness 0.4955 0.4663 0.4805 356
|
| 7 |
+
Anger 0.7022 0.6070 0.6512 369
|
| 8 |
+
Other 0.6740 0.7650 0.7166 600
|
| 9 |
+
Disgust 0.5194 0.4849 0.5016 332
|
| 10 |
+
Surprise 0.5020 0.3896 0.4387 326
|
| 11 |
+
|
| 12 |
+
accuracy 0.6247 3301
|
| 13 |
+
macro avg 0.5919 0.5809 0.5840 3301
|
| 14 |
+
weighted avg 0.6204 0.6247 0.6205 3301
|
| 15 |
+
|
| 16 |
+
========== Additional Metrics ==========
|
| 17 |
+
Test Loss: 1.6124
|
| 18 |
+
Test Accuracy: 0.6247
|
| 19 |
+
Precision (Macro): 0.5919
|
| 20 |
+
Precision (Weighted): 0.6204
|
| 21 |
+
Recall (Macro): 0.5809
|
| 22 |
+
Recall (Weighted): 0.6247
|
| 23 |
+
F1-Score (Macro): 0.5840
|
| 24 |
+
F1-Score (Weighted): 0.6205
|
| 25 |
+
|
| 26 |
+
========== Confusion Matrix ==========
|
| 27 |
+
[[720 28 69 11 93 37 33]
|
| 28 |
+
[ 34 205 13 39 10 14 12]
|
| 29 |
+
[ 92 22 166 7 31 19 19]
|
| 30 |
+
[ 13 62 13 224 17 34 6]
|
| 31 |
+
[ 56 15 29 6 459 10 25]
|
| 32 |
+
[ 34 21 22 27 36 161 31]
|
| 33 |
+
[ 83 18 23 5 35 35 127]]
|
cnn_lstm_emotion_model/cnn_lstm_model.keras
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c45256b322b2360c9ba9e0c5da5fd42705f7d4395f6c1d4c6a94035e43bf05d0
|
| 3 |
+
size 13535600
|
cnn_lstm_model.keras
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:78c966f03f234f409270b699f84a635d98128de271d8492ee25776026312cd24
|
| 3 |
+
size 13535600
|
flagged/log.csv
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Nhập câu cần phân loại cảm xúc,Kết quả dự đoán,flag,username,timestamp
|
| 2 |
+
"Hôm nay là ngày đẹp trời, tôi muốn có người yêu 😊",Disgust,,,2025-01-14 13:57:25.419643
|
logs/events.out.tfevents.1736834439.ai1gpu-virtual-machine.52042.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aeb26f251abccb92c7342c443b6b7c7faa2b0d0c41976053706f1c002754680a
|
| 3 |
+
size 23650
|
logs/events.out.tfevents.1736835355.ai1gpu-virtual-machine.52042.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:72bc950b1e422eb9db07cba8ad85db543521c38025579fcc2cce1dd799313233
|
| 3 |
+
size 411
|
logs/events.out.tfevents.1736835689.ai1gpu-virtual-machine.52955.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:744768bef1c4f7e54446c6a7925c8b770d2d5af70f6f76016fab9805a3802b6f
|
| 3 |
+
size 346
|
logs/events.out.tfevents.1736835769.ai1gpu-virtual-machine.53242.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0843cbd924008b8a37ef65480d32b8e16241e9e059a3784b0b8ce6d097a0d0c5
|
| 3 |
+
size 346
|
logs/events.out.tfevents.1736835850.ai1gpu-virtual-machine.53528.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c3fc1113ddc32236fc69e785dfa73481178e728dd02e131bad5add13004729f
|
| 3 |
+
size 346
|
logs/events.out.tfevents.1736835995.ai1gpu-virtual-machine.53982.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3de874ab406b8d42e3f02443b3ae8fce7228cffb61c6845aab400981d1263b0
|
| 3 |
+
size 5228
|
logs/events.out.tfevents.1736836066.ai1gpu-virtual-machine.54029.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f927f800053a89cf20a14bf5a48c6343b31d9a49d5e670a4fc48ad7fb676874
|
| 3 |
+
size 8712
|
logs/events.out.tfevents.1736836768.ai1gpu-virtual-machine.55099.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f2816f60b911788c30bc43168dbbe689eee10a119e1e450767e54f521cb5f03c
|
| 3 |
+
size 81906
|
logs/events.out.tfevents.1736841979.ai1gpu-virtual-machine.55099.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:426ac92bb076d56fd8130e04ac0064542681f9ddd70fbeb64779f10b8521bb1d
|
| 3 |
+
size 417
|
logs/events.out.tfevents.1736844609.ai1gpu-virtual-machine.66743.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ff2d9a713d3ea47e04c6361df3c62d551e983cd170de4a163798e58eed51111
|
| 3 |
+
size 346
|
logs/events.out.tfevents.1736852947.ai1gpu-virtual-machine.76812.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2cea1a1f21eb664b3b5ae8f09ae76a38a3c7a37560a4432c805772a8afb171b
|
| 3 |
+
size 83399
|
logs/events.out.tfevents.1736858105.ai1gpu-virtual-machine.76812.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e9817a200d06938057f30fdac643b1480e734857bb5337aa4f494b29d199245
|
| 3 |
+
size 569
|
logs/events.out.tfevents.1736858545.ai1gpu-virtual-machine.87908.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:48134c412b09adeae17bc7aac0295e48dce80cf72ce2a1f4109c159ee99819b1
|
| 3 |
+
size 486
|
logs/events.out.tfevents.1736858698.ai1gpu-virtual-machine.88011.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f0165be0e6c2731ce32b3e3cbe11b5a6997120211c06d0d04c264b5c69c8f9f2
|
| 3 |
+
size 83399
|
logs/events.out.tfevents.1736864229.ai1gpu-virtual-machine.88011.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e468b0b65d952e3df6c9eb4f53bb8a8f867532828522b13b8229b53ea2787f9a
|
| 3 |
+
size 569
|
logs/events.out.tfevents.1736907563.ai1gpu-virtual-machine.145430.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a67cb94b4913d02142ea7fb0bbad62005700059dc0bc6670464999d33dce0daf
|
| 3 |
+
size 7756
|
logs/events.out.tfevents.1736908155.ai1gpu-virtual-machine.146675.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a882fae8ea63fa2ecf17da9e9c44bcd33568c5a998b11da0ceb6c537857223c
|
| 3 |
+
size 7367
|
logs/events.out.tfevents.1736911863.ai1gpu-virtual-machine.152249.0
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1dcadbf84e08ca0d1c9cf9f877233b857eb144b8aa92bd28291827220a0f7ea6
|
| 3 |
+
size 85351
|
logs/events.out.tfevents.1736916063.ai1gpu-virtual-machine.152249.1
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6eee809e23d4dd927f9c3dffb75d8184a24ae246cd0380fc93894bccc415d632
|
| 3 |
+
size 766
|
main_BILSTM.py
ADDED
|
@@ -0,0 +1,573 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# thesis.py
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import emoji
|
| 6 |
+
import json
|
| 7 |
+
import re
|
| 8 |
+
import numpy as np
|
| 9 |
+
from underthesea import word_tokenize
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
import torch
|
| 12 |
+
from torchtext.vocab import Vectors
|
| 13 |
+
from sklearn.model_selection import train_test_split
|
| 14 |
+
from sklearn.utils import resample
|
| 15 |
+
from sklearn.metrics import (
|
| 16 |
+
accuracy_score,
|
| 17 |
+
classification_report,
|
| 18 |
+
precision_score,
|
| 19 |
+
recall_score,
|
| 20 |
+
f1_score,
|
| 21 |
+
confusion_matrix
|
| 22 |
+
)
|
| 23 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
| 24 |
+
from torch.utils.data import DataLoader, TensorDataset
|
| 25 |
+
import torch.nn as nn
|
| 26 |
+
import torch.optim as optim
|
| 27 |
+
import tensorflow as tf
|
| 28 |
+
import os
|
| 29 |
+
|
| 30 |
+
# ========== CÁC HÀM TIỀN XỬ LÝ ==========
|
| 31 |
+
|
| 32 |
+
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
|
| 33 |
+
"""
|
| 34 |
+
Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
|
| 35 |
+
ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
|
| 36 |
+
"""
|
| 37 |
+
sentence = sentence.lower()
|
| 38 |
+
sentence = replace_emojis(sentence, emoji_mapping)
|
| 39 |
+
sentence = remove_profanity(sentence)
|
| 40 |
+
sentence = remove_special_characters(sentence)
|
| 41 |
+
sentence = normalize_whitespace(sentence)
|
| 42 |
+
sentence = replace_abbreviations(sentence, abbreviations)
|
| 43 |
+
sentence = remove_repeated_characters(sentence)
|
| 44 |
+
sentence = replace_numbers(sentence)
|
| 45 |
+
sentence = tokenize_sentence(sentence)
|
| 46 |
+
return sentence
|
| 47 |
+
|
| 48 |
+
def replace_emojis(sentence, emoji_mapping):
|
| 49 |
+
processed_sentence = []
|
| 50 |
+
for char in sentence:
|
| 51 |
+
if char in emoji_mapping:
|
| 52 |
+
processed_sentence.append(emoji_mapping[char])
|
| 53 |
+
elif not emoji.is_emoji(char):
|
| 54 |
+
processed_sentence.append(char)
|
| 55 |
+
return ''.join(processed_sentence)
|
| 56 |
+
|
| 57 |
+
def remove_profanity(sentence):
|
| 58 |
+
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
|
| 59 |
+
words = sentence.split()
|
| 60 |
+
filtered_words = [word for word in words if word.lower() not in profane_words]
|
| 61 |
+
return ' '.join(filtered_words)
|
| 62 |
+
|
| 63 |
+
def remove_special_characters(sentence):
|
| 64 |
+
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
|
| 65 |
+
|
| 66 |
+
def normalize_whitespace(sentence):
|
| 67 |
+
return ' '.join(sentence.split())
|
| 68 |
+
|
| 69 |
+
def replace_abbreviations(sentence, abbreviations):
|
| 70 |
+
words = sentence.split()
|
| 71 |
+
replaced_words = [
|
| 72 |
+
" ".join(abbreviations[word]) if word in abbreviations else word
|
| 73 |
+
for word in words
|
| 74 |
+
]
|
| 75 |
+
return ' '.join(replaced_words)
|
| 76 |
+
|
| 77 |
+
def remove_repeated_characters(sentence):
|
| 78 |
+
# Ví dụ: "đẹp quáaaaaaa" -> "đẹp quá"
|
| 79 |
+
return re.sub(r"(.)\1{2,}", r"\1", sentence)
|
| 80 |
+
|
| 81 |
+
def replace_numbers(sentence):
|
| 82 |
+
# Thay toàn bộ số bằng token [number]
|
| 83 |
+
return re.sub(r"\d+", "[number]", sentence)
|
| 84 |
+
|
| 85 |
+
def tokenize_sentence(sentence):
|
| 86 |
+
# Tách từ bằng underthesea
|
| 87 |
+
return ' '.join(word_tokenize(sentence))
|
| 88 |
+
|
| 89 |
+
# ========== VOCABULARY CLASS ==========
|
| 90 |
+
|
| 91 |
+
class Vocabulary:
|
| 92 |
+
def __init__(self):
|
| 93 |
+
self.word2id = {}
|
| 94 |
+
self.word2id['<pad>'] = 0
|
| 95 |
+
self.word2id['<unk>'] = 1
|
| 96 |
+
self.unk_id = 1
|
| 97 |
+
self.id2word = {0: '<pad>', 1: '<unk>'}
|
| 98 |
+
|
| 99 |
+
def __getitem__(self, word):
|
| 100 |
+
return self.word2id.get(word, self.unk_id)
|
| 101 |
+
|
| 102 |
+
def __contains__(self, word):
|
| 103 |
+
return word in self.word2id
|
| 104 |
+
|
| 105 |
+
def __len__(self):
|
| 106 |
+
return len(self.word2id)
|
| 107 |
+
|
| 108 |
+
def lookup_tokens(self, indices):
|
| 109 |
+
return [self.id2word[idx] for idx in indices]
|
| 110 |
+
|
| 111 |
+
def add(self, word):
|
| 112 |
+
if word not in self.word2id:
|
| 113 |
+
idx = len(self.word2id)
|
| 114 |
+
self.word2id[word] = idx
|
| 115 |
+
self.id2word[idx] = word
|
| 116 |
+
|
| 117 |
+
@staticmethod
|
| 118 |
+
def tokenize_corpus(corpus):
|
| 119 |
+
tokenized_corpus = []
|
| 120 |
+
for doc in tqdm(corpus, desc="Tokenizing Corpus"):
|
| 121 |
+
tokens = [w.replace(" ", "_") for w in word_tokenize(doc)]
|
| 122 |
+
tokenized_corpus.append(tokens)
|
| 123 |
+
return tokenized_corpus
|
| 124 |
+
|
| 125 |
+
def corpus_to_tensor(self, corpus, is_tokenized=False):
|
| 126 |
+
"""
|
| 127 |
+
corpus: list các câu (chuỗi) hoặc list các list từ (nếu is_tokenized=True)
|
| 128 |
+
return: list[list[int]], mỗi câu là 1 list gồm các chỉ số token
|
| 129 |
+
"""
|
| 130 |
+
tokenized_corpus = (
|
| 131 |
+
self.tokenize_corpus(corpus) if not is_tokenized else corpus
|
| 132 |
+
)
|
| 133 |
+
return [
|
| 134 |
+
[self[token] for token in doc]
|
| 135 |
+
for doc in tokenized_corpus
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
# ========== EMOJI MAPPING ==========
|
| 139 |
+
|
| 140 |
+
emoji_mapping = {
|
| 141 |
+
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
|
| 142 |
+
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
|
| 143 |
+
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
|
| 144 |
+
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
|
| 145 |
+
"🤑": "[satisfaction]",
|
| 146 |
+
"🤐": "[neutral]", "🤨": "[neutral]", "���": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
|
| 147 |
+
"😏": "[sarcasm]",
|
| 148 |
+
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
|
| 149 |
+
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
|
| 150 |
+
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
|
| 151 |
+
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
|
| 152 |
+
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
|
| 153 |
+
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
|
| 154 |
+
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
|
| 155 |
+
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
|
| 156 |
+
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
|
| 157 |
+
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
# ========== DATA MANAGER ==========
|
| 161 |
+
|
| 162 |
+
class DataManager:
|
| 163 |
+
def __init__(self, file_path, abbreviations_path, word2vec_path):
|
| 164 |
+
self.file_path = file_path
|
| 165 |
+
self.abbreviations_path = abbreviations_path
|
| 166 |
+
self.word2vec_path = word2vec_path
|
| 167 |
+
self.vocabulary = None
|
| 168 |
+
self.word_embeddings = None
|
| 169 |
+
self.abbreviations = None
|
| 170 |
+
self.load_abbreviations()
|
| 171 |
+
|
| 172 |
+
def load_abbreviations(self):
|
| 173 |
+
with open(self.abbreviations_path, "r", encoding="utf-8") as f:
|
| 174 |
+
self.abbreviations = json.load(f)
|
| 175 |
+
|
| 176 |
+
def load_word2vec(self):
|
| 177 |
+
"""
|
| 178 |
+
Tải vector từ file word2vec,
|
| 179 |
+
dùng torchtext.Vectors để load embedding pretrained.
|
| 180 |
+
"""
|
| 181 |
+
self.word_embeddings = Vectors(
|
| 182 |
+
name=self.word2vec_path,
|
| 183 |
+
unk_init=torch.Tensor.normal_
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
def create_vocab_from_corpus(self, corpus, max_vocab_size=30000):
|
| 187 |
+
"""
|
| 188 |
+
Tạo vocabulary từ corpus, chỉ lấy top max_vocab_size từ.
|
| 189 |
+
"""
|
| 190 |
+
vocab = Vocabulary()
|
| 191 |
+
from collections import Counter
|
| 192 |
+
counter = Counter()
|
| 193 |
+
|
| 194 |
+
for sent in corpus:
|
| 195 |
+
for token in sent.split():
|
| 196 |
+
counter[token] += 1
|
| 197 |
+
|
| 198 |
+
most_common = counter.most_common(max_vocab_size)
|
| 199 |
+
for word, _freq in most_common:
|
| 200 |
+
vocab.add(word)
|
| 201 |
+
|
| 202 |
+
return vocab
|
| 203 |
+
|
| 204 |
+
def preprocess_data(self):
|
| 205 |
+
df = pd.read_excel(self.file_path)
|
| 206 |
+
if "Sentence" not in df.columns:
|
| 207 |
+
raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
|
| 208 |
+
|
| 209 |
+
# Tiền xử lý từng câu
|
| 210 |
+
df["processed_sentence"] = df["Sentence"].apply(
|
| 211 |
+
lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# Loại những dòng rỗng
|
| 215 |
+
df = df[df["processed_sentence"].str.strip().astype(bool)]
|
| 216 |
+
|
| 217 |
+
# Tạo vocab từ chính dữ liệu
|
| 218 |
+
all_sentences = df["processed_sentence"].tolist()
|
| 219 |
+
self.vocabulary = self.create_vocab_from_corpus(all_sentences, max_vocab_size=30000)
|
| 220 |
+
|
| 221 |
+
# Load word2vec
|
| 222 |
+
self.load_word2vec()
|
| 223 |
+
|
| 224 |
+
return df
|
| 225 |
+
|
| 226 |
+
def build_pretrained_embedding_matrix(self, embedding_dim=100):
|
| 227 |
+
"""
|
| 228 |
+
Tạo weight_matrix (numpy) (vocab_size x embedding_dim)
|
| 229 |
+
với trọng số pretrained.
|
| 230 |
+
"""
|
| 231 |
+
vocab_size = len(self.vocabulary)
|
| 232 |
+
weight_matrix = np.random.normal(
|
| 233 |
+
scale=0.1, size=(vocab_size, embedding_dim)
|
| 234 |
+
).astype(np.float32)
|
| 235 |
+
|
| 236 |
+
# Copy vector pretrained
|
| 237 |
+
for word, idx in self.vocabulary.word2id.items():
|
| 238 |
+
if word in self.word_embeddings.stoi:
|
| 239 |
+
weight_matrix[idx] = self.word_embeddings.vectors[
|
| 240 |
+
self.word_embeddings.stoi[word]
|
| 241 |
+
]
|
| 242 |
+
|
| 243 |
+
return weight_matrix
|
| 244 |
+
|
| 245 |
+
def split_and_convert(
|
| 246 |
+
self, df, label_column="Emotion", maxlen=400, test_size=0.2,
|
| 247 |
+
for_keras=False, batch_size=32
|
| 248 |
+
):
|
| 249 |
+
"""
|
| 250 |
+
Chia dữ liệu thành train/test hoặc train/val/test.
|
| 251 |
+
- for_keras=False → return train_loader, test_loader, label_mapping (PyTorch)
|
| 252 |
+
- for_keras=True → return X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
|
| 253 |
+
"""
|
| 254 |
+
if label_column not in df.columns:
|
| 255 |
+
raise ValueError(
|
| 256 |
+
f"Cột '{label_column}' không tồn tại. Hiện có: {df.columns.tolist()}"
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
# Tạo mapping nhãn -> số
|
| 260 |
+
label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
|
| 261 |
+
df[label_column] = df[label_column].map(label_mapping)
|
| 262 |
+
if df[label_column].isnull().any():
|
| 263 |
+
missing = df[df[label_column].isnull()][label_column].unique()
|
| 264 |
+
raise ValueError(f"Những nhãn cảm xúc sau không có trong label_mapping: {missing}")
|
| 265 |
+
|
| 266 |
+
X = df["processed_sentence"].tolist()
|
| 267 |
+
y = df[label_column].tolist()
|
| 268 |
+
|
| 269 |
+
# Stratify để duy trì phân phối lớp
|
| 270 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 271 |
+
X, y, test_size=test_size, random_state=42, stratify=y
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
if not for_keras:
|
| 275 |
+
# Chia train thành train và validation
|
| 276 |
+
X_train, X_val, y_train, y_val = train_test_split(
|
| 277 |
+
X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
# Convert text -> index
|
| 281 |
+
X_train_ids = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
|
| 282 |
+
X_test_ids = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False)
|
| 283 |
+
|
| 284 |
+
if not for_keras:
|
| 285 |
+
X_val_ids = self.vocabulary.corpus_to_tensor(X_val, is_tokenized=False)
|
| 286 |
+
|
| 287 |
+
# Pad
|
| 288 |
+
X_train_padded = pad_sequences(X_train_ids, maxlen=maxlen, padding='post', truncating='post')
|
| 289 |
+
X_test_padded = pad_sequences(X_test_ids, maxlen=maxlen, padding='post', truncating='post')
|
| 290 |
+
|
| 291 |
+
if not for_keras:
|
| 292 |
+
X_val_padded = pad_sequences(X_val_ids, maxlen=maxlen, padding='post', truncating='post')
|
| 293 |
+
|
| 294 |
+
print(">>> Debug Split and Convert:")
|
| 295 |
+
print("X_train_padded.shape:", X_train_padded.shape)
|
| 296 |
+
print("X_val_padded.shape: ", X_val_padded.shape if not for_keras else "N/A")
|
| 297 |
+
print("X_test_padded.shape: ", X_test_padded.shape)
|
| 298 |
+
print("y_train length:", len(y_train))
|
| 299 |
+
print("y_val length: ", len(y_val) if not for_keras else "N/A")
|
| 300 |
+
print("y_test length: ", len(y_test))
|
| 301 |
+
print("vocab_size:", len(self.vocabulary))
|
| 302 |
+
|
| 303 |
+
if for_keras:
|
| 304 |
+
num_classes = len(label_mapping)
|
| 305 |
+
y_train_onehot = tf.keras.utils.to_categorical(
|
| 306 |
+
y_train,
|
| 307 |
+
num_classes=num_classes
|
| 308 |
+
)
|
| 309 |
+
y_test_onehot = tf.keras.utils.to_categorical(
|
| 310 |
+
y_test,
|
| 311 |
+
num_classes=num_classes
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
print("y_train_onehot.shape:", y_train_onehot.shape)
|
| 315 |
+
print("y_test_onehot.shape: ", y_test_onehot.shape)
|
| 316 |
+
|
| 317 |
+
return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
|
| 318 |
+
else:
|
| 319 |
+
# Convert validation set
|
| 320 |
+
X_val_ids = self.vocabulary.corpus_to_tensor(X_val, is_tokenized=False)
|
| 321 |
+
X_val_padded = pad_sequences(X_val_ids, maxlen=maxlen, padding='post', truncating='post')
|
| 322 |
+
|
| 323 |
+
X_train_t = torch.tensor(X_train_padded, dtype=torch.long)
|
| 324 |
+
X_val_t = torch.tensor(X_val_padded, dtype=torch.long)
|
| 325 |
+
X_test_t = torch.tensor(X_test_padded, dtype=torch.long)
|
| 326 |
+
y_train_t = torch.tensor(y_train, dtype=torch.long)
|
| 327 |
+
y_val_t = torch.tensor(y_val, dtype=torch.long)
|
| 328 |
+
y_test_t = torch.tensor(y_test, dtype=torch.long)
|
| 329 |
+
|
| 330 |
+
train_ds = TensorDataset(X_train_t, y_train_t)
|
| 331 |
+
val_ds = TensorDataset(X_val_t, y_val_t)
|
| 332 |
+
test_ds = TensorDataset(X_test_t, y_test_t)
|
| 333 |
+
|
| 334 |
+
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
|
| 335 |
+
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
|
| 336 |
+
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
|
| 337 |
+
|
| 338 |
+
return train_loader, val_loader, test_loader, label_mapping
|
| 339 |
+
|
| 340 |
+
# ========== MÔ HÌNH KERAS BI-LSTM ==========
|
| 341 |
+
|
| 342 |
+
def predict_emotion_bilstm(model, text, data_manager, label_mapping):
|
| 343 |
+
processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
|
| 344 |
+
tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
|
| 345 |
+
text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
|
| 346 |
+
text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
|
| 347 |
+
output = model.predict(text_padded)
|
| 348 |
+
pred = output.argmax(axis=1)[0]
|
| 349 |
+
rev_map = {v: k for k, v in label_mapping.items()}
|
| 350 |
+
return rev_map[pred]
|
| 351 |
+
|
| 352 |
+
# ========== MAIN ==========
|
| 353 |
+
|
| 354 |
+
if __name__ == "__main__":
|
| 355 |
+
from keras.models import Model
|
| 356 |
+
from keras.layers import (
|
| 357 |
+
Input, Embedding, Dense, Dropout, Bidirectional, LSTM
|
| 358 |
+
)
|
| 359 |
+
from keras.optimizers import Adam
|
| 360 |
+
from keras.callbacks import ModelCheckpoint, EarlyStopping
|
| 361 |
+
|
| 362 |
+
# -------- ĐƯỜNG DẪN ----------
|
| 363 |
+
file_path = "train.xlsx"
|
| 364 |
+
abbreviations_path = "abbreviations.json"
|
| 365 |
+
word2vec_path = "word2vec_vi_syllables_100dims.txt"
|
| 366 |
+
output_path = "processed.xlsx"
|
| 367 |
+
|
| 368 |
+
# Khởi tạo DataManager
|
| 369 |
+
data_manager = DataManager(
|
| 370 |
+
file_path=file_path,
|
| 371 |
+
abbreviations_path=abbreviations_path,
|
| 372 |
+
word2vec_path=word2vec_path
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
# 1) Tiền xử lý, tạo vocab, load word2vec
|
| 376 |
+
df = data_manager.preprocess_data()
|
| 377 |
+
print("Trước khi cân bằng lớp (undersampling/oversampling):")
|
| 378 |
+
print(df["Emotion"].value_counts())
|
| 379 |
+
|
| 380 |
+
# 2) Cân bằng lớp dữ liệu (Ví dụ: Oversample 'Other' lên 3000)
|
| 381 |
+
# Bạn có thể điều chỉnh theo nhu cầu của mình
|
| 382 |
+
df_enjoyment = df[df["Emotion"] == "Enjoyment"]
|
| 383 |
+
df_other = df[df["Emotion"] == "Other"]
|
| 384 |
+
df_anger = df[df["Emotion"] == "Anger"]
|
| 385 |
+
df_sadness = df[df["Emotion"] == "Sadness"]
|
| 386 |
+
df_disgust = df[df["Emotion"] == "Disgust"]
|
| 387 |
+
df_fear = df[df["Emotion"] == "Fear"]
|
| 388 |
+
df_surprise = df[df["Emotion"] == "Surprise"]
|
| 389 |
+
|
| 390 |
+
# Oversample lớp 'Other' lên 3000 (chỉ minh hoạ)
|
| 391 |
+
if len(df_other) < 3000:
|
| 392 |
+
df_other_oversampled = resample(
|
| 393 |
+
df_other,
|
| 394 |
+
replace=True,
|
| 395 |
+
n_samples=3000,
|
| 396 |
+
random_state=42
|
| 397 |
+
)
|
| 398 |
+
else:
|
| 399 |
+
df_other_oversampled = df_other
|
| 400 |
+
|
| 401 |
+
# Giữ nguyên các lớp khác (hoặc oversample tùy ý)
|
| 402 |
+
df_balanced = pd.concat([
|
| 403 |
+
df_enjoyment,
|
| 404 |
+
df_other_oversampled,
|
| 405 |
+
df_anger,
|
| 406 |
+
df_sadness,
|
| 407 |
+
df_disgust,
|
| 408 |
+
df_fear,
|
| 409 |
+
df_surprise
|
| 410 |
+
], axis=0)
|
| 411 |
+
|
| 412 |
+
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
|
| 413 |
+
df = df_balanced
|
| 414 |
+
|
| 415 |
+
print("\nSau khi cân bằng lớp (demo oversample):")
|
| 416 |
+
print(df["Emotion"].value_counts())
|
| 417 |
+
|
| 418 |
+
# Xuất file (nếu muốn)
|
| 419 |
+
df.to_excel(output_path, index=False)
|
| 420 |
+
|
| 421 |
+
# ========== TRAIN BI-LSTM KERAS ==========
|
| 422 |
+
|
| 423 |
+
print("\n========== Training Keras BiLSTM ==========")
|
| 424 |
+
|
| 425 |
+
# Tạo embedding pretrained cho Keras
|
| 426 |
+
pretrained_matrix = data_manager.build_pretrained_embedding_matrix(embedding_dim=100)
|
| 427 |
+
pretrained_matrix_keras = pretrained_matrix.astype(np.float32)
|
| 428 |
+
|
| 429 |
+
# Split data for Keras
|
| 430 |
+
X_train, X_test, y_train, y_test, label_mapping = data_manager.split_and_convert(
|
| 431 |
+
df, label_column="Emotion", maxlen=400,
|
| 432 |
+
test_size=0.2, for_keras=True
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
num_classes = len(label_mapping)
|
| 436 |
+
input_dim = len(data_manager.vocabulary)
|
| 437 |
+
embedding_dim = pretrained_matrix.shape[1]
|
| 438 |
+
maxlen = 400
|
| 439 |
+
|
| 440 |
+
# Define BiLSTM Model
|
| 441 |
+
def create_bilstm_model():
|
| 442 |
+
input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
|
| 443 |
+
emb_layer = Embedding(
|
| 444 |
+
input_dim=input_dim,
|
| 445 |
+
output_dim=embedding_dim,
|
| 446 |
+
weights=[pretrained_matrix_keras],
|
| 447 |
+
input_length=maxlen,
|
| 448 |
+
trainable=True # Set to False nếu bạn không muốn fine-tune embeddings
|
| 449 |
+
)(input_layer)
|
| 450 |
+
|
| 451 |
+
bilstm = Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.5))(emb_layer)
|
| 452 |
+
dense1 = Dense(64, activation='relu')(bilstm)
|
| 453 |
+
dropout1 = Dropout(0.5)(dense1)
|
| 454 |
+
dense2 = Dense(32, activation='relu')(dropout1)
|
| 455 |
+
dropout2 = Dropout(0.5)(dense2)
|
| 456 |
+
output_layer = Dense(num_classes, activation='softmax')(dropout2)
|
| 457 |
+
|
| 458 |
+
model = Model(inputs=input_layer, outputs=output_layer)
|
| 459 |
+
model.compile(
|
| 460 |
+
loss='categorical_crossentropy',
|
| 461 |
+
optimizer=Adam(lr=1e-3),
|
| 462 |
+
metrics=['accuracy']
|
| 463 |
+
)
|
| 464 |
+
return model
|
| 465 |
+
|
| 466 |
+
# Create model
|
| 467 |
+
model_bilstm = create_bilstm_model()
|
| 468 |
+
model_bilstm.summary()
|
| 469 |
+
|
| 470 |
+
# Define callbacks
|
| 471 |
+
checkpoint = ModelCheckpoint(
|
| 472 |
+
'bilstm_best.keras',
|
| 473 |
+
save_best_only=True,
|
| 474 |
+
monitor='val_accuracy',
|
| 475 |
+
mode='max'
|
| 476 |
+
)
|
| 477 |
+
early_stopping = EarlyStopping(
|
| 478 |
+
monitor='val_accuracy',
|
| 479 |
+
patience=5,
|
| 480 |
+
restore_best_weights=True
|
| 481 |
+
)
|
| 482 |
+
|
| 483 |
+
# Train model
|
| 484 |
+
history = model_bilstm.fit(
|
| 485 |
+
X_train, y_train,
|
| 486 |
+
validation_data=(X_test, y_test),
|
| 487 |
+
epochs=100,
|
| 488 |
+
batch_size=32,
|
| 489 |
+
callbacks=[checkpoint, early_stopping]
|
| 490 |
+
)
|
| 491 |
+
|
| 492 |
+
# Đánh giá trên test set với detailed metrics
|
| 493 |
+
loss, acc = model_bilstm.evaluate(X_test, y_test)
|
| 494 |
+
print(f"BiLSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")
|
| 495 |
+
|
| 496 |
+
# Thu thập dự đoán và tính toán các chỉ số
|
| 497 |
+
y_pred_bilstm = model_bilstm.predict(X_test)
|
| 498 |
+
y_pred_bilstm = np.argmax(y_pred_bilstm, axis=1)
|
| 499 |
+
y_true_bilstm = np.argmax(y_test, axis=1)
|
| 500 |
+
|
| 501 |
+
test_accuracy_bilstm = accuracy_score(y_true_bilstm, y_pred_bilstm)
|
| 502 |
+
precision_macro_bilstm = precision_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
|
| 503 |
+
precision_weighted_bilstm = precision_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
|
| 504 |
+
recall_macro_bilstm = recall_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
|
| 505 |
+
recall_weighted_bilstm = recall_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
|
| 506 |
+
f1_macro_bilstm = f1_score(y_true_bilstm, y_pred_bilstm, average='macro', zero_division=0)
|
| 507 |
+
f1_weighted_bilstm = f1_score(y_true_bilstm, y_pred_bilstm, average='weighted', zero_division=0)
|
| 508 |
+
report_bilstm = classification_report(y_true_bilstm, y_pred_bilstm, target_names=label_mapping.keys(), digits=4)
|
| 509 |
+
conf_matrix_bilstm = confusion_matrix(y_true_bilstm, y_pred_bilstm)
|
| 510 |
+
|
| 511 |
+
# In các chỉ số
|
| 512 |
+
print(f"\nBiLSTM Test Accuracy: {test_accuracy_bilstm:.4f}")
|
| 513 |
+
print(f"Precision (Macro): {precision_macro_bilstm:.4f}")
|
| 514 |
+
print(f"Precision (Weighted): {precision_weighted_bilstm:.4f}")
|
| 515 |
+
print(f"Recall (Macro): {recall_macro_bilstm:.4f}")
|
| 516 |
+
print(f"Recall (Weighted): {recall_weighted_bilstm:.4f}")
|
| 517 |
+
print(f"F1-Score (Macro): {f1_macro_bilstm:.4f}")
|
| 518 |
+
print(f"F1-Score (Weighted): {f1_weighted_bilstm:.4f}")
|
| 519 |
+
|
| 520 |
+
print("\n========== BiLSTM Classification Report ==========")
|
| 521 |
+
print(report_bilstm)
|
| 522 |
+
|
| 523 |
+
print("\n========== BiLSTM Confusion Matrix ==========")
|
| 524 |
+
print(conf_matrix_bilstm)
|
| 525 |
+
|
| 526 |
+
# Lưu báo cáo vào file
|
| 527 |
+
bilstm_report_dir = "bilstm_emotion_model"
|
| 528 |
+
os.makedirs(bilstm_report_dir, exist_ok=True)
|
| 529 |
+
with open(os.path.join(bilstm_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
|
| 530 |
+
f.write("========== BiLSTM Classification Report ==========\n")
|
| 531 |
+
f.write(report_bilstm)
|
| 532 |
+
f.write("\n========== Additional Metrics ==========\n")
|
| 533 |
+
f.write(f"Test Loss: {loss:.4f}\n")
|
| 534 |
+
f.write(f"Test Accuracy: {test_accuracy_bilstm:.4f}\n")
|
| 535 |
+
f.write(f"Precision (Macro): {precision_macro_bilstm:.4f}\n")
|
| 536 |
+
f.write(f"Precision (Weighted): {precision_weighted_bilstm:.4f}\n")
|
| 537 |
+
f.write(f"Recall (Macro): {recall_macro_bilstm:.4f}\n")
|
| 538 |
+
f.write(f"Recall (Weighted): {recall_weighted_bilstm:.4f}\n")
|
| 539 |
+
f.write(f"F1-Score (Macro): {f1_macro_bilstm:.4f}\n")
|
| 540 |
+
f.write(f"F1-Score (Weighted): {f1_weighted_bilstm:.4f}\n")
|
| 541 |
+
f.write("\n========== Confusion Matrix ==========\n")
|
| 542 |
+
f.write(np.array2string(conf_matrix_bilstm))
|
| 543 |
+
|
| 544 |
+
print("\n========== BiLSTM Classification Report saved to 'bilstm_emotion_model/classification_report.txt' ==========")
|
| 545 |
+
|
| 546 |
+
# Lưu mô hình BiLSTM
|
| 547 |
+
model_bilstm.save(os.path.join(bilstm_report_dir, 'bilstm_model.keras'))
|
| 548 |
+
print(f"========== BiLSTM Model saved to '{bilstm_report_dir}/bilstm_model.keras' ==========")
|
| 549 |
+
|
| 550 |
+
# ========== DEMO DỰ ĐOÁN 1 CÂU MỚI ==========
|
| 551 |
+
|
| 552 |
+
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
|
| 553 |
+
|
| 554 |
+
# BiLSTM (Keras)
|
| 555 |
+
emotion_bilstm = predict_emotion_bilstm(
|
| 556 |
+
model_bilstm, custom_text, data_manager, label_mapping
|
| 557 |
+
)
|
| 558 |
+
print(f"Predicted Emotion (BiLSTM): {emotion_bilstm}")
|
| 559 |
+
|
| 560 |
+
# Kiểm tra TF, GPU
|
| 561 |
+
print("TF version:", tf.__version__)
|
| 562 |
+
print("GPU devices:", tf.config.list_physical_devices("GPU"))
|
| 563 |
+
# os.system("nvidia-smi") # nếu muốn xem info GPU
|
| 564 |
+
|
| 565 |
+
# ========== LƯU LABEL MAPPING VÀ VOCABULARY ==========
|
| 566 |
+
# Lưu label_mapping và vocabulary cho BiLSTM
|
| 567 |
+
with open(os.path.join(bilstm_report_dir, "label_mapping.json"), "w", encoding="utf-8") as f:
|
| 568 |
+
json.dump(label_mapping, f, ensure_ascii=False, indent=4)
|
| 569 |
+
|
| 570 |
+
with open(os.path.join(bilstm_report_dir, "vocabulary.json"), "w", encoding="utf-8") as f:
|
| 571 |
+
json.dump(data_manager.vocabulary.word2id, f, ensure_ascii=False, indent=4)
|
| 572 |
+
|
| 573 |
+
print("========== Label Mapping and Vocabulary saved ==========")
|
main_RNN_CNN-LSTM.py
ADDED
|
@@ -0,0 +1,738 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# thesis.py
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import emoji
|
| 6 |
+
import json
|
| 7 |
+
import re
|
| 8 |
+
import numpy as np
|
| 9 |
+
from underthesea import word_tokenize
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
import torch
|
| 12 |
+
from torchtext.vocab import Vectors
|
| 13 |
+
from sklearn.model_selection import train_test_split
|
| 14 |
+
from sklearn.utils import resample
|
| 15 |
+
from sklearn.metrics import (
|
| 16 |
+
accuracy_score,
|
| 17 |
+
classification_report,
|
| 18 |
+
precision_score,
|
| 19 |
+
recall_score,
|
| 20 |
+
f1_score,
|
| 21 |
+
confusion_matrix
|
| 22 |
+
)
|
| 23 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
| 24 |
+
from torch.utils.data import DataLoader, TensorDataset
|
| 25 |
+
import torch.nn as nn
|
| 26 |
+
import torch.optim as optim
|
| 27 |
+
import tensorflow as tf
|
| 28 |
+
import os
|
| 29 |
+
import joblib
|
| 30 |
+
|
| 31 |
+
# ========== CÁC HÀM TIỀN XỬ LÝ ==========
|
| 32 |
+
|
| 33 |
+
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
|
| 34 |
+
"""
|
| 35 |
+
Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
|
| 36 |
+
ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
|
| 37 |
+
"""
|
| 38 |
+
sentence = sentence.lower()
|
| 39 |
+
sentence = replace_emojis(sentence, emoji_mapping)
|
| 40 |
+
sentence = remove_profanity(sentence)
|
| 41 |
+
sentence = remove_special_characters(sentence)
|
| 42 |
+
sentence = normalize_whitespace(sentence)
|
| 43 |
+
sentence = replace_abbreviations(sentence, abbreviations)
|
| 44 |
+
sentence = remove_repeated_characters(sentence)
|
| 45 |
+
sentence = replace_numbers(sentence)
|
| 46 |
+
sentence = tokenize_sentence(sentence)
|
| 47 |
+
return sentence
|
| 48 |
+
|
| 49 |
+
def replace_emojis(sentence, emoji_mapping):
|
| 50 |
+
processed_sentence = []
|
| 51 |
+
for char in sentence:
|
| 52 |
+
if char in emoji_mapping:
|
| 53 |
+
processed_sentence.append(emoji_mapping[char])
|
| 54 |
+
elif not emoji.is_emoji(char):
|
| 55 |
+
processed_sentence.append(char)
|
| 56 |
+
return ''.join(processed_sentence)
|
| 57 |
+
|
| 58 |
+
def remove_profanity(sentence):
|
| 59 |
+
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
|
| 60 |
+
words = sentence.split()
|
| 61 |
+
filtered_words = [word for word in words if word.lower() not in profane_words]
|
| 62 |
+
return ' '.join(filtered_words)
|
| 63 |
+
|
| 64 |
+
def remove_special_characters(sentence):
|
| 65 |
+
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
|
| 66 |
+
|
| 67 |
+
def normalize_whitespace(sentence):
|
| 68 |
+
return ' '.join(sentence.split())
|
| 69 |
+
|
| 70 |
+
def replace_abbreviations(sentence, abbreviations):
|
| 71 |
+
words = sentence.split()
|
| 72 |
+
replaced_words = [
|
| 73 |
+
" ".join(abbreviations[word]) if word in abbreviations else word
|
| 74 |
+
for word in words
|
| 75 |
+
]
|
| 76 |
+
return ' '.join(replaced_words)
|
| 77 |
+
|
| 78 |
+
def remove_repeated_characters(sentence):
|
| 79 |
+
# Ví dụ: "đẹp quáaaaaaa" -> "đẹp quá"
|
| 80 |
+
return re.sub(r"(.)\1{2,}", r"\1", sentence)
|
| 81 |
+
|
| 82 |
+
def replace_numbers(sentence):
|
| 83 |
+
# Thay toàn bộ số bằng token [number]
|
| 84 |
+
return re.sub(r"\d+", "[number]", sentence)
|
| 85 |
+
|
| 86 |
+
def tokenize_sentence(sentence):
|
| 87 |
+
# Tách từ bằng underthesea
|
| 88 |
+
return ' '.join(word_tokenize(sentence))
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
# ========== VOCABULARY CLASS ==========
|
| 92 |
+
|
| 93 |
+
class Vocabulary:
|
| 94 |
+
def __init__(self):
|
| 95 |
+
self.word2id = {}
|
| 96 |
+
self.word2id['<pad>'] = 0
|
| 97 |
+
self.word2id['<unk>'] = 1
|
| 98 |
+
self.unk_id = 1
|
| 99 |
+
self.id2word = {0: '<pad>', 1: '<unk>'}
|
| 100 |
+
|
| 101 |
+
def __getitem__(self, word):
|
| 102 |
+
return self.word2id.get(word, self.unk_id)
|
| 103 |
+
|
| 104 |
+
def __contains__(self, word):
|
| 105 |
+
return word in self.word2id
|
| 106 |
+
|
| 107 |
+
def __len__(self):
|
| 108 |
+
return len(self.word2id)
|
| 109 |
+
|
| 110 |
+
def lookup_tokens(self, indices):
|
| 111 |
+
return [self.id2word[idx] for idx in indices]
|
| 112 |
+
|
| 113 |
+
def add(self, word):
|
| 114 |
+
if word not in self.word2id:
|
| 115 |
+
idx = len(self.word2id)
|
| 116 |
+
self.word2id[word] = idx
|
| 117 |
+
self.id2word[idx] = word
|
| 118 |
+
|
| 119 |
+
@staticmethod
|
| 120 |
+
def tokenize_corpus(corpus):
|
| 121 |
+
tokenized_corpus = []
|
| 122 |
+
for doc in tqdm(corpus, desc="Tokenizing Corpus"):
|
| 123 |
+
tokens = [w.replace(" ", "_") for w in word_tokenize(doc)]
|
| 124 |
+
tokenized_corpus.append(tokens)
|
| 125 |
+
return tokenized_corpus
|
| 126 |
+
|
| 127 |
+
def corpus_to_tensor(self, corpus, is_tokenized=False):
|
| 128 |
+
"""
|
| 129 |
+
corpus: list các câu (chuỗi) hoặc list các list từ (nếu is_tokenized=True)
|
| 130 |
+
return: list[list[int]], mỗi câu là 1 list gồm các chỉ số token
|
| 131 |
+
"""
|
| 132 |
+
tokenized_corpus = (
|
| 133 |
+
self.tokenize_corpus(corpus) if not is_tokenized else corpus
|
| 134 |
+
)
|
| 135 |
+
return [
|
| 136 |
+
[self[token] for token in doc]
|
| 137 |
+
for doc in tokenized_corpus
|
| 138 |
+
]
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# ========== EMOJI MAPPING ==========
|
| 142 |
+
|
| 143 |
+
emoji_mapping = {
|
| 144 |
+
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
|
| 145 |
+
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
|
| 146 |
+
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
|
| 147 |
+
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
|
| 148 |
+
"🤑": "[satisfaction]",
|
| 149 |
+
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
|
| 150 |
+
"😏": "[sarcasm]",
|
| 151 |
+
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
|
| 152 |
+
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
|
| 153 |
+
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
|
| 154 |
+
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
|
| 155 |
+
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
|
| 156 |
+
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
|
| 157 |
+
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
|
| 158 |
+
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
|
| 159 |
+
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
|
| 160 |
+
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
def load_abbreviations(path):
|
| 164 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 165 |
+
return json.load(f)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# ========== DATA MANAGER ==========
|
| 169 |
+
|
| 170 |
+
class DataManager:
|
| 171 |
+
def __init__(self, file_path, abbreviations_path, word2vec_path):
|
| 172 |
+
self.file_path = file_path
|
| 173 |
+
self.abbreviations_path = abbreviations_path
|
| 174 |
+
self.word2vec_path = word2vec_path
|
| 175 |
+
self.vocabulary = None
|
| 176 |
+
self.word_embeddings = None
|
| 177 |
+
self.abbreviations = None
|
| 178 |
+
self.load_abbreviations()
|
| 179 |
+
|
| 180 |
+
def load_abbreviations(self):
|
| 181 |
+
with open(self.abbreviations_path, "r", encoding="utf-8") as f:
|
| 182 |
+
self.abbreviations = json.load(f)
|
| 183 |
+
|
| 184 |
+
def load_word2vec(self):
|
| 185 |
+
"""
|
| 186 |
+
Tải vector từ file word2vec,
|
| 187 |
+
dùng torchtext.Vectors để load embedding pretrained.
|
| 188 |
+
"""
|
| 189 |
+
self.word_embeddings = Vectors(
|
| 190 |
+
name=self.word2vec_path,
|
| 191 |
+
unk_init=torch.Tensor.normal_
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
def create_vocab_from_corpus(self, corpus, max_vocab_size=30000):
|
| 195 |
+
"""
|
| 196 |
+
Tạo vocabulary từ corpus, chỉ lấy top max_vocab_size từ.
|
| 197 |
+
"""
|
| 198 |
+
vocab = Vocabulary()
|
| 199 |
+
from collections import Counter
|
| 200 |
+
counter = Counter()
|
| 201 |
+
|
| 202 |
+
for sent in corpus:
|
| 203 |
+
for token in sent.split():
|
| 204 |
+
counter[token] += 1
|
| 205 |
+
|
| 206 |
+
most_common = counter.most_common(max_vocab_size)
|
| 207 |
+
for word, _freq in most_common:
|
| 208 |
+
vocab.add(word)
|
| 209 |
+
|
| 210 |
+
return vocab
|
| 211 |
+
|
| 212 |
+
def preprocess_data(self):
|
| 213 |
+
df = pd.read_excel(self.file_path)
|
| 214 |
+
if "Sentence" not in df.columns:
|
| 215 |
+
raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
|
| 216 |
+
|
| 217 |
+
# Tiền xử lý từng câu
|
| 218 |
+
df["processed_sentence"] = df["Sentence"].apply(
|
| 219 |
+
lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
# Loại những dòng rỗng
|
| 223 |
+
df = df[df["processed_sentence"].str.strip().astype(bool)]
|
| 224 |
+
|
| 225 |
+
# Tạo vocab từ chính dữ liệu
|
| 226 |
+
all_sentences = df["processed_sentence"].tolist()
|
| 227 |
+
self.vocabulary = self.create_vocab_from_corpus(all_sentences, max_vocab_size=30000)
|
| 228 |
+
|
| 229 |
+
# Load word2vec
|
| 230 |
+
self.load_word2vec()
|
| 231 |
+
|
| 232 |
+
return df
|
| 233 |
+
|
| 234 |
+
def build_pretrained_embedding_matrix(self, embedding_dim=100):
|
| 235 |
+
"""
|
| 236 |
+
Tạo weight_matrix (numpy) (vocab_size x embedding_dim)
|
| 237 |
+
với trọng số pretrained.
|
| 238 |
+
"""
|
| 239 |
+
vocab_size = len(self.vocabulary)
|
| 240 |
+
weight_matrix = np.random.normal(
|
| 241 |
+
scale=0.1, size=(vocab_size, embedding_dim)
|
| 242 |
+
).astype(np.float32)
|
| 243 |
+
|
| 244 |
+
# Copy vector pretrained
|
| 245 |
+
for word, idx in self.vocabulary.word2id.items():
|
| 246 |
+
if word in self.word_embeddings.stoi:
|
| 247 |
+
weight_matrix[idx] = self.word_embeddings.vectors[
|
| 248 |
+
self.word_embeddings.stoi[word]
|
| 249 |
+
]
|
| 250 |
+
|
| 251 |
+
return weight_matrix
|
| 252 |
+
|
| 253 |
+
def split_and_convert(
|
| 254 |
+
self, df, label_column="Emotion", maxlen=400, test_size=0.2,
|
| 255 |
+
for_keras=False, batch_size=32
|
| 256 |
+
):
|
| 257 |
+
"""
|
| 258 |
+
Chia dữ liệu thành train/test.
|
| 259 |
+
- for_keras=False → return train_loader, test_loader, label_mapping (PyTorch)
|
| 260 |
+
- for_keras=True → return X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
|
| 261 |
+
"""
|
| 262 |
+
if label_column not in df.columns:
|
| 263 |
+
raise ValueError(
|
| 264 |
+
f"Cột '{label_column}' không tồn tại. Hiện có: {df.columns.tolist()}"
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
# Tạo mapping nhãn -> số
|
| 268 |
+
label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
|
| 269 |
+
df[label_column] = df[label_column].map(label_mapping)
|
| 270 |
+
if df[label_column].isnull().any():
|
| 271 |
+
missing = df[df[label_column].isnull()][label_column].unique()
|
| 272 |
+
raise ValueError(f"Những nhãn cảm xúc sau không có trong label_mapping: {missing}")
|
| 273 |
+
|
| 274 |
+
X = df["processed_sentence"].tolist()
|
| 275 |
+
y = df[label_column].tolist()
|
| 276 |
+
|
| 277 |
+
# Stratify to maintain class distribution
|
| 278 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 279 |
+
X, y, test_size=test_size, random_state=42, stratify=y
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
# Convert text -> index
|
| 283 |
+
X_train_ids = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
|
| 284 |
+
X_test_ids = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False)
|
| 285 |
+
|
| 286 |
+
# Pad
|
| 287 |
+
X_train_padded = pad_sequences(X_train_ids, maxlen=maxlen, padding='post', truncating='post')
|
| 288 |
+
X_test_padded = pad_sequences(X_test_ids, maxlen=maxlen, padding='post', truncating='post')
|
| 289 |
+
|
| 290 |
+
print(">>> Debug Split and Convert:")
|
| 291 |
+
print("X_train_padded.shape:", X_train_padded.shape)
|
| 292 |
+
print("X_test_padded.shape: ", X_test_padded.shape)
|
| 293 |
+
print("y_train length:", len(y_train))
|
| 294 |
+
print("y_test length: ", len(y_test))
|
| 295 |
+
print("vocab_size:", len(self.vocabulary))
|
| 296 |
+
|
| 297 |
+
if for_keras:
|
| 298 |
+
num_classes = len(label_mapping)
|
| 299 |
+
y_train_onehot = torch.nn.functional.one_hot(
|
| 300 |
+
torch.tensor(y_train),
|
| 301 |
+
num_classes=num_classes
|
| 302 |
+
).numpy()
|
| 303 |
+
y_test_onehot = torch.nn.functional.one_hot(
|
| 304 |
+
torch.tensor(y_test),
|
| 305 |
+
num_classes=num_classes
|
| 306 |
+
).numpy()
|
| 307 |
+
|
| 308 |
+
print("y_train_onehot.shape:", y_train_onehot.shape)
|
| 309 |
+
print("y_test_onehot.shape: ", y_test_onehot.shape)
|
| 310 |
+
|
| 311 |
+
return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
|
| 312 |
+
else:
|
| 313 |
+
# Trả về DataLoader
|
| 314 |
+
X_train_t = torch.tensor(X_train_padded, dtype=torch.long)
|
| 315 |
+
X_test_t = torch.tensor(X_test_padded, dtype=torch.long)
|
| 316 |
+
y_train_t = torch.tensor(y_train, dtype=torch.long)
|
| 317 |
+
y_test_t = torch.tensor(y_test, dtype=torch.long)
|
| 318 |
+
|
| 319 |
+
train_ds = TensorDataset(X_train_t, y_train_t)
|
| 320 |
+
test_ds = TensorDataset(X_test_t, y_test_t)
|
| 321 |
+
|
| 322 |
+
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
|
| 323 |
+
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
|
| 324 |
+
|
| 325 |
+
return train_loader, test_loader, label_mapping
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
# ========== MÔ HÌNH PYTORCH RNN ==========
|
| 329 |
+
|
| 330 |
+
class SimpleRNN(nn.Module):
|
| 331 |
+
def __init__(self, pretrained_weight, hidden_dim, output_dim, dropout=0.3):
|
| 332 |
+
super(SimpleRNN, self).__init__()
|
| 333 |
+
vocab_size, embedding_dim = pretrained_weight.shape
|
| 334 |
+
# Tạo nn.Embedding từ pretrained_weight
|
| 335 |
+
self.embedding = nn.Embedding.from_pretrained(
|
| 336 |
+
torch.from_numpy(pretrained_weight),
|
| 337 |
+
freeze=False # True nếu muốn cố định embedding
|
| 338 |
+
)
|
| 339 |
+
self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
|
| 340 |
+
self.dropout = nn.Dropout(dropout)
|
| 341 |
+
self.fc = nn.Linear(hidden_dim, output_dim)
|
| 342 |
+
|
| 343 |
+
def forward(self, x):
|
| 344 |
+
embedded = self.dropout(self.embedding(x))
|
| 345 |
+
_, (hidden, _) = self.rnn(embedded)
|
| 346 |
+
hidden = self.dropout(hidden.squeeze(0))
|
| 347 |
+
output = self.fc(hidden)
|
| 348 |
+
return output
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def predict_emotion_rnn(model, text, data_manager, label_mapping, device):
|
| 352 |
+
model.eval()
|
| 353 |
+
with torch.no_grad():
|
| 354 |
+
processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
|
| 355 |
+
tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
|
| 356 |
+
text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
|
| 357 |
+
text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
|
| 358 |
+
text_tensor = torch.tensor(
|
| 359 |
+
text_padded,
|
| 360 |
+
dtype=torch.long
|
| 361 |
+
).to(device)
|
| 362 |
+
|
| 363 |
+
output = model(text_tensor)
|
| 364 |
+
_, predicted = torch.max(output, 1)
|
| 365 |
+
rev_map = {v: k for k, v in label_mapping.items()}
|
| 366 |
+
return rev_map[predicted.item()]
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
# ========== MÔ HÌNH KERAS CNN-LSTM ==========
|
| 370 |
+
|
| 371 |
+
def predict_emotion_cnn_lstm(model, text, data_manager, label_mapping):
|
| 372 |
+
processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
|
| 373 |
+
tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
|
| 374 |
+
text_ids = data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True)
|
| 375 |
+
text_padded = pad_sequences(text_ids, maxlen=400, padding='post', truncating='post')
|
| 376 |
+
output = model.predict(text_padded)
|
| 377 |
+
pred = output.argmax(axis=1)[0]
|
| 378 |
+
rev_map = {v: k for k, v in label_mapping.items()}
|
| 379 |
+
return rev_map[pred]
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
# ========== MAIN ==========
|
| 383 |
+
|
| 384 |
+
if __name__ == "__main__":
|
| 385 |
+
from keras.models import Model
|
| 386 |
+
from keras.layers import (
|
| 387 |
+
Input, Embedding, Convolution1D, LSTM, Dense, Dropout, Lambda, concatenate
|
| 388 |
+
)
|
| 389 |
+
from keras.optimizers import Adam
|
| 390 |
+
from keras.callbacks import ModelCheckpoint, EarlyStopping
|
| 391 |
+
|
| 392 |
+
# -------- ĐƯỜNG DẪN ----------
|
| 393 |
+
file_path = "train.xlsx"
|
| 394 |
+
abbreviations_path = "abbreviations.json"
|
| 395 |
+
word2vec_path = "word2vec_vi_syllables_100dims.txt"
|
| 396 |
+
output_path = "processed.xlsx"
|
| 397 |
+
|
| 398 |
+
# Khởi tạo DataManager
|
| 399 |
+
data_manager = DataManager(
|
| 400 |
+
file_path=file_path,
|
| 401 |
+
abbreviations_path=abbreviations_path,
|
| 402 |
+
word2vec_path=word2vec_path
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
# 1) Tiền xử lý, tạo vocab, load word2vec
|
| 406 |
+
df = data_manager.preprocess_data()
|
| 407 |
+
print("Trước khi cân bằng lớp (undersampling/oversampling):")
|
| 408 |
+
print(df["Emotion"].value_counts())
|
| 409 |
+
|
| 410 |
+
# 2) Cân bằng lớp dữ liệu (Ví dụ: Oversample 'Other' lên 3000)
|
| 411 |
+
# Bạn có thể điều chỉnh theo nhu cầu của mình
|
| 412 |
+
df_enjoyment = df[df["Emotion"] == "Enjoyment"]
|
| 413 |
+
df_other = df[df["Emotion"] == "Other"]
|
| 414 |
+
df_anger = df[df["Emotion"] == "Anger"]
|
| 415 |
+
df_sadness = df[df["Emotion"] == "Sadness"]
|
| 416 |
+
df_disgust = df[df["Emotion"] == "Disgust"]
|
| 417 |
+
df_fear = df[df["Emotion"] == "Fear"]
|
| 418 |
+
df_surprise = df[df["Emotion"] == "Surprise"]
|
| 419 |
+
|
| 420 |
+
# Oversample lớp 'Other' lên 3000 (chỉ minh hoạ)
|
| 421 |
+
if len(df_other) < 3000:
|
| 422 |
+
df_other_oversampled = resample(
|
| 423 |
+
df_other,
|
| 424 |
+
replace=True,
|
| 425 |
+
n_samples=3000,
|
| 426 |
+
random_state=42
|
| 427 |
+
)
|
| 428 |
+
else:
|
| 429 |
+
df_other_oversampled = df_other
|
| 430 |
+
|
| 431 |
+
# Giữ nguyên các lớp khác (hoặc oversample tùy ý)
|
| 432 |
+
df_balanced = pd.concat([
|
| 433 |
+
df_enjoyment,
|
| 434 |
+
df_other_oversampled,
|
| 435 |
+
df_anger,
|
| 436 |
+
df_sadness,
|
| 437 |
+
df_disgust,
|
| 438 |
+
df_fear,
|
| 439 |
+
df_surprise
|
| 440 |
+
], axis=0)
|
| 441 |
+
|
| 442 |
+
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
|
| 443 |
+
df = df_balanced
|
| 444 |
+
|
| 445 |
+
print("\nSau khi cân bằng lớp (demo oversample):")
|
| 446 |
+
print(df["Emotion"].value_counts())
|
| 447 |
+
|
| 448 |
+
# Xuất file (nếu muốn)
|
| 449 |
+
df.to_excel(output_path, index=False)
|
| 450 |
+
|
| 451 |
+
# ========== TRAIN RNN PYTORCH ==========
|
| 452 |
+
|
| 453 |
+
print("\n========== Training PyTorch SimpleRNN ==========")
|
| 454 |
+
|
| 455 |
+
# Xây ma trận embedding pretrained
|
| 456 |
+
pretrained_matrix = data_manager.build_pretrained_embedding_matrix(embedding_dim=100)
|
| 457 |
+
|
| 458 |
+
# Chia và chuyển đổi dữ liệu thành DataLoader
|
| 459 |
+
train_loader, test_loader, label_mapping = data_manager.split_and_convert(
|
| 460 |
+
df, label_column="Emotion", maxlen=400, test_size=0.2,
|
| 461 |
+
for_keras=False, batch_size=32
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
+
hidden_dim = 128
|
| 465 |
+
output_dim = len(label_mapping)
|
| 466 |
+
|
| 467 |
+
model_rnn = SimpleRNN(pretrained_weight=pretrained_matrix,
|
| 468 |
+
hidden_dim=hidden_dim,
|
| 469 |
+
output_dim=output_dim,
|
| 470 |
+
dropout=0.3)
|
| 471 |
+
criterion = nn.CrossEntropyLoss()
|
| 472 |
+
optimizer = optim.Adam(model_rnn.parameters(), lr=1e-3)
|
| 473 |
+
|
| 474 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 475 |
+
model_rnn.to(device)
|
| 476 |
+
|
| 477 |
+
num_epochs = 20
|
| 478 |
+
for epoch in range(num_epochs):
|
| 479 |
+
model_rnn.train()
|
| 480 |
+
epoch_loss = 0
|
| 481 |
+
correct = 0
|
| 482 |
+
total = 0
|
| 483 |
+
|
| 484 |
+
for X_batch, y_batch in train_loader:
|
| 485 |
+
X_batch = X_batch.to(device)
|
| 486 |
+
y_batch = y_batch.to(device)
|
| 487 |
+
|
| 488 |
+
optimizer.zero_grad()
|
| 489 |
+
preds = model_rnn(X_batch)
|
| 490 |
+
loss = criterion(preds, y_batch)
|
| 491 |
+
loss.backward()
|
| 492 |
+
optimizer.step()
|
| 493 |
+
|
| 494 |
+
epoch_loss += loss.item()
|
| 495 |
+
_, pred_label = torch.max(preds, 1)
|
| 496 |
+
correct += (pred_label == y_batch).sum().item()
|
| 497 |
+
total += y_batch.size(0)
|
| 498 |
+
|
| 499 |
+
epoch_accuracy = correct / total
|
| 500 |
+
epoch_loss_avg = epoch_loss / len(train_loader)
|
| 501 |
+
print(f"Epoch {epoch+1}/{num_epochs}, "
|
| 502 |
+
f"Loss: {epoch_loss_avg:.4f}, "
|
| 503 |
+
f"Accuracy: {epoch_accuracy:.4f}")
|
| 504 |
+
|
| 505 |
+
# Đánh giá trên test set với detailed metrics
|
| 506 |
+
model_rnn.eval()
|
| 507 |
+
test_loss = 0
|
| 508 |
+
correct = 0
|
| 509 |
+
total = 0
|
| 510 |
+
y_true = []
|
| 511 |
+
y_pred = []
|
| 512 |
+
with torch.no_grad():
|
| 513 |
+
for X_batch, y_batch in test_loader:
|
| 514 |
+
X_batch = X_batch.to(device)
|
| 515 |
+
y_batch = y_batch.to(device)
|
| 516 |
+
preds = model_rnn(X_batch)
|
| 517 |
+
loss = criterion(preds, y_batch)
|
| 518 |
+
test_loss += loss.item()
|
| 519 |
+
|
| 520 |
+
_, predicted = torch.max(preds, 1)
|
| 521 |
+
correct += (predicted == y_batch).sum().item()
|
| 522 |
+
total += y_batch.size(0)
|
| 523 |
+
|
| 524 |
+
y_true.extend(y_batch.cpu().numpy())
|
| 525 |
+
y_pred.extend(predicted.cpu().numpy())
|
| 526 |
+
|
| 527 |
+
test_accuracy = accuracy_score(y_true, y_pred)
|
| 528 |
+
test_loss_avg = test_loss / len(test_loader)
|
| 529 |
+
precision_macro = precision_score(y_true, y_pred, average='macro', zero_division=0)
|
| 530 |
+
precision_weighted = precision_score(y_true, y_pred, average='weighted', zero_division=0)
|
| 531 |
+
recall_macro = recall_score(y_true, y_pred, average='macro', zero_division=0)
|
| 532 |
+
recall_weighted = recall_score(y_true, y_pred, average='weighted', zero_division=0)
|
| 533 |
+
f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
|
| 534 |
+
f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
|
| 535 |
+
report = classification_report(y_true, y_pred, target_names=label_mapping.keys(), digits=4)
|
| 536 |
+
conf_matrix = confusion_matrix(y_true, y_pred)
|
| 537 |
+
|
| 538 |
+
# In các chỉ số
|
| 539 |
+
print(f"\nTest Loss: {test_loss_avg:.4f}, Test Accuracy: {test_accuracy:.4f}")
|
| 540 |
+
print(f"Precision (Macro): {precision_macro:.4f}")
|
| 541 |
+
print(f"Precision (Weighted): {precision_weighted:.4f}")
|
| 542 |
+
print(f"Recall (Macro): {recall_macro:.4f}")
|
| 543 |
+
print(f"Recall (Weighted): {recall_weighted:.4f}")
|
| 544 |
+
print(f"F1-Score (Macro): {f1_macro:.4f}")
|
| 545 |
+
print(f"F1-Score (Weighted): {f1_weighted:.4f}")
|
| 546 |
+
|
| 547 |
+
print("\n========== Classification Report ==========")
|
| 548 |
+
print(report)
|
| 549 |
+
|
| 550 |
+
print("\n========== Confusion Matrix ==========")
|
| 551 |
+
print(conf_matrix)
|
| 552 |
+
|
| 553 |
+
# Lưu báo cáo vào file
|
| 554 |
+
rnn_report_dir = "rnn_emotion_model"
|
| 555 |
+
os.makedirs(rnn_report_dir, exist_ok=True)
|
| 556 |
+
with open(os.path.join(rnn_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
|
| 557 |
+
f.write("========== Classification Report ==========\n")
|
| 558 |
+
f.write(report)
|
| 559 |
+
f.write("\n========== Additional Metrics ==========\n")
|
| 560 |
+
f.write(f"Test Loss: {test_loss_avg:.4f}\n")
|
| 561 |
+
f.write(f"Test Accuracy: {test_accuracy:.4f}\n")
|
| 562 |
+
f.write(f"Precision (Macro): {precision_macro:.4f}\n")
|
| 563 |
+
f.write(f"Precision (Weighted): {precision_weighted:.4f}\n")
|
| 564 |
+
f.write(f"Recall (Macro): {recall_macro:.4f}\n")
|
| 565 |
+
f.write(f"Recall (Weighted): {recall_weighted:.4f}\n")
|
| 566 |
+
f.write(f"F1-Score (Macro): {f1_macro:.4f}\n")
|
| 567 |
+
f.write(f"F1-Score (Weighted): {f1_weighted:.4f}\n")
|
| 568 |
+
f.write("\n========== Confusion Matrix ==========\n")
|
| 569 |
+
f.write(np.array2string(conf_matrix))
|
| 570 |
+
|
| 571 |
+
print("\n========== Classification Report saved to 'rnn_emotion_model/classification_report.txt' ==========")
|
| 572 |
+
|
| 573 |
+
# Lưu mô hình RNN
|
| 574 |
+
torch.save(model_rnn.state_dict(), os.path.join(rnn_report_dir, "simple_rnn.pth"))
|
| 575 |
+
print("========== RNN Model saved to 'rnn_emotion_model/simple_rnn.pth' ==========")
|
| 576 |
+
|
| 577 |
+
# ========== TRAIN CNN-LSTM KERAS ==========
|
| 578 |
+
|
| 579 |
+
print("\n========== Training CNN-LSTM (Keras) ==========")
|
| 580 |
+
|
| 581 |
+
# Tạo embedding pretrained cho Keras
|
| 582 |
+
# Chúng ta có pretrained_matrix (num_vocab x 100)
|
| 583 |
+
# Sẽ truyền vào layer Embedding(..., weights=[...])
|
| 584 |
+
X_train_keras, X_test_keras, y_train_keras, y_test_keras, label_mapping_keras = data_manager.split_and_convert(
|
| 585 |
+
df, label_column="Emotion", maxlen=400, test_size=0.2,
|
| 586 |
+
for_keras=True
|
| 587 |
+
)
|
| 588 |
+
|
| 589 |
+
maxlen = 400
|
| 590 |
+
vocab_size, embedding_dim = pretrained_matrix.shape
|
| 591 |
+
|
| 592 |
+
# Chuyển pretrained_matrix -> float32 (đảm bảo Keras nhận dạng)
|
| 593 |
+
pretrained_matrix_keras = pretrained_matrix.astype(np.float32)
|
| 594 |
+
|
| 595 |
+
input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
|
| 596 |
+
emb_layer = Embedding(
|
| 597 |
+
input_dim=vocab_size,
|
| 598 |
+
output_dim=embedding_dim,
|
| 599 |
+
weights=[pretrained_matrix_keras],
|
| 600 |
+
trainable=True # True hoặc False tùy muốn fine-tune embedding
|
| 601 |
+
)(input_layer)
|
| 602 |
+
|
| 603 |
+
def max_1d(X):
|
| 604 |
+
return tf.reduce_max(X, axis=1)
|
| 605 |
+
|
| 606 |
+
con3 = Convolution1D(150, kernel_size=3, activation='relu')(emb_layer)
|
| 607 |
+
pool_con3 = Lambda(max_1d, output_shape=(150,))(con3)
|
| 608 |
+
|
| 609 |
+
con5 = Convolution1D(150, kernel_size=5, activation='relu')(emb_layer)
|
| 610 |
+
pool_con5 = Lambda(max_1d, output_shape=(150,))(con5)
|
| 611 |
+
|
| 612 |
+
lstm_out = LSTM(128, dropout=0.3)(emb_layer)
|
| 613 |
+
|
| 614 |
+
merged = concatenate([pool_con3, pool_con5, lstm_out])
|
| 615 |
+
dense = Dense(100, activation='relu')(merged)
|
| 616 |
+
drop = Dropout(0.3)(dense)
|
| 617 |
+
output = Dense(output_dim, activation='softmax')(drop)
|
| 618 |
+
|
| 619 |
+
model_cnn_lstm = Model(inputs=input_layer, outputs=output)
|
| 620 |
+
model_cnn_lstm.compile(
|
| 621 |
+
loss='categorical_crossentropy',
|
| 622 |
+
optimizer=Adam(lr=1e-3),
|
| 623 |
+
metrics=['accuracy']
|
| 624 |
+
)
|
| 625 |
+
|
| 626 |
+
checkpoint = ModelCheckpoint(
|
| 627 |
+
'cnn_lstm_best.keras',
|
| 628 |
+
save_best_only=True,
|
| 629 |
+
monitor='val_accuracy',
|
| 630 |
+
mode='max'
|
| 631 |
+
)
|
| 632 |
+
early_stopping = EarlyStopping(
|
| 633 |
+
monitor='val_accuracy',
|
| 634 |
+
patience=5,
|
| 635 |
+
restore_best_weights=True
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
history = model_cnn_lstm.fit(
|
| 639 |
+
X_train_keras, y_train_keras,
|
| 640 |
+
validation_data=(X_test_keras, y_test_keras),
|
| 641 |
+
epochs=30,
|
| 642 |
+
batch_size=32,
|
| 643 |
+
callbacks=[checkpoint, early_stopping]
|
| 644 |
+
)
|
| 645 |
+
|
| 646 |
+
# Đánh giá trên test set với detailed metrics
|
| 647 |
+
loss, acc = model_cnn_lstm.evaluate(X_test_keras, y_test_keras)
|
| 648 |
+
print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")
|
| 649 |
+
|
| 650 |
+
# Thu thập dự đoán và tính toán các chỉ số
|
| 651 |
+
y_pred_cnn_lstm = model_cnn_lstm.predict(X_test_keras)
|
| 652 |
+
y_pred_cnn_lstm = np.argmax(y_pred_cnn_lstm, axis=1)
|
| 653 |
+
y_true_cnn_lstm = np.argmax(y_test_keras, axis=1)
|
| 654 |
+
|
| 655 |
+
test_accuracy_cnn_lstm = accuracy_score(y_true_cnn_lstm, y_pred_cnn_lstm)
|
| 656 |
+
precision_macro_cnn_lstm = precision_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
|
| 657 |
+
precision_weighted_cnn_lstm = precision_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
|
| 658 |
+
recall_macro_cnn_lstm = recall_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
|
| 659 |
+
recall_weighted_cnn_lstm = recall_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
|
| 660 |
+
f1_macro_cnn_lstm = f1_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='macro', zero_division=0)
|
| 661 |
+
f1_weighted_cnn_lstm = f1_score(y_true_cnn_lstm, y_pred_cnn_lstm, average='weighted', zero_division=0)
|
| 662 |
+
report_cnn_lstm = classification_report(y_true_cnn_lstm, y_pred_cnn_lstm, target_names=label_mapping.keys(), digits=4)
|
| 663 |
+
conf_matrix_cnn_lstm = confusion_matrix(y_true_cnn_lstm, y_pred_cnn_lstm)
|
| 664 |
+
|
| 665 |
+
# In các chỉ số
|
| 666 |
+
print(f"\nCNN-LSTM Test Accuracy: {test_accuracy_cnn_lstm:.4f}")
|
| 667 |
+
print(f"Precision (Macro): {precision_macro_cnn_lstm:.4f}")
|
| 668 |
+
print(f"Precision (Weighted): {precision_weighted_cnn_lstm:.4f}")
|
| 669 |
+
print(f"Recall (Macro): {recall_macro_cnn_lstm:.4f}")
|
| 670 |
+
print(f"Recall (Weighted): {recall_weighted_cnn_lstm:.4f}")
|
| 671 |
+
print(f"F1-Score (Macro): {f1_macro_cnn_lstm:.4f}")
|
| 672 |
+
print(f"F1-Score (Weighted): {f1_weighted_cnn_lstm:.4f}")
|
| 673 |
+
|
| 674 |
+
print("\n========== CNN-LSTM Classification Report ==========")
|
| 675 |
+
print(report_cnn_lstm)
|
| 676 |
+
|
| 677 |
+
print("\n========== CNN-LSTM Confusion Matrix ==========")
|
| 678 |
+
print(conf_matrix_cnn_lstm)
|
| 679 |
+
|
| 680 |
+
# Lưu báo cáo vào file
|
| 681 |
+
cnn_lstm_report_dir = "cnn_lstm_emotion_model"
|
| 682 |
+
os.makedirs(cnn_lstm_report_dir, exist_ok=True)
|
| 683 |
+
with open(os.path.join(cnn_lstm_report_dir, "classification_report.txt"), "w", encoding="utf-8") as f:
|
| 684 |
+
f.write("========== CNN-LSTM Classification Report ==========\n")
|
| 685 |
+
f.write(report_cnn_lstm)
|
| 686 |
+
f.write("\n========== Additional Metrics ==========\n")
|
| 687 |
+
f.write(f"Test Loss: {loss:.4f}\n")
|
| 688 |
+
f.write(f"Test Accuracy: {test_accuracy_cnn_lstm:.4f}\n")
|
| 689 |
+
f.write(f"Precision (Macro): {precision_macro_cnn_lstm:.4f}\n")
|
| 690 |
+
f.write(f"Precision (Weighted): {precision_weighted_cnn_lstm:.4f}\n")
|
| 691 |
+
f.write(f"Recall (Macro): {recall_macro_cnn_lstm:.4f}\n")
|
| 692 |
+
f.write(f"Recall (Weighted): {recall_weighted_cnn_lstm:.4f}\n")
|
| 693 |
+
f.write(f"F1-Score (Macro): {f1_macro_cnn_lstm:.4f}\n")
|
| 694 |
+
f.write(f"F1-Score (Weighted): {f1_weighted_cnn_lstm:.4f}\n")
|
| 695 |
+
f.write("\n========== Confusion Matrix ==========\n")
|
| 696 |
+
f.write(np.array2string(conf_matrix_cnn_lstm))
|
| 697 |
+
|
| 698 |
+
print("\n========== CNN-LSTM Classification Report saved to 'cnn_lstm_emotion_model/classification_report.txt' ==========")
|
| 699 |
+
|
| 700 |
+
# Lưu mô hình CNN-LSTM
|
| 701 |
+
model_cnn_lstm.save(os.path.join(cnn_lstm_report_dir, 'cnn_lstm_model.keras'))
|
| 702 |
+
print(f"========== CNN-LSTM Model saved to '{cnn_lstm_report_dir}/cnn_lstm_model.keras' ==========")
|
| 703 |
+
|
| 704 |
+
# ========== LƯU LABEL MAPPING VÀ VOCABULARY ==========
|
| 705 |
+
# Lưu label_mapping và vocabulary cho RNN
|
| 706 |
+
with open(os.path.join(rnn_report_dir, "label_mapping.json"), "w", encoding="utf-8") as f:
|
| 707 |
+
json.dump(label_mapping, f, ensure_ascii=False, indent=4)
|
| 708 |
+
|
| 709 |
+
with open(os.path.join(rnn_report_dir, "vocabulary.json"), "w", encoding="utf-8") as f:
|
| 710 |
+
json.dump(data_manager.vocabulary.word2id, f, ensure_ascii=False, indent=4)
|
| 711 |
+
|
| 712 |
+
# Lưu label_mapping và vocabulary cho CNN-LSTM
|
| 713 |
+
# Giả sử label_mapping và vocabulary giống nhau, bạn có thể chỉ lưu một lần.
|
| 714 |
+
# Nếu khác, hãy điều chỉnh tương ứng.
|
| 715 |
+
|
| 716 |
+
print("========== Label Mapping and Vocabulary saved ==========")
|
| 717 |
+
|
| 718 |
+
# ========== DEMO DỰ ĐOÁN 1 CÂU MỚI ==========
|
| 719 |
+
|
| 720 |
+
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
|
| 721 |
+
|
| 722 |
+
# RNN (PyTorch)
|
| 723 |
+
emotion_rnn = predict_emotion_rnn(
|
| 724 |
+
model_rnn, custom_text, data_manager, label_mapping, device
|
| 725 |
+
)
|
| 726 |
+
print(f"Predicted Emotion (RNN): {emotion_rnn}")
|
| 727 |
+
|
| 728 |
+
# CNN-LSTM (Keras)
|
| 729 |
+
cnn_lstm_loaded = tf.keras.models.load_model(os.path.join(cnn_lstm_report_dir, 'cnn_lstm_model.keras'))
|
| 730 |
+
emotion_cnn_lstm = predict_emotion_cnn_lstm(
|
| 731 |
+
cnn_lstm_loaded, custom_text, data_manager, label_mapping
|
| 732 |
+
)
|
| 733 |
+
print(f"Predicted Emotion (CNN-LSTM): {emotion_cnn_lstm}")
|
| 734 |
+
|
| 735 |
+
# Kiểm tra TF, GPU
|
| 736 |
+
print("TF version:", tf.__version__)
|
| 737 |
+
print("GPU devices:", tf.config.list_physical_devices("GPU"))
|
| 738 |
+
# os.system("nvidia-smi") # nếu muốn xem info GPU
|
main_lstm.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# lstm_emotion_classifier.py
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
import re
|
| 5 |
+
import emoji
|
| 6 |
+
import json
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
import tensorflow as tf
|
| 10 |
+
from underthesea import word_tokenize
|
| 11 |
+
from sklearn.model_selection import train_test_split
|
| 12 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
| 13 |
+
from sklearn.utils import resample
|
| 14 |
+
from tensorflow.keras.preprocessing.text import Tokenizer
|
| 15 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
| 16 |
+
from tensorflow.keras.models import Sequential
|
| 17 |
+
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
|
| 18 |
+
from tensorflow.keras.callbacks import EarlyStopping
|
| 19 |
+
import joblib
|
| 20 |
+
import os
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import seaborn as sns
|
| 23 |
+
|
| 24 |
+
########################
|
| 25 |
+
# TIỀN XỬ LÝ
|
| 26 |
+
########################
|
| 27 |
+
|
| 28 |
+
def replace_emojis(sentence, emoji_mapping):
|
| 29 |
+
processed_sentence = []
|
| 30 |
+
for char in sentence:
|
| 31 |
+
if char in emoji_mapping:
|
| 32 |
+
processed_sentence.append(emoji_mapping[char])
|
| 33 |
+
elif not emoji.is_emoji(char):
|
| 34 |
+
processed_sentence.append(char)
|
| 35 |
+
return ''.join(processed_sentence)
|
| 36 |
+
|
| 37 |
+
def remove_profanity(sentence):
|
| 38 |
+
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
|
| 39 |
+
words = sentence.split()
|
| 40 |
+
filtered = [w for w in words if w.lower() not in profane_words]
|
| 41 |
+
return ' '.join(filtered)
|
| 42 |
+
|
| 43 |
+
def remove_special_characters(sentence):
|
| 44 |
+
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
|
| 45 |
+
|
| 46 |
+
def normalize_whitespace(sentence):
|
| 47 |
+
return ' '.join(sentence.split())
|
| 48 |
+
|
| 49 |
+
def remove_repeated_characters(sentence):
|
| 50 |
+
return re.sub(r"(.)\1{2,}", r"\1", sentence)
|
| 51 |
+
|
| 52 |
+
def replace_numbers(sentence):
|
| 53 |
+
return re.sub(r"\d+", "[number]", sentence)
|
| 54 |
+
|
| 55 |
+
def tokenize_underthesea(sentence):
|
| 56 |
+
tokens = word_tokenize(sentence)
|
| 57 |
+
return " ".join(tokens)
|
| 58 |
+
|
| 59 |
+
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
|
| 60 |
+
sentence = sentence.lower()
|
| 61 |
+
sentence = replace_emojis(sentence, emoji_mapping)
|
| 62 |
+
sentence = remove_profanity(sentence)
|
| 63 |
+
sentence = remove_special_characters(sentence)
|
| 64 |
+
sentence = normalize_whitespace(sentence)
|
| 65 |
+
# Thay thế viết tắt
|
| 66 |
+
words = sentence.split()
|
| 67 |
+
replaced = []
|
| 68 |
+
for w in words:
|
| 69 |
+
if w in abbreviations:
|
| 70 |
+
replaced.append(" ".join(abbreviations[w]))
|
| 71 |
+
else:
|
| 72 |
+
replaced.append(w)
|
| 73 |
+
sentence = " ".join(replaced)
|
| 74 |
+
sentence = remove_repeated_characters(sentence)
|
| 75 |
+
sentence = replace_numbers(sentence)
|
| 76 |
+
# Tokenize tiếng Việt
|
| 77 |
+
sentence = tokenize_underthesea(sentence)
|
| 78 |
+
return sentence
|
| 79 |
+
|
| 80 |
+
emoji_mapping = {
|
| 81 |
+
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
|
| 82 |
+
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
|
| 83 |
+
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
|
| 84 |
+
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
|
| 85 |
+
"🤑": "[satisfaction]",
|
| 86 |
+
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
|
| 87 |
+
"😏": "[sarcasm]",
|
| 88 |
+
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
|
| 89 |
+
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
|
| 90 |
+
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
|
| 91 |
+
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
|
| 92 |
+
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
|
| 93 |
+
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
|
| 94 |
+
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
|
| 95 |
+
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
|
| 96 |
+
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
|
| 97 |
+
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
def load_abbreviations(path):
|
| 101 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 102 |
+
return json.load(f)
|
| 103 |
+
|
| 104 |
+
###################################
|
| 105 |
+
# MAIN
|
| 106 |
+
###################################
|
| 107 |
+
if __name__ == "__main__":
|
| 108 |
+
file_path = "train.xlsx"
|
| 109 |
+
abbreviations_path = "abbreviations.json"
|
| 110 |
+
output_path = "processed_phobert.xlsx"
|
| 111 |
+
|
| 112 |
+
abbreviations = load_abbreviations(abbreviations_path)
|
| 113 |
+
|
| 114 |
+
df = pd.read_excel(file_path)
|
| 115 |
+
if "Sentence" not in df.columns or "Emotion" not in df.columns:
|
| 116 |
+
raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
|
| 117 |
+
|
| 118 |
+
# Tiền xử lý
|
| 119 |
+
df["processed_sentence"] = df["Sentence"].apply(
|
| 120 |
+
lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
|
| 121 |
+
)
|
| 122 |
+
# Loại bỏ rỗng
|
| 123 |
+
df = df[df["processed_sentence"].str.strip().astype(bool)]
|
| 124 |
+
|
| 125 |
+
print("Trước khi cân bằng:")
|
| 126 |
+
print(df["Emotion"].value_counts())
|
| 127 |
+
|
| 128 |
+
# =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
|
| 129 |
+
# Lấy max samples
|
| 130 |
+
max_count = df["Emotion"].value_counts().max()
|
| 131 |
+
|
| 132 |
+
df_balanced_list = []
|
| 133 |
+
for emo in df["Emotion"].unique():
|
| 134 |
+
df_emo = df[df["Emotion"] == emo]
|
| 135 |
+
if len(df_emo) < max_count:
|
| 136 |
+
# Oversample lên max_count
|
| 137 |
+
df_emo_oversampled = resample(
|
| 138 |
+
df_emo,
|
| 139 |
+
replace=True,
|
| 140 |
+
n_samples=max_count,
|
| 141 |
+
random_state=42
|
| 142 |
+
)
|
| 143 |
+
df_balanced_list.append(df_emo_oversampled)
|
| 144 |
+
else:
|
| 145 |
+
# Nếu emo này = max_count rồi thì giữ nguyên
|
| 146 |
+
df_balanced_list.append(df_emo)
|
| 147 |
+
|
| 148 |
+
df = pd.concat(df_balanced_list, axis=0)
|
| 149 |
+
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
|
| 150 |
+
|
| 151 |
+
print("\nSau khi cân bằng tất cả lớp:")
|
| 152 |
+
print(df["Emotion"].value_counts())
|
| 153 |
+
|
| 154 |
+
df.to_excel(output_path, index=False)
|
| 155 |
+
|
| 156 |
+
# Tạo label2id và id2label theo thứ tự bạn cung cấp
|
| 157 |
+
custom_id2label = {
|
| 158 |
+
0: 'Anger',
|
| 159 |
+
1: 'Disgust',
|
| 160 |
+
2: 'Enjoyment',
|
| 161 |
+
3: 'Fear',
|
| 162 |
+
4: 'Other',
|
| 163 |
+
5: 'Sadness',
|
| 164 |
+
6: 'Surprise'
|
| 165 |
+
}
|
| 166 |
+
label2id = {label: idx for idx, label in enumerate(custom_id2label.values())}
|
| 167 |
+
id2label = {v: k for k, v in label2id.items()}
|
| 168 |
+
|
| 169 |
+
df["label_id"] = df["Emotion"].map(label2id)
|
| 170 |
+
|
| 171 |
+
# Tách train/test
|
| 172 |
+
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
|
| 173 |
+
print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
|
| 174 |
+
|
| 175 |
+
# Feature Extraction với Tokenizer và Padding
|
| 176 |
+
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
|
| 177 |
+
tokenizer.fit_on_texts(train_df["processed_sentence"])
|
| 178 |
+
|
| 179 |
+
X_train_seq = tokenizer.texts_to_sequences(train_df["processed_sentence"])
|
| 180 |
+
X_test_seq = tokenizer.texts_to_sequences(test_df["processed_sentence"])
|
| 181 |
+
|
| 182 |
+
max_length = 256
|
| 183 |
+
X_train = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
|
| 184 |
+
X_test = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')
|
| 185 |
+
|
| 186 |
+
y_train = train_df["label_id"].values
|
| 187 |
+
y_test = test_df["label_id"].values
|
| 188 |
+
|
| 189 |
+
# Chuyển đổi nhãn thành one-hot encoding
|
| 190 |
+
num_classes = len(custom_id2label)
|
| 191 |
+
y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
|
| 192 |
+
y_test = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)
|
| 193 |
+
|
| 194 |
+
# Xây dựng mô hình LSTM
|
| 195 |
+
model = Sequential([
|
| 196 |
+
Embedding(input_dim=5000, output_dim=128, input_length=max_length),
|
| 197 |
+
LSTM(128, dropout=0.2, recurrent_dropout=0.2),
|
| 198 |
+
Dense(64, activation='relu'),
|
| 199 |
+
Dropout(0.5),
|
| 200 |
+
Dense(num_classes, activation='softmax')
|
| 201 |
+
])
|
| 202 |
+
|
| 203 |
+
model.compile(loss='categorical_crossentropy',
|
| 204 |
+
optimizer='adam',
|
| 205 |
+
metrics=['accuracy'])
|
| 206 |
+
|
| 207 |
+
model.summary()
|
| 208 |
+
|
| 209 |
+
# Huấn luyện mô hình
|
| 210 |
+
early_stop = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
|
| 211 |
+
|
| 212 |
+
history = model.fit(
|
| 213 |
+
X_train, y_train,
|
| 214 |
+
epochs=10,
|
| 215 |
+
batch_size=32,
|
| 216 |
+
validation_data=(X_test, y_test),
|
| 217 |
+
callbacks=[early_stop],
|
| 218 |
+
verbose=1
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
# Đánh giá mô hình
|
| 222 |
+
print("\n========== Evaluate on Test set ==========")
|
| 223 |
+
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
|
| 224 |
+
print(f"Test Accuracy: {accuracy:.4f}")
|
| 225 |
+
|
| 226 |
+
# Dự đoán và in báo cáo phân loại
|
| 227 |
+
y_pred_probs = model.predict(X_test)
|
| 228 |
+
y_pred = np.argmax(y_pred_probs, axis=1)
|
| 229 |
+
y_true = np.argmax(y_test, axis=1)
|
| 230 |
+
|
| 231 |
+
# In Classification Report
|
| 232 |
+
print("\nClassification Report:")
|
| 233 |
+
report = classification_report(y_true, y_pred, target_names=custom_id2label.values())
|
| 234 |
+
print(report)
|
| 235 |
+
|
| 236 |
+
# Tính và in Confusion Matrix
|
| 237 |
+
conf_matrix = confusion_matrix(y_true, y_pred)
|
| 238 |
+
print("\nConfusion Matrix:")
|
| 239 |
+
print(conf_matrix)
|
| 240 |
+
|
| 241 |
+
# Vẽ Confusion Matrix
|
| 242 |
+
plt.figure(figsize=(10, 8))
|
| 243 |
+
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
|
| 244 |
+
xticklabels=custom_id2label.values(),
|
| 245 |
+
yticklabels=custom_id2label.values())
|
| 246 |
+
plt.ylabel('Actual')
|
| 247 |
+
plt.xlabel('Predicted')
|
| 248 |
+
plt.title('Confusion Matrix')
|
| 249 |
+
plt.tight_layout()
|
| 250 |
+
plt.savefig(os.path.join("lstm_emotion_model", "confusion_matrix.png"))
|
| 251 |
+
plt.close()
|
| 252 |
+
print("\nConfusion Matrix plot saved to 'lstm_emotion_model/confusion_matrix.png'")
|
| 253 |
+
|
| 254 |
+
# Lưu Classification Report vào file
|
| 255 |
+
report_path = os.path.join("lstm_emotion_model", "classification_report.txt")
|
| 256 |
+
with open(report_path, "w", encoding="utf-8") as f:
|
| 257 |
+
f.write("========== Classification Report ==========\n")
|
| 258 |
+
f.write(report)
|
| 259 |
+
f.write("\n========== Confusion Matrix ==========\n")
|
| 260 |
+
f.write(np.array2string(conf_matrix))
|
| 261 |
+
|
| 262 |
+
print(f"\nClassification Report saved to '{report_path}'")
|
| 263 |
+
|
| 264 |
+
# Lưu mô hình và tokenizer
|
| 265 |
+
model_output_dir = "./lstm_emotion_model"
|
| 266 |
+
os.makedirs(model_output_dir, exist_ok=True)
|
| 267 |
+
model.save(os.path.join(model_output_dir, "lstm_emotion_model.h5"))
|
| 268 |
+
joblib.dump(tokenizer, os.path.join(model_output_dir, "tokenizer.joblib"))
|
| 269 |
+
with open(os.path.join(model_output_dir, "id2label.json"), "w", encoding="utf-8") as f:
|
| 270 |
+
json.dump(id2label, f, ensure_ascii=False, indent=4)
|
| 271 |
+
|
| 272 |
+
print("\n========== Model and Tokenizer saved ==========")
|
| 273 |
+
|
| 274 |
+
# Predict 1 câu (ví dụ)
|
| 275 |
+
def predict_text(text):
|
| 276 |
+
text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
|
| 277 |
+
seq = tokenizer.texts_to_sequences([text_proc])
|
| 278 |
+
padded = pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')
|
| 279 |
+
pred_prob = model.predict(padded)
|
| 280 |
+
pred_id = np.argmax(pred_prob, axis=1)[0]
|
| 281 |
+
label = custom_id2label[pred_id]
|
| 282 |
+
return label
|
| 283 |
+
|
| 284 |
+
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
|
| 285 |
+
emotion_pred = predict_text(custom_text)
|
| 286 |
+
print("\nCâu ví dụ:", custom_text)
|
| 287 |
+
print("Dự đoán cảm xúc:", emotion_pred)
|
| 288 |
+
|
| 289 |
+
print("\nHoàn thành demo LSTM với cân bằng dữ liệu & nhiều epoch hơn!")
|
main_phobert.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# phobert_emotion_balanced.py
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
import re
|
| 5 |
+
import emoji
|
| 6 |
+
import json
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import torch
|
| 9 |
+
import numpy as np
|
| 10 |
+
import os
|
| 11 |
+
import matplotlib.pyplot as plt
|
| 12 |
+
import seaborn as sns
|
| 13 |
+
|
| 14 |
+
from transformers import (
|
| 15 |
+
AutoTokenizer,
|
| 16 |
+
AutoConfig,
|
| 17 |
+
AutoModelForSequenceClassification,
|
| 18 |
+
Trainer,
|
| 19 |
+
TrainingArguments
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
from sklearn.model_selection import train_test_split
|
| 23 |
+
from sklearn.utils import resample
|
| 24 |
+
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix
|
| 25 |
+
|
| 26 |
+
########################
|
| 27 |
+
# TIỀN XỬ LÝ
|
| 28 |
+
########################
|
| 29 |
+
|
| 30 |
+
def replace_emojis(sentence, emoji_mapping):
|
| 31 |
+
processed_sentence = []
|
| 32 |
+
for char in sentence:
|
| 33 |
+
if char in emoji_mapping:
|
| 34 |
+
processed_sentence.append(emoji_mapping[char])
|
| 35 |
+
elif not emoji.is_emoji(char):
|
| 36 |
+
processed_sentence.append(char)
|
| 37 |
+
return ''.join(processed_sentence)
|
| 38 |
+
|
| 39 |
+
def remove_profanity(sentence):
|
| 40 |
+
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
|
| 41 |
+
words = sentence.split()
|
| 42 |
+
filtered = [w for w in words if w.lower() not in profane_words]
|
| 43 |
+
return ' '.join(filtered)
|
| 44 |
+
|
| 45 |
+
def remove_special_characters(sentence):
|
| 46 |
+
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
|
| 47 |
+
|
| 48 |
+
def normalize_whitespace(sentence):
|
| 49 |
+
return ' '.join(sentence.split())
|
| 50 |
+
|
| 51 |
+
def remove_repeated_characters(sentence):
|
| 52 |
+
return re.sub(r"(.)\1{2,}", r"\1", sentence)
|
| 53 |
+
|
| 54 |
+
def replace_numbers(sentence):
|
| 55 |
+
return re.sub(r"\d+", "[number]", sentence)
|
| 56 |
+
|
| 57 |
+
def tokenize_underthesea(sentence):
|
| 58 |
+
from underthesea import word_tokenize
|
| 59 |
+
tokens = word_tokenize(sentence)
|
| 60 |
+
return " ".join(tokens)
|
| 61 |
+
|
| 62 |
+
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
|
| 63 |
+
sentence = sentence.lower()
|
| 64 |
+
sentence = replace_emojis(sentence, emoji_mapping)
|
| 65 |
+
sentence = remove_profanity(sentence)
|
| 66 |
+
sentence = remove_special_characters(sentence)
|
| 67 |
+
sentence = normalize_whitespace(sentence)
|
| 68 |
+
# Thay thế viết tắt
|
| 69 |
+
words = sentence.split()
|
| 70 |
+
replaced = []
|
| 71 |
+
for w in words:
|
| 72 |
+
if w in abbreviations:
|
| 73 |
+
replaced.append(" ".join(abbreviations[w]))
|
| 74 |
+
else:
|
| 75 |
+
replaced.append(w)
|
| 76 |
+
sentence = " ".join(replaced)
|
| 77 |
+
sentence = remove_repeated_characters(sentence)
|
| 78 |
+
sentence = replace_numbers(sentence)
|
| 79 |
+
# Tokenize
|
| 80 |
+
sentence = tokenize_underthesea(sentence)
|
| 81 |
+
return sentence
|
| 82 |
+
|
| 83 |
+
emoji_mapping = {
|
| 84 |
+
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
|
| 85 |
+
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
|
| 86 |
+
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
|
| 87 |
+
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
|
| 88 |
+
"🤑": "[satisfaction]",
|
| 89 |
+
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
|
| 90 |
+
"😏": "[sarcasm]",
|
| 91 |
+
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
|
| 92 |
+
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
|
| 93 |
+
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
|
| 94 |
+
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
|
| 95 |
+
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
|
| 96 |
+
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
|
| 97 |
+
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
|
| 98 |
+
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
|
| 99 |
+
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
|
| 100 |
+
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
def load_abbreviations(path):
|
| 104 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 105 |
+
return json.load(f)
|
| 106 |
+
|
| 107 |
+
# Dataset HF
|
| 108 |
+
class PhoBertEmotionDataset(torch.utils.data.Dataset):
|
| 109 |
+
def __init__(self, encodings, labels):
|
| 110 |
+
self.encodings = encodings
|
| 111 |
+
self.labels = labels
|
| 112 |
+
|
| 113 |
+
def __len__(self):
|
| 114 |
+
return len(self.labels)
|
| 115 |
+
|
| 116 |
+
def __getitem__(self, idx):
|
| 117 |
+
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
|
| 118 |
+
item["labels"] = torch.tensor(self.labels[idx])
|
| 119 |
+
return item
|
| 120 |
+
|
| 121 |
+
###################################
|
| 122 |
+
# MAIN
|
| 123 |
+
###################################
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
file_path = "train.xlsx"
|
| 126 |
+
abbreviations_path = "abbreviations.json"
|
| 127 |
+
output_path = "processed_phobert.xlsx"
|
| 128 |
+
|
| 129 |
+
abbreviations = load_abbreviations(abbreviations_path)
|
| 130 |
+
|
| 131 |
+
df = pd.read_excel(file_path)
|
| 132 |
+
if "Sentence" not in df.columns or "Emotion" not in df.columns:
|
| 133 |
+
raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
|
| 134 |
+
|
| 135 |
+
# Tiền xử lý
|
| 136 |
+
df["processed_sentence"] = df["Sentence"].apply(
|
| 137 |
+
lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
|
| 138 |
+
)
|
| 139 |
+
# Loại bỏ rỗng
|
| 140 |
+
df = df[df["processed_sentence"].str.strip().astype(bool)]
|
| 141 |
+
|
| 142 |
+
print("Trước khi cân bằng:")
|
| 143 |
+
print(df["Emotion"].value_counts())
|
| 144 |
+
|
| 145 |
+
# =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
|
| 146 |
+
# Lấy max samples
|
| 147 |
+
max_count = df["Emotion"].value_counts().max()
|
| 148 |
+
|
| 149 |
+
df_balanced_list = []
|
| 150 |
+
for emo in df["Emotion"].unique():
|
| 151 |
+
df_emo = df[df["Emotion"] == emo]
|
| 152 |
+
if len(df_emo) < max_count:
|
| 153 |
+
# Oversample lên max_count
|
| 154 |
+
df_emo_oversampled = resample(
|
| 155 |
+
df_emo,
|
| 156 |
+
replace=True,
|
| 157 |
+
n_samples=max_count,
|
| 158 |
+
random_state=42
|
| 159 |
+
)
|
| 160 |
+
df_balanced_list.append(df_emo_oversampled)
|
| 161 |
+
else:
|
| 162 |
+
# Nếu emo này = max_count rồi thì giữ nguyên
|
| 163 |
+
df_balanced_list.append(df_emo)
|
| 164 |
+
|
| 165 |
+
df = pd.concat(df_balanced_list, axis=0)
|
| 166 |
+
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
|
| 167 |
+
|
| 168 |
+
print("\nSau khi cân bằng tất cả lớp:")
|
| 169 |
+
print(df["Emotion"].value_counts())
|
| 170 |
+
|
| 171 |
+
df.to_excel(output_path, index=False)
|
| 172 |
+
|
| 173 |
+
# Tạo label2id
|
| 174 |
+
unique_labels = sorted(df["Emotion"].unique()) # Sắp xếp để cố định
|
| 175 |
+
label2id = {label: i for i, label in enumerate(unique_labels)}
|
| 176 |
+
id2label = {v: k for k, v in label2id.items()}
|
| 177 |
+
|
| 178 |
+
df["label_id"] = df["Emotion"].map(label2id)
|
| 179 |
+
|
| 180 |
+
# Tách train/test
|
| 181 |
+
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
|
| 182 |
+
|
| 183 |
+
print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
|
| 184 |
+
|
| 185 |
+
# Load tokenizer
|
| 186 |
+
checkpoint = "vinai/phobert-base"
|
| 187 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
| 188 |
+
|
| 189 |
+
def tokenize_texts(texts):
|
| 190 |
+
return tokenizer(
|
| 191 |
+
texts,
|
| 192 |
+
padding=True,
|
| 193 |
+
truncation=True,
|
| 194 |
+
max_length=256
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
train_texts = train_df["processed_sentence"].tolist()
|
| 198 |
+
train_labels = train_df["label_id"].tolist()
|
| 199 |
+
test_texts = test_df["processed_sentence"].tolist()
|
| 200 |
+
test_labels = test_df["label_id"].tolist()
|
| 201 |
+
|
| 202 |
+
train_encodings = tokenize_texts(train_texts)
|
| 203 |
+
test_encodings = tokenize_texts(test_texts)
|
| 204 |
+
|
| 205 |
+
train_dataset = PhoBertEmotionDataset(train_encodings, train_labels)
|
| 206 |
+
test_dataset = PhoBertEmotionDataset(test_encodings, test_labels)
|
| 207 |
+
|
| 208 |
+
# Load model
|
| 209 |
+
config = AutoConfig.from_pretrained(checkpoint)
|
| 210 |
+
config.num_labels = len(label2id)
|
| 211 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 212 |
+
checkpoint,
|
| 213 |
+
config=config
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
# Tăng epoch lên 10, LR=2e-5
|
| 217 |
+
training_args = TrainingArguments(
|
| 218 |
+
output_dir="./phobert_results_v2",
|
| 219 |
+
overwrite_output_dir=True,
|
| 220 |
+
do_train=True,
|
| 221 |
+
do_eval=True,
|
| 222 |
+
evaluation_strategy="epoch",
|
| 223 |
+
save_strategy="epoch",
|
| 224 |
+
num_train_epochs=10, # Tăng epoch
|
| 225 |
+
per_device_train_batch_size=16,
|
| 226 |
+
per_device_eval_batch_size=16,
|
| 227 |
+
learning_rate=2e-5,
|
| 228 |
+
logging_dir="./logs",
|
| 229 |
+
logging_steps=50,
|
| 230 |
+
load_best_model_at_end=True,
|
| 231 |
+
metric_for_best_model="f1_weighted", # Chọn metric để lưu model tốt nhất
|
| 232 |
+
greater_is_better=True,
|
| 233 |
+
seed=42
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
# Define compute_metrics with additional metrics
|
| 237 |
+
def compute_metrics(eval_pred):
|
| 238 |
+
logits, labels = eval_pred
|
| 239 |
+
preds = np.argmax(logits, axis=-1)
|
| 240 |
+
precision_weighted = precision_score(labels, preds, average='weighted', zero_division=0)
|
| 241 |
+
recall_weighted = recall_score(labels, preds, average='weighted', zero_division=0)
|
| 242 |
+
f1_weighted = f1_score(labels, preds, average='weighted', zero_division=0)
|
| 243 |
+
precision_macro = precision_score(labels, preds, average='macro', zero_division=0)
|
| 244 |
+
recall_macro = recall_score(labels, preds, average='macro', zero_division=0)
|
| 245 |
+
f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
|
| 246 |
+
accuracy = accuracy_score(labels, preds)
|
| 247 |
+
return {
|
| 248 |
+
"accuracy": accuracy,
|
| 249 |
+
"precision_weighted": precision_weighted,
|
| 250 |
+
"recall_weighted": recall_weighted,
|
| 251 |
+
"f1_weighted": f1_weighted,
|
| 252 |
+
"precision_macro": precision_macro,
|
| 253 |
+
"recall_macro": recall_macro,
|
| 254 |
+
"f1_macro": f1_macro
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
trainer = Trainer(
|
| 258 |
+
model=model,
|
| 259 |
+
args=training_args,
|
| 260 |
+
train_dataset=train_dataset,
|
| 261 |
+
eval_dataset=test_dataset,
|
| 262 |
+
tokenizer=tokenizer,
|
| 263 |
+
compute_metrics=compute_metrics
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
print("\n========== Training PhoBERT (balanced, more epochs) ==========")
|
| 267 |
+
trainer.train()
|
| 268 |
+
|
| 269 |
+
print("\n========== Evaluate on Test set ==========")
|
| 270 |
+
results = trainer.evaluate(test_dataset)
|
| 271 |
+
print("Test results:", results)
|
| 272 |
+
|
| 273 |
+
# Extract additional metrics
|
| 274 |
+
print("\n========== Additional Metrics ==========")
|
| 275 |
+
print(f"Test Loss: {results.get('eval_loss'):.4f}")
|
| 276 |
+
print(f"Test Accuracy: {results.get('eval_accuracy'):.4f}")
|
| 277 |
+
print(f"Precision (Macro): {results.get('eval_precision_macro'):.4f}")
|
| 278 |
+
print(f"Precision (Weighted): {results.get('eval_precision_weighted'):.4f}")
|
| 279 |
+
print(f"Recall (Macro): {results.get('eval_recall_macro'):.4f}")
|
| 280 |
+
print(f"Recall (Weighted): {results.get('eval_recall_weighted'):.4f}")
|
| 281 |
+
print(f"F1-Score (Macro): {results.get('eval_f1_macro'):.4f}")
|
| 282 |
+
print(f"F1-Score (Weighted): {results.get('eval_f1_weighted'):.4f}")
|
| 283 |
+
|
| 284 |
+
# Generate detailed classification report
|
| 285 |
+
print("\n========== Detailed Classification Report ==========")
|
| 286 |
+
predictions, labels, _ = trainer.predict(test_dataset)
|
| 287 |
+
preds = np.argmax(predictions, axis=1)
|
| 288 |
+
report = classification_report(labels, preds, target_names=unique_labels, digits=4)
|
| 289 |
+
print(report)
|
| 290 |
+
|
| 291 |
+
# Tính Confusion Matrix
|
| 292 |
+
conf_matrix = confusion_matrix(labels, preds)
|
| 293 |
+
print("\nConfusion Matrix:")
|
| 294 |
+
print(conf_matrix)
|
| 295 |
+
|
| 296 |
+
# Vẽ Confusion Matrix
|
| 297 |
+
plt.figure(figsize=(10, 8))
|
| 298 |
+
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
|
| 299 |
+
xticklabels=unique_labels,
|
| 300 |
+
yticklabels=unique_labels)
|
| 301 |
+
plt.ylabel('Actual')
|
| 302 |
+
plt.xlabel('Predicted')
|
| 303 |
+
plt.title('Confusion Matrix')
|
| 304 |
+
plt.tight_layout()
|
| 305 |
+
confusion_matrix_path = os.path.join("phobert_emotion_model", "confusion_matrix.png")
|
| 306 |
+
os.makedirs("phobert_emotion_model", exist_ok=True)
|
| 307 |
+
plt.savefig(confusion_matrix_path)
|
| 308 |
+
plt.close()
|
| 309 |
+
print(f"\nConfusion Matrix plot saved to '{confusion_matrix_path}'")
|
| 310 |
+
|
| 311 |
+
# Lưu Classification Report vào file
|
| 312 |
+
report_path = os.path.join("phobert_emotion_model", "classification_report.txt")
|
| 313 |
+
with open(report_path, "w", encoding="utf-8") as f:
|
| 314 |
+
f.write("========== Classification Report ==========\n")
|
| 315 |
+
f.write(report)
|
| 316 |
+
f.write("\n========== Confusion Matrix ==========\n")
|
| 317 |
+
f.write(np.array2string(conf_matrix))
|
| 318 |
+
|
| 319 |
+
print(f"\nClassification Report saved to '{report_path}'")
|
| 320 |
+
|
| 321 |
+
# Lưu mô hình và tokenizer
|
| 322 |
+
model_output_dir = "./phobert_emotion_model"
|
| 323 |
+
os.makedirs(model_output_dir, exist_ok=True)
|
| 324 |
+
model.save_pretrained(os.path.join(model_output_dir, "phobert_emotion_model"))
|
| 325 |
+
tokenizer.save_pretrained(os.path.join(model_output_dir, "phobert_emotion_model"))
|
| 326 |
+
with open(os.path.join(model_output_dir, "id2label.json"), "w", encoding="utf-8") as f:
|
| 327 |
+
json.dump(id2label, f, ensure_ascii=False, indent=4)
|
| 328 |
+
|
| 329 |
+
print("\n========== Model and Tokenizer saved ==========")
|
| 330 |
+
|
| 331 |
+
# Predict 1 câu (ví dụ)
|
| 332 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 333 |
+
model.to(device)
|
| 334 |
+
|
| 335 |
+
def predict_text(text):
|
| 336 |
+
text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
|
| 337 |
+
enc = tokenizer(text_proc, padding=True, truncation=True, max_length=256, return_tensors="pt")
|
| 338 |
+
enc = {k: v.to(device) for k, v in enc.items()}
|
| 339 |
+
with torch.no_grad():
|
| 340 |
+
out = model(**enc)
|
| 341 |
+
pred_id = out.logits.argmax(dim=-1).item()
|
| 342 |
+
return id2label[pred_id]
|
| 343 |
+
|
| 344 |
+
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
|
| 345 |
+
emotion_pred = predict_text(custom_text)
|
| 346 |
+
print("\nCâu ví dụ:", custom_text)
|
| 347 |
+
print("Dự đoán cảm xúc:", emotion_pred)
|
| 348 |
+
|
| 349 |
+
print("\nHoàn thành demo PhoBERT với cân bằng dữ liệu & nhiều epoch hơn!")
|
main_svm.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# svm_emotion_classifier.py
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
import re
|
| 5 |
+
import emoji
|
| 6 |
+
import json
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
import torch # Có thể không cần thiết cho SVM, nhưng giữ lại nếu cần
|
| 10 |
+
from underthesea import word_tokenize
|
| 11 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 12 |
+
from sklearn.svm import SVC
|
| 13 |
+
from sklearn.model_selection import train_test_split
|
| 14 |
+
from sklearn.metrics import (
|
| 15 |
+
accuracy_score,
|
| 16 |
+
classification_report,
|
| 17 |
+
precision_score,
|
| 18 |
+
recall_score,
|
| 19 |
+
f1_score,
|
| 20 |
+
confusion_matrix
|
| 21 |
+
)
|
| 22 |
+
from sklearn.utils import resample
|
| 23 |
+
import joblib
|
| 24 |
+
import os
|
| 25 |
+
|
| 26 |
+
########################
|
| 27 |
+
# TIỀN XỬ LÝ
|
| 28 |
+
########################
|
| 29 |
+
|
| 30 |
+
def replace_emojis(sentence, emoji_mapping):
|
| 31 |
+
processed_sentence = []
|
| 32 |
+
for char in sentence:
|
| 33 |
+
if char in emoji_mapping:
|
| 34 |
+
processed_sentence.append(emoji_mapping[char])
|
| 35 |
+
elif not emoji.is_emoji(char):
|
| 36 |
+
processed_sentence.append(char)
|
| 37 |
+
return ''.join(processed_sentence)
|
| 38 |
+
|
| 39 |
+
def remove_profanity(sentence):
|
| 40 |
+
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
|
| 41 |
+
words = sentence.split()
|
| 42 |
+
filtered = [w for w in words if w.lower() not in profane_words]
|
| 43 |
+
return ' '.join(filtered)
|
| 44 |
+
|
| 45 |
+
def remove_special_characters(sentence):
|
| 46 |
+
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
|
| 47 |
+
|
| 48 |
+
def normalize_whitespace(sentence):
|
| 49 |
+
return ' '.join(sentence.split())
|
| 50 |
+
|
| 51 |
+
def remove_repeated_characters(sentence):
|
| 52 |
+
return re.sub(r"(.)\1{2,}", r"\1", sentence)
|
| 53 |
+
|
| 54 |
+
def replace_numbers(sentence):
|
| 55 |
+
return re.sub(r"\d+", "[number]", sentence)
|
| 56 |
+
|
| 57 |
+
def tokenize_underthesea(sentence):
|
| 58 |
+
tokens = word_tokenize(sentence)
|
| 59 |
+
return " ".join(tokens)
|
| 60 |
+
|
| 61 |
+
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
|
| 62 |
+
sentence = sentence.lower()
|
| 63 |
+
sentence = replace_emojis(sentence, emoji_mapping)
|
| 64 |
+
sentence = remove_profanity(sentence)
|
| 65 |
+
sentence = remove_special_characters(sentence)
|
| 66 |
+
sentence = normalize_whitespace(sentence)
|
| 67 |
+
# Thay thế viết tắt
|
| 68 |
+
words = sentence.split()
|
| 69 |
+
replaced = []
|
| 70 |
+
for w in words:
|
| 71 |
+
if w in abbreviations:
|
| 72 |
+
replaced.append(" ".join(abbreviations[w]))
|
| 73 |
+
else:
|
| 74 |
+
replaced.append(w)
|
| 75 |
+
sentence = " ".join(replaced)
|
| 76 |
+
sentence = remove_repeated_characters(sentence)
|
| 77 |
+
sentence = replace_numbers(sentence)
|
| 78 |
+
# Tokenize tiếng Việt
|
| 79 |
+
sentence = tokenize_underthesea(sentence)
|
| 80 |
+
return sentence
|
| 81 |
+
|
| 82 |
+
emoji_mapping = {
|
| 83 |
+
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
|
| 84 |
+
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
|
| 85 |
+
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
|
| 86 |
+
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
|
| 87 |
+
"🤑": "[satisfaction]",
|
| 88 |
+
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
|
| 89 |
+
"😏": "[sarcasm]",
|
| 90 |
+
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
|
| 91 |
+
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
|
| 92 |
+
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
|
| 93 |
+
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
|
| 94 |
+
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
|
| 95 |
+
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
|
| 96 |
+
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
|
| 97 |
+
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
|
| 98 |
+
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
|
| 99 |
+
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
def load_abbreviations(path):
|
| 103 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 104 |
+
return json.load(f)
|
| 105 |
+
|
| 106 |
+
###################################
|
| 107 |
+
# MAIN
|
| 108 |
+
###################################
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
file_path = "train.xlsx"
|
| 111 |
+
abbreviations_path = "abbreviations.json"
|
| 112 |
+
output_path = "processed_svm.xlsx" # Changed output filename to reflect SVM
|
| 113 |
+
|
| 114 |
+
abbreviations = load_abbreviations(abbreviations_path)
|
| 115 |
+
|
| 116 |
+
df = pd.read_excel(file_path)
|
| 117 |
+
if "Sentence" not in df.columns or "Emotion" not in df.columns:
|
| 118 |
+
raise ValueError("Dataset phải chứa cột 'Sentence' và 'Emotion'!")
|
| 119 |
+
|
| 120 |
+
# Tiền xử lý
|
| 121 |
+
df["processed_sentence"] = df["Sentence"].apply(
|
| 122 |
+
lambda x: preprocess_sentence(str(x), abbreviations, emoji_mapping)
|
| 123 |
+
)
|
| 124 |
+
# Loại bỏ rỗng
|
| 125 |
+
df = df[df["processed_sentence"].str.strip().astype(bool)]
|
| 126 |
+
|
| 127 |
+
print("Trước khi cân bằng:")
|
| 128 |
+
print(df["Emotion"].value_counts())
|
| 129 |
+
|
| 130 |
+
# =========== CÂN BẰNG TẤT CẢ CÁC LỚP =============
|
| 131 |
+
# Lấy max samples
|
| 132 |
+
max_count = df["Emotion"].value_counts().max()
|
| 133 |
+
|
| 134 |
+
df_balanced_list = []
|
| 135 |
+
for emo in df["Emotion"].unique():
|
| 136 |
+
df_emo = df[df["Emotion"] == emo]
|
| 137 |
+
if len(df_emo) < max_count:
|
| 138 |
+
# Oversample lên max_count
|
| 139 |
+
df_emo_oversampled = resample(
|
| 140 |
+
df_emo,
|
| 141 |
+
replace=True,
|
| 142 |
+
n_samples=max_count,
|
| 143 |
+
random_state=42
|
| 144 |
+
)
|
| 145 |
+
df_balanced_list.append(df_emo_oversampled)
|
| 146 |
+
else:
|
| 147 |
+
# Nếu emo này = max_count rồi thì giữ nguyên
|
| 148 |
+
df_balanced_list.append(df_emo)
|
| 149 |
+
|
| 150 |
+
df = pd.concat(df_balanced_list, axis=0)
|
| 151 |
+
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
|
| 152 |
+
|
| 153 |
+
print("\nSau khi cân bằng tất cả lớp:")
|
| 154 |
+
print(df["Emotion"].value_counts())
|
| 155 |
+
|
| 156 |
+
df.to_excel(output_path, index=False)
|
| 157 |
+
|
| 158 |
+
# Tạo label2id và id2label theo thứ tự bạn cung cấp
|
| 159 |
+
custom_id2label = {
|
| 160 |
+
0: 'Anger',
|
| 161 |
+
1: 'Disgust',
|
| 162 |
+
2: 'Enjoyment',
|
| 163 |
+
3: 'Fear',
|
| 164 |
+
4: 'Other',
|
| 165 |
+
5: 'Sadness',
|
| 166 |
+
6: 'Surprise'
|
| 167 |
+
}
|
| 168 |
+
label2id = {label: idx for idx, label in custom_id2label.items()}
|
| 169 |
+
id2label = {v: k for k, v in label2id.items()}
|
| 170 |
+
|
| 171 |
+
df["label_id"] = df["Emotion"].map(label2id)
|
| 172 |
+
if df["label_id"].isnull().any():
|
| 173 |
+
missing = df[df["label_id"].isnull()]["Emotion"].unique()
|
| 174 |
+
raise ValueError(f"Những nhãn cảm xúc sau không có trong label2id: {missing}")
|
| 175 |
+
|
| 176 |
+
# Tách train/test
|
| 177 |
+
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label_id"])
|
| 178 |
+
|
| 179 |
+
print(f"Train size = {len(train_df)}, Test size = {len(test_df)}")
|
| 180 |
+
|
| 181 |
+
# Feature Extraction với TF-IDF
|
| 182 |
+
vectorizer = TfidfVectorizer(max_features=5000)
|
| 183 |
+
X_train = vectorizer.fit_transform(train_df["processed_sentence"])
|
| 184 |
+
X_test = vectorizer.transform(test_df["processed_sentence"])
|
| 185 |
+
y_train = train_df["label_id"].values
|
| 186 |
+
y_test = test_df["label_id"].values
|
| 187 |
+
|
| 188 |
+
# Huấn luyện mô hình SVM
|
| 189 |
+
svm_classifier = SVC(kernel='linear', probability=True, random_state=42)
|
| 190 |
+
print("\n========== Training SVM ==========")
|
| 191 |
+
svm_classifier.fit(X_train, y_train)
|
| 192 |
+
|
| 193 |
+
# Đánh giá mô hình
|
| 194 |
+
print("\n========== Evaluate on Test set ==========")
|
| 195 |
+
y_pred = svm_classifier.predict(X_test)
|
| 196 |
+
|
| 197 |
+
# Tính các chỉ số
|
| 198 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 199 |
+
precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
|
| 200 |
+
precision_weighted = precision_score(y_test, y_pred, average='weighted', zero_division=0)
|
| 201 |
+
recall_macro = recall_score(y_test, y_pred, average='macro', zero_division=0)
|
| 202 |
+
recall_weighted = recall_score(y_test, y_pred, average='weighted', zero_division=0)
|
| 203 |
+
f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
|
| 204 |
+
f1_weighted = f1_score(y_test, y_pred, average='weighted', zero_division=0)
|
| 205 |
+
conf_matrix = confusion_matrix(y_test, y_pred)
|
| 206 |
+
|
| 207 |
+
# In các chỉ số
|
| 208 |
+
print(f"Test Accuracy: {accuracy:.4f}")
|
| 209 |
+
print(f"Precision (Macro): {precision_macro:.4f}")
|
| 210 |
+
print(f"Precision (Weighted): {precision_weighted:.4f}")
|
| 211 |
+
print(f"Recall (Macro): {recall_macro:.4f}")
|
| 212 |
+
print(f"Recall (Weighted): {recall_weighted:.4f}")
|
| 213 |
+
print(f"F1-Score (Macro): {f1_macro:.4f}")
|
| 214 |
+
print(f"F1-Score (Weighted): {f1_weighted:.4f}")
|
| 215 |
+
|
| 216 |
+
print("\n========== Classification Report ==========")
|
| 217 |
+
report = classification_report(y_test, y_pred, target_names=custom_id2label.values(), digits=4)
|
| 218 |
+
print(report)
|
| 219 |
+
|
| 220 |
+
# Lưu báo cáo vào file
|
| 221 |
+
report_path = os.path.join("svm_emotion_model", "classification_report.txt")
|
| 222 |
+
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
| 223 |
+
with open(report_path, "w", encoding="utf-8") as f:
|
| 224 |
+
f.write("========== Classification Report ==========\n")
|
| 225 |
+
f.write(report)
|
| 226 |
+
f.write("\n========== Additional Metrics ==========\n")
|
| 227 |
+
f.write(f"Accuracy: {accuracy:.4f}\n")
|
| 228 |
+
f.write(f"Precision (Macro): {precision_macro:.4f}\n")
|
| 229 |
+
f.write(f"Precision (Weighted): {precision_weighted:.4f}\n")
|
| 230 |
+
f.write(f"Recall (Macro): {recall_macro:.4f}\n")
|
| 231 |
+
f.write(f"Recall (Weighted): {recall_weighted:.4f}\n")
|
| 232 |
+
f.write(f"F1-Score (Macro): {f1_macro:.4f}\n")
|
| 233 |
+
f.write(f"F1-Score (Weighted): {f1_weighted:.4f}\n")
|
| 234 |
+
f.write("\n========== Confusion Matrix ==========\n")
|
| 235 |
+
f.write(np.array2string(conf_matrix))
|
| 236 |
+
|
| 237 |
+
print("\n========== Classification Report saved to 'svm_emotion_model/classification_report.txt' ==========")
|
| 238 |
+
|
| 239 |
+
# Lưu mô hình và các thành phần cần thiết
|
| 240 |
+
model_output_dir = "./svm_emotion_model"
|
| 241 |
+
os.makedirs(model_output_dir, exist_ok=True)
|
| 242 |
+
joblib.dump(svm_classifier, os.path.join(model_output_dir, "svm_classifier.joblib"))
|
| 243 |
+
joblib.dump(vectorizer, os.path.join(model_output_dir, "tfidf_vectorizer.joblib"))
|
| 244 |
+
joblib.dump(id2label, os.path.join(model_output_dir, "id2label.json"))
|
| 245 |
+
|
| 246 |
+
print("\n========== Model and Vectorizer saved ==========")
|
| 247 |
+
|
| 248 |
+
# Predict 1 câu (ví dụ)
|
| 249 |
+
def predict_text(text):
|
| 250 |
+
text_proc = preprocess_sentence(text, abbreviations, emoji_mapping)
|
| 251 |
+
X = vectorizer.transform([text_proc])
|
| 252 |
+
pred_id = svm_classifier.predict(X)[0]
|
| 253 |
+
label = custom_id2label[pred_id]
|
| 254 |
+
return label
|
| 255 |
+
|
| 256 |
+
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
|
| 257 |
+
emotion_pred = predict_text(custom_text)
|
| 258 |
+
print("\nCâu ví dụ:", custom_text)
|
| 259 |
+
print("Dự đoán cảm xúc:", emotion_pred)
|
| 260 |
+
|
| 261 |
+
print("\nHoàn thành demo SVM với cân bằng dữ liệu & nhiều chỉ số đánh giá!")
|
main_v1.py
ADDED
|
@@ -0,0 +1,494 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# thesis.py
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import emoji
|
| 6 |
+
import json
|
| 7 |
+
import re
|
| 8 |
+
from underthesea import word_tokenize
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
import torch
|
| 11 |
+
from torchtext.vocab import Vectors
|
| 12 |
+
from sklearn.model_selection import train_test_split
|
| 13 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
| 14 |
+
from torch.utils.data import DataLoader, TensorDataset
|
| 15 |
+
import torch.nn as nn
|
| 16 |
+
import torch.optim as optim
|
| 17 |
+
import numpy as np
|
| 18 |
+
import tensorflow as tf
|
| 19 |
+
|
| 20 |
+
# ========== CÁC HÀM TIỀN XỬ LÝ ==========
|
| 21 |
+
|
| 22 |
+
def preprocess_sentence(sentence, abbreviations, emoji_mapping):
|
| 23 |
+
"""
|
| 24 |
+
Tiền xử lý 1 câu: chuyển thường, thay thế emoji, xóa từ thô tục,
|
| 25 |
+
ký tự đặc biệt, chuẩn hóa khoảng trắng, v.v.
|
| 26 |
+
"""
|
| 27 |
+
sentence = sentence.lower()
|
| 28 |
+
sentence = replace_emojis(sentence, emoji_mapping)
|
| 29 |
+
sentence = remove_profanity(sentence)
|
| 30 |
+
sentence = remove_special_characters(sentence)
|
| 31 |
+
sentence = normalize_whitespace(sentence)
|
| 32 |
+
sentence = replace_abbreviations(sentence, abbreviations)
|
| 33 |
+
sentence = remove_repeated_characters(sentence)
|
| 34 |
+
sentence = replace_numbers(sentence)
|
| 35 |
+
sentence = tokenize_sentence(sentence)
|
| 36 |
+
return sentence
|
| 37 |
+
|
| 38 |
+
def replace_emojis(sentence, emoji_mapping):
|
| 39 |
+
processed_sentence = []
|
| 40 |
+
for char in sentence:
|
| 41 |
+
if char in emoji_mapping:
|
| 42 |
+
processed_sentence.append(emoji_mapping[char])
|
| 43 |
+
elif not emoji.is_emoji(char):
|
| 44 |
+
processed_sentence.append(char)
|
| 45 |
+
return ''.join(processed_sentence)
|
| 46 |
+
|
| 47 |
+
def remove_profanity(sentence):
|
| 48 |
+
profane_words = ["loz", "vloz", "vl", "dm", "đm", "clgt", "dmm", "cc", "vc", "đù mé", "vãi"]
|
| 49 |
+
words = sentence.split()
|
| 50 |
+
filtered_words = [word for word in words if word.lower() not in profane_words]
|
| 51 |
+
return ' '.join(filtered_words)
|
| 52 |
+
|
| 53 |
+
def remove_special_characters(sentence):
|
| 54 |
+
return re.sub(r"[\^\*@#&$%<>~{}|\\]", "", sentence)
|
| 55 |
+
|
| 56 |
+
def normalize_whitespace(sentence):
|
| 57 |
+
return ' '.join(sentence.split())
|
| 58 |
+
|
| 59 |
+
def replace_abbreviations(sentence, abbreviations):
|
| 60 |
+
words = sentence.split()
|
| 61 |
+
replaced_words = [
|
| 62 |
+
" ".join(abbreviations[word]) if word in abbreviations else word
|
| 63 |
+
for word in words
|
| 64 |
+
]
|
| 65 |
+
return ' '.join(replaced_words)
|
| 66 |
+
|
| 67 |
+
def remove_repeated_characters(sentence):
|
| 68 |
+
return re.sub(r"(.)\1{2,}", r"\1", sentence)
|
| 69 |
+
|
| 70 |
+
def replace_numbers(sentence):
|
| 71 |
+
return re.sub(r"\d+", "[number]", sentence)
|
| 72 |
+
|
| 73 |
+
def tokenize_sentence(sentence):
|
| 74 |
+
return ' '.join(word_tokenize(sentence))
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# ========== LỚP DATA MANAGER ==========
|
| 78 |
+
|
| 79 |
+
class DataManager:
|
| 80 |
+
def __init__(self, file_path, abbreviations_path, word2vec_path):
|
| 81 |
+
self.file_path = file_path
|
| 82 |
+
self.abbreviations_path = abbreviations_path
|
| 83 |
+
self.word2vec_path = word2vec_path
|
| 84 |
+
self.load_abbreviations()
|
| 85 |
+
self.load_word2vec()
|
| 86 |
+
|
| 87 |
+
def load_abbreviations(self):
|
| 88 |
+
with open(self.abbreviations_path, "r", encoding="utf-8") as file:
|
| 89 |
+
self.abbreviations = json.load(file)
|
| 90 |
+
|
| 91 |
+
def load_word2vec(self):
|
| 92 |
+
# Tải vector từ file word2vec, unk_init để từ vựng ngoài tập sẽ random normal
|
| 93 |
+
self.word_embeddings = Vectors(name=self.word2vec_path, unk_init=torch.Tensor.normal_)
|
| 94 |
+
self.vocabulary = self.create_vocab_from_word2vec()
|
| 95 |
+
|
| 96 |
+
def create_vocab_from_word2vec(self):
|
| 97 |
+
vocab = Vocabulary()
|
| 98 |
+
words_list = list(self.word_embeddings.stoi.keys())
|
| 99 |
+
for word in words_list:
|
| 100 |
+
vocab.add(word)
|
| 101 |
+
return vocab
|
| 102 |
+
|
| 103 |
+
def preprocess_data(self):
|
| 104 |
+
df = pd.read_excel(self.file_path)
|
| 105 |
+
if "Sentence" not in df.columns:
|
| 106 |
+
raise ValueError("Cột 'Sentence' không tồn tại trong dataset!")
|
| 107 |
+
|
| 108 |
+
# Tiền xử lý từng câu
|
| 109 |
+
df["processed_sentence"] = df["Sentence"].apply(
|
| 110 |
+
lambda x: preprocess_sentence(str(x), self.abbreviations, emoji_mapping)
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# Loại bỏ những dòng rỗng sau khi xử lý
|
| 114 |
+
df = df[df["processed_sentence"].str.strip().astype(bool)]
|
| 115 |
+
return df
|
| 116 |
+
|
| 117 |
+
def split_and_convert(
|
| 118 |
+
self, df, label_column="Emotion", maxlen=400, test_size=0.2,
|
| 119 |
+
for_keras=False, batch_size=32
|
| 120 |
+
):
|
| 121 |
+
"""
|
| 122 |
+
Chia dữ liệu thành train/test. Trả về:
|
| 123 |
+
- Nếu for_keras=False: train_loader, test_loader, label_mapping (PyTorch)
|
| 124 |
+
- Nếu for_keras=True: X_train, X_test, y_train_onehot, y_test_onehot, label_mapping (Keras)
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
if label_column not in df.columns:
|
| 128 |
+
raise ValueError(
|
| 129 |
+
f"Cột '{label_column}' không tồn tại trong DataFrame. "
|
| 130 |
+
f"Các cột hiện có: {df.columns.tolist()}"
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
# Tạo mapping nhãn -> số
|
| 134 |
+
label_mapping = {label: idx for idx, label in enumerate(df[label_column].unique())}
|
| 135 |
+
df[label_column] = df[label_column].map(label_mapping)
|
| 136 |
+
|
| 137 |
+
X = df["processed_sentence"].tolist()
|
| 138 |
+
y = df[label_column].tolist()
|
| 139 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
|
| 140 |
+
|
| 141 |
+
# Chuyển văn bản thành tensor chỉ số
|
| 142 |
+
X_train_tensors = self.vocabulary.corpus_to_tensor(X_train, is_tokenized=False)
|
| 143 |
+
X_test_tensors = self.vocabulary.corpus_to_tensor(X_test, is_tokenized=False)
|
| 144 |
+
|
| 145 |
+
# Pad sequences
|
| 146 |
+
X_train_padded = pad_sequences(X_train_tensors, maxlen=maxlen)
|
| 147 |
+
X_test_padded = pad_sequences(X_test_tensors, maxlen=maxlen)
|
| 148 |
+
|
| 149 |
+
# Debug thông tin
|
| 150 |
+
print(">>> Debug Split and Convert:")
|
| 151 |
+
print("X_train_padded.shape:", X_train_padded.shape)
|
| 152 |
+
print("X_test_padded.shape: ", X_test_padded.shape)
|
| 153 |
+
print("y_train length:", len(y_train))
|
| 154 |
+
print("y_test length: ", len(y_test))
|
| 155 |
+
|
| 156 |
+
# Kiểm tra min/max token
|
| 157 |
+
max_token_train = np.max(X_train_padded) if X_train_padded.size > 0 else None
|
| 158 |
+
min_token_train = np.min(X_train_padded) if X_train_padded.size > 0 else None
|
| 159 |
+
max_token_test = np.max(X_test_padded) if X_test_padded.size > 0 else None
|
| 160 |
+
min_token_test = np.min(X_test_padded) if X_test_padded.size > 0 else None
|
| 161 |
+
|
| 162 |
+
vocab_size = len(self.vocabulary)
|
| 163 |
+
print(f"vocab_size: {vocab_size}")
|
| 164 |
+
print(f"max_token_train: {max_token_train}, min_token_train: {min_token_train}")
|
| 165 |
+
print(f"max_token_test: {max_token_test}, min_token_test: {min_token_test}")
|
| 166 |
+
|
| 167 |
+
if for_keras:
|
| 168 |
+
num_classes = len(label_mapping)
|
| 169 |
+
# One-hot cho nhãn
|
| 170 |
+
y_train_onehot = torch.nn.functional.one_hot(torch.tensor(y_train), num_classes=num_classes).numpy()
|
| 171 |
+
y_test_onehot = torch.nn.functional.one_hot(torch.tensor(y_test), num_classes=num_classes).numpy()
|
| 172 |
+
|
| 173 |
+
# Debug
|
| 174 |
+
print("y_train_onehot.shape:", y_train_onehot.shape)
|
| 175 |
+
print("y_test_onehot.shape: ", y_test_onehot.shape)
|
| 176 |
+
|
| 177 |
+
return X_train_padded, X_test_padded, y_train_onehot, y_test_onehot, label_mapping
|
| 178 |
+
else:
|
| 179 |
+
# Trả về DataLoader cho PyTorch
|
| 180 |
+
X_train_tensor = torch.tensor(X_train_padded, dtype=torch.long)
|
| 181 |
+
X_test_tensor = torch.tensor(X_test_padded, dtype=torch.long)
|
| 182 |
+
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
|
| 183 |
+
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
|
| 184 |
+
|
| 185 |
+
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
|
| 186 |
+
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
|
| 187 |
+
|
| 188 |
+
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
| 189 |
+
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
|
| 190 |
+
return train_loader, test_loader, label_mapping
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
# ========== LỚP TỪ ĐIỂN (VOCABULARY) ==========
|
| 194 |
+
|
| 195 |
+
class Vocabulary:
|
| 196 |
+
def __init__(self):
|
| 197 |
+
self.word2id = {}
|
| 198 |
+
self.word2id['<pad>'] = 0
|
| 199 |
+
self.word2id['<unk>'] = 1
|
| 200 |
+
self.unk_id = self.word2id['<unk>']
|
| 201 |
+
self.id2word = {0: '<pad>', 1: '<unk>'}
|
| 202 |
+
|
| 203 |
+
def __getitem__(self, word):
|
| 204 |
+
return self.word2id.get(word, self.unk_id)
|
| 205 |
+
|
| 206 |
+
def __contains__(self, word):
|
| 207 |
+
return word in self.word2id
|
| 208 |
+
|
| 209 |
+
def __len__(self):
|
| 210 |
+
return len(self.word2id)
|
| 211 |
+
|
| 212 |
+
def lookup_tokens(self, word_indexes: list):
|
| 213 |
+
return [self.id2word[word_index] for word_index in word_indexes]
|
| 214 |
+
|
| 215 |
+
def add(self, word):
|
| 216 |
+
if word not in self:
|
| 217 |
+
word_index = len(self.word2id)
|
| 218 |
+
self.word2id[word] = word_index
|
| 219 |
+
self.id2word[word_index] = word
|
| 220 |
+
return word_index
|
| 221 |
+
else:
|
| 222 |
+
return self[word]
|
| 223 |
+
|
| 224 |
+
@staticmethod
|
| 225 |
+
def tokenize_corpus(corpus):
|
| 226 |
+
tokenized_corpus = []
|
| 227 |
+
for document in tqdm(corpus):
|
| 228 |
+
tokenized_document = [word.replace(" ", "_") for word in word_tokenize(document)]
|
| 229 |
+
tokenized_corpus.append(tokenized_document)
|
| 230 |
+
return tokenized_corpus
|
| 231 |
+
|
| 232 |
+
def corpus_to_tensor(self, corpus, is_tokenized=False):
|
| 233 |
+
tokenized_corpus = self.tokenize_corpus(corpus) if not is_tokenized else corpus
|
| 234 |
+
return [
|
| 235 |
+
[self[word] for word in document]
|
| 236 |
+
for document in tokenized_corpus
|
| 237 |
+
]
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
# ========== MAPPING EMOJI => NHÃN ==========
|
| 241 |
+
|
| 242 |
+
emoji_mapping = {
|
| 243 |
+
"😀": "[joy]", "😃": "[joy]", "😄": "[joy]", "😁": "[joy]", "😆": "[joy]", "😅": "[joy]", "😂": "[joy]", "🤣": "[joy]",
|
| 244 |
+
"🙂": "[love]", "🙃": "[love]", "😉": "[love]", "😊": "[love]", "😇": "[love]", "🥰": "[love]", "😍": "[love]",
|
| 245 |
+
"🤩": "[love]", "😘": "[love]", "😗": "[love]", "☺": "[love]", "😚": "[love]", "😙": "[love]",
|
| 246 |
+
"😋": "[satisfaction]", "😛": "[satisfaction]", "😜": "[satisfaction]", "🤪": "[satisfaction]", "😝": "[satisfaction]",
|
| 247 |
+
"🤑": "[satisfaction]",
|
| 248 |
+
"🤐": "[neutral]", "🤨": "[neutral]", "😐": "[neutral]", "😑": "[neutral]", "😶": "[neutral]",
|
| 249 |
+
"😏": "[sarcasm]",
|
| 250 |
+
"😒": "[disappointment]", "🙄": "[disappointment]", "😬": "[disappointment]",
|
| 251 |
+
"😔": "[sadness]", "😪": "[sadness]", "😢": "[sadness]", "😭": "[sadness]", "😥": "[sadness]", "😓": "[sadness]",
|
| 252 |
+
"😩": "[tiredness]", "😫": "[tiredness]", "🥱": "[tiredness]",
|
| 253 |
+
"🤤": "[discomfort]", "🤢": "[discomfort]", "🤮": "[discomfort]", "🤧": "[discomfort]", "🥵": "[discomfort]",
|
| 254 |
+
"🥶": "[discomfort]", "🥴": "[discomfort]", "😵": "[discomfort]", "🤯": "[discomfort]",
|
| 255 |
+
"😕": "[confused]", "😟": "[confused]", "🙁": "[confused]", "☹": "[confused]",
|
| 256 |
+
"😮": "[surprise]", "😯": "[surprise]", "😲": "[surprise]", "😳": "[surprise]", "🥺": "[pleading]",
|
| 257 |
+
"😦": "[fear]", "😧": "[fear]", "😨": "[fear]", "😰": "[fear]", "😱": "[fear]",
|
| 258 |
+
"😖": "[confusion]", "😣": "[confusion]", "😞": "[confusion]",
|
| 259 |
+
"😤": "[anger]", "😡": "[anger]", "😠": "[anger]", "🤬": "[anger]", "😈": "[mischievous]", "👿": "[mischievous]"
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
# ========== ĐỊNH NGHĨA MÔ HÌNH RNN PYTORCH ==========
|
| 264 |
+
|
| 265 |
+
class SimpleRNN(nn.Module):
|
| 266 |
+
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
|
| 267 |
+
super(SimpleRNN, self).__init__()
|
| 268 |
+
self.embedding = nn.Embedding(vocab_size, embedding_dim)
|
| 269 |
+
self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
|
| 270 |
+
self.fc = nn.Linear(hidden_dim, output_dim)
|
| 271 |
+
|
| 272 |
+
def forward(self, x):
|
| 273 |
+
embedded = self.embedding(x)
|
| 274 |
+
_, (hidden, _) = self.rnn(embedded)
|
| 275 |
+
return self.fc(hidden.squeeze(0))
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
# ========== HÀM DỰ ĐOÁN VỚI MÔ HÌNH PYTORCH RNN ==========
|
| 279 |
+
|
| 280 |
+
def predict_emotion_rnn(model, text, data_manager, label_mapping, device):
|
| 281 |
+
model.eval()
|
| 282 |
+
with torch.no_grad():
|
| 283 |
+
processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
|
| 284 |
+
tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
|
| 285 |
+
text_tensor = torch.tensor(
|
| 286 |
+
pad_sequences(data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True), maxlen=400),
|
| 287 |
+
dtype=torch.long
|
| 288 |
+
).to(device)
|
| 289 |
+
|
| 290 |
+
output = model(text_tensor)
|
| 291 |
+
_, predicted = torch.max(output, 1)
|
| 292 |
+
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
|
| 293 |
+
return reverse_label_mapping[predicted.item()]
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
# ========== HÀM DỰ ĐOÁN VỚI MÔ HÌNH KERAS CNN-LSTM ==========
|
| 297 |
+
|
| 298 |
+
def predict_emotion_cnn_lstm(model, text, data_manager, label_mapping):
|
| 299 |
+
processed_text = preprocess_sentence(text, data_manager.abbreviations, emoji_mapping)
|
| 300 |
+
tokenized_text = data_manager.vocabulary.tokenize_corpus([processed_text])
|
| 301 |
+
text_tensor = pad_sequences(data_manager.vocabulary.corpus_to_tensor(tokenized_text, is_tokenized=True), maxlen=400)
|
| 302 |
+
output = model.predict(text_tensor)
|
| 303 |
+
predicted = output.argmax(axis=1)[0]
|
| 304 |
+
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
|
| 305 |
+
return reverse_label_mapping[predicted]
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
# ========== PHẦN MAIN (CHẠY THỬ) ==========
|
| 309 |
+
|
| 310 |
+
if __name__ == "__main__":
|
| 311 |
+
# --------------------------
|
| 312 |
+
# Thay đường dẫn tại đây:
|
| 313 |
+
# --------------------------
|
| 314 |
+
file_path = "train.xlsx" # file Excel gốc (chứa cột "Sentence", "Emotion", ...)
|
| 315 |
+
abbreviations_path = "abbreviations.json"
|
| 316 |
+
word2vec_path = "/home/datpham/datpham/thesis-ngtram/word2vec_vi_syllables_100dims.txt"
|
| 317 |
+
output_path = "processed.xlsx"
|
| 318 |
+
|
| 319 |
+
data_manager = DataManager(
|
| 320 |
+
file_path=file_path,
|
| 321 |
+
abbreviations_path=abbreviations_path,
|
| 322 |
+
word2vec_path=word2vec_path
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
# 1) Đọc và tiền xử lý
|
| 326 |
+
df = data_manager.preprocess_data()
|
| 327 |
+
print("Trước khi undersampling:")
|
| 328 |
+
print(df["Emotion"].value_counts())
|
| 329 |
+
|
| 330 |
+
# 2) UNDERSAMPLING (Ví dụ)
|
| 331 |
+
# Chỉnh lại tên emotion cụ thể cho phù hợp tập dữ liệu của bạn
|
| 332 |
+
df_enjoyment = df[df["Emotion"] == "Enjoyment"]
|
| 333 |
+
df_other = df[df["Emotion"] == "Other"]
|
| 334 |
+
df_anger = df[df["Emotion"] == "Anger"]
|
| 335 |
+
df_sadness = df[df["Emotion"] == "Sadness"]
|
| 336 |
+
df_disgust = df[df["Emotion"] == "Disgust"]
|
| 337 |
+
df_fear = df[df["Emotion"] == "Fear"]
|
| 338 |
+
df_surprise = df[df["Emotion"] == "Surprise"]
|
| 339 |
+
|
| 340 |
+
# Ví dụ: Chọn 2000 mẫu cho 'Enjoyment'
|
| 341 |
+
if len(df_enjoyment) > 2000:
|
| 342 |
+
df_enjoyment_undersampled = df_enjoyment.sample(n=2000, random_state=42)
|
| 343 |
+
else:
|
| 344 |
+
df_enjoyment_undersampled = df_enjoyment
|
| 345 |
+
|
| 346 |
+
df_balanced = pd.concat([
|
| 347 |
+
df_enjoyment_undersampled,
|
| 348 |
+
df_other,
|
| 349 |
+
df_anger,
|
| 350 |
+
df_sadness,
|
| 351 |
+
df_disgust,
|
| 352 |
+
df_fear,
|
| 353 |
+
df_surprise
|
| 354 |
+
], axis=0)
|
| 355 |
+
|
| 356 |
+
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
|
| 357 |
+
df = df_balanced
|
| 358 |
+
|
| 359 |
+
print("\nSau khi undersampling:")
|
| 360 |
+
print(df["Emotion"].value_counts())
|
| 361 |
+
|
| 362 |
+
df.to_excel(output_path, index=False)
|
| 363 |
+
|
| 364 |
+
# 3) Tạo data loader cho PyTorch
|
| 365 |
+
train_loader, test_loader, label_mapping = data_manager.split_and_convert(
|
| 366 |
+
df, label_column="Emotion", for_keras=False
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
vocab_size = len(data_manager.vocabulary)
|
| 370 |
+
embedding_dim = 100
|
| 371 |
+
hidden_dim = 128
|
| 372 |
+
output_dim = len(label_mapping)
|
| 373 |
+
|
| 374 |
+
model_rnn = SimpleRNN(vocab_size, embedding_dim, hidden_dim, output_dim)
|
| 375 |
+
criterion = nn.CrossEntropyLoss()
|
| 376 |
+
optimizer = optim.Adam(model_rnn.parameters())
|
| 377 |
+
|
| 378 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 379 |
+
model_rnn.to(device)
|
| 380 |
+
|
| 381 |
+
num_epochs = 20
|
| 382 |
+
for epoch in range(num_epochs):
|
| 383 |
+
model_rnn.train()
|
| 384 |
+
epoch_loss = 0
|
| 385 |
+
correct = 0
|
| 386 |
+
total = 0
|
| 387 |
+
for X_batch, y_batch in train_loader:
|
| 388 |
+
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
|
| 389 |
+
|
| 390 |
+
optimizer.zero_grad()
|
| 391 |
+
predictions = model_rnn(X_batch)
|
| 392 |
+
loss = criterion(predictions, y_batch)
|
| 393 |
+
loss.backward()
|
| 394 |
+
optimizer.step()
|
| 395 |
+
|
| 396 |
+
epoch_loss += loss.item()
|
| 397 |
+
_, predicted = torch.max(predictions, 1)
|
| 398 |
+
correct += (predicted == y_batch).sum().item()
|
| 399 |
+
total += y_batch.size(0)
|
| 400 |
+
|
| 401 |
+
print(f"Epoch {epoch+1}/{num_epochs}, "
|
| 402 |
+
f"Loss: {epoch_loss/len(train_loader):.4f}, "
|
| 403 |
+
f"Accuracy: {correct/total:.4f}")
|
| 404 |
+
|
| 405 |
+
# Đánh giá RNN trên test set
|
| 406 |
+
model_rnn.eval()
|
| 407 |
+
test_loss = 0
|
| 408 |
+
correct = 0
|
| 409 |
+
total = 0
|
| 410 |
+
with torch.no_grad():
|
| 411 |
+
for X_batch, y_batch in test_loader:
|
| 412 |
+
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
|
| 413 |
+
predictions = model_rnn(X_batch)
|
| 414 |
+
loss = criterion(predictions, y_batch)
|
| 415 |
+
test_loss += loss.item()
|
| 416 |
+
|
| 417 |
+
_, predicted = torch.max(predictions, 1)
|
| 418 |
+
correct += (predicted == y_batch).sum().item()
|
| 419 |
+
total += y_batch.size(0)
|
| 420 |
+
|
| 421 |
+
print(f"Test Loss: {test_loss/len(test_loader):.4f}, "
|
| 422 |
+
f"Test Accuracy: {correct/total:.4f}")
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
# ========== CNN-LSTM (Keras) ==========
|
| 426 |
+
|
| 427 |
+
from keras.models import Model
|
| 428 |
+
from keras.layers import Input, Embedding, Convolution1D, LSTM, Dense, Dropout, Lambda, concatenate
|
| 429 |
+
from keras.optimizers import Adam
|
| 430 |
+
from keras.callbacks import ModelCheckpoint
|
| 431 |
+
|
| 432 |
+
print("Training CNN-LSTM...")
|
| 433 |
+
|
| 434 |
+
X_train, X_test, y_train, y_test, label_mapping = data_manager.split_and_convert(
|
| 435 |
+
df, label_column="Emotion", for_keras=True
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
maxlen = 400
|
| 439 |
+
|
| 440 |
+
input_layer = Input(shape=(maxlen,), dtype='int32', name='main_input')
|
| 441 |
+
emb_layer = Embedding(len(data_manager.vocabulary), embedding_dim)(input_layer)
|
| 442 |
+
|
| 443 |
+
def max_1d(X):
|
| 444 |
+
return tf.reduce_max(X, axis=1)
|
| 445 |
+
|
| 446 |
+
con3_layer = Convolution1D(150, kernel_size=3, activation='relu')(emb_layer)
|
| 447 |
+
pool_con3_layer = Lambda(max_1d, output_shape=(150,))(con3_layer)
|
| 448 |
+
|
| 449 |
+
con5_layer = Convolution1D(150, kernel_size=5, activation='relu')(emb_layer)
|
| 450 |
+
pool_con5_layer = Lambda(max_1d, output_shape=(150,))(con5_layer)
|
| 451 |
+
|
| 452 |
+
lstm_layer = LSTM(128)(emb_layer)
|
| 453 |
+
|
| 454 |
+
cnn_lstm_layer = concatenate([pool_con3_layer, pool_con5_layer, lstm_layer])
|
| 455 |
+
|
| 456 |
+
dense_layer = Dense(100, activation='relu')(cnn_lstm_layer)
|
| 457 |
+
dropout_layer = Dropout(0.2)(dense_layer)
|
| 458 |
+
output_layer = Dense(len(label_mapping), activation='softmax')(dropout_layer)
|
| 459 |
+
|
| 460 |
+
model_cnn_lstm = Model(inputs=input_layer, outputs=output_layer)
|
| 461 |
+
model_cnn_lstm.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
|
| 462 |
+
|
| 463 |
+
checkpoint = ModelCheckpoint('cnn_lstm_best.keras', save_best_only=True, monitor='val_accuracy', mode='max')
|
| 464 |
+
model_cnn_lstm.fit(
|
| 465 |
+
X_train, y_train,
|
| 466 |
+
validation_data=(X_test, y_test),
|
| 467 |
+
batch_size=32,
|
| 468 |
+
epochs=20,
|
| 469 |
+
callbacks=[checkpoint]
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
model_cnn_lstm.save('cnn_lstm_model.keras')
|
| 473 |
+
|
| 474 |
+
loss, accuracy = model_cnn_lstm.evaluate(X_test, y_test)
|
| 475 |
+
print(f"CNN-LSTM Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")
|
| 476 |
+
|
| 477 |
+
# Demo dự đoán 1 câu mới
|
| 478 |
+
custom_text = "Tôi rất vui khi sử dụng dịch vụ này!"
|
| 479 |
+
|
| 480 |
+
# RNN (PyTorch)
|
| 481 |
+
emotion_rnn = predict_emotion_rnn(model_rnn, custom_text, data_manager, label_mapping, device)
|
| 482 |
+
print(f"Predicted Emotion (RNN): {emotion_rnn}")
|
| 483 |
+
|
| 484 |
+
# CNN-LSTM (Keras)
|
| 485 |
+
cnn_lstm_model = tf.keras.models.load_model('cnn_lstm_model.keras')
|
| 486 |
+
emotion_cnn_lstm = predict_emotion_cnn_lstm(cnn_lstm_model, custom_text, data_manager, label_mapping)
|
| 487 |
+
print(f"Predicted Emotion (CNN-LSTM): {emotion_cnn_lstm}")
|
| 488 |
+
|
| 489 |
+
# Kiểm tra phiên bản TF, GPU
|
| 490 |
+
print("TF version:", tf.__version__)
|
| 491 |
+
print("GPU devices:", tf.config.list_physical_devices("GPU"))
|
| 492 |
+
# Có thể kiểm tra CUDA/GPU thông qua lệnh system sau (nếu muốn):
|
| 493 |
+
# import os
|
| 494 |
+
# os.system("nvidia-smi")
|
phobert_emotion_model/classification_report.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
========== Classification Report ==========
|
| 2 |
+
precision recall f1-score support
|
| 3 |
+
|
| 4 |
+
Anger 0.9768 0.9788 0.9778 991
|
| 5 |
+
Disgust 0.9457 0.9657 0.9556 991
|
| 6 |
+
Enjoyment 0.9166 0.8204 0.8658 991
|
| 7 |
+
Fear 0.9771 0.9879 0.9825 992
|
| 8 |
+
Other 0.9026 0.9253 0.9138 991
|
| 9 |
+
Sadness 0.9302 0.9677 0.9486 991
|
| 10 |
+
Surprise 0.9448 0.9496 0.9472 992
|
| 11 |
+
|
| 12 |
+
accuracy 0.9422 6939
|
| 13 |
+
macro avg 0.9420 0.9422 0.9416 6939
|
| 14 |
+
weighted avg 0.9420 0.9422 0.9416 6939
|
| 15 |
+
|
| 16 |
+
========== Confusion Matrix ==========
|
| 17 |
+
[[970 9 3 4 2 2 1]
|
| 18 |
+
[ 12 957 2 3 7 5 5]
|
| 19 |
+
[ 5 16 813 9 67 42 39]
|
| 20 |
+
[ 2 2 6 980 1 1 0]
|
| 21 |
+
[ 3 13 33 2 917 13 10]
|
| 22 |
+
[ 1 7 17 3 4 959 0]
|
| 23 |
+
[ 0 8 13 2 18 9 942]]
|
phobert_emotion_model/confusion_matrix.png
ADDED
|
phobert_emotion_model/id2label.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"0": "Anger",
|
| 3 |
+
"1": "Disgust",
|
| 4 |
+
"2": "Enjoyment",
|
| 5 |
+
"3": "Fear",
|
| 6 |
+
"4": "Other",
|
| 7 |
+
"5": "Sadness",
|
| 8 |
+
"6": "Surprise"
|
| 9 |
+
}
|
phobert_emotion_model/phobert_emotion_model/added_tokens.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"<mask>": 64000
|
| 3 |
+
}
|
phobert_emotion_model/phobert_emotion_model/bpe.codes
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
phobert_emotion_model/phobert_emotion_model/config.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "vinai/phobert-base",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"RobertaForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": 0,
|
| 8 |
+
"classifier_dropout": null,
|
| 9 |
+
"eos_token_id": 2,
|
| 10 |
+
"gradient_checkpointing": false,
|
| 11 |
+
"hidden_act": "gelu",
|
| 12 |
+
"hidden_dropout_prob": 0.1,
|
| 13 |
+
"hidden_size": 768,
|
| 14 |
+
"id2label": {
|
| 15 |
+
"0": "LABEL_0",
|
| 16 |
+
"1": "LABEL_1",
|
| 17 |
+
"2": "LABEL_2",
|
| 18 |
+
"3": "LABEL_3",
|
| 19 |
+
"4": "LABEL_4",
|
| 20 |
+
"5": "LABEL_5",
|
| 21 |
+
"6": "LABEL_6"
|
| 22 |
+
},
|
| 23 |
+
"initializer_range": 0.02,
|
| 24 |
+
"intermediate_size": 3072,
|
| 25 |
+
"label2id": {
|
| 26 |
+
"LABEL_0": 0,
|
| 27 |
+
"LABEL_1": 1,
|
| 28 |
+
"LABEL_2": 2,
|
| 29 |
+
"LABEL_3": 3,
|
| 30 |
+
"LABEL_4": 4,
|
| 31 |
+
"LABEL_5": 5,
|
| 32 |
+
"LABEL_6": 6
|
| 33 |
+
},
|
| 34 |
+
"layer_norm_eps": 1e-05,
|
| 35 |
+
"max_position_embeddings": 258,
|
| 36 |
+
"model_type": "roberta",
|
| 37 |
+
"num_attention_heads": 12,
|
| 38 |
+
"num_hidden_layers": 12,
|
| 39 |
+
"pad_token_id": 1,
|
| 40 |
+
"position_embedding_type": "absolute",
|
| 41 |
+
"problem_type": "single_label_classification",
|
| 42 |
+
"tokenizer_class": "PhobertTokenizer",
|
| 43 |
+
"torch_dtype": "float32",
|
| 44 |
+
"transformers_version": "4.40.0",
|
| 45 |
+
"type_vocab_size": 1,
|
| 46 |
+
"use_cache": true,
|
| 47 |
+
"vocab_size": 64001
|
| 48 |
+
}
|
phobert_emotion_model/phobert_emotion_model/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23cc285ab489e07145436eebb67247d71cd67c817155cc65eb5a7e52e78ed4f0
|
| 3 |
+
size 540038764
|
phobert_emotion_model/phobert_emotion_model/special_tokens_map.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<s>",
|
| 3 |
+
"cls_token": "<s>",
|
| 4 |
+
"eos_token": "</s>",
|
| 5 |
+
"mask_token": "<mask>",
|
| 6 |
+
"pad_token": "<pad>",
|
| 7 |
+
"sep_token": "</s>",
|
| 8 |
+
"unk_token": "<unk>"
|
| 9 |
+
}
|
phobert_emotion_model/phobert_emotion_model/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "<s>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"1": {
|
| 12 |
+
"content": "<pad>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"2": {
|
| 20 |
+
"content": "</s>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"3": {
|
| 28 |
+
"content": "<unk>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"64000": {
|
| 36 |
+
"content": "<mask>",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"bos_token": "<s>",
|
| 45 |
+
"clean_up_tokenization_spaces": true,
|
| 46 |
+
"cls_token": "<s>",
|
| 47 |
+
"eos_token": "</s>",
|
| 48 |
+
"mask_token": "<mask>",
|
| 49 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 50 |
+
"pad_token": "<pad>",
|
| 51 |
+
"sep_token": "</s>",
|
| 52 |
+
"tokenizer_class": "PhobertTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|
phobert_emotion_model/phobert_emotion_model/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
phobert_results/checkpoint-10410/added_tokens.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"<mask>": 64000
|
| 3 |
+
}
|
phobert_results/checkpoint-10410/bpe.codes
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|