Initial commit

Browse files

Files changed (8) hide show

README.md +96 -1
config.json +50 -0
evaluation_results_on_test_split.csv +6 -0
model.safetensors +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +55 -0
vocab.txt +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,98 @@
 ---
-license: apache-2.0
 ---

 ---
+license: mit
+datasets:
+- SkyWater21/lv_go_emotions
+language:
+- lv
 ---
+Fine-tuned [multilingual BERT](https://huggingface.co/google-bert/bert-base-multilingual-cased) for multi-label emotion classification task.
+Model was trained on [lv_go_emotions](https://huggingface.co/datasets/SkyWater21/lv_go_emotions) dataset. This dataset is Latvian translation of [GoEmotions](https://huggingface.co/datasets/go_emotions) dataset. Google Translate was used to generate the machine translation.
+Original 26 emotions were mapped to 6 base emotions as per Dr. Ekman theory.
+Labels predicted by classifier:
+```yaml
+0: anger
+1: disgust
+2: fear
+3: joy
+4: sadness
+5: surprise
+6: neutral
+```
+Label mapping from 27 emotions from GoEmotion to 6 base emotions as per Dr. Ekman theory:
+|GoEmotion|Ekman|
+|---|---|
+| admiration | joy|
+| amusement | joy|
+| anger | anger|
+| annoyance | anger|
+| approval | joy|
+| caring | joy|
+| confusion | surprise|
+| curiosity | surprise|
+| desire | joy|
+| disappointment | sadness|
+| disapproval | anger|
+| disgust | disgust|
+| embarrassment | sadness|
+| excitement | joy|
+| fear | fear|
+| gratitude | joy|
+| grief | sadness|
+| joy | joy|
+| love | joy|
+| nervousness | fear|
+| optimism | joy|
+| pride | joy|
+| realization | surprise|
+| relief | joy|
+| remorse | sadness|
+| sadness | sadness|
+| surprise | surprise|
+| neutral | neutral|
+Seed used for random number generator is 42:
+```python
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+```
+Training parameters:
+```yaml
+max_length: null
+batch_size: 64
+shuffle: True
+num_workers: 8
+pin_memory: False
+drop_last: False
+optimizer: adam
+lr: 0.00001
+weight_decay: 0
+problem_type: multi_label_classification
+num_epochs: 4
+```
+Evaluation results on test split of [lv_go_emotions](https://huggingface.co/datasets/SkyWater21/lv_go_emotions/viewer/simplified_ekman)
+|              |Precision|Recall|F1-Score|AUC-ROC|Support|
+|--------------|---------|------|--------|-------|-------|
+|anger         |     0.58|  0.36|    0.45|   0.83|   726|
+|disgust       |     0.88|  0.12|    0.21|   0.90|   123|
+|fear          |     0.75|  0.48|    0.58|   0.93|    98|
+|joy           |     0.82|  0.76|    0.79|   0.90|  2104|
+|sadness       |     0.69|  0.46|    0.55|   0.88|   379|
+|surprise      |     0.61|  0.51|    0.55|   0.87|   677|
+|neutral       |     0.65|  0.62|    0.64|   0.83|  1787|
+|micro avg     |     0.71|  0.60|    0.65|   0.92|  5894|
+|macro avg     |     0.71|  0.47|    0.54|   0.88|  5894|
+|weighted avg  |     0.71|  0.60|    0.64|   0.87|  5894|
+|samples avg   |     0.63|  0.62|    0.62|    nan|  5894|

config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "_name_or_path": "google-bert/bert-base-multilingual-cased",
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "anger",
+    "1": "disgust",
+    "2": "fear",
+    "3": "joy",
+    "4": "sadness",
+    "5": "surprise",
+    "6": "neutral"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "anger": 0,
+    "disgust": 1,
+    "fear": 2,
+    "joy": 3,
+    "neutral": 6,
+    "sadness": 4,
+    "surprise": 5
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "problem_type": "multi_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.45.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 119547
+}

evaluation_results_on_test_split.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+,anger,disgust,fear,joy,sadness,surprise,neutral,micro avg,macro avg,weighted avg,samples avg
+precision,0.58,0.88,0.75,0.82,0.69,0.61,0.65,0.71,0.71,0.71,0.63
+recall,0.36,0.12,0.48,0.76,0.46,0.51,0.62,0.6,0.47,0.6,0.62
+f1-score,0.45,0.21,0.58,0.79,0.55,0.55,0.64,0.65,0.54,0.64,0.62
+support,726.0,123.0,98.0,2104.0,379.0,677.0,1787.0,5894.0,5894.0,5894.0,5894.0
+auc-roc,0.83,0.9,0.93,0.9,0.88,0.87,0.83,0.92,0.88,0.87,

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c81861f6623de9da59fe8cda0c9f2a402fbb8866cf9362403d19bb0a9ed2be4f
+size 711458836

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff