luis-espinosa commited on
Commit
07d8df4
·
verified ·
1 Parent(s): e55db0a

Upload trained SetFit model

Browse files
1_Pooling/config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "word_embedding_dimension": 768,
3
  "pooling_mode_cls_token": false,
4
  "pooling_mode_mean_tokens": true,
5
  "pooling_mode_max_tokens": false,
 
1
  {
2
+ "word_embedding_dimension": 896,
3
  "pooling_mode_cls_token": false,
4
  "pooling_mode_mean_tokens": true,
5
  "pooling_mode_max_tokens": false,
README.md CHANGED
@@ -5,21 +5,20 @@ tags:
5
  - text-classification
6
  - generated_from_setfit_trainer
7
  widget:
8
- - text: Utilita Energy to pay £175,000 after failing to meet carbon emission reduction
9
- obligations
10
- - text: Ofgem appoints preferred bidder for Burbo Bank Extension offshore transmission
11
- assets
12
- - text: Aveni secures £11m to build LLM for financial services
13
- - text: LG CNS, Aeon to collaborate on EdTech in Japan
14
- - text: US to Buy Norwegian Joint Strike Missile for its F-35A Stealth Aircraft
15
  metrics:
16
  - accuracy
17
  pipeline_tag: text-classification
18
  library_name: setfit
19
  inference: false
20
- base_model: infgrad/stella-base-en-v2
21
  model-index:
22
- - name: SetFit with infgrad/stella-base-en-v2
23
  results:
24
  - task:
25
  type: text-classification
@@ -30,13 +29,13 @@ model-index:
30
  split: test
31
  metrics:
32
  - type: accuracy
33
- value: 0.6353790613718412
34
  name: Accuracy
35
  ---
36
 
37
- # SetFit with infgrad/stella-base-en-v2
38
 
39
- This is a [SetFit](https://github.com/huggingface/setfit) model that can be used for Text Classification. This SetFit model uses [infgrad/stella-base-en-v2](https://huggingface.co/infgrad/stella-base-en-v2) as the Sentence Transformer embedding model. A OneVsRestClassifier instance is used for classification.
40
 
41
  The model has been trained using an efficient few-shot learning technique that involves:
42
 
@@ -47,9 +46,9 @@ The model has been trained using an efficient few-shot learning technique that i
47
 
48
  ### Model Description
49
  - **Model Type:** SetFit
50
- - **Sentence Transformer body:** [infgrad/stella-base-en-v2](https://huggingface.co/infgrad/stella-base-en-v2)
51
  - **Classification head:** a OneVsRestClassifier instance
52
- - **Maximum Sequence Length:** 512 tokens
53
  <!-- - **Number of Classes:** Unknown -->
54
  <!-- - **Training Dataset:** [Unknown](https://huggingface.co/datasets/unknown) -->
55
  <!-- - **Language:** Unknown -->
@@ -66,7 +65,7 @@ The model has been trained using an efficient few-shot learning technique that i
66
  ### Metrics
67
  | Label | Accuracy |
68
  |:--------|:---------|
69
- | **all** | 0.6354 |
70
 
71
  ## Uses
72
 
@@ -86,7 +85,7 @@ from setfit import SetFitModel
86
  # Download from the 🤗 Hub
87
  model = SetFitModel.from_pretrained("amplyfi/all-labels")
88
  # Run inference
89
- preds = model("LG CNS, Aeon to collaborate on EdTech in Japan")
90
  ```
91
 
92
  <!--
@@ -118,14 +117,14 @@ preds = model("LG CNS, Aeon to collaborate on EdTech in Japan")
118
  ### Training Set Metrics
119
  | Training set | Min | Median | Max |
120
  |:-------------|:----|:-------|:----|
121
- | Word count | 4 | 9.9566 | 29 |
122
 
123
  ### Training Hyperparameters
124
  - batch_size: (16, 16)
125
  - num_epochs: (2, 2)
126
  - max_steps: -1
127
  - sampling_strategy: oversampling
128
- - num_iterations: 10
129
  - body_learning_rate: (2e-05, 2e-05)
130
  - head_learning_rate: 2e-05
131
  - loss: CosineSimilarityLoss
@@ -142,48 +141,29 @@ preds = model("LG CNS, Aeon to collaborate on EdTech in Japan")
142
  ### Training Results
143
  | Epoch | Step | Training Loss | Validation Loss |
144
  |:------:|:----:|:-------------:|:---------------:|
145
- | 0.0010 | 1 | 0.1474 | - |
146
- | 0.0482 | 50 | 0.2165 | - |
147
- | 0.0963 | 100 | 0.1969 | - |
148
- | 0.1445 | 150 | 0.1609 | - |
149
- | 0.1927 | 200 | 0.1175 | - |
150
- | 0.2408 | 250 | 0.0956 | - |
151
- | 0.2890 | 300 | 0.0783 | - |
152
- | 0.3372 | 350 | 0.0689 | - |
153
- | 0.3854 | 400 | 0.0513 | - |
154
- | 0.4335 | 450 | 0.0486 | - |
155
- | 0.4817 | 500 | 0.0651 | - |
156
- | 0.5299 | 550 | 0.0612 | - |
157
- | 0.5780 | 600 | 0.0537 | - |
158
- | 0.6262 | 650 | 0.0363 | - |
159
- | 0.6744 | 700 | 0.0408 | - |
160
- | 0.7225 | 750 | 0.0413 | - |
161
- | 0.7707 | 800 | 0.0373 | - |
162
- | 0.8189 | 850 | 0.0327 | - |
163
- | 0.8671 | 900 | 0.0278 | - |
164
- | 0.9152 | 950 | 0.0357 | - |
165
- | 0.9634 | 1000 | 0.0291 | - |
166
- | 1.0116 | 1050 | 0.0227 | - |
167
- | 1.0597 | 1100 | 0.0178 | - |
168
- | 1.1079 | 1150 | 0.0224 | - |
169
- | 1.1561 | 1200 | 0.0193 | - |
170
- | 1.2042 | 1250 | 0.0205 | - |
171
- | 1.2524 | 1300 | 0.019 | - |
172
- | 1.3006 | 1350 | 0.0176 | - |
173
- | 1.3487 | 1400 | 0.0196 | - |
174
- | 1.3969 | 1450 | 0.0147 | - |
175
- | 1.4451 | 1500 | 0.0209 | - |
176
- | 1.4933 | 1550 | 0.0161 | - |
177
- | 1.5414 | 1600 | 0.0164 | - |
178
- | 1.5896 | 1650 | 0.0188 | - |
179
- | 1.6378 | 1700 | 0.0153 | - |
180
- | 1.6859 | 1750 | 0.0167 | - |
181
- | 1.7341 | 1800 | 0.0198 | - |
182
- | 1.7823 | 1850 | 0.0157 | - |
183
- | 1.8304 | 1900 | 0.0168 | - |
184
- | 1.8786 | 1950 | 0.0128 | - |
185
- | 1.9268 | 2000 | 0.0165 | - |
186
- | 1.9750 | 2050 | 0.0121 | - |
187
 
188
  ### Framework Versions
189
  - Python: 3.10.12
 
5
  - text-classification
6
  - generated_from_setfit_trainer
7
  widget:
8
+ - text: Be.EV partners with Paua to add more than 700 charge points to the Paua network
9
+ - text: UAE’s Artificial Intelligence Office, Mastercard and First Abu Dhabi Bank
10
+ Launch Joint AI Challenge
11
+ - text: 'Supply Licence Review: Ofgem''s role in enforcing industry codes'
12
+ - text: Air Astana, Neos Enter into Strategic Partnership
13
+ - text: Ofgem protects customers of failed supplier Rutherford Energy Supply Limited
 
14
  metrics:
15
  - accuracy
16
  pipeline_tag: text-classification
17
  library_name: setfit
18
  inference: false
19
+ base_model: HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5
20
  model-index:
21
+ - name: SetFit with HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5
22
  results:
23
  - task:
24
  type: text-classification
 
29
  split: test
30
  metrics:
31
  - type: accuracy
32
+ value: 0.6441441441441441
33
  name: Accuracy
34
  ---
35
 
36
+ # SetFit with HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5
37
 
38
+ This is a [SetFit](https://github.com/huggingface/setfit) model that can be used for Text Classification. This SetFit model uses [HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5](https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5) as the Sentence Transformer embedding model. A OneVsRestClassifier instance is used for classification.
39
 
40
  The model has been trained using an efficient few-shot learning technique that involves:
41
 
 
46
 
47
  ### Model Description
48
  - **Model Type:** SetFit
49
+ - **Sentence Transformer body:** [HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5](https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5)
50
  - **Classification head:** a OneVsRestClassifier instance
51
+ - **Maximum Sequence Length:** 32768 tokens
52
  <!-- - **Number of Classes:** Unknown -->
53
  <!-- - **Training Dataset:** [Unknown](https://huggingface.co/datasets/unknown) -->
54
  <!-- - **Language:** Unknown -->
 
65
  ### Metrics
66
  | Label | Accuracy |
67
  |:--------|:---------|
68
+ | **all** | 0.6441 |
69
 
70
  ## Uses
71
 
 
85
  # Download from the 🤗 Hub
86
  model = SetFitModel.from_pretrained("amplyfi/all-labels")
87
  # Run inference
88
+ preds = model("Air Astana, Neos Enter into Strategic Partnership")
89
  ```
90
 
91
  <!--
 
117
  ### Training Set Metrics
118
  | Training set | Min | Median | Max |
119
  |:-------------|:----|:-------|:----|
120
+ | Word count | 4 | 9.9797 | 30 |
121
 
122
  ### Training Hyperparameters
123
  - batch_size: (16, 16)
124
  - num_epochs: (2, 2)
125
  - max_steps: -1
126
  - sampling_strategy: oversampling
127
+ - num_iterations: 5
128
  - body_learning_rate: (2e-05, 2e-05)
129
  - head_learning_rate: 2e-05
130
  - loss: CosineSimilarityLoss
 
141
  ### Training Results
142
  | Epoch | Step | Training Loss | Validation Loss |
143
  |:------:|:----:|:-------------:|:---------------:|
144
+ | 0.0018 | 1 | 0.3185 | - |
145
+ | 0.0903 | 50 | 0.2296 | - |
146
+ | 0.1805 | 100 | 0.1307 | - |
147
+ | 0.2708 | 150 | 0.0955 | - |
148
+ | 0.3610 | 200 | 0.08 | - |
149
+ | 0.4513 | 250 | 0.0687 | - |
150
+ | 0.5415 | 300 | 0.0591 | - |
151
+ | 0.6318 | 350 | 0.0545 | - |
152
+ | 0.7220 | 400 | 0.0538 | - |
153
+ | 0.8123 | 450 | 0.0482 | - |
154
+ | 0.9025 | 500 | 0.0327 | - |
155
+ | 0.9928 | 550 | 0.0332 | - |
156
+ | 1.0830 | 600 | 0.0315 | - |
157
+ | 1.1733 | 650 | 0.0188 | - |
158
+ | 1.2635 | 700 | 0.016 | - |
159
+ | 1.3538 | 750 | 0.016 | - |
160
+ | 1.4440 | 800 | 0.0167 | - |
161
+ | 1.5343 | 850 | 0.0128 | - |
162
+ | 1.6245 | 900 | 0.0182 | - |
163
+ | 1.7148 | 950 | 0.0113 | - |
164
+ | 1.8051 | 1000 | 0.014 | - |
165
+ | 1.8953 | 1050 | 0.0151 | - |
166
+ | 1.9856 | 1100 | 0.0153 | - |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  ### Framework Versions
169
  - Python: 3.10.12
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
config.json CHANGED
@@ -1,32 +1,28 @@
1
  {
2
- "_name_or_path": "infgrad/stella-base-en-v2",
3
  "architectures": [
4
- "BertModel"
5
  ],
6
- "attention_probs_dropout_prob": 0.1,
7
- "classifier_dropout": null,
8
- "gradient_checkpointing": false,
9
- "hidden_act": "gelu",
10
- "hidden_dropout_prob": 0.1,
11
- "hidden_size": 768,
12
- "id2label": {
13
- "0": "LABEL_0"
14
- },
15
  "initializer_range": 0.02,
16
- "intermediate_size": 3072,
17
- "label2id": {
18
- "LABEL_0": 0
19
- },
20
- "layer_norm_eps": 1e-12,
21
- "max_position_embeddings": 512,
22
- "model_type": "bert",
23
- "num_attention_heads": 12,
24
- "num_hidden_layers": 12,
25
- "pad_token_id": 0,
26
- "position_embedding_type": "absolute",
27
  "torch_dtype": "float32",
28
  "transformers_version": "4.42.2",
29
- "type_vocab_size": 2,
30
- "use_cache": true,
31
- "vocab_size": 30522
32
  }
 
1
  {
2
+ "_name_or_path": "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5",
3
  "architectures": [
4
+ "Qwen2Model"
5
  ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 896,
 
 
 
 
11
  "initializer_range": 0.02,
12
+ "intermediate_size": 4864,
13
+ "max_position_embeddings": 131072,
14
+ "max_window_layers": 24,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 14,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 2,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": 131072,
22
+ "tie_word_embeddings": true,
23
  "torch_dtype": "float32",
24
  "transformers_version": "4.42.2",
25
+ "use_cache": false,
26
+ "use_sliding_window": false,
27
+ "vocab_size": 151936
28
  }
config_sentence_transformers.json CHANGED
@@ -4,7 +4,10 @@
4
  "transformers": "4.42.2",
5
  "pytorch": "2.5.1+cu124"
6
  },
7
- "prompts": {},
 
 
 
8
  "default_prompt_name": null,
9
  "similarity_fn_name": "cosine"
10
  }
 
4
  "transformers": "4.42.2",
5
  "pytorch": "2.5.1+cu124"
6
  },
7
+ "prompts": {
8
+ "query": "",
9
+ "document": ""
10
+ },
11
  "default_prompt_name": null,
12
  "similarity_fn_name": "cosine"
13
  }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2773ffa701d578cb282d1aa6b7ad5b80c4af07f73d90af030ec284cc37479f32
3
- size 437951328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5292fdb77075fc7c073101ce5d1de5a8519e07ce5428101ed6891b827ea82938
3
+ size 1976161736
model_head.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e5ad649e8942c9b81f6de27817d0415a1cc3bc89445f6d253e44e1ce9262c1e
3
- size 117652
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f68b0434348a1d9f0132b0f79b1ddd5f3caafbc7d4de9225d0a8bd1bae5fd447
3
+ size 136084
modules.json CHANGED
@@ -10,5 +10,11 @@
10
  "name": "1",
11
  "path": "1_Pooling",
12
  "type": "sentence_transformers.models.Pooling"
 
 
 
 
 
 
13
  }
14
  ]
 
10
  "name": "1",
11
  "path": "1_Pooling",
12
  "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
  }
20
  ]
sentence_bert_config.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
- "max_seq_length": 512,
3
  "do_lower_case": false
4
  }
 
1
  {
2
+ "max_seq_length": 32768,
3
  "do_lower_case": false
4
  }
special_tokens_map.json CHANGED
@@ -1,34 +1,17 @@
1
  {
2
- "cls_token": {
3
- "content": "[CLS]",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "mask_token": {
10
- "content": "[MASK]",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
- "content": "[PAD]",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "sep_token": {
24
- "content": "[SEP]",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "unk_token": {
31
- "content": "[UNK]",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|endoftext|>",
 
 
 
8
  "lstrip": false,
9
  "normalized": false,
10
  "rstrip": false,
11
  "single_word": false
12
  },
13
  "pad_token": {
14
+ "content": "<|endoftext|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  "lstrip": false,
16
  "normalized": false,
17
  "rstrip": false,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,39 +1,24 @@
1
  {
 
2
  "added_tokens_decoder": {
3
- "0": {
4
- "content": "[PAD]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
10
  },
11
- "100": {
12
- "content": "[UNK]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
- "101": {
20
- "content": "[CLS]",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false,
25
- "special": true
26
- },
27
- "102": {
28
- "content": "[SEP]",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "103": {
36
- "content": "[MASK]",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
@@ -41,17 +26,31 @@
41
  "special": true
42
  }
43
  },
44
- "clean_up_tokenization_spaces": true,
45
- "cls_token": "[CLS]",
46
- "do_basic_tokenize": true,
47
- "do_lower_case": true,
48
- "mask_token": "[MASK]",
49
- "model_max_length": 512,
50
- "never_split": null,
51
- "pad_token": "[PAD]",
52
- "sep_token": "[SEP]",
53
- "strip_accents": null,
54
- "tokenize_chinese_chars": true,
55
- "tokenizer_class": "BertTokenizer",
56
- "unk_token": "[UNK]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  }
 
1
  {
2
+ "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
  "lstrip": false,
7
  "normalized": false,
8
  "rstrip": false,
9
  "single_word": false,
10
  "special": true
11
  },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
  "lstrip": false,
15
  "normalized": false,
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
19
  },
20
+ "151645": {
21
+ "content": "<|im_end|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  "lstrip": false,
23
  "normalized": false,
24
  "rstrip": false,
 
26
  "special": true
27
  }
28
  },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "auto_map": {
34
+ "AutoTokenizer": [
35
+ "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5--tokenization_qwen.Qwen2Tokenizer",
36
+ "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5--tokenization_qwen.Qwen2TokenizerFast"
37
+ ]
38
+ },
39
+ "bos_token": null,
40
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
41
+ "clean_up_tokenization_spaces": false,
42
+ "eos_token": "<|endoftext|>",
43
+ "errors": "replace",
44
+ "max_length": 512,
45
+ "model_max_length": 32768,
46
+ "pad_to_multiple_of": null,
47
+ "pad_token": "<|endoftext|>",
48
+ "pad_token_type_id": 0,
49
+ "padding_side": "left",
50
+ "split_special_tokens": false,
51
+ "stride": 0,
52
+ "tokenizer_class": "Qwen2Tokenizer",
53
+ "truncation_side": "right",
54
+ "truncation_strategy": "longest_first",
55
+ "unk_token": null
56
  }
vocab.json ADDED
The diff for this file is too large to render. See raw diff