Uploading TinyMistral-248M-v3 to HF

Files changed (8) hide show

.gitattributes +35 -0
README.md +244 -0
config.json +27 -0
generation_config.json +6 -0
model.safetensors +3 -0
special_tokens_map.json +34 -0
tokenizer.json +0 -0
tokenizer_config.json +91 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,244 @@

+---
+language:
+- en
+license: apache-2.0
+datasets:
+- Locutusque/TM-DATA-V2
+- LLM360/TxT360
+- mlfoundations/dclm-baseline-1.0
+- Skylion007/openwebtext
+- JeanKaddour/minipile
+- eminorhan/gutenberg_en
+model-index:
+- name: TinyMistral-248M-v3
+  results:
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: IFEval (0-Shot)
+      type: HuggingFaceH4/ifeval
+      args:
+        num_few_shot: 0
+    metrics:
+    - type: inst_level_strict_acc and prompt_level_strict_acc
+      value: 16.39
+      name: strict accuracy
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=M4-ai/TinyMistral-248M-v3
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: BBH (3-Shot)
+      type: BBH
+      args:
+        num_few_shot: 3
+    metrics:
+    - type: acc_norm
+      value: 1.78
+      name: normalized accuracy
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=M4-ai/TinyMistral-248M-v3
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: MATH Lvl 5 (4-Shot)
+      type: hendrycks/competition_math
+      args:
+        num_few_shot: 4
+    metrics:
+    - type: exact_match
+      value: 0.0
+      name: exact match
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=M4-ai/TinyMistral-248M-v3
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: GPQA (0-shot)
+      type: Idavidrein/gpqa
+      args:
+        num_few_shot: 0
+    metrics:
+    - type: acc_norm
+      value: 0.0
+      name: acc_norm
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=M4-ai/TinyMistral-248M-v3
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: MuSR (0-shot)
+      type: TAUR-Lab/MuSR
+      args:
+        num_few_shot: 0
+    metrics:
+    - type: acc_norm
+      value: 5.15
+      name: acc_norm
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=M4-ai/TinyMistral-248M-v3
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: MMLU-PRO (5-shot)
+      type: TIGER-Lab/MMLU-Pro
+      config: main
+      split: test
+      args:
+        num_few_shot: 5
+    metrics:
+    - type: acc
+      value: 1.47
+      name: accuracy
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=M4-ai/TinyMistral-248M-v3
+      name: Open LLM Leaderboard
+---
+still in training. Trained on about ~21 billion tokens so far.
+|                 Tasks                  |Version|     Filter     |n-shot|  Metric   |   | Value |   |Stderr|
+|----------------------------------------|-------|----------------|-----:|-----------|---|------:|---|-----:|
+|Open LLM Leaderboard                    |    N/A|                |      |           |   |       |   |      |
+| - arc_challenge                        |      1|none            |    25|acc        |↑  | 0.2005|±  |0.0117|
+|                                        |       |none            |    25|acc_norm   |↑  | 0.2406|±  |0.0125|
+| - gsm8k                                |      3|flexible-extract|     5|exact_match|↑  | 0.0083|±  |0.0025|
+|                                        |       |strict-match    |     5|exact_match|↑  | 0.0000|±  |0.0000|
+| - hellaswag                            |      1|none            |    10|acc        |↑  | 0.2724|±  |0.0044|
+|                                        |       |none            |    10|acc_norm   |↑  | 0.2838|±  |0.0045|
+| - mmlu                                 |      2|none            |      |acc        |↑  | 0.2290|±  |0.0035|
+|  - humanities                          |      2|none            |      |acc        |↑  | 0.2380|±  |0.0062|
+|   - formal_logic                       |      1|none            |     5|acc        |↑  | 0.2460|±  |0.0385|
+|   - high_school_european_history       |      1|none            |     5|acc        |↑  | 0.1818|±  |0.0301|
+|   - high_school_us_history             |      1|none            |     5|acc        |↑  | 0.2647|±  |0.0310|
+|   - high_school_world_history          |      1|none            |     5|acc        |↑  | 0.2911|±  |0.0296|
+|   - international_law                  |      1|none            |     5|acc        |↑  | 0.2149|±  |0.0375|
+|   - jurisprudence                      |      1|none            |     5|acc        |↑  | 0.2685|±  |0.0428|
+|   - logical_fallacies                  |      1|none            |     5|acc        |↑  | 0.2209|±  |0.0326|
+|   - moral_disputes                     |      1|none            |     5|acc        |↑  | 0.2457|±  |0.0232|
+|   - moral_scenarios                    |      1|none            |     5|acc        |↑  | 0.2369|±  |0.0142|
+|   - philosophy                         |      1|none            |     5|acc        |↑  | 0.1865|±  |0.0221|
+|   - prehistory                         |      1|none            |     5|acc        |↑  | 0.1975|±  |0.0222|
+|   - professional_law                   |      1|none            |     5|acc        |↑  | 0.2432|±  |0.0110|
+|   - world_religions                    |      1|none            |     5|acc        |↑  | 0.3099|±  |0.0355|
+|  - other                               |      2|none            |      |acc        |↑  | 0.2375|±  |0.0076|
+|   - business_ethics                    |      1|none            |     5|acc        |↑  | 0.3200|±  |0.0469|
+|   - clinical_knowledge                 |      1|none            |     5|acc        |↑  | 0.2226|±  |0.0256|
+|   - college_medicine                   |      1|none            |     5|acc        |↑  | 0.1965|±  |0.0303|
+|   - global_facts                       |      1|none            |     5|acc        |↑  | 0.1800|±  |0.0386|
+|   - human_aging                        |      1|none            |     5|acc        |↑  | 0.3004|±  |0.0308|
+|   - management                         |      1|none            |     5|acc        |↑  | 0.1942|±  |0.0392|
+|   - marketing                          |      1|none            |     5|acc        |↑  | 0.2735|±  |0.0292|
+|   - medical_genetics                   |      1|none            |     5|acc        |↑  | 0.3000|±  |0.0461|
+|   - miscellaneous                      |      1|none            |     5|acc        |↑  | 0.2478|±  |0.0154|
+|   - nutrition                          |      1|none            |     5|acc        |↑  | 0.2222|±  |0.0238|
+|   - professional_accounting            |      1|none            |     5|acc        |↑  | 0.2021|±  |0.0240|
+|   - professional_medicine              |      1|none            |     5|acc        |↑  | 0.1912|±  |0.0239|
+|   - virology                           |      1|none            |     5|acc        |↑  | 0.2590|±  |0.0341|
+|  - social sciences                     |      2|none            |      |acc        |↑  | 0.2203|±  |0.0075|
+|   - econometrics                       |      1|none            |     5|acc        |↑  | 0.2368|±  |0.0400|
+|   - high_school_geography              |      1|none            |     5|acc        |↑  | 0.2020|±  |0.0286|
+|   - high_school_government_and_politics|      1|none            |     5|acc        |↑  | 0.1865|±  |0.0281|
+|   - high_school_macroeconomics         |      1|none            |     5|acc        |↑  | 0.2205|±  |0.0210|
+|   - high_school_microeconomics         |      1|none            |     5|acc        |↑  | 0.2143|±  |0.0267|
+|   - high_school_psychology             |      1|none            |     5|acc        |↑  | 0.1908|±  |0.0168|
+|   - human_sexuality                    |      1|none            |     5|acc        |↑  | 0.2672|±  |0.0388|
+|   - professional_psychology            |      1|none            |     5|acc        |↑  | 0.2386|±  |0.0172|
+|   - public_relations                   |      1|none            |     5|acc        |↑  | 0.1727|±  |0.0362|
+|   - security_studies                   |      1|none            |     5|acc        |↑  | 0.2367|±  |0.0272|
+|   - sociology                          |      1|none            |     5|acc        |↑  | 0.2488|±  |0.0306|
+|   - us_foreign_policy                  |      1|none            |     5|acc        |↑  | 0.2600|±  |0.0441|
+|  - stem                                |      2|none            |      |acc        |↑  | 0.2157|±  |0.0073|
+|   - abstract_algebra                   |      1|none            |     5|acc        |↑  | 0.2200|±  |0.0416|
+|   - anatomy                            |      1|none            |     5|acc        |↑  | 0.1778|±  |0.0330|
+|   - astronomy                          |      1|none            |     5|acc        |↑  | 0.1908|±  |0.0320|
+|   - college_biology                    |      1|none            |     5|acc        |↑  | 0.2778|±  |0.0375|
+|   - college_chemistry                  |      1|none            |     5|acc        |↑  | 0.2200|±  |0.0416|
+|   - college_computer_science           |      1|none            |     5|acc        |↑  | 0.2100|±  |0.0409|
+|   - college_mathematics                |      1|none            |     5|acc        |↑  | 0.2100|±  |0.0409|
+|   - college_physics                    |      1|none            |     5|acc        |↑  | 0.2157|±  |0.0409|
+|   - computer_security                  |      1|none            |     5|acc        |↑  | 0.2700|±  |0.0446|
+|   - conceptual_physics                 |      1|none            |     5|acc        |↑  | 0.2638|±  |0.0288|
+|   - electrical_engineering             |      1|none            |     5|acc        |↑  | 0.2483|±  |0.0360|
+|   - elementary_mathematics             |      1|none            |     5|acc        |↑  | 0.2037|±  |0.0207|
+|   - high_school_biology                |      1|none            |     5|acc        |↑  | 0.1774|±  |0.0217|
+|   - high_school_chemistry              |      1|none            |     5|acc        |↑  | 0.2020|±  |0.0282|
+|   - high_school_computer_science       |      1|none            |     5|acc        |↑  | 0.2500|±  |0.0435|
+|   - high_school_mathematics            |      1|none            |     5|acc        |↑  | 0.2148|±  |0.0250|
+|   - high_school_physics                |      1|none            |     5|acc        |↑  | 0.2053|±  |0.0330|
+|   - high_school_statistics             |      1|none            |     5|acc        |↑  | 0.1481|±  |0.0242|
+|   - machine_learning                   |      1|none            |     5|acc        |↑  | 0.3125|±  |0.0440|
+| - truthfulqa_gen                       |      3|none            |     0|bleu_acc   |↑  | 0.2362|±  |0.0149|
+|                                        |       |none            |     0|bleu_diff  |↑  |-1.0138|±  |0.2569|
+|                                        |       |none            |     0|bleu_max   |↑  | 7.9522|±  |0.4088|
+|                                        |       |none            |     0|rouge1_acc |↑  | 0.2595|±  |0.0153|
+|                                        |       |none            |     0|rouge1_diff|↑  |-1.9129|±  |0.4349|
+|                                        |       |none            |     0|rouge1_max |↑  |21.7885|±  |0.7307|
+|                                        |       |none            |     0|rouge2_acc |↑  | 0.1200|±  |0.0114|
+|                                        |       |none            |     0|rouge2_diff|↑  |-1.9771|±  |0.3475|
+|                                        |       |none            |     0|rouge2_max |↑  | 9.0199|±  |0.5842|
+|                                        |       |none            |     0|rougeL_acc |↑  | 0.2570|±  |0.0153|
+|                                        |       |none            |     0|rougeL_diff|↑  |-1.8812|±  |0.4185|
+|                                        |       |none            |     0|rougeL_max |↑  |19.6284|±  |0.6850|
+| - truthfulqa_mc1                       |      2|none            |     0|acc        |↑  | 0.1983|±  |0.0140|
+| - truthfulqa_mc2                       |      2|none            |     0|acc        |↑  | 0.3861|±  |0.0147|
+| - winogrande                           |      1|none            |     5|acc        |↑  | 0.4972|±  |0.0141|
+|      Groups       |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|-------------------|------:|------|------|------|---|-----:|---|-----:|
+| - mmlu            |      2|none  |      |acc   |↑  |0.2290|±  |0.0035|
+|  - humanities     |      2|none  |      |acc   |↑  |0.2380|±  |0.0062|
+|  - other          |      2|none  |      |acc   |↑  |0.2375|±  |0.0076|
+|  - social sciences|      2|none  |      |acc   |↑  |0.2203|±  |0.0075|
+|  - stem           |      2|none  |      |acc   |↑  |0.2157|±  |0.0073|
+|              Tasks              |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
+|---------------------------------|------:|------|-----:|--------|---|-----:|---|-----:|
+|agieval_nous                     |      0|none  |      |acc_norm|↑  |0.2133|±  |0.0081|
+| - agieval_aqua_rat              |      1|none  |     0|acc     |↑  |0.2047|±  |0.0254|
+|                                 |       |none  |     0|acc_norm|↑  |0.1969|±  |0.0250|
+| - agieval_logiqa_en             |      1|none  |     0|acc     |↑  |0.2043|±  |0.0158|
+|                                 |       |none  |     0|acc_norm|↑  |0.2304|±  |0.0165|
+| - agieval_lsat_ar               |      1|none  |     0|acc     |↑  |0.1739|±  |0.0250|
+|                                 |       |none  |     0|acc_norm|↑  |0.1957|±  |0.0262|
+| - agieval_lsat_lr               |      1|none  |     0|acc     |↑  |0.1549|±  |0.0160|
+|                                 |       |none  |     0|acc_norm|↑  |0.1608|±  |0.0163|
+| - agieval_lsat_rc               |      1|none  |     0|acc     |↑  |0.1636|±  |0.0226|
+|                                 |       |none  |     0|acc_norm|↑  |0.2119|±  |0.0250|
+| - agieval_sat_en                |      1|none  |     0|acc     |↑  |0.2670|±  |0.0309|
+|                                 |       |none  |     0|acc_norm|↑  |0.2621|±  |0.0307|
+| - agieval_sat_en_without_passage|      1|none  |     0|acc     |↑  |0.2670|±  |0.0309|
+|                                 |       |none  |     0|acc_norm|↑  |0.2621|±  |0.0307|
+| - agieval_sat_math              |      1|none  |     0|acc     |↑  |0.2182|±  |0.0279|
+|                                 |       |none  |     0|acc_norm|↑  |0.2318|±  |0.0285|
+|arc_challenge                    |      1|none  |     0|acc     |↑  |0.1945|±  |0.0116|
+|                                 |       |none  |     0|acc_norm|↑  |0.2372|±  |0.0124|
+|truthfulqa_mc2                   |      2|none  |     0|acc     |↑  |0.3861|±  |0.0147|
+|   Groups   |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
+|------------|------:|------|------|--------|---|-----:|---|-----:|
+|agieval_nous|      0|none  |      |acc_norm|↑  |0.2133|±  |0.0081|
+# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
+Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_M4-ai__TinyMistral-248M-v3)
+|      Metric       |Value|
+|-------------------|----:|
+|Avg.               | 4.13|
+|IFEval (0-Shot)    |16.39|
+|BBH (3-Shot)       | 1.78|
+|MATH Lvl 5 (4-Shot)| 0.00|
+|GPQA (0-shot)      | 0.00|
+|MuSR (0-shot)      | 5.15|
+|MMLU-PRO (5-shot)  | 1.47|

config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "_name_or_path": "M4-ai/TinyMistral-248M-v3",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 32,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 10000.0,
+  "sliding_window": 32,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.2",
+  "use_cache": true,
+  "vocab_size": 32005
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.45.2"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9432ee6e0681473a9ed513e43362d9911832f9a5c7faded76f46ec66c55a9d3b
+size 496060688

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "additional_special_tokens": [
+    "<|ASSISTANT|>",
+    "<|USER|>"
+  ],
+  "bos_token": {
+    "content": "<|bos|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<|bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|ASSISTANT|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|USER|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|ASSISTANT|>",
+    "<|USER|>"
+  ],
+  "bos_token": "<|bos|>",
+  "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "legacy": true,
+  "max_length": 1536,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "stride": 0,
+  "tokenizer_class": "LlamaTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": true
+}