File size: 11,517 Bytes
9e9a431 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
| Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr| |------------------|-------|----------------|-----:|-----------------------|---|-----:|---|------| |gsm8k | 3|flexible-extract| 5|exact_match |↑ |0.8400|± |0.0101| | | |strict-match | 5|exact_match |↑ |0.8378|± |0.0102| |hrm8k | N/A| | | | | | | | | - hrm8k_gsm8k | 1|none | 0|exact_match |↑ |0.8196|± |0.0106| | - hrm8k_ksm | 1|none | 0|exact_match |↑ |0.0511|± |0.0058| | - hrm8k_math | 1|none | 0|exact_match |↑ |0.5539|± |0.0093| | - hrm8k_mmmlu | 1|none | 0|exact_match |↑ |0.5362|± |0.0230| | - hrm8k_omni_math| 1|none | 0|exact_match |↑ |0.1812|± |0.0088| |ifeval | 4|none | 0|inst_level_loose_acc |↑ |0.8753|± | N/A| | | |none | 0|inst_level_strict_acc |↑ |0.8609|± | N/A| | | |none | 0|prompt_level_loose_acc |↑ |0.8244|± |0.0164| | | |none | 0|prompt_level_strict_acc|↑ |0.8078|± |0.0170| | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| |-------------------------------|------:|------|-----:|--------|---|-----:|---|------| |haerae | 1|none | |acc |↑ |0.6654|± |0.0140| | | |none | |acc_norm|↑ |0.6654|± |0.0140| | - haerae_general_knowledge | 1|none | 5|acc |↑ |0.4943|± |0.0378| | | |none | 5|acc_norm|↑ |0.4943|± |0.0378| | - haerae_history | 1|none | 5|acc |↑ |0.5585|± |0.0363| | | |none | 5|acc_norm|↑ |0.5585|± |0.0363| | - haerae_loan_word | 1|none | 5|acc |↑ |0.7456|± |0.0336| | | |none | 5|acc_norm|↑ |0.7456|± |0.0336| | - haerae_rare_word | 1|none | 5|acc |↑ |0.7160|± |0.0224| | | |none | 5|acc_norm|↑ |0.7160|± |0.0224| | - haerae_standard_nomenclature| 1|none | 5|acc |↑ |0.7712|± |0.0341| | | |none | 5|acc_norm|↑ |0.7712|± |0.0341| |kobest | 1|none | |acc |↑ |0.7768|± |0.0057| | | |none | |acc_norm|↑ |0.5880|± |0.0220| | | |none | |f1 |↑ |0.7764|± | N/A| | - kobest_boolq | 1|none | 5|acc |↑ |0.9252|± |0.0070| | | |none | 5|f1 |↑ |0.9252|± | N/A| | - kobest_copa | 1|none | 5|acc |↑ |0.6980|± |0.0145| | | |none | 5|f1 |↑ |0.6975|± | N/A| | - kobest_hellaswag | 1|none | 5|acc |↑ |0.4440|± |0.0222| | | |none | 5|acc_norm|↑ |0.5880|± |0.0220| | | |none | 5|f1 |↑ |0.4419|± | N/A| | - kobest_sentineg | 1|none | 5|acc |↑ |0.9622|± |0.0096| | | |none | 5|f1 |↑ |0.9622|± | N/A| | - kobest_wic | 1|none | 5|acc |↑ |0.7476|± |0.0122| | | |none | 5|f1 |↑ |0.7473|± | N/A| |Groups|Version|Filter|n-shot| Metric | |Value | |Stderr| |------|------:|------|------|--------|---|-----:|---|------| |haerae| 1|none | |acc |↑ |0.6654|± |0.0140| | | |none | |acc_norm|↑ |0.6654|± |0.0140| |kobest| 1|none | |acc |↑ |0.7768|± |0.0057| | | |none | |acc_norm|↑ |0.5880|± |0.0220| | | |none | |f1 |↑ |0.7764|± | N/A| vllm (pretrained=/data/public/glim/workspace/mergekit/output,max_model_len=8192,tensor_parallel_size=2,enable_chunked_prefill=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: auto | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| |-----------------------------------------------------------|------:|------|-----:|-----------|---|-----:|---|-----:| |kmmlu_direct | 2|none | |exact_match|↑ |0.5212|± |0.0026| | - kmmlu_direct_applied_science | 2|none | |exact_match|↑ |0.4997|± |0.0046| | - kmmlu_direct_aviation_engineering_and_maintenance | 2|none | 5|exact_match|↑ |0.5190|± |0.0158| | - kmmlu_direct_electronics_engineering | 2|none | 5|exact_match|↑ |0.6420|± |0.0152| | - kmmlu_direct_energy_management | 2|none | 5|exact_match|↑ |0.4160|± |0.0156| | - kmmlu_direct_environmental_science | 2|none | 5|exact_match|↑ |0.3710|± |0.0153| | - kmmlu_direct_gas_technology_and_engineering | 2|none | 5|exact_match|↑ |0.4240|± |0.0156| | - kmmlu_direct_geomatics | 2|none | 5|exact_match|↑ |0.4580|± |0.0158| | - kmmlu_direct_industrial_engineer | 2|none | 5|exact_match|↑ |0.5080|± |0.0158| | - kmmlu_direct_machine_design_and_manufacturing | 2|none | 5|exact_match|↑ |0.5360|± |0.0158| | - kmmlu_direct_maritime_engineering | 2|none | 5|exact_match|↑ |0.5117|± |0.0204| | - kmmlu_direct_nondestructive_testing | 2|none | 5|exact_match|↑ |0.5460|± |0.0158| | - kmmlu_direct_railway_and_automotive_engineering | 2|none | 5|exact_match|↑ |0.4310|± |0.0157| | - kmmlu_direct_telecommunications_and_wireless_technology| 2|none | 5|exact_match|↑ |0.6380|± |0.0152| | - kmmlu_direct_humss | 2|none | |exact_match|↑ |0.5365|± |0.0068| | - kmmlu_direct_accounting | 2|none | 5|exact_match|↑ |0.5400|± |0.0501| | - kmmlu_direct_criminal_law | 2|none | 5|exact_match|↑ |0.3800|± |0.0344| | - kmmlu_direct_economics | 2|none | 5|exact_match|↑ |0.5462|± |0.0438| | - kmmlu_direct_education | 2|none | 5|exact_match|↑ |0.6200|± |0.0488| | - kmmlu_direct_korean_history | 2|none | 5|exact_match|↑ |0.2700|± |0.0446| | - kmmlu_direct_law | 2|none | 5|exact_match|↑ |0.4730|± |0.0158| | - kmmlu_direct_management | 2|none | 5|exact_match|↑ |0.6120|± |0.0154| | - kmmlu_direct_political_science_and_sociology | 2|none | 5|exact_match|↑ |0.5500|± |0.0288| | - kmmlu_direct_psychology | 2|none | 5|exact_match|↑ |0.4590|± |0.0158| | - kmmlu_direct_social_welfare | 2|none | 5|exact_match|↑ |0.6710|± |0.0149| | - kmmlu_direct_taxation | 2|none | 5|exact_match|↑ |0.4100|± |0.0349| | - kmmlu_direct_other | 2|none | |exact_match|↑ |0.5130|± |0.0053| | - kmmlu_direct_agricultural_sciences | 2|none | 5|exact_match|↑ |0.3920|± |0.0154| | - kmmlu_direct_construction | 2|none | 5|exact_match|↑ |0.4320|± |0.0157| | - kmmlu_direct_fashion | 2|none | 5|exact_match|↑ |0.5010|± |0.0158| | - kmmlu_direct_food_processing | 2|none | 5|exact_match|↑ |0.4700|± |0.0158| | - kmmlu_direct_health | 2|none | 5|exact_match|↑ |0.6800|± |0.0469| | - kmmlu_direct_interior_architecture_and_design | 2|none | 5|exact_match|↑ |0.6110|± |0.0154| | - kmmlu_direct_marketing | 2|none | 5|exact_match|↑ |0.8010|± |0.0126| | - kmmlu_direct_patent | 2|none | 5|exact_match|↑ |0.3900|± |0.0490| | - kmmlu_direct_public_safety | 2|none | 5|exact_match|↑ |0.4660|± |0.0158| | - kmmlu_direct_real_estate | 2|none | 5|exact_match|↑ |0.4850|± |0.0354| | - kmmlu_direct_refrigerating_machinery | 2|none | 5|exact_match|↑ |0.4320|± |0.0157| | - kmmlu_direct_stem | 2|none | |exact_match|↑ |0.5455|± |0.0048| | - kmmlu_direct_biology | 2|none | 5|exact_match|↑ |0.4140|± |0.0156| | - kmmlu_direct_chemical_engineering | 2|none | 5|exact_match|↑ |0.5580|± |0.0157| | - kmmlu_direct_chemistry | 2|none | 5|exact_match|↑ |0.5550|± |0.0203| | - kmmlu_direct_civil_engineering | 2|none | 5|exact_match|↑ |0.4750|± |0.0158| | - kmmlu_direct_computer_science | 2|none | 5|exact_match|↑ |0.7780|± |0.0131| | - kmmlu_direct_ecology | 2|none | 5|exact_match|↑ |0.5430|± |0.0158| | - kmmlu_direct_electrical_engineering | 2|none | 5|exact_match|↑ |0.4100|± |0.0156| | - kmmlu_direct_information_technology | 2|none | 5|exact_match|↑ |0.7860|± |0.0130| | - kmmlu_direct_materials_engineering | 2|none | 5|exact_match|↑ |0.5250|± |0.0158| | - kmmlu_direct_math | 2|none | 5|exact_match|↑ |0.3267|± |0.0271| | - kmmlu_direct_mechanical_engineering | 2|none | 5|exact_match|↑ |0.4800|± |0.0158| | Groups |Version|Filter|n-shot| Metric | |Value | |Stderr| |-------------------------------|------:|------|------|-----------|---|-----:|---|-----:| |kmmlu_direct | 2|none | |exact_match|↑ |0.5212|± |0.0026| | - kmmlu_direct_applied_science| 2|none | |exact_match|↑ |0.4997|± |0.0046| | - kmmlu_direct_humss | 2|none | |exact_match|↑ |0.5365|± |0.0068| | - kmmlu_direct_other | 2|none | |exact_match|↑ |0.5130|± |0.0053| | - kmmlu_direct_stem | 2|none | |exact_match|↑ |0.5455|± |0.0048| |