vllm (pretrained=/root/autodl-tmp/QwQ-32B,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.432 |
± |
0.0314 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.744 |
± |
0.0277 |
vllm (pretrained=/root/autodl-tmp/QwQ-32B,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.444 |
± |
0.0222 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.716 |
± |
0.0202 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.8140 |
± |
0.0125 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8359 |
± |
0.0251 |
- other |
2 |
none |
|
acc |
↑ |
0.8103 |
± |
0.0269 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8889 |
± |
0.0222 |
- stem |
2 |
none |
|
acc |
↑ |
0.7544 |
± |
0.0238 |
vllm (pretrained=/root/autodl-tmp/QwQ-32B-abliterated,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.528 |
± |
0.0316 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.740 |
± |
0.0278 |
vllm (pretrained=/root/autodl-tmp/QwQ-32B-abliterated,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.492 |
± |
0.0224 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.742 |
± |
0.0196 |
vllm (pretrained=/root/autodl-tmp/QwQ-32B-abliterated,add_bos_token=true,max_model_len=4096,dtype=bfloat16,max_num_seqs=3), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.8152 |
± |
0.0126 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8359 |
± |
0.0253 |
- other |
2 |
none |
|
acc |
↑ |
0.8000 |
± |
0.0276 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8722 |
± |
0.0240 |
- stem |
2 |
none |
|
acc |
↑ |
0.7754 |
± |
0.0232 |
vllm (pretrained=/root/autodl-tmp/QwQ-32B-abliterated-awq,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.476 |
± |
0.0316 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.752 |
± |
0.0274 |
vllm (pretrained=/root/autodl-tmp/QwQ-32B-abliterated-awq,add_bos_token=true,max_model_len=4096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.524 |
± |
0.0224 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.716 |
± |
0.0202 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.8023 |
± |
0.0130 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8000 |
± |
0.0266 |
- other |
2 |
none |
|
acc |
↑ |
0.7949 |
± |
0.0284 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8500 |
± |
0.0258 |
- stem |
2 |
none |
|
acc |
↑ |
0.7789 |
± |
0.0235 |