|
{"task_name": "arc_easy::olmes", "task_hash": "c02b46502ed310af2d8f73ddc068f6bd", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "arc_easy::olmes", "task_core": "arc_easy", "limit": 1000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "arc_easy::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 68.17251586914062, "current_date": "2024-11-19 21:10:13 UTC", "num_instances": 2000, "beaker_info": {}, "metrics": {"primary_score": 0.714}, "task_idx": null} |
|
{"task_name": "arc_challenge::olmes", "task_hash": "11d27cc9476c8b7bf020c4361973aaa5", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "arc_challenge::olmes", "task_core": "arc_challenge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "arc_challenge::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 90.32264161109924, "current_date": "2024-11-19 21:11:22 UTC", "num_instances": 2344, "beaker_info": {}, "metrics": {"primary_score": 0.3984641638225256}, "task_idx": null} |
|
{"task_name": "boolq::olmes", "task_hash": "da41fcb8eeb8d860801247f30fee2e77", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "boolq::olmes", "task_core": "boolq", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "mc", "alias": "boolq::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 126.84859204292297, "current_date": "2024-11-19 21:12:52 UTC", "num_instances": 2000, "beaker_info": {}, "metrics": {"primary_score": 0.638}, "task_idx": null} |
|
{"task_name": "csqa::olmes", "task_hash": "148a28cc5b845794bb841274ea09e6f6", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "csqa::olmes", "task_core": "csqa", "limit": null, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "csqa::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 70.88607287406921, "current_date": "2024-11-19 21:14:59 UTC", "num_instances": 2442, "beaker_info": {}, "metrics": {"primary_score": 0.6322686322686323}, "task_idx": null} |
|
{"task_name": "hellaswag::olmes", "task_hash": "f4206b2ad682263984ece6a64d6d9271", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "hellaswag::olmes", "task_core": "hellaswag", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "hellaswag::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 203.0478916168213, "current_date": "2024-11-19 21:16:11 UTC", "num_instances": 2000, "beaker_info": {}, "metrics": {"primary_score": 0.59}, "task_idx": null} |
|
{"task_name": "openbookqa::olmes", "task_hash": "d5df7a559abb9f3a09e5a30be3037e2a", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "openbookqa::olmes", "task_core": "openbookqa", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "openbookqa::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 23.034831047058105, "current_date": "2024-11-19 21:19:33 UTC", "num_instances": 1000, "beaker_info": {}, "metrics": {"primary_score": 0.488}, "task_idx": null} |
|
{"task_name": "piqa::olmes", "task_hash": "9361bd3526bac064874231d85f849e47", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "piqa::olmes", "task_core": "piqa", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "piqa::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 54.42874765396118, "current_date": "2024-11-19 21:19:56 UTC", "num_instances": 2000, "beaker_info": {}, "metrics": {"primary_score": 0.705}, "task_idx": null} |
|
{"task_name": "socialiqa::olmes", "task_hash": "57d3935fe101216b9f4012980add4fed", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "socialiqa::olmes", "task_core": "socialiqa", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "socialiqa::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 61.53313398361206, "current_date": "2024-11-19 21:20:51 UTC", "num_instances": 2000, "beaker_info": {}, "metrics": {"primary_score": 0.507}, "task_idx": null} |
|
{"task_name": "winogrande::olmes", "task_hash": "011ccb1214c83646d4781be4fc32f744", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "winogrande::olmes", "task_core": "winogrande", "limit": null, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "winogrande::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 42.78815579414368, "current_date": "2024-11-19 21:21:52 UTC", "num_instances": 2534, "beaker_info": {}, "metrics": {"primary_score": 0.5880031570639306}, "task_idx": null} |
|
{"task_name": "core_9mcqa::olmes", "task_hash": "1b3207764f3554af7e5d19097a4b7263", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "core_9mcqa::olmes", "task_core": "arc_easy", "limit": 1000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 9, "description": "Aggregate metric", "alias": "core_9mcqa::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 741.0625824928284, "current_date": "2024-11-19 21:10:13 UTC", "num_instances": 18320, "beaker_info": {}, "metrics": {"primary_score_micro": 0.5875545851528384, "primary_score_macro": 0.584526217017232, "primary_score": 0.584526217017232}, "task_idx": null} |
|
{"task_name": "core_9mcqa:rc::olmes", "task_hash": "9fcc2b2273b1681109643b68a8545dc0", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "core_9mcqa:rc::olmes", "task_core": "arc_easy", "limit": 1000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 1234, "context_kwargs": {"description": null}, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 9, "description": "Aggregate metric", "alias": "core_9mcqa:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 440.35932326316833, "current_date": "2024-11-19 21:10:37 UTC", "num_instances": 9160, "beaker_info": {}, "metrics": {"acc_per_char_micro": 0.5692139737991266, "acc_per_char_macro": 0.5589227242908723, "correct_loss_per_char_micro": 0.5937509490662316, "correct_loss_per_char_macro": 0.6029771456048766, "primary_score_micro": 0.5772925764192139, "primary_score_macro": 0.5740817725727875, "incorrect_loss_per_token_micro": 3.7928495433620943, "incorrect_loss_per_token_macro": 3.8230539499749536, "acc_raw_micro": 0.5324235807860263, "acc_raw_macro": 0.517849553384937, "incorrect_loss_per_char_micro": 0.7578683910506546, "incorrect_loss_per_char_macro": 0.7611783181725845, "correct_loss_per_token_micro": 2.8967751889357443, "correct_loss_per_token_macro": 2.9568383042246413, "acc_per_token_micro": 0.5605895196506551, "acc_per_token_macro": 0.5529338517695767, "correct_loss_raw_micro": 23.022728316323207, "correct_loss_raw_macro": 23.347966431864265, "incorrect_loss_raw_micro": 26.831048833962193, "incorrect_loss_raw_macro": 27.03290257617091, "primary_score": 0.5740817725727875}, "task_idx": null} |
|
{"task_name": "arc_easy:mc", "task_hash": "ee0799a85be6dba03938d8980a14bc3a", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "arc_easy:mc", "task_core": "arc_easy", "limit": 1000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (MC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 23.879162788391113, "current_date": "2024-11-19 21:10:13 UTC", "num_instances": 1000, "beaker_info": {}, "metrics": {"acc_raw": 0.246, "acc_per_token": 0.246, "acc_per_char": 0.246, "correct_loss_raw": 1.3999989740848542, "incorrect_loss_raw": 1.397326016485691, "correct_loss_per_token": 1.3999989740848542, "incorrect_loss_per_token": 1.397326016485691, "correct_loss_per_char": 0.6999994870424271, "incorrect_loss_per_char": 0.6986630082428456, "primary_score": 0.246}, "task_idx": 0} |
|
{"task_name": "arc_easy", "task_hash": "ed6704ae05bb260463787386ca9d78ee", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "arc_easy", "task_core": "arc_easy", "limit": 1000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"description": null}, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (RC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 44.29335308074951, "current_date": "2024-11-19 21:10:37 UTC", "num_instances": 1000, "beaker_info": {}, "metrics": {"acc_raw": 0.694, "acc_per_token": 0.693, "acc_per_char": 0.714, "correct_loss_raw": 9.330107917129993, "incorrect_loss_raw": 14.000815374672404, "correct_loss_per_token": 2.2515681965954655, "incorrect_loss_per_token": 3.9243185699953256, "correct_loss_per_char": 0.41603878481733886, "incorrect_loss_per_char": 0.7056315904639505, "acc_uncond": 0.646, "correct_loss_uncond": -13.524798529326915, "incorrect_loss_uncond": -10.111074344178036, "primary_score": 0.714}, "task_idx": 1} |
|
{"task_name": "arc_challenge:mc", "task_hash": "867052288d273ab0fe8a12a6c5c548e6", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "arc_challenge:mc", "task_core": "arc_challenge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 31.17244529724121, "current_date": "2024-11-19 21:11:22 UTC", "num_instances": 1172, "beaker_info": {}, "metrics": {"acc_raw": 0.25, "acc_per_token": 0.25, "acc_per_char": 0.25, "correct_loss_raw": 1.39782169339193, "incorrect_loss_raw": 1.3961642627880828, "correct_loss_per_token": 1.39782169339193, "incorrect_loss_per_token": 1.3961642627880828, "correct_loss_per_char": 0.698910846695965, "incorrect_loss_per_char": 0.6980821313940414, "primary_score": 0.25}, "task_idx": 2} |
|
{"task_name": "arc_challenge", "task_hash": "3059c7ebe96cbc93600127cf3237e8e1", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "arc_challenge", "task_core": "arc_challenge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_uncond", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 59.15019631385803, "current_date": "2024-11-19 21:11:53 UTC", "num_instances": 1172, "beaker_info": {}, "metrics": {"acc_raw": 0.33532423208191126, "acc_per_token": 0.36945392491467577, "acc_per_char": 0.36177474402730375, "correct_loss_raw": 15.167813213612032, "incorrect_loss_raw": 16.08963039536049, "correct_loss_per_token": 2.7816373910187298, "incorrect_loss_per_token": 3.1465591771719446, "correct_loss_per_char": 0.5648754563167282, "incorrect_loss_per_char": 0.6338645292376848, "acc_uncond": 0.3984641638225256, "correct_loss_uncond": -13.69655239866862, "incorrect_loss_uncond": -12.153245893880248, "primary_score": 0.3984641638225256}, "task_idx": 3} |
|
{"task_name": "boolq:mc", "task_hash": "e6a86116b0573ade267bddc6598da6f4", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "boolq:mc", "task_core": "boolq", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "boolq:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 64.98181390762329, "current_date": "2024-11-19 21:12:52 UTC", "num_instances": 1000, "beaker_info": {}, "metrics": {"acc_raw": 0.638, "acc_per_token": 0.638, "acc_per_char": 0.638, "correct_loss_raw": 0.7280895302966237, "incorrect_loss_raw": 1.1170413957461716, "correct_loss_per_token": 0.7280895302966237, "incorrect_loss_per_token": 1.1170413957461716, "correct_loss_per_char": 0.36404476514831186, "incorrect_loss_per_char": 0.5585206978730858, "primary_score": 0.638}, "task_idx": 4} |
|
{"task_name": "boolq", "task_hash": "116b9d7a3c43d4d92986e54a7cec0bd5", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "boolq", "task_core": "boolq", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "boolq:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 61.86677813529968, "current_date": "2024-11-19 21:13:57 UTC", "num_instances": 1000, "beaker_info": {}, "metrics": {"acc_raw": 0.544, "acc_per_token": 0.544, "acc_per_char": 0.603, "correct_loss_raw": 0.7550902778506279, "incorrect_loss_raw": 0.8332378754168749, "correct_loss_per_token": 0.7550902778506279, "incorrect_loss_per_token": 0.8332378754168749, "correct_loss_per_char": 0.2051628428623079, "incorrect_loss_per_char": 0.2452210317080221, "primary_score": 0.544}, "task_idx": 5} |
|
{"task_name": "csqa:mc", "task_hash": "7dd00b56a8058d62c908535d927b9cda", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "csqa:mc", "task_core": "csqa", "limit": null, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "csqa:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 27.32917022705078, "current_date": "2024-11-19 21:14:59 UTC", "num_instances": 1221, "beaker_info": {}, "metrics": {"acc_raw": 0.19492219492219492, "acc_per_token": 0.19492219492219492, "acc_per_char": 0.19492219492219492, "correct_loss_raw": 1.6432164040478794, "incorrect_loss_raw": 1.6387123266367714, "correct_loss_per_token": 1.6432164040478794, "incorrect_loss_per_token": 1.6387123266367714, "correct_loss_per_char": 0.8216082020239397, "incorrect_loss_per_char": 0.8193561633183857, "primary_score": 0.19492219492219492}, "task_idx": 6} |
|
{"task_name": "csqa", "task_hash": "648cdcc5233e8fead60944b3946367f7", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "csqa", "task_core": "csqa", "limit": null, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_uncond", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "csqa:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 43.55690264701843, "current_date": "2024-11-19 21:15:26 UTC", "num_instances": 1221, "beaker_info": {}, "metrics": {"acc_raw": 0.5913185913185913, "acc_per_token": 0.583947583947584, "acc_per_char": 0.6175266175266175, "correct_loss_raw": 6.534973341700096, "incorrect_loss_raw": 11.175633807903816, "correct_loss_per_token": 4.077766012058472, "incorrect_loss_per_token": 7.26478272882345, "correct_loss_per_char": 0.6641299097690206, "incorrect_loss_per_char": 1.2084717048723157, "acc_uncond": 0.6322686322686323, "correct_loss_uncond": -9.932422045554224, "incorrect_loss_uncond": -5.273424961569854, "primary_score": 0.6322686322686323}, "task_idx": 7} |
|
{"task_name": "hellaswag:mc", "task_hash": "75631579605ae5f677bf3e10716878f8", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "hellaswag:mc", "task_core": "hellaswag", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "hellaswag:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 77.1323413848877, "current_date": "2024-11-19 21:16:11 UTC", "num_instances": 1000, "beaker_info": {}, "metrics": {"acc_raw": 0.265, "acc_per_token": 0.265, "acc_per_char": 0.265, "correct_loss_raw": 1.4350044332146645, "incorrect_loss_raw": 1.4458212533990522, "correct_loss_per_token": 1.4350044332146645, "incorrect_loss_per_token": 1.4458212533990522, "correct_loss_per_char": 0.7175022166073323, "incorrect_loss_per_char": 0.7229106266995261, "primary_score": 0.265}, "task_idx": 8} |
|
{"task_name": "hellaswag", "task_hash": "8312d0c6fac4c6da5cc98a431402ea60", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "hellaswag", "task_core": "hellaswag", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "hellaswag:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 125.9155502319336, "current_date": "2024-11-19 21:17:28 UTC", "num_instances": 1000, "beaker_info": {}, "metrics": {"acc_raw": 0.469, "acc_per_token": 0.592, "acc_per_char": 0.59, "correct_loss_raw": 72.27628855609893, "incorrect_loss_raw": 89.32188357480365, "correct_loss_per_token": 2.3943869034698344, "incorrect_loss_per_token": 2.9809331095903073, "correct_loss_per_char": 0.5284280503451793, "incorrect_loss_per_char": 0.6609210587038847, "acc_uncond": 0.478, "correct_loss_uncond": -26.180681183815004, "incorrect_loss_uncond": -20.375546882629408, "primary_score": 0.59}, "task_idx": 9} |
|
{"task_name": "openbookqa:mc", "task_hash": "aec5918df9c1126cd5bd8e2000fae9f7", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "openbookqa:mc", "task_core": "openbookqa", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 9.500998497009277, "current_date": "2024-11-19 21:19:33 UTC", "num_instances": 500, "beaker_info": {}, "metrics": {"acc_raw": 0.27, "acc_per_token": 0.27, "acc_per_char": 0.27, "correct_loss_raw": 1.3968519463539124, "incorrect_loss_raw": 1.4042580502430602, "correct_loss_per_token": 1.3968519463539124, "incorrect_loss_per_token": 1.4042580502430602, "correct_loss_per_char": 0.6984259731769562, "incorrect_loss_per_char": 0.7021290251215301, "primary_score": 0.27}, "task_idx": 10} |
|
{"task_name": "openbookqa", "task_hash": "bcd3c6e0e23954870d75bd4cd800afc9", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "openbookqa", "task_core": "openbookqa", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_uncond", "random_subsample_seed": 1234, "context_kwargs": {"no_prefix": false}, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.533832550048828, "current_date": "2024-11-19 21:19:43 UTC", "num_instances": 500, "beaker_info": {}, "metrics": {"acc_raw": 0.258, "acc_per_token": 0.382, "acc_per_char": 0.344, "correct_loss_raw": 15.555774765849113, "incorrect_loss_raw": 14.663584057887403, "correct_loss_per_token": 4.5521495499962, "incorrect_loss_per_token": 5.301223985967573, "correct_loss_per_char": 0.8821060165202664, "incorrect_loss_per_char": 0.9975226397142319, "acc_uncond": 0.488, "correct_loss_uncond": -9.155967952847481, "incorrect_loss_uncond": -7.157924043258035, "primary_score": 0.488}, "task_idx": 11} |
|
{"task_name": "piqa:mc", "task_hash": "3dfbe656dca31c364b396de69bc710a0", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "piqa:mc", "task_core": "piqa", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "piqa:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 24.298110961914062, "current_date": "2024-11-19 21:19:56 UTC", "num_instances": 1000, "beaker_info": {}, "metrics": {"acc_raw": 0.503, "acc_per_token": 0.503, "acc_per_char": 0.503, "correct_loss_raw": 0.8549434743523597, "incorrect_loss_raw": 0.8426201480925083, "correct_loss_per_token": 0.8549434743523597, "incorrect_loss_per_token": 0.8426201480925083, "correct_loss_per_char": 0.42747173717617987, "incorrect_loss_per_char": 0.42131007404625415, "primary_score": 0.503}, "task_idx": 12} |
|
{"task_name": "piqa", "task_hash": "96a9ff13e8416d1762b937f64a13d416", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "piqa", "task_core": "piqa", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "piqa:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 30.13063669204712, "current_date": "2024-11-19 21:20:21 UTC", "num_instances": 1000, "beaker_info": {}, "metrics": {"acc_raw": 0.711, "acc_per_token": 0.714, "acc_per_char": 0.705, "correct_loss_raw": 60.29878288650513, "incorrect_loss_raw": 64.27842769479751, "correct_loss_per_token": 2.959425113945048, "incorrect_loss_per_token": 3.183489216339498, "correct_loss_per_char": 0.6886968636692701, "incorrect_loss_per_char": 0.7379695185160277, "acc_uncond": 0.615, "correct_loss_uncond": -16.600970502853393, "incorrect_loss_uncond": -15.603905320644378, "primary_score": 0.705}, "task_idx": 13} |
|
{"task_name": "socialiqa:mc", "task_hash": "8997a05d7b8e86a4026d0cac0d26653e", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "socialiqa:mc", "task_core": "socialiqa", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "socialiqa:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 22.3986234664917, "current_date": "2024-11-19 21:20:51 UTC", "num_instances": 1000, "beaker_info": {}, "metrics": {"acc_raw": 0.315, "acc_per_token": 0.315, "acc_per_char": 0.315, "correct_loss_raw": 1.1634808651208877, "incorrect_loss_raw": 1.155260512650013, "correct_loss_per_token": 1.1634808651208877, "incorrect_loss_per_token": 1.155260512650013, "correct_loss_per_char": 0.5817404325604438, "incorrect_loss_per_char": 0.5776302563250065, "primary_score": 0.315}, "task_idx": 14} |
|
{"task_name": "socialiqa", "task_hash": "919d1b7d9249f469506576d515a7e379", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "socialiqa", "task_core": "socialiqa", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "socialiqa:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 39.13451051712036, "current_date": "2024-11-19 21:21:13 UTC", "num_instances": 1000, "beaker_info": {}, "metrics": {"acc_raw": 0.47, "acc_per_token": 0.51, "acc_per_char": 0.507, "correct_loss_raw": 13.429017143726348, "incorrect_loss_raw": 15.813642004132271, "correct_loss_per_token": 3.7707465825720146, "incorrect_loss_per_token": 4.630981714700234, "correct_loss_per_char": 0.6942623684752544, "incorrect_loss_per_char": 0.8590028932502934, "acc_uncond": 0.495, "correct_loss_uncond": -12.363556597232819, "incorrect_loss_uncond": -10.141750838160515, "primary_score": 0.507}, "task_idx": 15} |
|
{"task_name": "winogrande:mc", "task_hash": "b50e2ed910dee64ac741bdaac81c6b91", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "winogrande:mc", "task_core": "winogrande", "limit": null, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "winogrande:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 20.010592699050903, "current_date": "2024-11-19 21:21:52 UTC", "num_instances": 1267, "beaker_info": {}, "metrics": {"acc_raw": 0.4956590370955012, "acc_per_token": 0.4956590370955012, "acc_per_char": 0.4956590370955012, "correct_loss_raw": 0.9217955738882333, "incorrect_loss_raw": 0.9242580053591314, "correct_loss_per_token": 0.9217955738882333, "incorrect_loss_per_token": 0.9242580053591314, "correct_loss_per_char": 0.46089778694411665, "incorrect_loss_per_char": 0.4621290026795657, "primary_score": 0.4956590370955012}, "task_idx": 16} |
|
{"task_name": "winogrande", "task_hash": "5f81ea18813293043c23fa7f73ff85b2", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "winogrande", "task_core": "winogrande", "limit": null, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": null, "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "winogrande:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/core_9mcqa", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 22.777563095092773, "current_date": "2024-11-19 21:22:12 UTC", "num_instances": 1267, "beaker_info": {}, "metrics": {"acc_raw": 0.5880031570639306, "acc_per_token": 0.5880031570639306, "acc_per_char": 0.5880031570639306, "correct_loss_raw": 16.783849784306117, "incorrect_loss_raw": 17.11926840056372, "correct_loss_per_token": 3.0687747105153775, "incorrect_loss_per_token": 3.1419591717693787, "correct_loss_per_char": 0.7830940176685236, "incorrect_loss_per_char": 0.8019998970868495, "primary_score": 0.5880031570639306}, "task_idx": 17} |
|
|