princeton-nlp's picture
Upload folder using huggingface_hub
fc4aea2 verified
{"all_primary_scores": ["arc_easy::olmes: 0.714", "arc_challenge::olmes: 0.398464", "boolq::olmes: 0.638", "csqa::olmes: 0.632269", "hellaswag::olmes: 0.59", "openbookqa::olmes: 0.488", "piqa::olmes: 0.705", "socialiqa::olmes: 0.507", "winogrande::olmes: 0.588003", "core_9mcqa::olmes: 0.584526", "core_9mcqa:rc::olmes: 0.574082", "arc_easy:mc::olmes: 0.246", "arc_easy:rc::olmes: 0.714", "arc_challenge:mc::olmes: 0.25", "arc_challenge:rc::olmes: 0.398464", "boolq:mc::olmes: 0.638", "boolq:rc::olmes: 0.544", "csqa:mc::olmes: 0.194922", "csqa:rc::olmes: 0.632269", "hellaswag:mc::olmes: 0.265", "hellaswag:rc::olmes: 0.59", "openbookqa:mc::olmes: 0.27", "openbookqa:rc::olmes: 0.488", "piqa:mc::olmes: 0.503", "piqa:rc::olmes: 0.705", "socialiqa:mc::olmes: 0.315", "socialiqa:rc::olmes: 0.507", "winogrande:mc::olmes: 0.495659", "winogrande:rc::olmes: 0.588003"], "metrics": [{"task": "arc_easy::olmes", "primary_score": 0.714, "num_instances": 2000, "task_config": {"task_name": "arc_easy::olmes", "task_core": "arc_easy", "limit": 1000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "arc_easy::olmes"}}}, {"task": "arc_challenge::olmes", "primary_score": 0.3984641638225256, "num_instances": 2344, "task_config": {"task_name": "arc_challenge::olmes", "task_core": "arc_challenge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "arc_challenge::olmes"}}}, {"task": "boolq::olmes", "primary_score": 0.638, "num_instances": 2000, "task_config": {"task_name": "boolq::olmes", "task_core": "boolq", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "mc", "alias": "boolq::olmes"}}}, {"task": "csqa::olmes", "primary_score": 0.6322686322686323, "num_instances": 2442, "task_config": {"task_name": "csqa::olmes", "task_core": "csqa", "limit": null, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "csqa::olmes"}}}, {"task": "hellaswag::olmes", "primary_score": 0.59, "num_instances": 2000, "task_config": {"task_name": "hellaswag::olmes", "task_core": "hellaswag", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "hellaswag::olmes"}}}, {"task": "openbookqa::olmes", "primary_score": 0.488, "num_instances": 1000, "task_config": {"task_name": "openbookqa::olmes", "task_core": "openbookqa", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "openbookqa::olmes"}}}, {"task": "piqa::olmes", "primary_score": 0.705, "num_instances": 2000, "task_config": {"task_name": "piqa::olmes", "task_core": "piqa", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "piqa::olmes"}}}, {"task": "socialiqa::olmes", "primary_score": 0.507, "num_instances": 2000, "task_config": {"task_name": "socialiqa::olmes", "task_core": "socialiqa", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "socialiqa::olmes"}}}, {"task": "winogrande::olmes", "primary_score": 0.5880031570639306, "num_instances": 2534, "task_config": {"task_name": "winogrande::olmes", "task_core": "winogrande", "limit": null, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "winogrande::olmes"}}}, {"task": "core_9mcqa::olmes", "primary_score_micro": 0.5875545851528384, "primary_score_macro": 0.584526217017232, "primary_score": 0.584526217017232, "num_instances": 18320, "task_config": {"task_name": "core_9mcqa::olmes", "task_core": "arc_easy", "limit": 1000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 9, "description": "Aggregate metric", "alias": "core_9mcqa::olmes"}}}, {"task": "core_9mcqa:rc::olmes", "acc_per_char_micro": 0.5692139737991266, "acc_per_char_macro": 0.5589227242908723, "correct_loss_per_char_micro": 0.5937509490662316, "correct_loss_per_char_macro": 0.6029771456048766, "primary_score_micro": 0.5772925764192139, "primary_score_macro": 0.5740817725727875, "incorrect_loss_per_token_micro": 3.7928495433620943, "incorrect_loss_per_token_macro": 3.8230539499749536, "acc_raw_micro": 0.5324235807860263, "acc_raw_macro": 0.517849553384937, "incorrect_loss_per_char_micro": 0.7578683910506546, "incorrect_loss_per_char_macro": 0.7611783181725845, "correct_loss_per_token_micro": 2.8967751889357443, "correct_loss_per_token_macro": 2.9568383042246413, "acc_per_token_micro": 0.5605895196506551, "acc_per_token_macro": 0.5529338517695767, "correct_loss_raw_micro": 23.022728316323207, "correct_loss_raw_macro": 23.347966431864265, "incorrect_loss_raw_micro": 26.831048833962193, "incorrect_loss_raw_macro": 27.03290257617091, "primary_score": 0.5740817725727875, "num_instances": 9160, "task_config": {"task_name": "core_9mcqa:rc::olmes", "task_core": "arc_easy", "limit": 1000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 1234, "context_kwargs": {"description": null}, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"num_tasks": 9, "description": "Aggregate metric", "alias": "core_9mcqa:rc::olmes"}}}, {"task": "arc_easy:mc", "acc_raw": 0.246, "acc_per_token": 0.246, "acc_per_char": 0.246, "correct_loss_raw": 1.3999989740848542, "incorrect_loss_raw": 1.397326016485691, "correct_loss_per_token": 1.3999989740848542, "incorrect_loss_per_token": 1.397326016485691, "correct_loss_per_char": 0.6999994870424271, "incorrect_loss_per_char": 0.6986630082428456, "primary_score": 0.246, "num_instances": 1000, "task_config": {"task_name": "arc_easy:mc", "task_core": "arc_easy", "limit": 1000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (MC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:mc::olmes"}}}, {"task": "arc_easy", "acc_raw": 0.694, "acc_per_token": 0.693, "acc_per_char": 0.714, "correct_loss_raw": 9.330107917129993, "incorrect_loss_raw": 14.000815374672404, "correct_loss_per_token": 2.2515681965954655, "incorrect_loss_per_token": 3.9243185699953256, "correct_loss_per_char": 0.41603878481733886, "incorrect_loss_per_char": 0.7056315904639505, "acc_uncond": 0.646, "correct_loss_uncond": -13.524798529326915, "incorrect_loss_uncond": -10.111074344178036, "primary_score": 0.714, "num_instances": 1000, "task_config": {"task_name": "arc_easy", "task_core": "arc_easy", "limit": 1000, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": {"description": null}, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Easy", "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"description": "ARC-Easy (RC) using OLMES-v0.1", "regimes": ["OLMES-v0.1"], "alias": "arc_easy:rc::olmes"}}}, {"task": "arc_challenge:mc", "acc_raw": 0.25, "acc_per_token": 0.25, "acc_per_char": 0.25, "correct_loss_raw": 1.39782169339193, "incorrect_loss_raw": 1.3961642627880828, "correct_loss_per_token": 1.39782169339193, "incorrect_loss_per_token": 1.3961642627880828, "correct_loss_per_char": 0.698910846695965, "incorrect_loss_per_char": 0.6980821313940414, "primary_score": 0.25, "num_instances": 1172, "task_config": {"task_name": "arc_challenge:mc", "task_core": "arc_challenge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:mc::olmes"}}}, {"task": "arc_challenge", "acc_raw": 0.33532423208191126, "acc_per_token": 0.36945392491467577, "acc_per_char": 0.36177474402730375, "correct_loss_raw": 15.167813213612032, "incorrect_loss_raw": 16.08963039536049, "correct_loss_per_token": 2.7816373910187298, "incorrect_loss_per_token": 3.1465591771719446, "correct_loss_per_char": 0.5648754563167282, "incorrect_loss_per_char": 0.6338645292376848, "acc_uncond": 0.3984641638225256, "correct_loss_uncond": -13.69655239866862, "incorrect_loss_uncond": -12.153245893880248, "primary_score": 0.3984641638225256, "num_instances": 1172, "task_config": {"task_name": "arc_challenge", "task_core": "arc_challenge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_uncond", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:ARC-Challenge", "dataset_path": "ai2_arc", "dataset_name": "ARC-Challenge", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "arc_challenge:rc::olmes"}}}, {"task": "boolq:mc", "acc_raw": 0.638, "acc_per_token": 0.638, "acc_per_char": 0.638, "correct_loss_raw": 0.7280895302966237, "incorrect_loss_raw": 1.1170413957461716, "correct_loss_per_token": 0.7280895302966237, "incorrect_loss_per_token": 1.1170413957461716, "correct_loss_per_char": 0.36404476514831186, "incorrect_loss_per_char": 0.5585206978730858, "primary_score": 0.638, "num_instances": 1000, "task_config": {"task_name": "boolq:mc", "task_core": "boolq", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "boolq:mc::olmes"}}}, {"task": "boolq", "acc_raw": 0.544, "acc_per_token": 0.544, "acc_per_char": 0.603, "correct_loss_raw": 0.7550902778506279, "incorrect_loss_raw": 0.8332378754168749, "correct_loss_per_token": 0.7550902778506279, "incorrect_loss_per_token": 0.8332378754168749, "correct_loss_per_char": 0.2051628428623079, "incorrect_loss_per_char": 0.2452210317080221, "primary_score": 0.544, "num_instances": 1000, "task_config": {"task_name": "boolq", "task_core": "boolq", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": null}, "native_id_field": "idx", "fewshot_source": "OLMES:BoolQ", "dataset_path": "super_glue", "dataset_name": "boolq", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "boolq:rc::olmes"}}}, {"task": "csqa:mc", "acc_raw": 0.19492219492219492, "acc_per_token": 0.19492219492219492, "acc_per_char": 0.19492219492219492, "correct_loss_raw": 1.6432164040478794, "incorrect_loss_raw": 1.6387123266367714, "correct_loss_per_token": 1.6432164040478794, "incorrect_loss_per_token": 1.6387123266367714, "correct_loss_per_char": 0.8216082020239397, "incorrect_loss_per_char": 0.8193561633183857, "primary_score": 0.19492219492219492, "num_instances": 1221, "task_config": {"task_name": "csqa:mc", "task_core": "csqa", "limit": null, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "csqa:mc::olmes"}}}, {"task": "csqa", "acc_raw": 0.5913185913185913, "acc_per_token": 0.583947583947584, "acc_per_char": 0.6175266175266175, "correct_loss_raw": 6.534973341700096, "incorrect_loss_raw": 11.175633807903816, "correct_loss_per_token": 4.077766012058472, "incorrect_loss_per_token": 7.26478272882345, "correct_loss_per_char": 0.6641299097690206, "incorrect_loss_per_char": 1.2084717048723157, "acc_uncond": 0.6322686322686323, "correct_loss_uncond": -9.932422045554224, "incorrect_loss_uncond": -5.273424961569854, "primary_score": 0.6322686322686323, "num_instances": 1221, "task_config": {"task_name": "csqa", "task_core": "csqa", "limit": null, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_uncond", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:commonsense_qa", "dataset_path": "commonsense_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "csqa:rc::olmes"}}}, {"task": "hellaswag:mc", "acc_raw": 0.265, "acc_per_token": 0.265, "acc_per_char": 0.265, "correct_loss_raw": 1.4350044332146645, "incorrect_loss_raw": 1.4458212533990522, "correct_loss_per_token": 1.4350044332146645, "incorrect_loss_per_token": 1.4458212533990522, "correct_loss_per_char": 0.7175022166073323, "incorrect_loss_per_char": 0.7229106266995261, "primary_score": 0.265, "num_instances": 1000, "task_config": {"task_name": "hellaswag:mc", "task_core": "hellaswag", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "hellaswag:mc::olmes"}}}, {"task": "hellaswag", "acc_raw": 0.469, "acc_per_token": 0.592, "acc_per_char": 0.59, "correct_loss_raw": 72.27628855609893, "incorrect_loss_raw": 89.32188357480365, "correct_loss_per_token": 2.3943869034698344, "incorrect_loss_per_token": 2.9809331095903073, "correct_loss_per_char": 0.5284280503451793, "incorrect_loss_per_char": 0.6609210587038847, "acc_uncond": 0.478, "correct_loss_uncond": -26.180681183815004, "incorrect_loss_uncond": -20.375546882629408, "primary_score": 0.59, "num_instances": 1000, "task_config": {"task_name": "hellaswag", "task_core": "hellaswag", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "ind", "fewshot_source": "OLMES:hellaswag", "dataset_path": "hellaswag", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "hellaswag:rc::olmes"}}}, {"task": "openbookqa:mc", "acc_raw": 0.27, "acc_per_token": 0.27, "acc_per_char": 0.27, "correct_loss_raw": 1.3968519463539124, "incorrect_loss_raw": 1.4042580502430602, "correct_loss_per_token": 1.3968519463539124, "incorrect_loss_per_token": 1.4042580502430602, "correct_loss_per_char": 0.6984259731769562, "incorrect_loss_per_char": 0.7021290251215301, "primary_score": 0.27, "num_instances": 500, "task_config": {"task_name": "openbookqa:mc", "task_core": "openbookqa", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:mc::olmes"}}}, {"task": "openbookqa", "acc_raw": 0.258, "acc_per_token": 0.382, "acc_per_char": 0.344, "correct_loss_raw": 15.555774765849113, "incorrect_loss_raw": 14.663584057887403, "correct_loss_per_token": 4.5521495499962, "incorrect_loss_per_token": 5.301223985967573, "correct_loss_per_char": 0.8821060165202664, "incorrect_loss_per_char": 0.9975226397142319, "acc_uncond": 0.488, "correct_loss_uncond": -9.155967952847481, "incorrect_loss_uncond": -7.157924043258035, "primary_score": 0.488, "num_instances": 500, "task_config": {"task_name": "openbookqa", "task_core": "openbookqa", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_uncond", "random_subsample_seed": 1234, "context_kwargs": {"no_prefix": false}, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "id", "fewshot_source": "OLMES:openbookqa", "dataset_path": "openbookqa", "dataset_name": "main", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "openbookqa:rc::olmes"}}}, {"task": "piqa:mc", "acc_raw": 0.503, "acc_per_token": 0.503, "acc_per_char": 0.503, "correct_loss_raw": 0.8549434743523597, "incorrect_loss_raw": 0.8426201480925083, "correct_loss_per_token": 0.8549434743523597, "incorrect_loss_per_token": 0.8426201480925083, "correct_loss_per_char": 0.42747173717617987, "incorrect_loss_per_char": 0.42131007404625415, "primary_score": 0.503, "num_instances": 1000, "task_config": {"task_name": "piqa:mc", "task_core": "piqa", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "piqa:mc::olmes"}}}, {"task": "piqa", "acc_raw": 0.711, "acc_per_token": 0.714, "acc_per_char": 0.705, "correct_loss_raw": 60.29878288650513, "incorrect_loss_raw": 64.27842769479751, "correct_loss_per_token": 2.959425113945048, "incorrect_loss_per_token": 3.183489216339498, "correct_loss_per_char": 0.6886968636692701, "incorrect_loss_per_char": 0.7379695185160277, "acc_uncond": 0.615, "correct_loss_uncond": -16.600970502853393, "incorrect_loss_uncond": -15.603905320644378, "primary_score": 0.705, "num_instances": 1000, "task_config": {"task_name": "piqa", "task_core": "piqa", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": "OLMES:piqa", "dataset_path": "piqa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "piqa:rc::olmes"}}}, {"task": "socialiqa:mc", "acc_raw": 0.315, "acc_per_token": 0.315, "acc_per_char": 0.315, "correct_loss_raw": 1.1634808651208877, "incorrect_loss_raw": 1.155260512650013, "correct_loss_per_token": 1.1634808651208877, "incorrect_loss_per_token": 1.155260512650013, "correct_loss_per_char": 0.5817404325604438, "incorrect_loss_per_char": 0.5776302563250065, "primary_score": 0.315, "num_instances": 1000, "task_config": {"task_name": "socialiqa:mc", "task_core": "socialiqa", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "socialiqa:mc::olmes"}}}, {"task": "socialiqa", "acc_raw": 0.47, "acc_per_token": 0.51, "acc_per_char": 0.507, "correct_loss_raw": 13.429017143726348, "incorrect_loss_raw": 15.813642004132271, "correct_loss_per_token": 3.7707465825720146, "incorrect_loss_per_token": 4.630981714700234, "correct_loss_per_char": 0.6942623684752544, "incorrect_loss_per_char": 0.8590028932502934, "acc_uncond": 0.495, "correct_loss_uncond": -12.363556597232819, "incorrect_loss_uncond": -10.141750838160515, "primary_score": 0.507, "num_instances": 1000, "task_config": {"task_name": "socialiqa", "task_core": "socialiqa", "limit": 1000, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": "OLMES:social_i_qa", "dataset_path": "social_i_qa", "dataset_name": null, "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "socialiqa:rc::olmes"}}}, {"task": "winogrande:mc", "acc_raw": 0.4956590370955012, "acc_per_token": 0.4956590370955012, "acc_per_char": 0.4956590370955012, "correct_loss_raw": 0.9217955738882333, "incorrect_loss_raw": 0.9242580053591314, "correct_loss_per_token": 0.9217955738882333, "incorrect_loss_per_token": 0.9242580053591314, "correct_loss_per_char": 0.46089778694411665, "incorrect_loss_per_char": 0.4621290026795657, "primary_score": 0.4956590370955012, "num_instances": 1267, "task_config": {"task_name": "winogrande:mc", "task_core": "winogrande", "limit": null, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "winogrande:mc::olmes"}}}, {"task": "winogrande", "acc_raw": 0.5880031570639306, "acc_per_token": 0.5880031570639306, "acc_per_char": 0.5880031570639306, "correct_loss_raw": 16.783849784306117, "incorrect_loss_raw": 17.11926840056372, "correct_loss_per_token": 3.0687747105153775, "incorrect_loss_per_token": 3.1419591717693787, "correct_loss_per_char": 0.7830940176685236, "incorrect_loss_per_char": 0.8019998970868495, "primary_score": 0.5880031570639306, "num_instances": 1267, "task_config": {"task_name": "winogrande", "task_core": "winogrande", "limit": null, "split": "validation", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": null, "fewshot_source": "OLMES:winogrande", "dataset_path": "winogrande", "dataset_name": "winogrande_xl", "use_chat_format": null, "version": 0, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "winogrande:rc::olmes"}}}], "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000"}}