{"task_name": "mmlu:mc::olmes", "task_hash": "f0f05cd4953d75d76242750a66e32adb", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu:mc::olmes", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"num_tasks": 57, "description": "Aggregate metric", "alias": "mmlu:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 742.4591648578644, "current_date": "2024-11-19 21:10:14 UTC", "num_instances": 14042, "beaker_info": {}, "metrics": {"correct_loss_per_token_micro": 1.399370581556837, "correct_loss_per_token_macro": 1.4004924818471334, "incorrect_loss_per_token_micro": 1.4053478479492902, "incorrect_loss_per_token_macro": 1.4081410191080233, "acc_raw_micro": 0.25658738071499787, "acc_raw_macro": 0.25454956238045445, "correct_loss_raw_micro": 1.399370581556837, "correct_loss_raw_macro": 1.4004924818471334, "acc_per_char_micro": 0.25658738071499787, "acc_per_char_macro": 0.25454956238045445, "incorrect_loss_raw_micro": 1.4053478479492902, "incorrect_loss_raw_macro": 1.4081410191080233, "correct_loss_per_char_micro": 0.6996852907784185, "correct_loss_per_char_macro": 0.7002462409235667, "primary_score_micro": 0.25658738071499787, "primary_score_macro": 0.25454956238045445, "incorrect_loss_per_char_micro": 0.7026739239746451, "incorrect_loss_per_char_macro": 0.7040705095540116, "acc_per_token_micro": 0.25658738071499787, "acc_per_token_macro": 0.25454956238045445, "primary_score": 0.25454956238045445}, "task_idx": null} {"task_name": "mmlu:rc::olmes", "task_hash": "d3fcbcac54951cec9ca2867583e71aa6", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu:rc::olmes", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "macro", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"num_tasks": 57, "description": "Aggregate metric", "alias": "mmlu:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 1744.0068137645721, "current_date": "2024-11-19 21:22:37 UTC", "num_instances": 14042, "beaker_info": {}, "metrics": {"correct_loss_per_token_micro": 2.6432887331562633, "correct_loss_per_token_macro": 2.7731874534749923, "correct_loss_uncond_micro": -15.04736503639418, "correct_loss_uncond_macro": -13.507832404085681, "incorrect_loss_uncond_micro": -14.061741272932458, "incorrect_loss_uncond_macro": -12.577336096784808, "incorrect_loss_per_token_micro": 2.956806828783771, "incorrect_loss_per_token_macro": 3.083547324785819, "acc_raw_micro": 0.317048853439681, "acc_raw_macro": 0.3165782466450824, "correct_loss_raw_micro": 21.79074947781205, "correct_loss_raw_macro": 20.761361040112263, "acc_per_char_micro": 0.33150548354935194, "acc_per_char_macro": 0.333915463567101, "incorrect_loss_raw_micro": 22.025241633518036, "incorrect_loss_raw_macro": 20.90722468526756, "correct_loss_per_char_micro": 0.622242090275648, "correct_loss_per_char_macro": 0.6912111226964194, "primary_score_micro": 0.33150548354935194, "primary_score_macro": 0.333915463567101, "incorrect_loss_per_char_micro": 0.6791941747100239, "incorrect_loss_per_char_macro": 0.7448731712983696, "acc_uncond_micro": 0.3470303375587523, "acc_uncond_macro": 0.34845882687855845, "acc_per_token_micro": 0.3336419313488107, "acc_per_token_macro": 0.3358986998185535, "primary_score": 0.333915463567101}, "task_idx": null} {"task_name": "mmlu::olmes", "task_hash": "f5ac6da68d1e2b6ae02dda443aa04648", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu::olmes", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "mc_or_rc", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"num_tasks": 2, "description": "Best of MC vs RC", "used_mc_or_rc": "rc", "alias": "mmlu::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2486.4659786224365, "current_date": "2024-11-19 21:10:14 UTC", "num_instances": 28084, "beaker_info": {}, "metrics": {"primary_score": 0.333915463567101}, "task_idx": null} {"task_name": "mmlu_abstract_algebra:mc", "task_hash": "bdde3fee40ebc8ddc5786c67975c5b31", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_abstract_algebra:mc", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_abstract_algebra:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.9220430850982666, "current_date": "2024-11-19 21:10:14 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.27, "acc_per_token": 0.27, "acc_per_char": 0.27, "correct_loss_raw": 1.5122956645488739, "incorrect_loss_raw": 1.508113094369571, "correct_loss_per_token": 1.5122956645488739, "incorrect_loss_per_token": 1.508113094369571, "correct_loss_per_char": 0.7561478322744369, "incorrect_loss_per_char": 0.7540565471847855, "primary_score": 0.27}, "task_idx": 0} {"task_name": "mmlu_anatomy:mc", "task_hash": "ba9ed92a6ef8f2c40aa5551bfc77b5e7", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_anatomy:mc", "task_core": "mmlu_anatomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "anatomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_anatomy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.611966133117676, "current_date": "2024-11-19 21:10:16 UTC", "num_instances": 135, "beaker_info": {}, "metrics": {"acc_raw": 0.2, "acc_per_token": 0.2, "acc_per_char": 0.2, "correct_loss_raw": 1.405913030659711, "incorrect_loss_raw": 1.394696780192999, "correct_loss_per_token": 1.405913030659711, "incorrect_loss_per_token": 1.394696780192999, "correct_loss_per_char": 0.7029565153298555, "incorrect_loss_per_char": 0.6973483900964995, "primary_score": 0.2}, "task_idx": 1} {"task_name": "mmlu_astronomy:mc", "task_hash": "e7ca8a8921c02622e23c99b7d90379f7", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_astronomy:mc", "task_core": "mmlu_astronomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "astronomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_astronomy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.6969826221466064, "current_date": "2024-11-19 21:10:20 UTC", "num_instances": 152, "beaker_info": {}, "metrics": {"acc_raw": 0.19078947368421054, "acc_per_token": 0.19078947368421054, "acc_per_char": 0.19078947368421054, "correct_loss_raw": 1.4413342103362083, "incorrect_loss_raw": 1.3992736855881265, "correct_loss_per_token": 1.4413342103362083, "incorrect_loss_per_token": 1.3992736855881265, "correct_loss_per_char": 0.7206671051681042, "incorrect_loss_per_char": 0.6996368427940632, "primary_score": 0.19078947368421054}, "task_idx": 2} {"task_name": "mmlu_business_ethics:mc", "task_hash": "7de417726ca2cc155dd1475a38afc381", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_business_ethics:mc", "task_core": "mmlu_business_ethics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "business_ethics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_business_ethics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.340962648391724, "current_date": "2024-11-19 21:10:27 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.2, "acc_per_token": 0.2, "acc_per_char": 0.2, "correct_loss_raw": 1.4059372037649154, "incorrect_loss_raw": 1.4068191585938135, "correct_loss_per_token": 1.4059372037649154, "incorrect_loss_per_token": 1.4068191585938135, "correct_loss_per_char": 0.7029686018824577, "incorrect_loss_per_char": 0.7034095792969067, "primary_score": 0.2}, "task_idx": 3} {"task_name": "mmlu_clinical_knowledge:mc", "task_hash": "221ee08c4359ce7072b8d66f1c37f500", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_clinical_knowledge:mc", "task_core": "mmlu_clinical_knowledge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "clinical_knowledge", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_clinical_knowledge:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.778644561767578, "current_date": "2024-11-19 21:10:31 UTC", "num_instances": 265, "beaker_info": {}, "metrics": {"acc_raw": 0.21132075471698114, "acc_per_token": 0.21132075471698114, "acc_per_char": 0.21132075471698114, "correct_loss_raw": 1.4041231951623592, "incorrect_loss_raw": 1.3956608722044994, "correct_loss_per_token": 1.4041231951623592, "incorrect_loss_per_token": 1.3956608722044994, "correct_loss_per_char": 0.7020615975811796, "incorrect_loss_per_char": 0.6978304361022497, "primary_score": 0.21132075471698114}, "task_idx": 4} {"task_name": "mmlu_college_biology:mc", "task_hash": "aaf0bf4441359de8ffba70cefb786807", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_biology:mc", "task_core": "mmlu_college_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_biology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.012893915176392, "current_date": "2024-11-19 21:10:39 UTC", "num_instances": 144, "beaker_info": {}, "metrics": {"acc_raw": 0.2013888888888889, "acc_per_token": 0.2013888888888889, "acc_per_char": 0.2013888888888889, "correct_loss_raw": 1.3990219914250903, "incorrect_loss_raw": 1.4011454527024867, "correct_loss_per_token": 1.3990219914250903, "incorrect_loss_per_token": 1.4011454527024867, "correct_loss_per_char": 0.6995109957125452, "incorrect_loss_per_char": 0.7005727263512433, "primary_score": 0.2013888888888889}, "task_idx": 5} {"task_name": "mmlu_college_chemistry:mc", "task_hash": "1980c88e607a6dea06d45f27c60e3365", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_chemistry:mc", "task_core": "mmlu_college_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_chemistry:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.94285249710083, "current_date": "2024-11-19 21:10:44 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.21, "acc_per_token": 0.21, "acc_per_char": 0.21, "correct_loss_raw": 1.4002672040462494, "incorrect_loss_raw": 1.3961779685815174, "correct_loss_per_token": 1.4002672040462494, "incorrect_loss_per_token": 1.3961779685815174, "correct_loss_per_char": 0.7001336020231247, "incorrect_loss_per_char": 0.6980889842907587, "primary_score": 0.21}, "task_idx": 6} {"task_name": "mmlu_college_computer_science:mc", "task_hash": "9d5570c603bbcb33a0727904a22ef997", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_computer_science:mc", "task_core": "mmlu_college_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_computer_science:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.416605710983276, "current_date": "2024-11-19 21:10:48 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.34, "acc_per_token": 0.34, "acc_per_char": 0.34, "correct_loss_raw": 1.3831730151176453, "incorrect_loss_raw": 1.4183711288372678, "correct_loss_per_token": 1.3831730151176453, "incorrect_loss_per_token": 1.4183711288372678, "correct_loss_per_char": 0.6915865075588227, "incorrect_loss_per_char": 0.7091855644186339, "primary_score": 0.34}, "task_idx": 7} {"task_name": "mmlu_college_mathematics:mc", "task_hash": "264fbafdeceacfd7588ca20ca3546113", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_mathematics:mc", "task_core": "mmlu_college_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_mathematics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.413100004196167, "current_date": "2024-11-19 21:10:54 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.33, "acc_per_token": 0.33, "acc_per_char": 0.33, "correct_loss_raw": 1.3795828753709793, "incorrect_loss_raw": 1.4324306764205301, "correct_loss_per_token": 1.3795828753709793, "incorrect_loss_per_token": 1.4324306764205301, "correct_loss_per_char": 0.6897914376854897, "incorrect_loss_per_char": 0.7162153382102651, "primary_score": 0.33}, "task_idx": 8} {"task_name": "mmlu_college_medicine:mc", "task_hash": "9b3c95bd3bbac8771701a5abc3ab28ba", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_medicine:mc", "task_core": "mmlu_college_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_medicine:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.131869077682495, "current_date": "2024-11-19 21:10:59 UTC", "num_instances": 173, "beaker_info": {}, "metrics": {"acc_raw": 0.23121387283236994, "acc_per_token": 0.23121387283236994, "acc_per_char": 0.23121387283236994, "correct_loss_raw": 1.4082309800076347, "incorrect_loss_raw": 1.3981844719664442, "correct_loss_per_token": 1.4082309800076347, "incorrect_loss_per_token": 1.3981844719664442, "correct_loss_per_char": 0.7041154900038173, "incorrect_loss_per_char": 0.6990922359832221, "primary_score": 0.23121387283236994}, "task_idx": 9} {"task_name": "mmlu_college_physics:mc", "task_hash": "2c97b2d8aac8dff8cd2656474c1dfb86", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_physics:mc", "task_core": "mmlu_college_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_physics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.303727149963379, "current_date": "2024-11-19 21:11:06 UTC", "num_instances": 102, "beaker_info": {}, "metrics": {"acc_raw": 0.23529411764705882, "acc_per_token": 0.23529411764705882, "acc_per_char": 0.23529411764705882, "correct_loss_raw": 1.3708963253918816, "incorrect_loss_raw": 1.408653272522821, "correct_loss_per_token": 1.3708963253918816, "incorrect_loss_per_token": 1.408653272522821, "correct_loss_per_char": 0.6854481626959408, "incorrect_loss_per_char": 0.7043266362614105, "primary_score": 0.23529411764705882}, "task_idx": 10} {"task_name": "mmlu_computer_security:mc", "task_hash": "6d7c3f721bf97797f0e660d896f4585b", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_computer_security:mc", "task_core": "mmlu_computer_security", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "computer_security", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_computer_security:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.0471487045288086, "current_date": "2024-11-19 21:11:09 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.21, "acc_per_token": 0.21, "acc_per_char": 0.21, "correct_loss_raw": 1.4048889875411987, "incorrect_loss_raw": 1.404950787623724, "correct_loss_per_token": 1.4048889875411987, "incorrect_loss_per_token": 1.404950787623724, "correct_loss_per_char": 0.7024444937705994, "incorrect_loss_per_char": 0.702475393811862, "primary_score": 0.21}, "task_idx": 11} {"task_name": "mmlu_conceptual_physics:mc", "task_hash": "ffbb5f78c71ff87a70f5b59d313a380d", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_conceptual_physics:mc", "task_core": "mmlu_conceptual_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "conceptual_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_conceptual_physics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.555780649185181, "current_date": "2024-11-19 21:11:12 UTC", "num_instances": 235, "beaker_info": {}, "metrics": {"acc_raw": 0.33191489361702126, "acc_per_token": 0.33191489361702126, "acc_per_char": 0.33191489361702126, "correct_loss_raw": 1.3766544839169117, "incorrect_loss_raw": 1.4139737217138846, "correct_loss_per_token": 1.3766544839169117, "incorrect_loss_per_token": 1.4139737217138846, "correct_loss_per_char": 0.6883272419584558, "incorrect_loss_per_char": 0.7069868608569423, "primary_score": 0.33191489361702126}, "task_idx": 12} {"task_name": "mmlu_econometrics:mc", "task_hash": "c69ca4807df1205e806299e8e20218af", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_econometrics:mc", "task_core": "mmlu_econometrics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "econometrics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_econometrics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.1991126537323, "current_date": "2024-11-19 21:11:18 UTC", "num_instances": 114, "beaker_info": {}, "metrics": {"acc_raw": 0.21052631578947367, "acc_per_token": 0.21052631578947367, "acc_per_char": 0.21052631578947367, "correct_loss_raw": 1.4257322455707349, "incorrect_loss_raw": 1.4038150355829833, "correct_loss_per_token": 1.4257322455707349, "incorrect_loss_per_token": 1.4038150355829833, "correct_loss_per_char": 0.7128661227853674, "incorrect_loss_per_char": 0.7019075177914916, "primary_score": 0.21052631578947367}, "task_idx": 13} {"task_name": "mmlu_electrical_engineering:mc", "task_hash": "c279f61638992683680ca9604e20fa4d", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_electrical_engineering:mc", "task_core": "mmlu_electrical_engineering", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "electrical_engineering", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_electrical_engineering:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.6134796142578125, "current_date": "2024-11-19 21:11:23 UTC", "num_instances": 145, "beaker_info": {}, "metrics": {"acc_raw": 0.2206896551724138, "acc_per_token": 0.2206896551724138, "acc_per_char": 0.2206896551724138, "correct_loss_raw": 1.4267057862775079, "incorrect_loss_raw": 1.3974592317109829, "correct_loss_per_token": 1.4267057862775079, "incorrect_loss_per_token": 1.3974592317109829, "correct_loss_per_char": 0.7133528931387539, "incorrect_loss_per_char": 0.6987296158554914, "primary_score": 0.2206896551724138}, "task_idx": 14} {"task_name": "mmlu_elementary_mathematics:mc", "task_hash": "35b6f0933f711770d09fb00b45905c5c", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_elementary_mathematics:mc", "task_core": "mmlu_elementary_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "elementary_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_elementary_mathematics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.865192413330078, "current_date": "2024-11-19 21:11:28 UTC", "num_instances": 378, "beaker_info": {}, "metrics": {"acc_raw": 0.2698412698412698, "acc_per_token": 0.2698412698412698, "acc_per_char": 0.2698412698412698, "correct_loss_raw": 1.40157401782495, "incorrect_loss_raw": 1.3962224588928183, "correct_loss_per_token": 1.40157401782495, "incorrect_loss_per_token": 1.3962224588928183, "correct_loss_per_char": 0.700787008912475, "incorrect_loss_per_char": 0.6981112294464091, "primary_score": 0.2698412698412698}, "task_idx": 15} {"task_name": "mmlu_formal_logic:mc", "task_hash": "74d8e6a1f297e0274243d2bbb7df4d1b", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_formal_logic:mc", "task_core": "mmlu_formal_logic", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "formal_logic", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_formal_logic:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.130831003189087, "current_date": "2024-11-19 21:11:41 UTC", "num_instances": 126, "beaker_info": {}, "metrics": {"acc_raw": 0.25396825396825395, "acc_per_token": 0.25396825396825395, "acc_per_char": 0.25396825396825395, "correct_loss_raw": 1.4213087558746338, "incorrect_loss_raw": 1.4132544647448908, "correct_loss_per_token": 1.4213087558746338, "incorrect_loss_per_token": 1.4132544647448908, "correct_loss_per_char": 0.7106543779373169, "incorrect_loss_per_char": 0.7066272323724454, "primary_score": 0.25396825396825395}, "task_idx": 16} {"task_name": "mmlu_global_facts:mc", "task_hash": "4f14cfa253ea56a8d3b0d2c805ccdb28", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_global_facts:mc", "task_core": "mmlu_global_facts", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "global_facts", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_global_facts:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.08803653717041, "current_date": "2024-11-19 21:11:48 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.18, "acc_per_token": 0.18, "acc_per_char": 0.18, "correct_loss_raw": 1.4756081253290176, "incorrect_loss_raw": 1.4302349406480783, "correct_loss_per_token": 1.4756081253290176, "incorrect_loss_per_token": 1.4302349406480783, "correct_loss_per_char": 0.7378040626645088, "incorrect_loss_per_char": 0.7151174703240392, "primary_score": 0.18}, "task_idx": 17} {"task_name": "mmlu_high_school_biology:mc", "task_hash": "055cfa37938a062655e6ce08f80c7765", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_biology:mc", "task_core": "mmlu_high_school_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_biology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.56223464012146, "current_date": "2024-11-19 21:11:51 UTC", "num_instances": 310, "beaker_info": {}, "metrics": {"acc_raw": 0.2903225806451613, "acc_per_token": 0.2903225806451613, "acc_per_char": 0.2903225806451613, "correct_loss_raw": 1.3885792163110549, "incorrect_loss_raw": 1.40228969813675, "correct_loss_per_token": 1.3885792163110549, "incorrect_loss_per_token": 1.40228969813675, "correct_loss_per_char": 0.6942896081555274, "incorrect_loss_per_char": 0.701144849068375, "primary_score": 0.2903225806451613}, "task_idx": 18} {"task_name": "mmlu_high_school_chemistry:mc", "task_hash": "6cef5e5a35451e467b97a8cf773fb61c", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_chemistry:mc", "task_core": "mmlu_high_school_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_chemistry:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.159745454788208, "current_date": "2024-11-19 21:12:02 UTC", "num_instances": 203, "beaker_info": {}, "metrics": {"acc_raw": 0.2019704433497537, "acc_per_token": 0.2019704433497537, "acc_per_char": 0.2019704433497537, "correct_loss_raw": 1.3932797168863231, "incorrect_loss_raw": 1.3984861059729097, "correct_loss_per_token": 1.3932797168863231, "incorrect_loss_per_token": 1.3984861059729097, "correct_loss_per_char": 0.6966398584431616, "incorrect_loss_per_char": 0.6992430529864548, "primary_score": 0.2019704433497537}, "task_idx": 19} {"task_name": "mmlu_high_school_computer_science:mc", "task_hash": "31a39a79632638f209cd0a9c599f158d", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_computer_science:mc", "task_core": "mmlu_high_school_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_computer_science:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.891598463058472, "current_date": "2024-11-19 21:12:09 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.26, "acc_per_token": 0.26, "acc_per_char": 0.26, "correct_loss_raw": 1.3843735754489899, "incorrect_loss_raw": 1.4085347572962434, "correct_loss_per_token": 1.3843735754489899, "incorrect_loss_per_token": 1.4085347572962434, "correct_loss_per_char": 0.6921867877244949, "incorrect_loss_per_char": 0.7042673786481217, "primary_score": 0.26}, "task_idx": 20} {"task_name": "mmlu_high_school_european_history:mc", "task_hash": "e8f2a29738091af55efa8a7194452ac2", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_european_history:mc", "task_core": "mmlu_high_school_european_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_european_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_european_history:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 27.430134773254395, "current_date": "2024-11-19 21:12:16 UTC", "num_instances": 165, "beaker_info": {}, "metrics": {"acc_raw": 0.2, "acc_per_token": 0.2, "acc_per_char": 0.2, "correct_loss_raw": 1.4093764514634104, "incorrect_loss_raw": 1.3966222804002089, "correct_loss_per_token": 1.4093764514634104, "incorrect_loss_per_token": 1.3966222804002089, "correct_loss_per_char": 0.7046882257317052, "incorrect_loss_per_char": 0.6983111402001044, "primary_score": 0.2}, "task_idx": 21} {"task_name": "mmlu_high_school_geography:mc", "task_hash": "6a43a92b543ec77afeeda9d5011e0c36", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_geography:mc", "task_core": "mmlu_high_school_geography", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_geography", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_geography:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.009889602661133, "current_date": "2024-11-19 21:12:44 UTC", "num_instances": 198, "beaker_info": {}, "metrics": {"acc_raw": 0.18686868686868688, "acc_per_token": 0.18686868686868688, "acc_per_char": 0.18686868686868688, "correct_loss_raw": 1.4153487002006684, "incorrect_loss_raw": 1.3935150254254387, "correct_loss_per_token": 1.4153487002006684, "incorrect_loss_per_token": 1.3935150254254387, "correct_loss_per_char": 0.7076743501003342, "incorrect_loss_per_char": 0.6967575127127194, "primary_score": 0.18686868686868688}, "task_idx": 22} {"task_name": "mmlu_high_school_government_and_politics:mc", "task_hash": "65cdc0b1dc4018c2017fc6023e9bb862", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_government_and_politics:mc", "task_core": "mmlu_high_school_government_and_politics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_government_and_politics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_government_and_politics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.022237539291382, "current_date": "2024-11-19 21:12:50 UTC", "num_instances": 193, "beaker_info": {}, "metrics": {"acc_raw": 0.3005181347150259, "acc_per_token": 0.3005181347150259, "acc_per_char": 0.3005181347150259, "correct_loss_raw": 1.3870787317888724, "incorrect_loss_raw": 1.4027198924713595, "correct_loss_per_token": 1.3870787317888724, "incorrect_loss_per_token": 1.4027198924713595, "correct_loss_per_char": 0.6935393658944362, "incorrect_loss_per_char": 0.7013599462356798, "primary_score": 0.3005181347150259}, "task_idx": 23} {"task_name": "mmlu_high_school_macroeconomics:mc", "task_hash": "177b3e0ec28ae90f76d191ba937fb328", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_macroeconomics:mc", "task_core": "mmlu_high_school_macroeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_macroeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_macroeconomics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.615823030471802, "current_date": "2024-11-19 21:12:57 UTC", "num_instances": 390, "beaker_info": {}, "metrics": {"acc_raw": 0.3384615384615385, "acc_per_token": 0.3384615384615385, "acc_per_char": 0.3384615384615385, "correct_loss_raw": 1.3830373278030983, "incorrect_loss_raw": 1.4065138395525443, "correct_loss_per_token": 1.3830373278030983, "incorrect_loss_per_token": 1.4065138395525443, "correct_loss_per_char": 0.6915186639015491, "incorrect_loss_per_char": 0.7032569197762721, "primary_score": 0.3384615384615385}, "task_idx": 24} {"task_name": "mmlu_high_school_mathematics:mc", "task_hash": "934371e2cf927fc449e77df454d85d2d", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_mathematics:mc", "task_core": "mmlu_high_school_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_mathematics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 9.789054155349731, "current_date": "2024-11-19 21:13:08 UTC", "num_instances": 270, "beaker_info": {}, "metrics": {"acc_raw": 0.25555555555555554, "acc_per_token": 0.25555555555555554, "acc_per_char": 0.25555555555555554, "correct_loss_raw": 1.425081949322312, "incorrect_loss_raw": 1.418366907702551, "correct_loss_per_token": 1.425081949322312, "incorrect_loss_per_token": 1.418366907702551, "correct_loss_per_char": 0.712540974661156, "incorrect_loss_per_char": 0.7091834538512755, "primary_score": 0.25555555555555554}, "task_idx": 25} {"task_name": "mmlu_high_school_microeconomics:mc", "task_hash": "3738e45ad1235f9f0a4825ae099697cb", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_microeconomics:mc", "task_core": "mmlu_high_school_microeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_microeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_microeconomics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.382247686386108, "current_date": "2024-11-19 21:13:18 UTC", "num_instances": 238, "beaker_info": {}, "metrics": {"acc_raw": 0.3025210084033613, "acc_per_token": 0.3025210084033613, "acc_per_char": 0.3025210084033613, "correct_loss_raw": 1.3908371038797522, "incorrect_loss_raw": 1.399872533580502, "correct_loss_per_token": 1.3908371038797522, "incorrect_loss_per_token": 1.399872533580502, "correct_loss_per_char": 0.6954185519398761, "incorrect_loss_per_char": 0.699936266790251, "primary_score": 0.3025210084033613}, "task_idx": 26} {"task_name": "mmlu_high_school_physics:mc", "task_hash": "583350c5b48fd28100732ad06943489f", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_physics:mc", "task_core": "mmlu_high_school_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_physics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.19127893447876, "current_date": "2024-11-19 21:13:26 UTC", "num_instances": 151, "beaker_info": {}, "metrics": {"acc_raw": 0.31788079470198677, "acc_per_token": 0.31788079470198677, "acc_per_char": 0.31788079470198677, "correct_loss_raw": 1.3750574825615283, "incorrect_loss_raw": 1.4153292002793698, "correct_loss_per_token": 1.3750574825615283, "incorrect_loss_per_token": 1.4153292002793698, "correct_loss_per_char": 0.6875287412807641, "incorrect_loss_per_char": 0.7076646001396849, "primary_score": 0.31788079470198677}, "task_idx": 27} {"task_name": "mmlu_high_school_psychology:mc", "task_hash": "accf1559d013b1e7ac36647c1fe9dd67", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_psychology:mc", "task_core": "mmlu_high_school_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_psychology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 20.13453221321106, "current_date": "2024-11-19 21:13:32 UTC", "num_instances": 545, "beaker_info": {}, "metrics": {"acc_raw": 0.23302752293577983, "acc_per_token": 0.23302752293577983, "acc_per_char": 0.23302752293577983, "correct_loss_raw": 1.4060063779900926, "incorrect_loss_raw": 1.3952868980005249, "correct_loss_per_token": 1.4060063779900926, "incorrect_loss_per_token": 1.3952868980005249, "correct_loss_per_char": 0.7030031889950463, "incorrect_loss_per_char": 0.6976434490002624, "primary_score": 0.23302752293577983}, "task_idx": 28} {"task_name": "mmlu_high_school_statistics:mc", "task_hash": "7bd3b2133806936ee947ebd9c9890647", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_statistics:mc", "task_core": "mmlu_high_school_statistics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_statistics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_statistics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.637912511825562, "current_date": "2024-11-19 21:13:52 UTC", "num_instances": 216, "beaker_info": {}, "metrics": {"acc_raw": 0.44907407407407407, "acc_per_token": 0.44907407407407407, "acc_per_char": 0.44907407407407407, "correct_loss_raw": 1.32589447553511, "incorrect_loss_raw": 1.451398033786703, "correct_loss_per_token": 1.32589447553511, "incorrect_loss_per_token": 1.451398033786703, "correct_loss_per_char": 0.662947237767555, "incorrect_loss_per_char": 0.7256990168933515, "primary_score": 0.44907407407407407}, "task_idx": 29} {"task_name": "mmlu_high_school_us_history:mc", "task_hash": "8097dc2c4728e3ef312c10bfcc9a0c47", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_us_history:mc", "task_core": "mmlu_high_school_us_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_us_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_us_history:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 32.43279314041138, "current_date": "2024-11-19 21:14:05 UTC", "num_instances": 204, "beaker_info": {}, "metrics": {"acc_raw": 0.25980392156862747, "acc_per_token": 0.25980392156862747, "acc_per_char": 0.25980392156862747, "correct_loss_raw": 1.3946236991414838, "incorrect_loss_raw": 1.4083286948453368, "correct_loss_per_token": 1.3946236991414838, "incorrect_loss_per_token": 1.4083286948453368, "correct_loss_per_char": 0.6973118495707419, "incorrect_loss_per_char": 0.7041643474226684, "primary_score": 0.25980392156862747}, "task_idx": 30} {"task_name": "mmlu_high_school_world_history:mc", "task_hash": "4c9689dbb0e9effb2991bc98e1364c03", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_world_history:mc", "task_core": "mmlu_high_school_world_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_world_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_world_history:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 26.605832815170288, "current_date": "2024-11-19 21:14:37 UTC", "num_instances": 237, "beaker_info": {}, "metrics": {"acc_raw": 0.24050632911392406, "acc_per_token": 0.24050632911392406, "acc_per_char": 0.24050632911392406, "correct_loss_raw": 1.389097487876184, "incorrect_loss_raw": 1.3955407057130385, "correct_loss_per_token": 1.389097487876184, "incorrect_loss_per_token": 1.3955407057130385, "correct_loss_per_char": 0.694548743938092, "incorrect_loss_per_char": 0.6977703528565192, "primary_score": 0.24050632911392406}, "task_idx": 31} {"task_name": "mmlu_human_aging:mc", "task_hash": "aed6dc4e5de4b465852e8add68f1e1c7", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_human_aging:mc", "task_core": "mmlu_human_aging", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_aging", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_aging:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.512657880783081, "current_date": "2024-11-19 21:15:04 UTC", "num_instances": 223, "beaker_info": {}, "metrics": {"acc_raw": 0.3452914798206278, "acc_per_token": 0.3452914798206278, "acc_per_char": 0.3452914798206278, "correct_loss_raw": 1.3652815257487276, "incorrect_loss_raw": 1.4116136404074542, "correct_loss_per_token": 1.3652815257487276, "incorrect_loss_per_token": 1.4116136404074542, "correct_loss_per_char": 0.6826407628743638, "incorrect_loss_per_char": 0.7058068202037271, "primary_score": 0.3452914798206278}, "task_idx": 32} {"task_name": "mmlu_human_sexuality:mc", "task_hash": "40c85ccce055746bdd1f28232f48f0fa", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_human_sexuality:mc", "task_core": "mmlu_human_sexuality", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_sexuality", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_sexuality:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.6009631156921387, "current_date": "2024-11-19 21:15:09 UTC", "num_instances": 131, "beaker_info": {}, "metrics": {"acc_raw": 0.2595419847328244, "acc_per_token": 0.2595419847328244, "acc_per_char": 0.2595419847328244, "correct_loss_raw": 1.4200853368708195, "incorrect_loss_raw": 1.4141683085577486, "correct_loss_per_token": 1.4200853368708195, "incorrect_loss_per_token": 1.4141683085577486, "correct_loss_per_char": 0.7100426684354098, "incorrect_loss_per_char": 0.7070841542788743, "primary_score": 0.2595419847328244}, "task_idx": 33} {"task_name": "mmlu_international_law:mc", "task_hash": "3cfc657dd55e3ad96d5c3e9cd17bc346", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_international_law:mc", "task_core": "mmlu_international_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "international_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_international_law:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.969613552093506, "current_date": "2024-11-19 21:15:13 UTC", "num_instances": 121, "beaker_info": {}, "metrics": {"acc_raw": 0.2396694214876033, "acc_per_token": 0.2396694214876033, "acc_per_char": 0.2396694214876033, "correct_loss_raw": 1.3773366005952694, "incorrect_loss_raw": 1.4053297351542908, "correct_loss_per_token": 1.3773366005952694, "incorrect_loss_per_token": 1.4053297351542908, "correct_loss_per_char": 0.6886683002976347, "incorrect_loss_per_char": 0.7026648675771454, "primary_score": 0.2396694214876033}, "task_idx": 34} {"task_name": "mmlu_jurisprudence:mc", "task_hash": "ca4ac71f0fd702b39c6245be2ab32061", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_jurisprudence:mc", "task_core": "mmlu_jurisprudence", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "jurisprudence", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_jurisprudence:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.5692873001098633, "current_date": "2024-11-19 21:15:19 UTC", "num_instances": 108, "beaker_info": {}, "metrics": {"acc_raw": 0.25, "acc_per_token": 0.25, "acc_per_char": 0.25, "correct_loss_raw": 1.4049190200037427, "incorrect_loss_raw": 1.400600313220495, "correct_loss_per_token": 1.4049190200037427, "incorrect_loss_per_token": 1.400600313220495, "correct_loss_per_char": 0.7024595100018713, "incorrect_loss_per_char": 0.7003001566102475, "primary_score": 0.25}, "task_idx": 35} {"task_name": "mmlu_logical_fallacies:mc", "task_hash": "a4b3c214c3cb1c10bfa4042dd0e9df92", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_logical_fallacies:mc", "task_core": "mmlu_logical_fallacies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "logical_fallacies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_logical_fallacies:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.709029197692871, "current_date": "2024-11-19 21:15:22 UTC", "num_instances": 163, "beaker_info": {}, "metrics": {"acc_raw": 0.25153374233128833, "acc_per_token": 0.25153374233128833, "acc_per_char": 0.25153374233128833, "correct_loss_raw": 1.3859041256407287, "incorrect_loss_raw": 1.4001630442274127, "correct_loss_per_token": 1.3859041256407287, "incorrect_loss_per_token": 1.4001630442274127, "correct_loss_per_char": 0.6929520628203644, "incorrect_loss_per_char": 0.7000815221137063, "primary_score": 0.25153374233128833}, "task_idx": 36} {"task_name": "mmlu_machine_learning:mc", "task_hash": "43ad1436fc44eed0bc66cc7239ecd94b", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_machine_learning:mc", "task_core": "mmlu_machine_learning", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "machine_learning", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_machine_learning:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.5937018394470215, "current_date": "2024-11-19 21:15:28 UTC", "num_instances": 112, "beaker_info": {}, "metrics": {"acc_raw": 0.30357142857142855, "acc_per_token": 0.30357142857142855, "acc_per_char": 0.30357142857142855, "correct_loss_raw": 1.3839700307164873, "incorrect_loss_raw": 1.4140378013253208, "correct_loss_per_token": 1.3839700307164873, "incorrect_loss_per_token": 1.4140378013253208, "correct_loss_per_char": 0.6919850153582436, "incorrect_loss_per_char": 0.7070189006626604, "primary_score": 0.30357142857142855}, "task_idx": 37} {"task_name": "mmlu_management:mc", "task_hash": "f565b650124e104d5d59b40491bde8e7", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_management:mc", "task_core": "mmlu_management", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "management", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_management:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.505298376083374, "current_date": "2024-11-19 21:15:34 UTC", "num_instances": 103, "beaker_info": {}, "metrics": {"acc_raw": 0.14563106796116504, "acc_per_token": 0.14563106796116504, "acc_per_char": 0.14563106796116504, "correct_loss_raw": 1.4238631702163844, "incorrect_loss_raw": 1.3957783520028815, "correct_loss_per_token": 1.4238631702163844, "incorrect_loss_per_token": 1.3957783520028815, "correct_loss_per_char": 0.7119315851081922, "incorrect_loss_per_char": 0.6978891760014407, "primary_score": 0.14563106796116504}, "task_idx": 38} {"task_name": "mmlu_marketing:mc", "task_hash": "63c7c7a1863fe3aaf961947124cbd4c3", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_marketing:mc", "task_core": "mmlu_marketing", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "marketing", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_marketing:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.7376954555511475, "current_date": "2024-11-19 21:15:36 UTC", "num_instances": 234, "beaker_info": {}, "metrics": {"acc_raw": 0.29914529914529914, "acc_per_token": 0.29914529914529914, "acc_per_char": 0.29914529914529914, "correct_loss_raw": 1.3953857554329767, "incorrect_loss_raw": 1.410645197174828, "correct_loss_per_token": 1.3953857554329767, "incorrect_loss_per_token": 1.410645197174828, "correct_loss_per_char": 0.6976928777164884, "incorrect_loss_per_char": 0.705322598587414, "primary_score": 0.29914529914529914}, "task_idx": 39} {"task_name": "mmlu_medical_genetics:mc", "task_hash": "11f7f7576f9aeb3dae4cc770e7a06c98", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_medical_genetics:mc", "task_core": "mmlu_medical_genetics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "medical_genetics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_medical_genetics:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.685650587081909, "current_date": "2024-11-19 21:15:44 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.29, "acc_per_token": 0.29, "acc_per_char": 0.29, "correct_loss_raw": 1.3824806594848633, "incorrect_loss_raw": 1.4173768124977746, "correct_loss_per_token": 1.3824806594848633, "incorrect_loss_per_token": 1.4173768124977746, "correct_loss_per_char": 0.6912403297424317, "incorrect_loss_per_char": 0.7086884062488873, "primary_score": 0.29}, "task_idx": 40} {"task_name": "mmlu_miscellaneous:mc", "task_hash": "d9c892ba8631049d773d6fa3dc5dca82", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_miscellaneous:mc", "task_core": "mmlu_miscellaneous", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "miscellaneous", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_miscellaneous:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 18.94162631034851, "current_date": "2024-11-19 21:15:47 UTC", "num_instances": 783, "beaker_info": {}, "metrics": {"acc_raw": 0.27330779054916987, "acc_per_token": 0.27330779054916987, "acc_per_char": 0.27330779054916987, "correct_loss_raw": 1.3916616130758215, "incorrect_loss_raw": 1.4047548287013272, "correct_loss_per_token": 1.3916616130758215, "incorrect_loss_per_token": 1.4047548287013272, "correct_loss_per_char": 0.6958308065379107, "incorrect_loss_per_char": 0.7023774143506636, "primary_score": 0.27330779054916987}, "task_idx": 41} {"task_name": "mmlu_moral_disputes:mc", "task_hash": "d05901af9b9e012ab9e4ce8bb28c2bb8", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_moral_disputes:mc", "task_core": "mmlu_moral_disputes", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_disputes", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_disputes:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.779247283935547, "current_date": "2024-11-19 21:16:06 UTC", "num_instances": 346, "beaker_info": {}, "metrics": {"acc_raw": 0.24277456647398843, "acc_per_token": 0.24277456647398843, "acc_per_char": 0.24277456647398843, "correct_loss_raw": 1.4040873453796254, "incorrect_loss_raw": 1.4014373959144413, "correct_loss_per_token": 1.4040873453796254, "incorrect_loss_per_token": 1.4014373959144413, "correct_loss_per_char": 0.7020436726898127, "incorrect_loss_per_char": 0.7007186979572206, "primary_score": 0.24277456647398843}, "task_idx": 42} {"task_name": "mmlu_moral_scenarios:mc", "task_hash": "33949ee763bf0ed37a82aa7796d56cd6", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_moral_scenarios:mc", "task_core": "mmlu_moral_scenarios", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_scenarios", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_scenarios:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 43.35043215751648, "current_date": "2024-11-19 21:16:19 UTC", "num_instances": 895, "beaker_info": {}, "metrics": {"acc_raw": 0.2424581005586592, "acc_per_token": 0.2424581005586592, "acc_per_char": 0.2424581005586592, "correct_loss_raw": 1.4125996712199802, "incorrect_loss_raw": 1.4057185177474494, "correct_loss_per_token": 1.4125996712199802, "incorrect_loss_per_token": 1.4057185177474494, "correct_loss_per_char": 0.7062998356099901, "incorrect_loss_per_char": 0.7028592588737247, "primary_score": 0.2424581005586592}, "task_idx": 43} {"task_name": "mmlu_nutrition:mc", "task_hash": "e68f4b08d1adc45a7ab0ea385d987849", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_nutrition:mc", "task_core": "mmlu_nutrition", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "nutrition", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_nutrition:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.897084951400757, "current_date": "2024-11-19 21:17:02 UTC", "num_instances": 306, "beaker_info": {}, "metrics": {"acc_raw": 0.21568627450980393, "acc_per_token": 0.21568627450980393, "acc_per_char": 0.21568627450980393, "correct_loss_raw": 1.4100196493996515, "incorrect_loss_raw": 1.3950737732947516, "correct_loss_per_token": 1.4100196493996515, "incorrect_loss_per_token": 1.3950737732947516, "correct_loss_per_char": 0.7050098246998258, "incorrect_loss_per_char": 0.6975368866473758, "primary_score": 0.21568627450980393}, "task_idx": 44} {"task_name": "mmlu_philosophy:mc", "task_hash": "dd14a2446c6e46449cd5b14ee7982b73", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_philosophy:mc", "task_core": "mmlu_philosophy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "philosophy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_philosophy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.110023021697998, "current_date": "2024-11-19 21:17:15 UTC", "num_instances": 311, "beaker_info": {}, "metrics": {"acc_raw": 0.24115755627009647, "acc_per_token": 0.24115755627009647, "acc_per_char": 0.24115755627009647, "correct_loss_raw": 1.3945161211528962, "incorrect_loss_raw": 1.3982760004164196, "correct_loss_per_token": 1.3945161211528962, "incorrect_loss_per_token": 1.3982760004164196, "correct_loss_per_char": 0.6972580605764481, "incorrect_loss_per_char": 0.6991380002082098, "primary_score": 0.24115755627009647}, "task_idx": 45} {"task_name": "mmlu_prehistory:mc", "task_hash": "d65b3e5cf8049b1c1442537b281f5a72", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_prehistory:mc", "task_core": "mmlu_prehistory", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "prehistory", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_prehistory:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.024940729141235, "current_date": "2024-11-19 21:17:23 UTC", "num_instances": 324, "beaker_info": {}, "metrics": {"acc_raw": 0.2191358024691358, "acc_per_token": 0.2191358024691358, "acc_per_char": 0.2191358024691358, "correct_loss_raw": 1.409603499703937, "incorrect_loss_raw": 1.4012677543815766, "correct_loss_per_token": 1.409603499703937, "incorrect_loss_per_token": 1.4012677543815766, "correct_loss_per_char": 0.7048017498519685, "incorrect_loss_per_char": 0.7006338771907883, "primary_score": 0.2191358024691358}, "task_idx": 46} {"task_name": "mmlu_professional_accounting:mc", "task_hash": "2d9464b5e5a5ee20a777a37004dd3a2d", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_accounting:mc", "task_core": "mmlu_professional_accounting", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_accounting", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_accounting:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.726781129837036, "current_date": "2024-11-19 21:17:36 UTC", "num_instances": 282, "beaker_info": {}, "metrics": {"acc_raw": 0.23049645390070922, "acc_per_token": 0.23049645390070922, "acc_per_char": 0.23049645390070922, "correct_loss_raw": 1.4010881396895605, "incorrect_loss_raw": 1.3946880787681466, "correct_loss_per_token": 1.4010881396895605, "incorrect_loss_per_token": 1.3946880787681466, "correct_loss_per_char": 0.7005440698447802, "incorrect_loss_per_char": 0.6973440393840733, "primary_score": 0.23049645390070922}, "task_idx": 47} {"task_name": "mmlu_professional_law:mc", "task_hash": "c4dd4f89898c6498217d79776e68bb06", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_law:mc", "task_core": "mmlu_professional_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_law:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 192.40008306503296, "current_date": "2024-11-19 21:17:50 UTC", "num_instances": 1534, "beaker_info": {}, "metrics": {"acc_raw": 0.24511082138200782, "acc_per_token": 0.24511082138200782, "acc_per_char": 0.24511082138200782, "correct_loss_raw": 1.4002322044801525, "incorrect_loss_raw": 1.3988230905031325, "correct_loss_per_token": 1.4002322044801525, "incorrect_loss_per_token": 1.3988230905031325, "correct_loss_per_char": 0.7001161022400763, "incorrect_loss_per_char": 0.6994115452515662, "primary_score": 0.24511082138200782}, "task_idx": 48} {"task_name": "mmlu_professional_medicine:mc", "task_hash": "8b8aa33e03e2f1b4abff4cbb3dd56cd7", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_medicine:mc", "task_core": "mmlu_professional_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_medicine:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 21.888848781585693, "current_date": "2024-11-19 21:21:02 UTC", "num_instances": 272, "beaker_info": {}, "metrics": {"acc_raw": 0.4411764705882353, "acc_per_token": 0.4411764705882353, "acc_per_char": 0.4411764705882353, "correct_loss_raw": 1.3388401828706264, "incorrect_loss_raw": 1.446392762748634, "correct_loss_per_token": 1.3388401828706264, "incorrect_loss_per_token": 1.446392762748634, "correct_loss_per_char": 0.6694200914353132, "incorrect_loss_per_char": 0.723196381374317, "primary_score": 0.4411764705882353}, "task_idx": 49} {"task_name": "mmlu_professional_psychology:mc", "task_hash": "3094d326fde18b55836110e1d0f8f241", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_psychology:mc", "task_core": "mmlu_professional_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_psychology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 26.77055311203003, "current_date": "2024-11-19 21:21:24 UTC", "num_instances": 612, "beaker_info": {}, "metrics": {"acc_raw": 0.24019607843137256, "acc_per_token": 0.24019607843137256, "acc_per_char": 0.24019607843137256, "correct_loss_raw": 1.4043345170862533, "incorrect_loss_raw": 1.4039143988120013, "correct_loss_per_token": 1.4043345170862533, "incorrect_loss_per_token": 1.4039143988120013, "correct_loss_per_char": 0.7021672585431267, "incorrect_loss_per_char": 0.7019571994060007, "primary_score": 0.24019607843137256}, "task_idx": 50} {"task_name": "mmlu_public_relations:mc", "task_hash": "b10f684a09888253de5b2778544ace3d", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_public_relations:mc", "task_core": "mmlu_public_relations", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "public_relations", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_public_relations:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.7334680557250977, "current_date": "2024-11-19 21:21:51 UTC", "num_instances": 110, "beaker_info": {}, "metrics": {"acc_raw": 0.24545454545454545, "acc_per_token": 0.24545454545454545, "acc_per_char": 0.24545454545454545, "correct_loss_raw": 1.3907646802338687, "incorrect_loss_raw": 1.4154231024510933, "correct_loss_per_token": 1.3907646802338687, "incorrect_loss_per_token": 1.4154231024510933, "correct_loss_per_char": 0.6953823401169343, "incorrect_loss_per_char": 0.7077115512255466, "primary_score": 0.24545454545454545}, "task_idx": 51} {"task_name": "mmlu_security_studies:mc", "task_hash": "1f8f03c4608bfc16b773b6789dff3612", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_security_studies:mc", "task_core": "mmlu_security_studies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "security_studies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_security_studies:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 23.006799936294556, "current_date": "2024-11-19 21:21:55 UTC", "num_instances": 245, "beaker_info": {}, "metrics": {"acc_raw": 0.1836734693877551, "acc_per_token": 0.1836734693877551, "acc_per_char": 0.1836734693877551, "correct_loss_raw": 1.4304668886320933, "incorrect_loss_raw": 1.3972869594891864, "correct_loss_per_token": 1.4304668886320933, "incorrect_loss_per_token": 1.3972869594891864, "correct_loss_per_char": 0.7152334443160466, "incorrect_loss_per_char": 0.6986434797445932, "primary_score": 0.1836734693877551}, "task_idx": 52} {"task_name": "mmlu_sociology:mc", "task_hash": "8febc5ac38c21f5a0811d42006faf2ea", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_sociology:mc", "task_core": "mmlu_sociology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "sociology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_sociology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.014625310897827, "current_date": "2024-11-19 21:22:18 UTC", "num_instances": 201, "beaker_info": {}, "metrics": {"acc_raw": 0.20398009950248755, "acc_per_token": 0.20398009950248755, "acc_per_char": 0.20398009950248755, "correct_loss_raw": 1.4055783861312108, "incorrect_loss_raw": 1.4032114724417035, "correct_loss_per_token": 1.4055783861312108, "incorrect_loss_per_token": 1.4032114724417035, "correct_loss_per_char": 0.7027891930656054, "incorrect_loss_per_char": 0.7016057362208518, "primary_score": 0.20398009950248755}, "task_idx": 53} {"task_name": "mmlu_us_foreign_policy:mc", "task_hash": "cceb9539ca6356676c1a014a74093ec9", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_us_foreign_policy:mc", "task_core": "mmlu_us_foreign_policy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "us_foreign_policy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_us_foreign_policy:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.496429920196533, "current_date": "2024-11-19 21:22:25 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.25, "acc_per_token": 0.25, "acc_per_char": 0.25, "correct_loss_raw": 1.3994287383556365, "incorrect_loss_raw": 1.401831651926041, "correct_loss_per_token": 1.3994287383556365, "incorrect_loss_per_token": 1.401831651926041, "correct_loss_per_char": 0.6997143691778183, "incorrect_loss_per_char": 0.7009158259630205, "primary_score": 0.25}, "task_idx": 54} {"task_name": "mmlu_virology:mc", "task_hash": "1b216fb4e04c61029da5dfb32810fabc", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_virology:mc", "task_core": "mmlu_virology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "virology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_virology:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.60969090461731, "current_date": "2024-11-19 21:22:28 UTC", "num_instances": 166, "beaker_info": {}, "metrics": {"acc_raw": 0.27710843373493976, "acc_per_token": 0.27710843373493976, "acc_per_char": 0.27710843373493976, "correct_loss_raw": 1.3888891016144351, "incorrect_loss_raw": 1.4161239694878751, "correct_loss_per_token": 1.3888891016144351, "incorrect_loss_per_token": 1.4161239694878751, "correct_loss_per_char": 0.6944445508072176, "incorrect_loss_per_char": 0.7080619847439376, "primary_score": 0.27710843373493976}, "task_idx": 55} {"task_name": "mmlu_world_religions:mc", "task_hash": "223d634e4c9d91a64ed77b7e259d7010", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_world_religions:mc", "task_core": "mmlu_world_religions", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_raw", "random_subsample_seed": 1234, "context_kwargs": {"no_label_prefix_space": false}, "generation_kwargs": null, "metric_kwargs": null, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "world_religions", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_world_religions:mc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.8880889415740967, "current_date": "2024-11-19 21:22:33 UTC", "num_instances": 171, "beaker_info": {}, "metrics": {"acc_raw": 0.23976608187134502, "acc_per_token": 0.23976608187134502, "acc_per_char": 0.23976608187134502, "correct_loss_raw": 1.3958141071754588, "incorrect_loss_raw": 1.39785935144443, "correct_loss_per_token": 1.3958141071754588, "incorrect_loss_per_token": 1.39785935144443, "correct_loss_per_char": 0.6979070535877294, "incorrect_loss_per_char": 0.698929675722215, "primary_score": 0.23976608187134502}, "task_idx": 56} {"task_name": "mmlu_abstract_algebra", "task_hash": "c85fa3ca2628093d327501718793d07b", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_abstract_algebra", "task_core": "mmlu_abstract_algebra", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_abstract_algebra:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 3.784212112426758, "current_date": "2024-11-19 21:22:37 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.22, "acc_per_token": 0.27, "acc_per_char": 0.25, "correct_loss_raw": 6.4551828312873845, "incorrect_loss_raw": 4.987657780647279, "correct_loss_per_token": 1.7790700636836112, "incorrect_loss_per_token": 1.9649607666646502, "correct_loss_per_char": 0.7085478552367768, "incorrect_loss_per_char": 0.7159901483865085, "acc_uncond": 0.3, "correct_loss_uncond": -9.357519872188568, "incorrect_loss_uncond": -8.862104341189067, "primary_score": 0.25}, "task_idx": 57} {"task_name": "mmlu_anatomy", "task_hash": "3f9b02c965eba1bd23b0446d0e9deff4", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_anatomy", "task_core": "mmlu_anatomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "anatomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_anatomy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.426770448684692, "current_date": "2024-11-19 21:22:40 UTC", "num_instances": 135, "beaker_info": {}, "metrics": {"acc_raw": 0.34074074074074073, "acc_per_token": 0.2962962962962963, "acc_per_char": 0.34074074074074073, "correct_loss_raw": 17.91354572199009, "incorrect_loss_raw": 18.324018736238834, "correct_loss_per_token": 2.270532707861062, "incorrect_loss_per_token": 2.6184480049899297, "correct_loss_per_char": 0.5162493756103963, "incorrect_loss_per_char": 0.5941078473379293, "acc_uncond": 0.31851851851851853, "correct_loss_uncond": -14.984284937381744, "incorrect_loss_uncond": -14.478863314640376, "primary_score": 0.34074074074074073}, "task_idx": 58} {"task_name": "mmlu_astronomy", "task_hash": "d9e63c18cde7815546c5a54ffadb81f9", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_astronomy", "task_core": "mmlu_astronomy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "astronomy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_astronomy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.074229717254639, "current_date": "2024-11-19 21:22:47 UTC", "num_instances": 152, "beaker_info": {}, "metrics": {"acc_raw": 0.3223684210526316, "acc_per_token": 0.39473684210526316, "acc_per_char": 0.4144736842105263, "correct_loss_raw": 25.177843918925838, "incorrect_loss_raw": 25.155770634872887, "correct_loss_per_token": 2.324497508223701, "incorrect_loss_per_token": 2.811709351684318, "correct_loss_per_char": 0.5685864552887461, "incorrect_loss_per_char": 0.6720247721294323, "acc_uncond": 0.42105263157894735, "correct_loss_uncond": -15.207309873480545, "incorrect_loss_uncond": -14.05902164703921, "primary_score": 0.4144736842105263}, "task_idx": 59} {"task_name": "mmlu_business_ethics", "task_hash": "dbbf5c673a31d657513075cc70e4f670", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_business_ethics", "task_core": "mmlu_business_ethics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "business_ethics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_business_ethics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.859542608261108, "current_date": "2024-11-19 21:22:57 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.53, "acc_per_token": 0.45, "acc_per_char": 0.46, "correct_loss_raw": 22.757138235569002, "incorrect_loss_raw": 25.77794682184854, "correct_loss_per_token": 3.1831077655639537, "incorrect_loss_per_token": 3.552855390475315, "correct_loss_per_char": 0.8752598337381101, "incorrect_loss_per_char": 0.9435633335962644, "acc_uncond": 0.37, "correct_loss_uncond": -12.232631995677949, "incorrect_loss_uncond": -11.214106442133586, "primary_score": 0.46}, "task_idx": 60} {"task_name": "mmlu_clinical_knowledge", "task_hash": "940022f2e7983e3f56cfc7196b310a7f", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_clinical_knowledge", "task_core": "mmlu_clinical_knowledge", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "clinical_knowledge", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_clinical_knowledge:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 15.071310758590698, "current_date": "2024-11-19 21:23:06 UTC", "num_instances": 265, "beaker_info": {}, "metrics": {"acc_raw": 0.2792452830188679, "acc_per_token": 0.3433962264150943, "acc_per_char": 0.3660377358490566, "correct_loss_raw": 22.166429183618078, "incorrect_loss_raw": 20.489821842631446, "correct_loss_per_token": 2.5494427778993702, "incorrect_loss_per_token": 2.8195764623307342, "correct_loss_per_char": 0.6004814047788665, "incorrect_loss_per_char": 0.6851914042506143, "acc_uncond": 0.33962264150943394, "correct_loss_uncond": -13.612649080438434, "incorrect_loss_uncond": -12.529704328303072, "primary_score": 0.3660377358490566}, "task_idx": 61} {"task_name": "mmlu_college_biology", "task_hash": "0b879b8081c2b7d376a6abd76697f553", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_biology", "task_core": "mmlu_college_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_biology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.725754976272583, "current_date": "2024-11-19 21:23:21 UTC", "num_instances": 144, "beaker_info": {}, "metrics": {"acc_raw": 0.3680555555555556, "acc_per_token": 0.3611111111111111, "acc_per_char": 0.3819444444444444, "correct_loss_raw": 19.9105933378968, "incorrect_loss_raw": 21.72963875973667, "correct_loss_per_token": 2.5496391006038817, "incorrect_loss_per_token": 2.985122184704055, "correct_loss_per_char": 0.4947993088349139, "incorrect_loss_per_char": 0.5861087337984283, "acc_uncond": 0.3472222222222222, "correct_loss_uncond": -15.96084564882848, "incorrect_loss_uncond": -14.809113546654025, "primary_score": 0.3819444444444444}, "task_idx": 62} {"task_name": "mmlu_college_chemistry", "task_hash": "0ed8a28c3b6ceca7f72f02bc9b87d236", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_chemistry", "task_core": "mmlu_college_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_chemistry:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.232069253921509, "current_date": "2024-11-19 21:23:32 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.22, "acc_per_token": 0.28, "acc_per_char": 0.26, "correct_loss_raw": 18.67529359340668, "incorrect_loss_raw": 18.623361633221297, "correct_loss_per_token": 2.913132373962731, "incorrect_loss_per_token": 2.9985381063616385, "correct_loss_per_char": 1.1796746788592554, "incorrect_loss_per_char": 1.1639814629662026, "acc_uncond": 0.26, "correct_loss_uncond": -12.370259900093078, "incorrect_loss_uncond": -12.005320732196175, "primary_score": 0.26}, "task_idx": 63} {"task_name": "mmlu_college_computer_science", "task_hash": "563c1a7e8c030ab92f3c9359a1196891", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_computer_science", "task_core": "mmlu_college_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_computer_science:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 17.076542854309082, "current_date": "2024-11-19 21:23:40 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.23, "acc_per_token": 0.24, "acc_per_char": 0.29, "correct_loss_raw": 17.125564770698546, "incorrect_loss_raw": 16.317387603123983, "correct_loss_per_token": 2.396058769024732, "incorrect_loss_per_token": 2.5570406531858145, "correct_loss_per_char": 0.7816295440981262, "incorrect_loss_per_char": 0.7795670748224239, "acc_uncond": 0.27, "correct_loss_uncond": -12.883329787254333, "incorrect_loss_uncond": -13.138652811050415, "primary_score": 0.29}, "task_idx": 64} {"task_name": "mmlu_college_mathematics", "task_hash": "97a6ddef0d69128d9260dd1f8c82521c", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_mathematics", "task_core": "mmlu_college_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_mathematics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 9.083509683609009, "current_date": "2024-11-19 21:23:57 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.15, "acc_per_token": 0.22, "acc_per_char": 0.19, "correct_loss_raw": 11.40063785791397, "incorrect_loss_raw": 10.084527692000073, "correct_loss_per_token": 2.8536593351442985, "incorrect_loss_per_token": 2.8226579726819767, "correct_loss_per_char": 1.1760315024355075, "incorrect_loss_per_char": 1.1311194345135664, "acc_uncond": 0.3, "correct_loss_uncond": -9.223767383098602, "incorrect_loss_uncond": -8.866132938861842, "primary_score": 0.19}, "task_idx": 65} {"task_name": "mmlu_college_medicine", "task_hash": "483a77ff3415e8b126e8e83fda055b39", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_medicine", "task_core": "mmlu_college_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_medicine:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 14.27556300163269, "current_date": "2024-11-19 21:24:06 UTC", "num_instances": 173, "beaker_info": {}, "metrics": {"acc_raw": 0.30057803468208094, "acc_per_token": 0.3063583815028902, "acc_per_char": 0.28901734104046245, "correct_loss_raw": 19.81116359357889, "incorrect_loss_raw": 19.46911873684222, "correct_loss_per_token": 2.6087844780290466, "incorrect_loss_per_token": 2.7511619527855347, "correct_loss_per_char": 0.6122792239099126, "incorrect_loss_per_char": 0.6498799604226275, "acc_uncond": 0.3352601156069364, "correct_loss_uncond": -13.740935204346055, "incorrect_loss_uncond": -13.052801668529113, "primary_score": 0.28901734104046245}, "task_idx": 66} {"task_name": "mmlu_college_physics", "task_hash": "db149cec3fe17117a3fa544e9ea18d10", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_college_physics", "task_core": "mmlu_college_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "college_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_college_physics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.697943687438965, "current_date": "2024-11-19 21:24:20 UTC", "num_instances": 102, "beaker_info": {}, "metrics": {"acc_raw": 0.24509803921568626, "acc_per_token": 0.2647058823529412, "acc_per_char": 0.24509803921568626, "correct_loss_raw": 12.496067006213993, "incorrect_loss_raw": 10.78362273858264, "correct_loss_per_token": 2.6446954790713253, "incorrect_loss_per_token": 2.480259769965165, "correct_loss_per_char": 1.0753204948654511, "incorrect_loss_per_char": 0.9993043140143509, "acc_uncond": 0.2549019607843137, "correct_loss_uncond": -11.738114963559543, "incorrect_loss_uncond": -11.791085632797936, "primary_score": 0.24509803921568626}, "task_idx": 67} {"task_name": "mmlu_computer_security", "task_hash": "4a7052996611caebbf6877da200249e9", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_computer_security", "task_core": "mmlu_computer_security", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "computer_security", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_computer_security:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 34.999040365219116, "current_date": "2024-11-19 21:24:58 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.44, "acc_per_token": 0.39, "acc_per_char": 0.43, "correct_loss_raw": 21.590673285126687, "incorrect_loss_raw": 21.24347382863363, "correct_loss_per_token": 3.266507198900353, "incorrect_loss_per_token": 3.9848627693716905, "correct_loss_per_char": 0.7648403250653502, "incorrect_loss_per_char": 0.9002014855324758, "acc_uncond": 0.48, "correct_loss_uncond": -11.743729857802391, "incorrect_loss_uncond": -9.172414719263715, "primary_score": 0.43}, "task_idx": 68} {"task_name": "mmlu_conceptual_physics", "task_hash": "f183468e707d67350aa3143009a25cb4", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_conceptual_physics", "task_core": "mmlu_conceptual_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "conceptual_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_conceptual_physics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 9.768489122390747, "current_date": "2024-11-19 21:25:03 UTC", "num_instances": 235, "beaker_info": {}, "metrics": {"acc_raw": 0.4297872340425532, "acc_per_token": 0.3872340425531915, "acc_per_char": 0.37872340425531914, "correct_loss_raw": 8.800380459364424, "incorrect_loss_raw": 10.790937607085455, "correct_loss_per_token": 2.8105938294911996, "incorrect_loss_per_token": 3.5675698091199823, "correct_loss_per_char": 0.5748582898927789, "incorrect_loss_per_char": 0.6984745809759881, "acc_uncond": 0.35319148936170214, "correct_loss_uncond": -11.130239910648225, "incorrect_loss_uncond": -9.603436176404886, "primary_score": 0.37872340425531914}, "task_idx": 69} {"task_name": "mmlu_econometrics", "task_hash": "f07b012d85c15887c3dce1c9c732f2cd", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_econometrics", "task_core": "mmlu_econometrics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "econometrics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_econometrics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.96303415298462, "current_date": "2024-11-19 21:25:13 UTC", "num_instances": 114, "beaker_info": {}, "metrics": {"acc_raw": 0.2719298245614035, "acc_per_token": 0.2894736842105263, "acc_per_char": 0.2807017543859649, "correct_loss_raw": 20.503092958216083, "incorrect_loss_raw": 21.490892941840208, "correct_loss_per_token": 2.2654041138946712, "incorrect_loss_per_token": 2.2292898427550645, "correct_loss_per_char": 0.5412424556774481, "incorrect_loss_per_char": 0.557237568210862, "acc_uncond": 0.2894736842105263, "correct_loss_uncond": -15.166060527165731, "incorrect_loss_uncond": -14.935436847962826, "primary_score": 0.2807017543859649}, "task_idx": 70} {"task_name": "mmlu_electrical_engineering", "task_hash": "4dd791561a029e99d7a01f69b382e913", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_electrical_engineering", "task_core": "mmlu_electrical_engineering", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "electrical_engineering", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_electrical_engineering:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.668545246124268, "current_date": "2024-11-19 21:25:26 UTC", "num_instances": 145, "beaker_info": {}, "metrics": {"acc_raw": 0.25517241379310346, "acc_per_token": 0.31724137931034485, "acc_per_char": 0.2482758620689655, "correct_loss_raw": 13.839519211341594, "incorrect_loss_raw": 13.613122390056475, "correct_loss_per_token": 3.3820394688821844, "incorrect_loss_per_token": 3.688959328273953, "correct_loss_per_char": 0.9524712168695203, "incorrect_loss_per_char": 0.9578910470850094, "acc_uncond": 0.23448275862068965, "correct_loss_uncond": -8.453801628638958, "incorrect_loss_uncond": -9.51819067960498, "primary_score": 0.2482758620689655}, "task_idx": 71} {"task_name": "mmlu_elementary_mathematics", "task_hash": "34eb4bd85bcf6cf6a0740154b20610f9", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_elementary_mathematics", "task_core": "mmlu_elementary_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "elementary_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_elementary_mathematics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 23.200291633605957, "current_date": "2024-11-19 21:25:36 UTC", "num_instances": 378, "beaker_info": {}, "metrics": {"acc_raw": 0.24338624338624337, "acc_per_token": 0.24338624338624337, "acc_per_char": 0.2751322751322751, "correct_loss_raw": 11.930546489657548, "incorrect_loss_raw": 12.10207574045847, "correct_loss_per_token": 3.8344038402479708, "incorrect_loss_per_token": 3.9704004893855487, "correct_loss_per_char": 1.5278709612794505, "incorrect_loss_per_char": 1.5513099371206358, "acc_uncond": 0.26455026455026454, "correct_loss_uncond": -8.801304420781515, "incorrect_loss_uncond": -8.569474821473346, "primary_score": 0.2751322751322751}, "task_idx": 72} {"task_name": "mmlu_formal_logic", "task_hash": "edba816f035a5a7d7df7dae63a847ed4", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_formal_logic", "task_core": "mmlu_formal_logic", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "formal_logic", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_formal_logic:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.340330123901367, "current_date": "2024-11-19 21:26:00 UTC", "num_instances": 126, "beaker_info": {}, "metrics": {"acc_raw": 0.30952380952380953, "acc_per_token": 0.2857142857142857, "acc_per_char": 0.29365079365079366, "correct_loss_raw": 23.540814636245607, "incorrect_loss_raw": 24.555815297459816, "correct_loss_per_token": 2.5186289372363673, "incorrect_loss_per_token": 2.5082171115158514, "correct_loss_per_char": 1.1576052453427086, "incorrect_loss_per_char": 1.1826633041442995, "acc_uncond": 0.2619047619047619, "correct_loss_uncond": -25.623597778971234, "incorrect_loss_uncond": -25.936248189557798, "primary_score": 0.29365079365079366}, "task_idx": 73} {"task_name": "mmlu_global_facts", "task_hash": "83faa1c084d9844ed22d2f870171a354", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_global_facts", "task_core": "mmlu_global_facts", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "global_facts", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_global_facts:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.964112281799316, "current_date": "2024-11-19 21:26:13 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.29, "acc_per_token": 0.27, "acc_per_char": 0.29, "correct_loss_raw": 7.879513807296753, "incorrect_loss_raw": 8.528802576065067, "correct_loss_per_token": 2.7290880534026933, "incorrect_loss_per_token": 2.750701302564733, "correct_loss_per_char": 1.0806024398000555, "incorrect_loss_per_char": 1.1004074488770166, "acc_uncond": 0.28, "correct_loss_uncond": -7.5427256774902345, "incorrect_loss_uncond": -7.546341579755147, "primary_score": 0.29}, "task_idx": 74} {"task_name": "mmlu_high_school_biology", "task_hash": "40305e6449b4c634cf3858f0cb1a9ea0", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_biology", "task_core": "mmlu_high_school_biology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_biology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_biology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 22.704880952835083, "current_date": "2024-11-19 21:26:19 UTC", "num_instances": 310, "beaker_info": {}, "metrics": {"acc_raw": 0.34516129032258064, "acc_per_token": 0.3870967741935484, "acc_per_char": 0.3903225806451613, "correct_loss_raw": 21.363152931582544, "incorrect_loss_raw": 21.78373695650408, "correct_loss_per_token": 2.5451589527874114, "incorrect_loss_per_token": 2.929591839395843, "correct_loss_per_char": 0.5453848861018671, "incorrect_loss_per_char": 0.5849483112641993, "acc_uncond": 0.3774193548387097, "correct_loss_uncond": -14.032325724632509, "incorrect_loss_uncond": -12.629724754312988, "primary_score": 0.3903225806451613}, "task_idx": 75} {"task_name": "mmlu_high_school_chemistry", "task_hash": "c148a2f0c73c4d2e8a363125f171f603", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_chemistry", "task_core": "mmlu_high_school_chemistry", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_chemistry", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_chemistry:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 16.593919038772583, "current_date": "2024-11-19 21:26:42 UTC", "num_instances": 203, "beaker_info": {}, "metrics": {"acc_raw": 0.19704433497536947, "acc_per_token": 0.2315270935960591, "acc_per_char": 0.21674876847290642, "correct_loss_raw": 20.89802763467939, "incorrect_loss_raw": 18.928701400561092, "correct_loss_per_token": 2.7035825977303256, "incorrect_loss_per_token": 2.617592187327384, "correct_loss_per_char": 0.9237622521546132, "incorrect_loss_per_char": 0.9029845173498553, "acc_uncond": 0.28078817733990147, "correct_loss_uncond": -13.191290159237209, "incorrect_loss_uncond": -13.121985284565705, "primary_score": 0.21674876847290642}, "task_idx": 76} {"task_name": "mmlu_high_school_computer_science", "task_hash": "7f237d33901391c40fe99221b7fc7df2", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_computer_science", "task_core": "mmlu_high_school_computer_science", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_computer_science", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_computer_science:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.148508071899414, "current_date": "2024-11-19 21:26:58 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.24, "acc_per_token": 0.25, "acc_per_char": 0.3, "correct_loss_raw": 23.00562346100807, "incorrect_loss_raw": 23.226041058301917, "correct_loss_per_token": 2.5115057369712956, "incorrect_loss_per_token": 2.739964992086951, "correct_loss_per_char": 0.7942636222370791, "incorrect_loss_per_char": 0.8636869777631533, "acc_uncond": 0.32, "correct_loss_uncond": -15.524907387495041, "incorrect_loss_uncond": -15.119951387643816, "primary_score": 0.3}, "task_idx": 77} {"task_name": "mmlu_high_school_european_history", "task_hash": "bce04ae918d4f75bd0e71aeb5508ea76", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_european_history", "task_core": "mmlu_high_school_european_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_european_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_european_history:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 91.33902597427368, "current_date": "2024-11-19 21:27:12 UTC", "num_instances": 165, "beaker_info": {}, "metrics": {"acc_raw": 0.3515151515151515, "acc_per_token": 0.45454545454545453, "acc_per_char": 0.4484848484848485, "correct_loss_raw": 27.50270097797567, "incorrect_loss_raw": 27.05405940070296, "correct_loss_per_token": 2.4626567790590443, "incorrect_loss_per_token": 3.073838508885275, "correct_loss_per_char": 0.44729345815871113, "incorrect_loss_per_char": 0.552629013230489, "acc_uncond": 0.4, "correct_loss_uncond": -15.371065504984422, "incorrect_loss_uncond": -13.31542071308752, "primary_score": 0.4484848484848485}, "task_idx": 78} {"task_name": "mmlu_high_school_geography", "task_hash": "2451a97e8ea5ba8e49d0f60db615137b", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_geography", "task_core": "mmlu_high_school_geography", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_geography", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_geography:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.76505422592163, "current_date": "2024-11-19 21:28:43 UTC", "num_instances": 198, "beaker_info": {}, "metrics": {"acc_raw": 0.3838383838383838, "acc_per_token": 0.4292929292929293, "acc_per_char": 0.42424242424242425, "correct_loss_raw": 14.490383876694573, "incorrect_loss_raw": 14.81266781717839, "correct_loss_per_token": 3.1006659141581205, "incorrect_loss_per_token": 3.6240699168801833, "correct_loss_per_char": 0.5874396804149111, "incorrect_loss_per_char": 0.7109279375675324, "acc_uncond": 0.41414141414141414, "correct_loss_uncond": -10.85394283135732, "incorrect_loss_uncond": -9.224186990999613, "primary_score": 0.42424242424242425}, "task_idx": 79} {"task_name": "mmlu_high_school_government_and_politics", "task_hash": "432e3dd431e2137bb51952baabfe8d40", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_government_and_politics", "task_core": "mmlu_high_school_government_and_politics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_government_and_politics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_government_and_politics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 12.682172775268555, "current_date": "2024-11-19 21:28:54 UTC", "num_instances": 193, "beaker_info": {}, "metrics": {"acc_raw": 0.39896373056994816, "acc_per_token": 0.47150259067357514, "acc_per_char": 0.45077720207253885, "correct_loss_raw": 21.419602970655408, "incorrect_loss_raw": 23.24519727143599, "correct_loss_per_token": 2.236870015309941, "incorrect_loss_per_token": 2.7710672404875227, "correct_loss_per_char": 0.3740266779895104, "incorrect_loss_per_char": 0.46483421379674705, "acc_uncond": 0.45595854922279794, "correct_loss_uncond": -16.38661623900573, "incorrect_loss_uncond": -13.596990226258063, "primary_score": 0.45077720207253885}, "task_idx": 80} {"task_name": "mmlu_high_school_macroeconomics", "task_hash": "fa28d7d574940324e3f18cc755314008", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_macroeconomics", "task_core": "mmlu_high_school_macroeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_macroeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_macroeconomics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 20.654499769210815, "current_date": "2024-11-19 21:29:06 UTC", "num_instances": 390, "beaker_info": {}, "metrics": {"acc_raw": 0.29743589743589743, "acc_per_token": 0.33589743589743587, "acc_per_char": 0.3153846153846154, "correct_loss_raw": 21.07840545391425, "incorrect_loss_raw": 21.375357272074773, "correct_loss_per_token": 2.608882898964855, "incorrect_loss_per_token": 2.824826016663295, "correct_loss_per_char": 0.562448383317162, "incorrect_loss_per_char": 0.5901391587490764, "acc_uncond": 0.3384615384615385, "correct_loss_uncond": -15.114438359859662, "incorrect_loss_uncond": -14.110496433372163, "primary_score": 0.3153846153846154}, "task_idx": 81} {"task_name": "mmlu_high_school_mathematics", "task_hash": "d35dafac7b92c7adc6cb83bfcf827620", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_mathematics", "task_core": "mmlu_high_school_mathematics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_mathematics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_mathematics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 16.023837089538574, "current_date": "2024-11-19 21:29:27 UTC", "num_instances": 270, "beaker_info": {}, "metrics": {"acc_raw": 0.15925925925925927, "acc_per_token": 0.17777777777777778, "acc_per_char": 0.16296296296296298, "correct_loss_raw": 9.28662220283791, "incorrect_loss_raw": 8.163666827148862, "correct_loss_per_token": 4.178258001474584, "incorrect_loss_per_token": 3.917127378658242, "correct_loss_per_char": 1.7174042732985988, "incorrect_loss_per_char": 1.616340709285345, "acc_uncond": 0.2518518518518518, "correct_loss_uncond": -5.568703559592918, "incorrect_loss_uncond": -5.421064988919243, "primary_score": 0.16296296296296298}, "task_idx": 82} {"task_name": "mmlu_high_school_microeconomics", "task_hash": "9b84847fb5a13e1e48dfd2e71e7dfdc5", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_microeconomics", "task_core": "mmlu_high_school_microeconomics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_microeconomics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_microeconomics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 13.288214445114136, "current_date": "2024-11-19 21:29:44 UTC", "num_instances": 238, "beaker_info": {}, "metrics": {"acc_raw": 0.31092436974789917, "acc_per_token": 0.35294117647058826, "acc_per_char": 0.36134453781512604, "correct_loss_raw": 24.63945012733716, "incorrect_loss_raw": 24.164019934436524, "correct_loss_per_token": 2.602873918455077, "incorrect_loss_per_token": 2.8529698773609264, "correct_loss_per_char": 0.5514129796237556, "incorrect_loss_per_char": 0.5945063509843619, "acc_uncond": 0.29411764705882354, "correct_loss_uncond": -15.441406973269807, "incorrect_loss_uncond": -14.559387908429335, "primary_score": 0.36134453781512604}, "task_idx": 83} {"task_name": "mmlu_high_school_physics", "task_hash": "2438f80fa949fdfba5fd0982a3e13ce8", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_physics", "task_core": "mmlu_high_school_physics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_physics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_physics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 14.445171356201172, "current_date": "2024-11-19 21:29:56 UTC", "num_instances": 151, "beaker_info": {}, "metrics": {"acc_raw": 0.25165562913907286, "acc_per_token": 0.23178807947019867, "acc_per_char": 0.24503311258278146, "correct_loss_raw": 21.13985337484751, "incorrect_loss_raw": 20.35369520997897, "correct_loss_per_token": 2.4328990738547174, "incorrect_loss_per_token": 2.3560301577896197, "correct_loss_per_char": 0.8585223176217057, "incorrect_loss_per_char": 0.8402890421816588, "acc_uncond": 0.2781456953642384, "correct_loss_uncond": -15.777207565623403, "incorrect_loss_uncond": -16.102421425299404, "primary_score": 0.24503311258278146}, "task_idx": 84} {"task_name": "mmlu_high_school_psychology", "task_hash": "e5c6b909fb842973d0ba75f8fad285a1", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_psychology", "task_core": "mmlu_high_school_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_psychology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 40.20431876182556, "current_date": "2024-11-19 21:30:11 UTC", "num_instances": 545, "beaker_info": {}, "metrics": {"acc_raw": 0.48807339449541287, "acc_per_token": 0.47706422018348627, "acc_per_char": 0.47522935779816516, "correct_loss_raw": 14.156356584588321, "incorrect_loss_raw": 16.74465744896401, "correct_loss_per_token": 2.888265642439932, "incorrect_loss_per_token": 3.7921417257225736, "correct_loss_per_char": 0.48106282032416087, "incorrect_loss_per_char": 0.6332678164009538, "acc_uncond": 0.47155963302752296, "correct_loss_uncond": -13.520115094994186, "incorrect_loss_uncond": -11.29164081728057, "primary_score": 0.47522935779816516}, "task_idx": 85} {"task_name": "mmlu_high_school_statistics", "task_hash": "c5e879c445098b25ee27496e3b91777c", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_statistics", "task_core": "mmlu_high_school_statistics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_statistics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_statistics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 26.864049673080444, "current_date": "2024-11-19 21:30:51 UTC", "num_instances": 216, "beaker_info": {}, "metrics": {"acc_raw": 0.27314814814814814, "acc_per_token": 0.2916666666666667, "acc_per_char": 0.2777777777777778, "correct_loss_raw": 26.131407388382488, "incorrect_loss_raw": 26.658040807938868, "correct_loss_per_token": 2.5982270382700445, "incorrect_loss_per_token": 2.6451773859738483, "correct_loss_per_char": 0.786524301711884, "incorrect_loss_per_char": 0.8176619046460267, "acc_uncond": 0.3055555555555556, "correct_loss_uncond": -17.335551079224658, "incorrect_loss_uncond": -16.501133336329172, "primary_score": 0.2777777777777778}, "task_idx": 86} {"task_name": "mmlu_high_school_us_history", "task_hash": "07edfc83a12773340cdb716671b46541", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_us_history", "task_core": "mmlu_high_school_us_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_us_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_us_history:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 111.23037767410278, "current_date": "2024-11-19 21:31:18 UTC", "num_instances": 204, "beaker_info": {}, "metrics": {"acc_raw": 0.3284313725490196, "acc_per_token": 0.3431372549019608, "acc_per_char": 0.37745098039215685, "correct_loss_raw": 25.274763485672427, "incorrect_loss_raw": 26.57934519395329, "correct_loss_per_token": 2.455718281654319, "incorrect_loss_per_token": 2.757877330223681, "correct_loss_per_char": 0.4598392848516849, "incorrect_loss_per_char": 0.5243689544489788, "acc_uncond": 0.4264705882352941, "correct_loss_uncond": -13.92219268603652, "incorrect_loss_uncond": -11.630784141861536, "primary_score": 0.37745098039215685}, "task_idx": 87} {"task_name": "mmlu_high_school_world_history", "task_hash": "38f161e2f228b6acfe7cb1aa36d0d3ef", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_high_school_world_history", "task_core": "mmlu_high_school_world_history", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "high_school_world_history", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_high_school_world_history:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 80.03247952461243, "current_date": "2024-11-19 21:33:09 UTC", "num_instances": 237, "beaker_info": {}, "metrics": {"acc_raw": 0.3206751054852321, "acc_per_token": 0.37130801687763715, "acc_per_char": 0.37130801687763715, "correct_loss_raw": 29.43805429251385, "incorrect_loss_raw": 29.626125245154633, "correct_loss_per_token": 2.7216699601477736, "incorrect_loss_per_token": 3.136590812911396, "correct_loss_per_char": 0.5012342985644117, "incorrect_loss_per_char": 0.5617093694188149, "acc_uncond": 0.3924050632911392, "correct_loss_uncond": -14.541466581670544, "incorrect_loss_uncond": -12.60329805916037, "primary_score": 0.37130801687763715}, "task_idx": 88} {"task_name": "mmlu_human_aging", "task_hash": "8c66e7db317c293ebcd7cd3ad67b5840", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_human_aging", "task_core": "mmlu_human_aging", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_aging", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_aging:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.173273801803589, "current_date": "2024-11-19 21:34:29 UTC", "num_instances": 223, "beaker_info": {}, "metrics": {"acc_raw": 0.39461883408071746, "acc_per_token": 0.39461883408071746, "acc_per_char": 0.37668161434977576, "correct_loss_raw": 12.677635809231232, "incorrect_loss_raw": 15.47025020391595, "correct_loss_per_token": 3.046415314731483, "incorrect_loss_per_token": 3.5851252066525, "correct_loss_per_char": 0.5667559513387225, "incorrect_loss_per_char": 0.7035341763206064, "acc_uncond": 0.4260089686098655, "correct_loss_uncond": -10.355107204261916, "incorrect_loss_uncond": -8.973779054589693, "primary_score": 0.37668161434977576}, "task_idx": 89} {"task_name": "mmlu_human_sexuality", "task_hash": "f3dcb40d784b716dae889d9bf3c62232", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_human_sexuality", "task_core": "mmlu_human_sexuality", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "human_sexuality", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_human_sexuality:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.652626037597656, "current_date": "2024-11-19 21:34:40 UTC", "num_instances": 131, "beaker_info": {}, "metrics": {"acc_raw": 0.4122137404580153, "acc_per_token": 0.4198473282442748, "acc_per_char": 0.3969465648854962, "correct_loss_raw": 14.669895948799512, "incorrect_loss_raw": 16.163360671991004, "correct_loss_per_token": 3.0384140234268244, "incorrect_loss_per_token": 3.6212319350738484, "correct_loss_per_char": 0.6311531693924813, "incorrect_loss_per_char": 0.6829634331608414, "acc_uncond": 0.3282442748091603, "correct_loss_uncond": -10.704758468020053, "incorrect_loss_uncond": -11.668367686920798, "primary_score": 0.3969465648854962}, "task_idx": 90} {"task_name": "mmlu_international_law", "task_hash": "b4d3ab839d093262fe791e56c98053df", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_international_law", "task_core": "mmlu_international_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "international_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_international_law:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.200043439865112, "current_date": "2024-11-19 21:34:47 UTC", "num_instances": 121, "beaker_info": {}, "metrics": {"acc_raw": 0.17355371900826447, "acc_per_token": 0.30578512396694213, "acc_per_char": 0.256198347107438, "correct_loss_raw": 48.5408869026121, "incorrect_loss_raw": 35.460322001420444, "correct_loss_per_token": 2.4756060714120736, "incorrect_loss_per_token": 2.6451336287890466, "correct_loss_per_char": 0.453038230927997, "incorrect_loss_per_char": 0.4743127227123939, "acc_uncond": 0.371900826446281, "correct_loss_uncond": -25.168442221712475, "incorrect_loss_uncond": -22.507250311946084, "primary_score": 0.256198347107438}, "task_idx": 91} {"task_name": "mmlu_jurisprudence", "task_hash": "a5a3583aea5dbd6ece8896b0140522f5", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_jurisprudence", "task_core": "mmlu_jurisprudence", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "jurisprudence", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_jurisprudence:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.7869322299957275, "current_date": "2024-11-19 21:34:57 UTC", "num_instances": 108, "beaker_info": {}, "metrics": {"acc_raw": 0.23148148148148148, "acc_per_token": 0.26851851851851855, "acc_per_char": 0.3148148148148148, "correct_loss_raw": 26.89270097238046, "incorrect_loss_raw": 22.6379110702762, "correct_loss_per_token": 3.0341406842080265, "incorrect_loss_per_token": 3.5132505140292585, "correct_loss_per_char": 0.5889211461775832, "incorrect_loss_per_char": 0.6462895365021206, "acc_uncond": 0.3888888888888889, "correct_loss_uncond": -14.154463432453296, "incorrect_loss_uncond": -12.54157188718702, "primary_score": 0.3148148148148148}, "task_idx": 92} {"task_name": "mmlu_logical_fallacies", "task_hash": "87754a93f67c5e3682212e20e26d138f", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_logical_fallacies", "task_core": "mmlu_logical_fallacies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "logical_fallacies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_logical_fallacies:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 10.951525688171387, "current_date": "2024-11-19 21:35:04 UTC", "num_instances": 163, "beaker_info": {}, "metrics": {"acc_raw": 0.31901840490797545, "acc_per_token": 0.3312883435582822, "acc_per_char": 0.3067484662576687, "correct_loss_raw": 23.010878557076484, "incorrect_loss_raw": 22.950006944757778, "correct_loss_per_token": 3.1891809132318185, "incorrect_loss_per_token": 3.5550170354697945, "correct_loss_per_char": 0.5582601865429742, "incorrect_loss_per_char": 0.6406159326677359, "acc_uncond": 0.39263803680981596, "correct_loss_uncond": -13.189598791438378, "incorrect_loss_uncond": -11.083331307506748, "primary_score": 0.3067484662576687}, "task_idx": 93} {"task_name": "mmlu_machine_learning", "task_hash": "c7a50715045d63764fe2fc8c95f84e4e", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_machine_learning", "task_core": "mmlu_machine_learning", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "machine_learning", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_machine_learning:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 9.213104248046875, "current_date": "2024-11-19 21:35:15 UTC", "num_instances": 112, "beaker_info": {}, "metrics": {"acc_raw": 0.25, "acc_per_token": 0.20535714285714285, "acc_per_char": 0.2767857142857143, "correct_loss_raw": 18.307162577552454, "incorrect_loss_raw": 18.575598238834324, "correct_loss_per_token": 3.585282597547379, "incorrect_loss_per_token": 3.5459706425587236, "correct_loss_per_char": 0.9230253304795293, "incorrect_loss_per_char": 0.9122802462172885, "acc_uncond": 0.24107142857142858, "correct_loss_uncond": -8.379302962550096, "incorrect_loss_uncond": -7.790933533438611, "primary_score": 0.2767857142857143}, "task_idx": 94} {"task_name": "mmlu_management", "task_hash": "bb2a328db2333c8df600dba174c2c4f7", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_management", "task_core": "mmlu_management", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "management", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_management:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.426297426223755, "current_date": "2024-11-19 21:35:24 UTC", "num_instances": 103, "beaker_info": {}, "metrics": {"acc_raw": 0.36893203883495146, "acc_per_token": 0.4368932038834951, "acc_per_char": 0.5145631067961165, "correct_loss_raw": 13.354300859700707, "incorrect_loss_raw": 14.035566178726148, "correct_loss_per_token": 3.448802221833666, "incorrect_loss_per_token": 3.9716159753555917, "correct_loss_per_char": 0.5887009092409919, "incorrect_loss_per_char": 0.6663220082502314, "acc_uncond": 0.4174757281553398, "correct_loss_uncond": -9.784205390773351, "incorrect_loss_uncond": -8.292069842514481, "primary_score": 0.5145631067961165}, "task_idx": 95} {"task_name": "mmlu_marketing", "task_hash": "58c595b7c49dba71f3aa397880a13a84", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_marketing", "task_core": "mmlu_marketing", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "marketing", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_marketing:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 15.254076957702637, "current_date": "2024-11-19 21:35:29 UTC", "num_instances": 234, "beaker_info": {}, "metrics": {"acc_raw": 0.5213675213675214, "acc_per_token": 0.5042735042735043, "acc_per_char": 0.5427350427350427, "correct_loss_raw": 13.019673931293, "incorrect_loss_raw": 16.553809684735754, "correct_loss_per_token": 2.6529033287731636, "incorrect_loss_per_token": 3.59070879885069, "correct_loss_per_char": 0.5415795828438098, "incorrect_loss_per_char": 0.7426038180350188, "acc_uncond": 0.5299145299145299, "correct_loss_uncond": -12.220347578708942, "incorrect_loss_uncond": -9.38660909935959, "primary_score": 0.5427350427350427}, "task_idx": 96} {"task_name": "mmlu_medical_genetics", "task_hash": "36a9fec8301b47f23d8ced742c53d402", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_medical_genetics", "task_core": "mmlu_medical_genetics", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "medical_genetics", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_medical_genetics:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 4.191201686859131, "current_date": "2024-11-19 21:35:44 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.38, "acc_per_token": 0.42, "acc_per_char": 0.43, "correct_loss_raw": 14.743131907582283, "incorrect_loss_raw": 14.271686580975851, "correct_loss_per_token": 2.4686744897557973, "incorrect_loss_per_token": 2.9509285705860315, "correct_loss_per_char": 0.6537800260975594, "incorrect_loss_per_char": 0.7681347658972387, "acc_uncond": 0.37, "correct_loss_uncond": -13.33009413421154, "incorrect_loss_uncond": -11.562404470443727, "primary_score": 0.43}, "task_idx": 97} {"task_name": "mmlu_miscellaneous", "task_hash": "3ce7aa82135b0926faa1a6d49e1f073f", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_miscellaneous", "task_core": "mmlu_miscellaneous", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "miscellaneous", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_miscellaneous:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 31.331737995147705, "current_date": "2024-11-19 21:35:48 UTC", "num_instances": 783, "beaker_info": {}, "metrics": {"acc_raw": 0.4827586206896552, "acc_per_token": 0.4840357598978289, "acc_per_char": 0.48020434227330777, "correct_loss_raw": 9.973246936668495, "incorrect_loss_raw": 12.443472592563705, "correct_loss_per_token": 2.8331898448315176, "incorrect_loss_per_token": 4.036465953696992, "correct_loss_per_char": 0.6163125125320722, "incorrect_loss_per_char": 0.8791243173215028, "acc_uncond": 0.4840357598978289, "correct_loss_uncond": -10.355883860273114, "incorrect_loss_uncond": -7.90547576311955, "primary_score": 0.48020434227330777}, "task_idx": 98} {"task_name": "mmlu_moral_disputes", "task_hash": "643b3f1a385bb8b4ce6a53105fffb3de", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_moral_disputes", "task_core": "mmlu_moral_disputes", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_disputes", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_disputes:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 23.96148133277893, "current_date": "2024-11-19 21:36:20 UTC", "num_instances": 346, "beaker_info": {}, "metrics": {"acc_raw": 0.2861271676300578, "acc_per_token": 0.2832369942196532, "acc_per_char": 0.2514450867052023, "correct_loss_raw": 26.762201377249866, "incorrect_loss_raw": 24.295487502177107, "correct_loss_per_token": 2.8838279812730208, "incorrect_loss_per_token": 3.0668071172461397, "correct_loss_per_char": 0.5684422696554423, "incorrect_loss_per_char": 0.5808929020841671, "acc_uncond": 0.3063583815028902, "correct_loss_uncond": -13.697513322092894, "incorrect_loss_uncond": -13.334110145164594, "primary_score": 0.2514450867052023}, "task_idx": 99} {"task_name": "mmlu_moral_scenarios", "task_hash": "49d4bc1cb20a4596312dda1c40b5467e", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_moral_scenarios", "task_core": "mmlu_moral_scenarios", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "moral_scenarios", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_moral_scenarios:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 116.89065456390381, "current_date": "2024-11-19 21:36:44 UTC", "num_instances": 895, "beaker_info": {}, "metrics": {"acc_raw": 0.23798882681564246, "acc_per_token": 0.24134078212290502, "acc_per_char": 0.24022346368715083, "correct_loss_raw": 1.7945288913210011, "incorrect_loss_raw": 1.797925722465836, "correct_loss_per_token": 0.4390254018226819, "incorrect_loss_per_token": 0.44364589957625594, "correct_loss_per_char": 0.10330200633329756, "incorrect_loss_per_char": 0.10433826118203812, "acc_uncond": 0.27262569832402234, "correct_loss_uncond": -20.481219156960535, "incorrect_loss_uncond": -20.232290562770878, "primary_score": 0.24022346368715083}, "task_idx": 100} {"task_name": "mmlu_nutrition", "task_hash": "96b6d39ad9e2a3d1f6444ca444eafe21", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_nutrition", "task_core": "mmlu_nutrition", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "nutrition", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_nutrition:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 24.05600881576538, "current_date": "2024-11-19 21:38:40 UTC", "num_instances": 306, "beaker_info": {}, "metrics": {"acc_raw": 0.2647058823529412, "acc_per_token": 0.3137254901960784, "acc_per_char": 0.32679738562091504, "correct_loss_raw": 26.710010400589773, "incorrect_loss_raw": 23.848608736768515, "correct_loss_per_token": 2.6501611735502424, "incorrect_loss_per_token": 2.902847647897238, "correct_loss_per_char": 0.5759508009400872, "incorrect_loss_per_char": 0.6318093526945697, "acc_uncond": 0.3333333333333333, "correct_loss_uncond": -11.975636066369761, "incorrect_loss_uncond": -11.679631917954525, "primary_score": 0.32679738562091504}, "task_idx": 101} {"task_name": "mmlu_philosophy", "task_hash": "e8a8e079a41710f36b2b11993287bbfb", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_philosophy", "task_core": "mmlu_philosophy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "philosophy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_philosophy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 16.853014945983887, "current_date": "2024-11-19 21:39:05 UTC", "num_instances": 311, "beaker_info": {}, "metrics": {"acc_raw": 0.2990353697749196, "acc_per_token": 0.31511254019292606, "acc_per_char": 0.3022508038585209, "correct_loss_raw": 22.036362408154265, "incorrect_loss_raw": 20.702107290363003, "correct_loss_per_token": 3.1140666275385387, "incorrect_loss_per_token": 3.35463497433172, "correct_loss_per_char": 0.6169163719681281, "incorrect_loss_per_char": 0.6499129989768088, "acc_uncond": 0.3633440514469453, "correct_loss_uncond": -13.029168019244908, "incorrect_loss_uncond": -12.086915756345185, "primary_score": 0.3022508038585209}, "task_idx": 102} {"task_name": "mmlu_prehistory", "task_hash": "7b3aeaaf8c8020231ef7fed4751f86c2", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_prehistory", "task_core": "mmlu_prehistory", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "prehistory", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_prehistory:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 20.799691200256348, "current_date": "2024-11-19 21:39:21 UTC", "num_instances": 324, "beaker_info": {}, "metrics": {"acc_raw": 0.404320987654321, "acc_per_token": 0.36728395061728397, "acc_per_char": 0.3425925925925926, "correct_loss_raw": 21.146028854044868, "incorrect_loss_raw": 22.864884013746988, "correct_loss_per_token": 2.632545019073288, "incorrect_loss_per_token": 2.9579714207120884, "correct_loss_per_char": 0.6003120828855978, "incorrect_loss_per_char": 0.6647959973037921, "acc_uncond": 0.38580246913580246, "correct_loss_uncond": -14.831230773050109, "incorrect_loss_uncond": -14.000013917807197, "primary_score": 0.3425925925925926}, "task_idx": 103} {"task_name": "mmlu_professional_accounting", "task_hash": "271a9bf402980f6076d2237f6c3d56d5", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_accounting", "task_core": "mmlu_professional_accounting", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_accounting", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_accounting:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 35.48146605491638, "current_date": "2024-11-19 21:39:42 UTC", "num_instances": 282, "beaker_info": {}, "metrics": {"acc_raw": 0.2907801418439716, "acc_per_token": 0.2695035460992908, "acc_per_char": 0.2624113475177305, "correct_loss_raw": 25.166415042911016, "incorrect_loss_raw": 25.396868566953817, "correct_loss_per_token": 2.9057916461264224, "incorrect_loss_per_token": 2.9578965180040386, "correct_loss_per_char": 0.7851858868344537, "incorrect_loss_per_char": 0.815302295690538, "acc_uncond": 0.3120567375886525, "correct_loss_uncond": -12.864156899722756, "incorrect_loss_uncond": -12.295960299089451, "primary_score": 0.2624113475177305}, "task_idx": 104} {"task_name": "mmlu_professional_law", "task_hash": "9cf2ca304d70aaad2023633d91fbfefa", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_law", "task_core": "mmlu_professional_law", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_law", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_law:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 499.405357837677, "current_date": "2024-11-19 21:40:18 UTC", "num_instances": 1534, "beaker_info": {}, "metrics": {"acc_raw": 0.25097783572359844, "acc_per_token": 0.25684485006518903, "acc_per_char": 0.2757496740547588, "correct_loss_raw": 41.34747552964958, "incorrect_loss_raw": 39.70492565160419, "correct_loss_per_token": 2.2535205880807645, "incorrect_loss_per_token": 2.2561388734950083, "correct_loss_per_char": 0.4481341781759717, "incorrect_loss_per_char": 0.4478595031156273, "acc_uncond": 0.2848761408083442, "correct_loss_uncond": -26.798888672439496, "incorrect_loss_uncond": -25.853866037106012, "primary_score": 0.2757496740547588}, "task_idx": 105} {"task_name": "mmlu_professional_medicine", "task_hash": "e76678f3aea053cba7bbb3fe152ff642", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_medicine", "task_core": "mmlu_professional_medicine", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_medicine", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_medicine:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 65.07992219924927, "current_date": "2024-11-19 21:48:37 UTC", "num_instances": 272, "beaker_info": {}, "metrics": {"acc_raw": 0.28308823529411764, "acc_per_token": 0.3125, "acc_per_char": 0.2977941176470588, "correct_loss_raw": 14.847494749023634, "incorrect_loss_raw": 15.73212240358779, "correct_loss_per_token": 2.668888486909859, "incorrect_loss_per_token": 2.8228289677717346, "correct_loss_per_char": 0.5463753252705837, "incorrect_loss_per_char": 0.5865243782959754, "acc_uncond": 0.3713235294117647, "correct_loss_uncond": -11.705239387557787, "incorrect_loss_uncond": -10.756183107255723, "primary_score": 0.2977941176470588}, "task_idx": 106} {"task_name": "mmlu_professional_psychology", "task_hash": "1f11cdabb27186bb3d09781f9a2bce87", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_professional_psychology", "task_core": "mmlu_professional_psychology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "professional_psychology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_professional_psychology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 51.8912353515625, "current_date": "2024-11-19 21:49:43 UTC", "num_instances": 612, "beaker_info": {}, "metrics": {"acc_raw": 0.29901960784313725, "acc_per_token": 0.33986928104575165, "acc_per_char": 0.3202614379084967, "correct_loss_raw": 25.4097203849967, "incorrect_loss_raw": 26.151661396351244, "correct_loss_per_token": 3.2427745371115555, "incorrect_loss_per_token": 3.5245931682352443, "correct_loss_per_char": 0.587391723247063, "incorrect_loss_per_char": 0.6356098136551943, "acc_uncond": 0.3104575163398693, "correct_loss_uncond": -15.352141680281147, "incorrect_loss_uncond": -14.66829736215877, "primary_score": 0.3202614379084967}, "task_idx": 107} {"task_name": "mmlu_public_relations", "task_hash": "f4f7d9efa5b14b632f1bb8cf53a780d0", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_public_relations", "task_core": "mmlu_public_relations", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "public_relations", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_public_relations:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 5.817285776138306, "current_date": "2024-11-19 21:50:34 UTC", "num_instances": 110, "beaker_info": {}, "metrics": {"acc_raw": 0.44545454545454544, "acc_per_token": 0.39090909090909093, "acc_per_char": 0.2909090909090909, "correct_loss_raw": 13.839453978430141, "incorrect_loss_raw": 16.5949248881051, "correct_loss_per_token": 3.9873044605859063, "incorrect_loss_per_token": 4.544503142075553, "correct_loss_per_char": 0.7093345693575296, "incorrect_loss_per_char": 0.773644059946458, "acc_uncond": 0.34545454545454546, "correct_loss_uncond": -9.499438776211305, "incorrect_loss_uncond": -8.281265567649495, "primary_score": 0.2909090909090909}, "task_idx": 108} {"task_name": "mmlu_security_studies", "task_hash": "ae4ffe7cce87e733dc815d013b44ec75", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_security_studies", "task_core": "mmlu_security_studies", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "security_studies", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_security_studies:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 27.682515382766724, "current_date": "2024-11-19 21:50:40 UTC", "num_instances": 245, "beaker_info": {}, "metrics": {"acc_raw": 0.30612244897959184, "acc_per_token": 0.31020408163265306, "acc_per_char": 0.27346938775510204, "correct_loss_raw": 88.36162767215652, "incorrect_loss_raw": 98.62264835591225, "correct_loss_per_token": 3.15054932583337, "incorrect_loss_per_token": 3.0993470106979735, "correct_loss_per_char": 0.6036736416103822, "incorrect_loss_per_char": 0.5617384804719124, "acc_uncond": 0.2653061224489796, "correct_loss_uncond": -16.647231608021016, "incorrect_loss_uncond": -19.05951665372265, "primary_score": 0.27346938775510204}, "task_idx": 109} {"task_name": "mmlu_sociology", "task_hash": "66633d3e396945e27b4489e2e582b958", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_sociology", "task_core": "mmlu_sociology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "sociology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_sociology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 11.714619159698486, "current_date": "2024-11-19 21:51:08 UTC", "num_instances": 201, "beaker_info": {}, "metrics": {"acc_raw": 0.27860696517412936, "acc_per_token": 0.34328358208955223, "acc_per_char": 0.27860696517412936, "correct_loss_raw": 29.919023765260306, "incorrect_loss_raw": 30.6563104515645, "correct_loss_per_token": 3.2267842207330983, "incorrect_loss_per_token": 3.550673093473738, "correct_loss_per_char": 0.5550078604496379, "incorrect_loss_per_char": 0.5870597591181631, "acc_uncond": 0.43283582089552236, "correct_loss_uncond": -14.502825765467401, "incorrect_loss_uncond": -13.354575955848006, "primary_score": 0.27860696517412936}, "task_idx": 110} {"task_name": "mmlu_us_foreign_policy", "task_hash": "bd1ffb65bcdfb1582c6b60bcdbd3d533", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_us_foreign_policy", "task_core": "mmlu_us_foreign_policy", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "us_foreign_policy", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_us_foreign_policy:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 6.119236469268799, "current_date": "2024-11-19 21:51:19 UTC", "num_instances": 100, "beaker_info": {}, "metrics": {"acc_raw": 0.37, "acc_per_token": 0.34, "acc_per_char": 0.37, "correct_loss_raw": 21.595972112417222, "incorrect_loss_raw": 20.498132687409726, "correct_loss_per_token": 2.4566456377095482, "incorrect_loss_per_token": 2.835236189122907, "correct_loss_per_char": 0.48015732518820037, "incorrect_loss_per_char": 0.5316416496468698, "acc_uncond": 0.42, "correct_loss_uncond": -13.404697128534316, "incorrect_loss_uncond": -11.991759479045868, "primary_score": 0.37}, "task_idx": 111} {"task_name": "mmlu_virology", "task_hash": "ea10babc381c242bef7bc631f8d422d2", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_virology", "task_core": "mmlu_virology", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "virology", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_virology:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 8.70515513420105, "current_date": "2024-11-19 21:51:26 UTC", "num_instances": 166, "beaker_info": {}, "metrics": {"acc_raw": 0.23493975903614459, "acc_per_token": 0.3795180722891566, "acc_per_char": 0.30120481927710846, "correct_loss_raw": 18.997218740991798, "incorrect_loss_raw": 19.065596443821622, "correct_loss_per_token": 3.2663271122144337, "incorrect_loss_per_token": 3.717375781578531, "correct_loss_per_char": 0.6537507042217882, "incorrect_loss_per_char": 0.7220563400961199, "acc_uncond": 0.28313253012048195, "correct_loss_uncond": -10.576083596930447, "incorrect_loss_uncond": -10.448030103880724, "primary_score": 0.30120481927710846}, "task_idx": 112} {"task_name": "mmlu_world_religions", "task_hash": "7b18e63e9c2a47f065dce28de478a8c0", "model_hash": "03418cf8091a9882619950ffb07429a5", "model_config": {"model": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000", "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "mmlu_world_religions", "task_core": "mmlu_world_religions", "limit": null, "split": "test", "num_shots": 5, "fewshot_seed": 1234, "primary_metric": "acc_per_char", "random_subsample_seed": 1234, "context_kwargs": null, "generation_kwargs": null, "metric_kwargs": {"uncond_docid_offset": 1000000}, "native_id_field": "index", "fewshot_source": null, "dataset_path": "cais/mmlu", "dataset_name": "world_religions", "use_chat_format": null, "version": 1, "revision": null, "metadata": {"regimes": ["OLMES-v0.1"], "alias": "mmlu_world_religions:rc::olmes"}}, "compute_config": {"batch_size": "16", "max_batch_size": 32, "output_dir": "hf_checkpoints/dclm-pool-1b-1x-h-fasttext-open_lm_1b_swiglutorch-warm5000-lr0p003-wd0p033-cd3e-05-bs256-mult1-seed124-tokens28795904000/olmes_fp32/mmlu", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 7.180548667907715, "current_date": "2024-11-19 21:51:34 UTC", "num_instances": 171, "beaker_info": {}, "metrics": {"acc_raw": 0.4678362573099415, "acc_per_token": 0.49707602339181284, "acc_per_char": 0.5029239766081871, "correct_loss_raw": 8.47612531526744, "incorrect_loss_raw": 10.434313575542925, "correct_loss_per_token": 2.4943425273595117, "incorrect_loss_per_token": 3.6110528083283655, "correct_loss_per_char": 0.6646023540305831, "incorrect_loss_per_char": 0.9210858793720432, "acc_uncond": 0.5380116959064327, "correct_loss_uncond": -10.584433940767545, "incorrect_loss_uncond": -7.8370108089716535, "primary_score": 0.5029239766081871}, "task_idx": 113}