diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..19eb526860b5a27acb3bd8c33eac9dc06337cfa9 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.397060070126241, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.028476177083435063}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.08196959243431225, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00195116466928937}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.33108642729266935, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004857070190117945}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11979264664781694, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002158921431338679}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.038077755175156366, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011930918163582719}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16233506295685138, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033998382128665604}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.056570429414871386, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013679132720736964}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07704554627752275, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001768283838194329}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.31731175550944546, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004676502198993326}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11313932552316201, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019283064282019343}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07702671368860729, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001838053091960503}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3112451044141329, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00447435213067716}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11240293894137914, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001986128142601576}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d4e9057c3479ed796e815bca9d1d0c4511160b9a --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.38706003560361096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02717161163589533}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07729769477815421, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001771654452916722}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.32003526042529734, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004812626649605805}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11542957589833107, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021300218455718844}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.036414818702198135, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001111989991849411}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.15727962630352726, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033348788622806114}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05476675039227378, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013633806053571877}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.0729909459967392, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016200898181591467}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.30701344301265726, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004658701295786162}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10950390722831477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019529650137034304}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07274875626049643, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016668638545293297}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.30071552450108774, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004440903116419319}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1085381091863139, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019824670507400145}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8ea596527717eeb77492af76d58459ffc864eef7 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.35329168677191275, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03390701669686637}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07636623292264108, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016638254935029849}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3287961533258121, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004764326327026456}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11535960559109319, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002056790821407062}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.035234171439883326, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009656833181951659}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.15891884434217565, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003347326475949424}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05384033369196494, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013019177568197539}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07191370056044478, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00151309831707199}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3139667684953094, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004595120348556269}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10900458168967593, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018752253984042533}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07180086441278799, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015621362000957913}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.308419107397528, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004350924818476407}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10835178632862826, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019132850106397554}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..16790a371e1c84359051c5b3cd5258d134a66385 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.39131628859321554, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.026313116002416113}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07564940705521785, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016901779366449113}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3252629322591383, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0047859412578955705}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11391076313881403, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020936343693923527}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03538953748428128, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010004009583260043}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1567691792380991, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003336003242576354}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05369363828556127, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013209945591844108}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07119946415387049, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015306573606216481}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3103716691791309, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004600670503840457}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1076674209928578, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019026187637810328}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07110671643080219, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015812267457522326}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.30565042167460654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004393496455416831}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10699621950964268, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019384608416758112}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c04c3896a8d5e98e3c4ce71c507817e6208763cc --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.37052755017666045, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.022042441052051973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.0748616883094487, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016268791916451487}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.32859336280130663, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004763314479505647}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11339049154327531, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002096303472052317}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03464054004561626, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009949762918573578}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.157960165962179, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003381278298836932}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.052775854126686714, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013204579265960609}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.0700069922960969, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001456781799696283}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.31134498667272115, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004535027041515096}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1064438415118096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001899753706201498}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07049420755664745, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001517990950279307}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3098059888344558, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004434437168281575}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10675864203685105, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019563539894181738}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d1b68c473eecd728f3b1efd1f5f24d9968707121 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.35995256263213665, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03764670650407796}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07365790588568459, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017121949707532308}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3228153530836222, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004694269135211919}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11087540816858653, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020724670013026}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03370563835754997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000982560364849555}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.15192330946847016, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033144184826216097}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05131186762124571, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001301751075510561}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06873278486701989, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014938317734199177}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3061441130232553, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00448586985770301}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10413194280026901, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018759453675962236}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06886164410317586, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015474980166708774}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.30297784804468764, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004342846233116827}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10388096228686042, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019168887238182983}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b984289ba814768fcd4cfaef0cc8bb92f958f8fd --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.06974904696556859, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011614879719000074}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.11241125442105249, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0014600463110962813}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.07993097912178979, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011136701948783593}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0038814664404484087, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00020315036956781887}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.005354585862023678, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0002836354871500975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.004179116314891411, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00020937586745221478}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.06608672332133624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001045278233772761}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.10847163328970093, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014019598783824951}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0763332930371866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001024817413877872}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.06486651184386755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001063656610383382}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.10550878356686602, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001363292887440063}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.07455065795690266, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010195929073391948}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.06405097746568524, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.014294784040984042}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..bfc44d719ba8f24e961ccd4fafed754498acf199 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.07967563985483712, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011981791486243532}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.13333453575974324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0015456880808194433}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.09272818816907549, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011490248194294334}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0031182028953535613, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0001763228556558668}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.005182712565249269, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00033464377765989186}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.003531715774833695, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00019054232118964935}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07295443320214035, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010133973838849122}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1250877049808009, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014330876586568309}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.08573108085907154, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009959675791518386}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.07428908435430764, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011046337929190013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.1251193589727116, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0014467114410570768}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.08666221587131816, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010615368892029633}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.08853135918871324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.020831959008675835}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e91aae08ecaf812fe9314a2ebd6d8b69b968584c --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.0821044769548865, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012109023007897738}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.13827872380494466, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0016474383605086832}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.09561825103521294, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011752321521538633}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0037438880758047005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00021045724159333664}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.006616313525097841, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00042267274141401527}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.004325117399126492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002352094266785428}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07475764548912422, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010236395736361035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1281885979875213, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014803317472057867}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.08768328186526594, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010046872049285587}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.07537568417123489, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011026893753818762}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.12800436919065455, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0015289279450751802}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.08802189261932684, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010727115935503444}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.16015334774145826, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.023734817784982588}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6a1232065526430ad01c279ec8b14d3c4a7dbd08 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.07398914178321388, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014407291212392703}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.1182771258048201, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0018198887142112982}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.08208715110302196, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012831764702565588}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.004020145992604983, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00026592766536485463}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.006665406872296463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004654137487344057}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.004456026661162212, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002758499131281591}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.06723906063892546, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012458979372848018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.10954540525765066, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016400029407562262}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.07512642856203387, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001099007590251846}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.0688188354810642, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013457162556628375}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.11040591385464732, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017010360876035554}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.07638882046568143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011908718046903589}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.2614442095298893, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.028570775000506844}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6721885fa826a55ecab05c1a0b48b61b96abe0b2 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.0280761104871091, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012926400012494954}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.04256550951472789, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0015662670220772453}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.02864898102166578, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010448235312037137}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0015031467154335884, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00016818655996840296}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.002661081711747737, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003367092996858654}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0016218547111402145, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00016643077121423323}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.025283329619297263, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00116903072674806}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.039170970833320586, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014262971628748195}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.025906000131277095, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009115419100498881}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.026006848089608563, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011953058942308498}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.039653057024522397, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0014650811160356375}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.02654490920386107, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009630631658391304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.06670432998840774, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.010834402499150991}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..22b1434ad59afc7d44d6a22189f928f9c10e9560 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.0049355335861678085, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0005700406667338665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.007089024132379281, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0007092663255506791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.004918846421495546, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005110461047249682}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.000304156454074688, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00010109123411176751}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0004177630400024621, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0001288667888548727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0002967006936469681, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 8.974945913013664e-05}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.004402996255292156, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00048523970345924354}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.0065174776893015506, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0006367984657316963}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.00441760659640342, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00043438930707966116}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.004598567341994099, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005286763909225495}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.006622774214457113, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0006621175203479033}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.0045788303091290556, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004707439731624039}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.6156436086606036e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.807467277955187e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..9c7da4f88564a0e715b42e184334d09a113186e5 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.023831548566168672, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.00013669989610831787}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.05449305285889657, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007682171874626243}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.09611446455866401, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0009371035323738266}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.06253160637445583, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006035775811340659}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.007752886867383771, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00022369481483085416}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.015703783259365008, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004099918025863708}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.010133641195144875, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002601808393175606}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.05437436599020971, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007584161707705096}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.09607649971755518, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009370784251490342}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.06247837072122016, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006020906398104155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.050869285779818474, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007452830238007919}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.08872183965001415, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008251902983264386}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.05776879458207875, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000537142122378467}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a6ddd2abda783e577e7db1b257f44b9bcd07b925 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 7.5716891789423135, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07657233541931847}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.37675073670067255, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024649179184822933}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4196033967756775, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002554821342455347}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.378589775338143, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019484786742807038}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.1515787193445361, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016212252559065935}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.16960708192000204, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001740739006568578}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.15192303671918772, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014410254284747698}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2562623240157523, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018156767746927076}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.2883651325055159, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002017743649477281}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.25806597930638026, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014774627085457283}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3119796155675665, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002197372081552806}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.34777409514067237, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023378237968652736}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3135332346518358, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018186234521200878}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f549ec91fc0ca76e040e28194df1780157219daa --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 8.415825253964716, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15234172654187825}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.39718229879297745, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023267859874187644}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.431821762098323, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024735326081453987}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.39680880458063933, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018153515680263594}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.16935200061749783, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016483718369393143}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.1866765784720031, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001801459763033518}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.16948726304174097, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014659320427177205}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.26901190247512224, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018220891410013944}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.2939323384016312, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002026079597108763}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2687904361168554, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014803733340222947}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.32665277498400797, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021157313290731425}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.35541359697456837, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0022949279797004926}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.32628931648104437, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017352819339899982}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9e4917187cc82fa755146dd67e96058f501d8922 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 9.053715150693384, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06980100462430686}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.402912961249351, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002311451156821765}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4381203339806578, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002475121033893737}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.40313857771935285, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001805737386969938}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.17782567314062503, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016671508958511524}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.19584958268974514, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018895308376705284}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.178157082665128, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015223176148801163}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2736571491639734, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018305403535992246}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.2995352347018479, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020969000115146907}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.27413718639696194, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015353598026852135}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3307097275531657, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002114092970242258}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3606968160198456, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023518211594742116}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3312412837590075, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017721110137833955}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1bebe4fa799da53a490eeb10678d5c1253828f30 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 8.992864407073405, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08770475097615464}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.4035902410295868, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002314326089880622}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.43366816643506345, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002417145513082443}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4013041193656425, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018012199210851237}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.17901955155397473, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016801289447732556}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.19422772465440907, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001846206730549891}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.17801635513525366, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015174150039881623}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2749826532734801, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018385761944167643}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.29799763054527917, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00209039090294883}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2740436937683373, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015578070729464124}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3308350462575971, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002138455916629113}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.35699098749445574, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023463522382987587}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3294316503713183, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017961731530620674}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3d61cf169e002626140e7509b9b0f8329a52130d --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 8.814480003636467, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16384652517645307}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.40351181993098634, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023784770515124913}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.430473586962648, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002372480142494689}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3997657675145631, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018195701621308699}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.1784847314937123, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016941969329593459}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.19174384124754476, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018048604307058427}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.17668001136142425, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015118431601118265}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.27762946990242493, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018754875427661021}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.2983245741357427, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020162838897830222}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.27542991361833463, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015311724080949638}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3318650598753267, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021960345461577626}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3541002511978268, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0022290840532188166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3287311122356614, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001782259806323467}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_0.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1b9be35273e62a844485b1bd4af2c88f45d2af69 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.07925362926398048, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011370372081177169}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.18098825098858384, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023006406434685736}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.10882608435694785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001462837537770478}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0049500024419751225, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003221010027610884}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.011519060394160629, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007552222759138258}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.006837673192649714, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000442612615175873}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.06892683313731025, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000932076327601901}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.15911626707843118, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020012317128335327}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.09494170160398116, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012132340582403285}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.06534032138757281, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000925024745716427}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.1505792139257626, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001967931673316114}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0899394725428077, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012023093408387983}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.12092109985195117, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.034246881577404256}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_1.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7f28fc4cff80f27c9aec40230538a18a52981509 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.07136700780151199, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011530148021735068}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.17061111634547732, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024515335014419462}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.09933976208790961, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015105268621052712}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.005417921285791925, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00033592492027509244}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.013060194907812957, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008110410377500807}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.007569232441595493, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004654857633003587}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.06437885934626987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009594061202846054}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.1554012098707474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021362266435883126}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.08986801887882968, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012683121784435572}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.059106334424700216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000921780599854098}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.14253712577743585, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002053226455737736}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.08245663882625462, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012172551546565829}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.13415861030855933, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03178941606459842}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_2.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..35720f6baaf4f28d0be9f35e6162ffb04bfaebf0 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.07045755744978599, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011197061035798305}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.1695166104908529, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024490112684960375}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.09827283354103876, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014779249302482205}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.005139282932076346, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00031501935520477996}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.01241481071541199, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007816833218457778}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.007191687689267013, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004416289821760009}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.06394661959092474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009610100837351401}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.15483855207760336, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021492840237945345}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.08935199909306543, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012717361942556741}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0585812823989726, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009045085352227196}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.14210183912770377, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0020497549943373035}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.08188744465829662, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012028366586011006}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.12926835061658584, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.031144890827347994}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_3.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..db45f1b3bd43d7cdd8b5c25594146104952a923f --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.07075379884252816, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001485432353726185}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.15768815281906948, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00265924403959227}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.09447682363371723, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016497035224024978}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.005384789457002967, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00038433889596432803}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.01255287060081809, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008379579770883962}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.007295985375796452, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00047973371921657647}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.06325092101401272, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012434648427272113}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.14272917123385326, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022892155927241712}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.08484720398405765, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013808160236295816}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.05848048220762729, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012154422756537036}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.13155323370681296, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0022039878122268345}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.07823287562318222, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001328680406843208}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.19243170077336127, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.029553440444076876}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_4.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a67cf72c4620fc074fe1d3f94ff9817e398e8eb5 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0230606329148841, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001669556268708695}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.03931050113194901, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002276662240791683}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.02608363281745486, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015378702997621066}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0015648182940910817, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002556263421133481}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.002830258508910678, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003872563374046439}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0018619726116248108, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00026399269918420587}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.020423048767215433, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013721487737836427}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.03591218470026906, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00206779739972093}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.02355261113960695, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001364331759867035}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.01899394741047602, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013253063930260193}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.032752320378036076, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0019045379991497096}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.021609618867255508, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012726837713074035}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.052615863248391834, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.009113330815053633}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_5.json b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..aca9cd4d7c5660e66b78b5af7e0a6a79239d1c17 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/agg.4b284b12bc4opt4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002042535086594319, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0005339034695671533}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0017891870168785233, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0004800337144125977}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0018545887592808419, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0004835971603513197}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 5.360205831903945e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 5.360205831903959e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 5.360205831903945e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 5.360205831903959e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 5.360205831903945e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 5.360205831903959e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0016326522715622547, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00043592654264051184}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.001419705972958375, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00037990440625202293}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0014859696373791518, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00039548761545587586}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0016749881249852043, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00045147167577573214}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0014642325505038205, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00039829208519008764}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0015279556773790943, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00041167209717655475}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 5.10720007616051e-38, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.031931667498303e-33}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_0.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..b19252c554e8319ba1db20c1ca8e40f7b700c15c 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_0.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78b985794effc9cd5b1a15b686641061f8e0c3067ea8d3c61896529152179286 +size 4044426 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_1.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c64f5e4e2681079ae57fe4dc0a707f1113f9cea0 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_1.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fe04195ce5a51bf8669e161633da6568acc663887848151046ea2150c4d875f +size 4971066 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..090c14512c7d32a5733d4aac1f1867c22abaad49 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_2.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e068fc882587791a38584fd6766aad1de4d2385bafb0cace8f0ca67db7e9051 +size 5885360 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..b7dc710fcfe1e1ff5c3686b2a4dadb736defdcf5 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_3.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9a8dfd050ba48bca7433ad4fcd180fc895591be0ce2c52477c9075a6a749d64 +size 6804421 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..f87836ee8461a1b533256eb8fc6971779c88ad13 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_4.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9266c8104eeca44a270fed31768d1c21333b1d7b3446a78115c09a3cbe31d0e3 +size 7736101 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..08d421a3ad6a0be4510f1f60ddf1ce2edd4a974f 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_5.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a59aae5d51cca0d6b3d99c4344f66b64ff3aea6dc198cb9a89e78c5e79c5313e +size 8644612 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_0.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8b2134bb481207ddf78b31109967ee7904b737dd 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_0.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4a435aedc62b6f0694b520aafe875643efbff76de3544091b190b9ff55f1d7 +size 7726092 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_1.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..5dad2ad3b319d8f386b18d382aa8b3104f537e00 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_1.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5619713ccc049b3276c87e152d63831709eb512cda33c79ae31302b9c8066e6e +size 13280881 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..1f2756691c7c746d8b33c7806dfdd905c6a4dbf8 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5878d59e522e16365fc6a3c4683d2dc9b71894ba93b3d9b190db9a1180dbb1f +size 18879243 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..95df5ff5b6c9610432c703879a967e59366c3fbd 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd88404f7d4d1a3ba7a8adc958c7283a9204dd26df08be280721a5a617bac33 +size 24313465 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..be61160e0618d8de833e59ba962fa9f8e6e80839 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:210ea5d6ade9f36355c7365eecab67c75bd4da0f6ecb77c2308fe17bde458906 +size 29468402 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c313d4910747f71b12634970107142f4076ada84 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb4dec510bc05db8956c6f89133f2cf7399781364de2644aee6a2475f4112d53 +size 34798517 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..2a895b4c040f62c45673802af1aadcc99fad814b 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bc244a05f19aecd66149302aca8e595cea9c861897d3fbf57c1642073cbe1e7 +size 4390411 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..7d7c5959fd7f2994072cf2e12eb4e68052b43f6b 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9b59715551f4ae6306fc81d22c9b091e3a472648c2e4d47a560c96278316707 +size 5187145 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..cf683ddf8d0b734e6692bddcb67c908ebe4e9c3e 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3d7a990b9c4de02fa5bc70997e779cfd206a079c953fe8f52fa711688e0d4b9 +size 6251291 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..5209319b32cf199f6cd6ef14f84eeaeecfd4abb5 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3df8fc186d1b456f06e7bec99f5297487b10980c27a9907ac524aae058228c77 +size 7331415 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..52503d5172bcafa4d0cec8ab623b6f7853dc6dfc 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6191dda5c17ccacd52060334ffa8b37643fd5f42dbb6db131da3f9c1056dce9c +size 8406414 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..f6de3d9ff271583a95c69bc86c64dc51048f36c9 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fda8739310063c9ef4f8e8a3cc25c396dd6a7af3e293b042c7bed89e4cb080e0 +size 9494361 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_0.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_0.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..27a5efd9d1806965a792bc3eb6508c85cd2f91f3 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_0.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:804ae7e14838239fef7a1a4ec531605a38944c6d0ddfcc0da13b5a65a5774f42 +size 2825587 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_1.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_1.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..1d289114a09c1ca0e28a3cb98e1689fa38db5bd3 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_1.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3435399eee49540f8339b6e036a6bebeaf3593e68076711f76414a4d66b04ff3 +size 5095798 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_2.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..baf53e5637bc3aeb42699e98e01d8e6ff5780e1e 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_2.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78446a15139be02ec98211ac1ecbc9b75b759f46d1919f168600e931fef592d +size 7371672 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_3.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..9bedd5fa1fb351bf48638dfda0d9fcf8b8a11d67 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_3.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d17a886b9773eb0676f223d5ebb99d42f0dca94d63ede7b6f6afb1ac3eae821 +size 9636637 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_4.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c276769d7d3d733e9dedf21cfdaf304612177fbf 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_4.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b449410fefabf66fc62683a1a1da3ea8732082255fe9b16d804f51aae9d4691 +size 11670446 diff --git a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_5.jsonl b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..71c77e3926978009d17905e1f36d4b785e9ced55 100644 --- a/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_5.jsonl +++ b/4b284b12bc4opt4/evaluation/generation/examples.4b284b12bc4opt4_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4852169b0e5d08dcc43eaf186675789d67fc2abbbad17cf60f7fd3a64c760bad +size 13897509 diff --git a/4b284b12bc4opt4/evaluation/generation/merged.csv b/4b284b12bc4opt4/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..c578de07f1212176b0bdcb4d409370bc27207b2a --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.010133641195144875 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.010133641195144875 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.15192303671918772 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.15192303671918772 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.16948726304174097 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.16948726304174097 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.178157082665128 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.178157082665128 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.17801635513525366 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.17801635513525366 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.17668001136142425 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.17668001136142425 +e2e_nlg_cleaned,5,average,multiple,0.14406623168631325 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.006837673192649714 +gem_xsum,0,median,rouge2_fmeasure,0.006837673192649714 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.007569232441595493 +gem_xsum,1,median,rouge2_fmeasure,0.007569232441595493 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.007191687689267013 +gem_xsum,2,median,rouge2_fmeasure,0.007191687689267013 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.007295985375796452 +gem_xsum,3,median,rouge2_fmeasure,0.007295985375796452 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.0018619726116248108 +gem_xsum,4,median,rouge2_fmeasure,0.0018619726116248108 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,5.360205831903945e-05 +gem_xsum,5,median,rouge2_fmeasure,5.360205831903945e-05 +gem_xsum,5,average,multiple,0.005135025561542087 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.056570429414871386 +web_nlg_en,0,median,rouge2_fmeasure,0.056570429414871386 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05476675039227378 +web_nlg_en,1,median,rouge2_fmeasure,0.05476675039227378 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05384033369196494 +web_nlg_en,2,median,rouge2_fmeasure,0.05384033369196494 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05369363828556127 +web_nlg_en,3,median,rouge2_fmeasure,0.05369363828556127 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.052775854126686714 +web_nlg_en,4,median,rouge2_fmeasure,0.052775854126686714 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05131186762124571 +web_nlg_en,5,median,rouge2_fmeasure,0.05131186762124571 +web_nlg_en,5,average,multiple,0.053826478922100636 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.004179116314891411 +wiki_lingua_en,0,median,rouge2_fmeasure,0.004179116314891411 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.003531715774833695 +wiki_lingua_en,1,median,rouge2_fmeasure,0.003531715774833695 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.004325117399126492 +wiki_lingua_en,2,median,rouge2_fmeasure,0.004325117399126492 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.004456026661162212 +wiki_lingua_en,3,median,rouge2_fmeasure,0.004456026661162212 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.0016218547111402145 +wiki_lingua_en,4,median,rouge2_fmeasure,0.0016218547111402145 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0002967006936469681 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0002967006936469681 +wiki_lingua_en,5,average,multiple,0.0030684219258001653 diff --git a/4b284b12bc4opt4/evaluation/generation/merged.json b/4b284b12bc4opt4/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..18ff67a44595f2d64498af032767718bfb31eb9d --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.397060070126241, "bleu_stderr": 0.028476177083435063, "rouge1_fmeasure": 0.11979264664781694, "rouge1_fmeasure_stderr": 0.002158921431338679, "rouge1_precision": 0.08196959243431225, "rouge1_precision_stderr": 0.00195116466928937, "rouge1_recall": 0.33108642729266935, "rouge1_recall_stderr": 0.004857070190117945, "rouge2_fmeasure": 0.056570429414871386, "rouge2_fmeasure_stderr": 0.0013679132720736964, "rouge2_precision": 0.038077755175156366, "rouge2_precision_stderr": 0.0011930918163582719, "rouge2_recall": 0.16233506295685138, "rouge2_recall_stderr": 0.0033998382128665604, "rougeL_fmeasure": 0.11313932552316201, "rougeL_fmeasure_stderr": 0.0019283064282019343, "rougeL_precision": 0.07704554627752275, "rougeL_precision_stderr": 0.001768283838194329, "rougeL_recall": 0.31731175550944546, "rougeL_recall_stderr": 0.004676502198993326, "rougeLsum_fmeasure": 0.11240293894137914, "rougeLsum_fmeasure_stderr": 0.001986128142601576, "rougeLsum_precision": 0.07702671368860729, "rougeLsum_precision_stderr": 0.001838053091960503, "rougeLsum_recall": 0.3112451044141329, "rougeLsum_recall_stderr": 0.00447435213067716}}, "1": {"PALM_prompt": {"bleu": 0.38706003560361096, "bleu_stderr": 0.02717161163589533, "rouge1_fmeasure": 0.11542957589833107, "rouge1_fmeasure_stderr": 0.0021300218455718844, "rouge1_precision": 0.07729769477815421, "rouge1_precision_stderr": 0.001771654452916722, "rouge1_recall": 0.32003526042529734, "rouge1_recall_stderr": 0.004812626649605805, "rouge2_fmeasure": 0.05476675039227378, "rouge2_fmeasure_stderr": 0.0013633806053571877, "rouge2_precision": 0.036414818702198135, "rouge2_precision_stderr": 0.001111989991849411, "rouge2_recall": 0.15727962630352726, "rouge2_recall_stderr": 0.0033348788622806114, "rougeL_fmeasure": 0.10950390722831477, "rougeL_fmeasure_stderr": 0.0019529650137034304, "rougeL_precision": 0.0729909459967392, "rougeL_precision_stderr": 0.0016200898181591467, "rougeL_recall": 0.30701344301265726, "rougeL_recall_stderr": 0.004658701295786162, "rougeLsum_fmeasure": 0.1085381091863139, "rougeLsum_fmeasure_stderr": 0.0019824670507400145, "rougeLsum_precision": 0.07274875626049643, "rougeLsum_precision_stderr": 0.0016668638545293297, "rougeLsum_recall": 0.30071552450108774, "rougeLsum_recall_stderr": 0.004440903116419319}}, "2": {"PALM_prompt": {"bleu": 0.35329168677191275, "bleu_stderr": 0.03390701669686637, "rouge1_fmeasure": 0.11535960559109319, "rouge1_fmeasure_stderr": 0.002056790821407062, "rouge1_precision": 0.07636623292264108, "rouge1_precision_stderr": 0.0016638254935029849, "rouge1_recall": 0.3287961533258121, "rouge1_recall_stderr": 0.004764326327026456, "rouge2_fmeasure": 0.05384033369196494, "rouge2_fmeasure_stderr": 0.0013019177568197539, "rouge2_precision": 0.035234171439883326, "rouge2_precision_stderr": 0.0009656833181951659, "rouge2_recall": 0.15891884434217565, "rouge2_recall_stderr": 0.003347326475949424, "rougeL_fmeasure": 0.10900458168967593, "rougeL_fmeasure_stderr": 0.0018752253984042533, "rougeL_precision": 0.07191370056044478, "rougeL_precision_stderr": 0.00151309831707199, "rougeL_recall": 0.3139667684953094, "rougeL_recall_stderr": 0.004595120348556269, "rougeLsum_fmeasure": 0.10835178632862826, "rougeLsum_fmeasure_stderr": 0.0019132850106397554, "rougeLsum_precision": 0.07180086441278799, "rougeLsum_precision_stderr": 0.0015621362000957913, "rougeLsum_recall": 0.308419107397528, "rougeLsum_recall_stderr": 0.004350924818476407}}, "3": {"PALM_prompt": {"bleu": 0.39131628859321554, "bleu_stderr": 0.026313116002416113, "rouge1_fmeasure": 0.11391076313881403, "rouge1_fmeasure_stderr": 0.0020936343693923527, "rouge1_precision": 0.07564940705521785, "rouge1_precision_stderr": 0.0016901779366449113, "rouge1_recall": 0.3252629322591383, "rouge1_recall_stderr": 0.0047859412578955705, "rouge2_fmeasure": 0.05369363828556127, "rouge2_fmeasure_stderr": 0.0013209945591844108, "rouge2_precision": 0.03538953748428128, "rouge2_precision_stderr": 0.0010004009583260043, "rouge2_recall": 0.1567691792380991, "rouge2_recall_stderr": 0.003336003242576354, "rougeL_fmeasure": 0.1076674209928578, "rougeL_fmeasure_stderr": 0.0019026187637810328, "rougeL_precision": 0.07119946415387049, "rougeL_precision_stderr": 0.0015306573606216481, "rougeL_recall": 0.3103716691791309, "rougeL_recall_stderr": 0.004600670503840457, "rougeLsum_fmeasure": 0.10699621950964268, "rougeLsum_fmeasure_stderr": 0.0019384608416758112, "rougeLsum_precision": 0.07110671643080219, "rougeLsum_precision_stderr": 0.0015812267457522326, "rougeLsum_recall": 0.30565042167460654, "rougeLsum_recall_stderr": 0.004393496455416831}}, "4": {"PALM_prompt": {"bleu": 0.37052755017666045, "bleu_stderr": 0.022042441052051973, "rouge1_fmeasure": 0.11339049154327531, "rouge1_fmeasure_stderr": 0.002096303472052317, "rouge1_precision": 0.0748616883094487, "rouge1_precision_stderr": 0.0016268791916451487, "rouge1_recall": 0.32859336280130663, "rouge1_recall_stderr": 0.004763314479505647, "rouge2_fmeasure": 0.052775854126686714, "rouge2_fmeasure_stderr": 0.0013204579265960609, "rouge2_precision": 0.03464054004561626, "rouge2_precision_stderr": 0.0009949762918573578, "rouge2_recall": 0.157960165962179, "rouge2_recall_stderr": 0.003381278298836932, "rougeL_fmeasure": 0.1064438415118096, "rougeL_fmeasure_stderr": 0.001899753706201498, "rougeL_precision": 0.0700069922960969, "rougeL_precision_stderr": 0.001456781799696283, "rougeL_recall": 0.31134498667272115, "rougeL_recall_stderr": 0.004535027041515096, "rougeLsum_fmeasure": 0.10675864203685105, "rougeLsum_fmeasure_stderr": 0.0019563539894181738, "rougeLsum_precision": 0.07049420755664745, "rougeLsum_precision_stderr": 0.001517990950279307, "rougeLsum_recall": 0.3098059888344558, "rougeLsum_recall_stderr": 0.004434437168281575}}, "5": {"PALM_prompt": {"bleu": 0.35995256263213665, "bleu_stderr": 0.03764670650407796, "rouge1_fmeasure": 0.11087540816858653, "rouge1_fmeasure_stderr": 0.0020724670013026, "rouge1_precision": 0.07365790588568459, "rouge1_precision_stderr": 0.0017121949707532308, "rouge1_recall": 0.3228153530836222, "rouge1_recall_stderr": 0.004694269135211919, "rouge2_fmeasure": 0.05131186762124571, "rouge2_fmeasure_stderr": 0.001301751075510561, "rouge2_precision": 0.03370563835754997, "rouge2_precision_stderr": 0.000982560364849555, "rouge2_recall": 0.15192330946847016, "rouge2_recall_stderr": 0.0033144184826216097, "rougeL_fmeasure": 0.10413194280026901, "rougeL_fmeasure_stderr": 0.0018759453675962236, "rougeL_precision": 0.06873278486701989, "rougeL_precision_stderr": 0.0014938317734199177, "rougeL_recall": 0.3061441130232553, "rougeL_recall_stderr": 0.00448586985770301, "rougeLsum_fmeasure": 0.10388096228686042, "rougeLsum_fmeasure_stderr": 0.0019168887238182983, "rougeLsum_precision": 0.06886164410317586, "rougeLsum_precision_stderr": 0.0015474980166708774, "rougeLsum_recall": 0.30297784804468764, "rougeLsum_recall_stderr": 0.004342846233116827}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 0.06405097746568524, "bleu_stderr": 0.014294784040984042, "rouge1_fmeasure": 0.07993097912178979, "rouge1_fmeasure_stderr": 0.0011136701948783593, "rouge1_precision": 0.06974904696556859, "rouge1_precision_stderr": 0.0011614879719000074, "rouge1_recall": 0.11241125442105249, "rouge1_recall_stderr": 0.0014600463110962813, "rouge2_fmeasure": 0.004179116314891411, "rouge2_fmeasure_stderr": 0.00020937586745221478, "rouge2_precision": 0.0038814664404484087, "rouge2_precision_stderr": 0.00020315036956781887, "rouge2_recall": 0.005354585862023678, "rouge2_recall_stderr": 0.0002836354871500975, "rougeL_fmeasure": 0.0763332930371866, "rougeL_fmeasure_stderr": 0.001024817413877872, "rougeL_precision": 0.06608672332133624, "rougeL_precision_stderr": 0.001045278233772761, "rougeL_recall": 0.10847163328970093, "rougeL_recall_stderr": 0.0014019598783824951, "rougeLsum_fmeasure": 0.07455065795690266, "rougeLsum_fmeasure_stderr": 0.0010195929073391948, "rougeLsum_precision": 0.06486651184386755, "rougeLsum_precision_stderr": 0.001063656610383382, "rougeLsum_recall": 0.10550878356686602, "rougeLsum_recall_stderr": 0.001363292887440063}}, "1": {"tldr_en": {"bleu": 0.08853135918871324, "bleu_stderr": 0.020831959008675835, "rouge1_fmeasure": 0.09272818816907549, "rouge1_fmeasure_stderr": 0.0011490248194294334, "rouge1_precision": 0.07967563985483712, "rouge1_precision_stderr": 0.0011981791486243532, "rouge1_recall": 0.13333453575974324, "rouge1_recall_stderr": 0.0015456880808194433, "rouge2_fmeasure": 0.003531715774833695, "rouge2_fmeasure_stderr": 0.00019054232118964935, "rouge2_precision": 0.0031182028953535613, "rouge2_precision_stderr": 0.0001763228556558668, "rouge2_recall": 0.005182712565249269, "rouge2_recall_stderr": 0.00033464377765989186, "rougeL_fmeasure": 0.08573108085907154, "rougeL_fmeasure_stderr": 0.0009959675791518386, "rougeL_precision": 0.07295443320214035, "rougeL_precision_stderr": 0.0010133973838849122, "rougeL_recall": 0.1250877049808009, "rougeL_recall_stderr": 0.0014330876586568309, "rougeLsum_fmeasure": 0.08666221587131816, "rougeLsum_fmeasure_stderr": 0.0010615368892029633, "rougeLsum_precision": 0.07428908435430764, "rougeLsum_precision_stderr": 0.0011046337929190013, "rougeLsum_recall": 0.1251193589727116, "rougeLsum_recall_stderr": 0.0014467114410570768}}, "2": {"tldr_en": {"bleu": 0.16015334774145826, "bleu_stderr": 0.023734817784982588, "rouge1_fmeasure": 0.09561825103521294, "rouge1_fmeasure_stderr": 0.0011752321521538633, "rouge1_precision": 0.0821044769548865, "rouge1_precision_stderr": 0.0012109023007897738, "rouge1_recall": 0.13827872380494466, "rouge1_recall_stderr": 0.0016474383605086832, "rouge2_fmeasure": 0.004325117399126492, "rouge2_fmeasure_stderr": 0.0002352094266785428, "rouge2_precision": 0.0037438880758047005, "rouge2_precision_stderr": 0.00021045724159333664, "rouge2_recall": 0.006616313525097841, "rouge2_recall_stderr": 0.00042267274141401527, "rougeL_fmeasure": 0.08768328186526594, "rougeL_fmeasure_stderr": 0.0010046872049285587, "rougeL_precision": 0.07475764548912422, "rougeL_precision_stderr": 0.0010236395736361035, "rougeL_recall": 0.1281885979875213, "rougeL_recall_stderr": 0.0014803317472057867, "rougeLsum_fmeasure": 0.08802189261932684, "rougeLsum_fmeasure_stderr": 0.0010727115935503444, "rougeLsum_precision": 0.07537568417123489, "rougeLsum_precision_stderr": 0.0011026893753818762, "rougeLsum_recall": 0.12800436919065455, "rougeLsum_recall_stderr": 0.0015289279450751802}}, "3": {"tldr_en": {"bleu": 0.2614442095298893, "bleu_stderr": 0.028570775000506844, "rouge1_fmeasure": 0.08208715110302196, "rouge1_fmeasure_stderr": 0.0012831764702565588, "rouge1_precision": 0.07398914178321388, "rouge1_precision_stderr": 0.0014407291212392703, "rouge1_recall": 0.1182771258048201, "rouge1_recall_stderr": 0.0018198887142112982, "rouge2_fmeasure": 0.004456026661162212, "rouge2_fmeasure_stderr": 0.0002758499131281591, "rouge2_precision": 0.004020145992604983, "rouge2_precision_stderr": 0.00026592766536485463, "rouge2_recall": 0.006665406872296463, "rouge2_recall_stderr": 0.0004654137487344057, "rougeL_fmeasure": 0.07512642856203387, "rougeL_fmeasure_stderr": 0.001099007590251846, "rougeL_precision": 0.06723906063892546, "rougeL_precision_stderr": 0.0012458979372848018, "rougeL_recall": 0.10954540525765066, "rougeL_recall_stderr": 0.0016400029407562262, "rougeLsum_fmeasure": 0.07638882046568143, "rougeLsum_fmeasure_stderr": 0.0011908718046903589, "rougeLsum_precision": 0.0688188354810642, "rougeLsum_precision_stderr": 0.0013457162556628375, "rougeLsum_recall": 0.11040591385464732, "rougeLsum_recall_stderr": 0.0017010360876035554}}, "4": {"tldr_en": {"bleu": 0.06670432998840774, "bleu_stderr": 0.010834402499150991, "rouge1_fmeasure": 0.02864898102166578, "rouge1_fmeasure_stderr": 0.0010448235312037137, "rouge1_precision": 0.0280761104871091, "rouge1_precision_stderr": 0.0012926400012494954, "rouge1_recall": 0.04256550951472789, "rouge1_recall_stderr": 0.0015662670220772453, "rouge2_fmeasure": 0.0016218547111402145, "rouge2_fmeasure_stderr": 0.00016643077121423323, "rouge2_precision": 0.0015031467154335884, "rouge2_precision_stderr": 0.00016818655996840296, "rouge2_recall": 0.002661081711747737, "rouge2_recall_stderr": 0.0003367092996858654, "rougeL_fmeasure": 0.025906000131277095, "rougeL_fmeasure_stderr": 0.0009115419100498881, "rougeL_precision": 0.025283329619297263, "rougeL_precision_stderr": 0.00116903072674806, "rougeL_recall": 0.039170970833320586, "rougeL_recall_stderr": 0.0014262971628748195, "rougeLsum_fmeasure": 0.02654490920386107, "rougeLsum_fmeasure_stderr": 0.0009630631658391304, "rougeLsum_precision": 0.026006848089608563, "rougeLsum_precision_stderr": 0.0011953058942308498, "rougeLsum_recall": 0.039653057024522397, "rougeLsum_recall_stderr": 0.0014650811160356375}}, "5": {"tldr_en": {"bleu": 1.6156436086606036e-07, "bleu_stderr": 3.807467277955187e-07, "rouge1_fmeasure": 0.004918846421495546, "rouge1_fmeasure_stderr": 0.0005110461047249682, "rouge1_precision": 0.0049355335861678085, "rouge1_precision_stderr": 0.0005700406667338665, "rouge1_recall": 0.007089024132379281, "rouge1_recall_stderr": 0.0007092663255506791, "rouge2_fmeasure": 0.0002967006936469681, "rouge2_fmeasure_stderr": 8.974945913013664e-05, "rouge2_precision": 0.000304156454074688, "rouge2_precision_stderr": 0.00010109123411176751, "rouge2_recall": 0.0004177630400024621, "rouge2_recall_stderr": 0.0001288667888548727, "rougeL_fmeasure": 0.00441760659640342, "rougeL_fmeasure_stderr": 0.00043438930707966116, "rougeL_precision": 0.004402996255292156, "rougeL_precision_stderr": 0.00048523970345924354, "rougeL_recall": 0.0065174776893015506, "rougeL_recall_stderr": 0.0006367984657316963, "rougeLsum_fmeasure": 0.0045788303091290556, "rougeLsum_fmeasure_stderr": 0.0004707439731624039, "rougeLsum_precision": 0.004598567341994099, "rougeLsum_precision_stderr": 0.0005286763909225495, "rougeLsum_recall": 0.006622774214457113, "rougeLsum_recall_stderr": 0.0006621175203479033}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.023831548566168672, "bleu_stderr": 0.00013669989610831787, "rouge1_fmeasure": 0.06253160637445583, "rouge1_fmeasure_stderr": 0.0006035775811340659, "rouge1_precision": 0.05449305285889657, "rouge1_precision_stderr": 0.0007682171874626243, "rouge1_recall": 0.09611446455866401, "rouge1_recall_stderr": 0.0009371035323738266, "rouge2_fmeasure": 0.010133641195144875, "rouge2_fmeasure_stderr": 0.0002601808393175606, "rouge2_precision": 0.007752886867383771, "rouge2_precision_stderr": 0.00022369481483085416, "rouge2_recall": 0.015703783259365008, "rouge2_recall_stderr": 0.0004099918025863708, "rougeL_fmeasure": 0.06247837072122016, "rougeL_fmeasure_stderr": 0.0006020906398104155, "rougeL_precision": 0.05437436599020971, "rougeL_precision_stderr": 0.0007584161707705096, "rougeL_recall": 0.09607649971755518, "rougeL_recall_stderr": 0.0009370784251490342, "rougeLsum_fmeasure": 0.05776879458207875, "rougeLsum_fmeasure_stderr": 0.000537142122378467, "rougeLsum_precision": 0.050869285779818474, "rougeLsum_precision_stderr": 0.0007452830238007919, "rougeLsum_recall": 0.08872183965001415, "rougeLsum_recall_stderr": 0.0008251902983264386}}, "1": {"generate_text_restaurant": {"bleu": 7.5716891789423135, "bleu_stderr": 0.07657233541931847, "rouge1_fmeasure": 0.378589775338143, "rouge1_fmeasure_stderr": 0.0019484786742807038, "rouge1_precision": 0.37675073670067255, "rouge1_precision_stderr": 0.0024649179184822933, "rouge1_recall": 0.4196033967756775, "rouge1_recall_stderr": 0.002554821342455347, "rouge2_fmeasure": 0.15192303671918772, "rouge2_fmeasure_stderr": 0.0014410254284747698, "rouge2_precision": 0.1515787193445361, "rouge2_precision_stderr": 0.0016212252559065935, "rouge2_recall": 0.16960708192000204, "rouge2_recall_stderr": 0.001740739006568578, "rougeL_fmeasure": 0.25806597930638026, "rougeL_fmeasure_stderr": 0.0014774627085457283, "rougeL_precision": 0.2562623240157523, "rougeL_precision_stderr": 0.0018156767746927076, "rougeL_recall": 0.2883651325055159, "rougeL_recall_stderr": 0.002017743649477281, "rougeLsum_fmeasure": 0.3135332346518358, "rougeLsum_fmeasure_stderr": 0.0018186234521200878, "rougeLsum_precision": 0.3119796155675665, "rougeLsum_precision_stderr": 0.002197372081552806, "rougeLsum_recall": 0.34777409514067237, "rougeLsum_recall_stderr": 0.0023378237968652736}}, "2": {"generate_text_restaurant": {"bleu": 8.415825253964716, "bleu_stderr": 0.15234172654187825, "rouge1_fmeasure": 0.39680880458063933, "rouge1_fmeasure_stderr": 0.0018153515680263594, "rouge1_precision": 0.39718229879297745, "rouge1_precision_stderr": 0.0023267859874187644, "rouge1_recall": 0.431821762098323, "rouge1_recall_stderr": 0.0024735326081453987, "rouge2_fmeasure": 0.16948726304174097, "rouge2_fmeasure_stderr": 0.0014659320427177205, "rouge2_precision": 0.16935200061749783, "rouge2_precision_stderr": 0.0016483718369393143, "rouge2_recall": 0.1866765784720031, "rouge2_recall_stderr": 0.001801459763033518, "rougeL_fmeasure": 0.2687904361168554, "rougeL_fmeasure_stderr": 0.0014803733340222947, "rougeL_precision": 0.26901190247512224, "rougeL_precision_stderr": 0.0018220891410013944, "rougeL_recall": 0.2939323384016312, "rougeL_recall_stderr": 0.002026079597108763, "rougeLsum_fmeasure": 0.32628931648104437, "rougeLsum_fmeasure_stderr": 0.0017352819339899982, "rougeLsum_precision": 0.32665277498400797, "rougeLsum_precision_stderr": 0.0021157313290731425, "rougeLsum_recall": 0.35541359697456837, "rougeLsum_recall_stderr": 0.0022949279797004926}}, "3": {"generate_text_restaurant": {"bleu": 9.053715150693384, "bleu_stderr": 0.06980100462430686, "rouge1_fmeasure": 0.40313857771935285, "rouge1_fmeasure_stderr": 0.001805737386969938, "rouge1_precision": 0.402912961249351, "rouge1_precision_stderr": 0.002311451156821765, "rouge1_recall": 0.4381203339806578, "rouge1_recall_stderr": 0.002475121033893737, "rouge2_fmeasure": 0.178157082665128, "rouge2_fmeasure_stderr": 0.0015223176148801163, "rouge2_precision": 0.17782567314062503, "rouge2_precision_stderr": 0.0016671508958511524, "rouge2_recall": 0.19584958268974514, "rouge2_recall_stderr": 0.0018895308376705284, "rougeL_fmeasure": 0.27413718639696194, "rougeL_fmeasure_stderr": 0.0015353598026852135, "rougeL_precision": 0.2736571491639734, "rougeL_precision_stderr": 0.0018305403535992246, "rougeL_recall": 0.2995352347018479, "rougeL_recall_stderr": 0.0020969000115146907, "rougeLsum_fmeasure": 0.3312412837590075, "rougeLsum_fmeasure_stderr": 0.0017721110137833955, "rougeLsum_precision": 0.3307097275531657, "rougeLsum_precision_stderr": 0.002114092970242258, "rougeLsum_recall": 0.3606968160198456, "rougeLsum_recall_stderr": 0.0023518211594742116}}, "4": {"generate_text_restaurant": {"bleu": 8.992864407073405, "bleu_stderr": 0.08770475097615464, "rouge1_fmeasure": 0.4013041193656425, "rouge1_fmeasure_stderr": 0.0018012199210851237, "rouge1_precision": 0.4035902410295868, "rouge1_precision_stderr": 0.002314326089880622, "rouge1_recall": 0.43366816643506345, "rouge1_recall_stderr": 0.002417145513082443, "rouge2_fmeasure": 0.17801635513525366, "rouge2_fmeasure_stderr": 0.0015174150039881623, "rouge2_precision": 0.17901955155397473, "rouge2_precision_stderr": 0.0016801289447732556, "rouge2_recall": 0.19422772465440907, "rouge2_recall_stderr": 0.001846206730549891, "rougeL_fmeasure": 0.2740436937683373, "rougeL_fmeasure_stderr": 0.0015578070729464124, "rougeL_precision": 0.2749826532734801, "rougeL_precision_stderr": 0.0018385761944167643, "rougeL_recall": 0.29799763054527917, "rougeL_recall_stderr": 0.00209039090294883, "rougeLsum_fmeasure": 0.3294316503713183, "rougeLsum_fmeasure_stderr": 0.0017961731530620674, "rougeLsum_precision": 0.3308350462575971, "rougeLsum_precision_stderr": 0.002138455916629113, "rougeLsum_recall": 0.35699098749445574, "rougeLsum_recall_stderr": 0.0023463522382987587}}, "5": {"generate_text_restaurant": {"bleu": 8.814480003636467, "bleu_stderr": 0.16384652517645307, "rouge1_fmeasure": 0.3997657675145631, "rouge1_fmeasure_stderr": 0.0018195701621308699, "rouge1_precision": 0.40351181993098634, "rouge1_precision_stderr": 0.0023784770515124913, "rouge1_recall": 0.430473586962648, "rouge1_recall_stderr": 0.002372480142494689, "rouge2_fmeasure": 0.17668001136142425, "rouge2_fmeasure_stderr": 0.0015118431601118265, "rouge2_precision": 0.1784847314937123, "rouge2_precision_stderr": 0.0016941969329593459, "rouge2_recall": 0.19174384124754476, "rouge2_recall_stderr": 0.0018048604307058427, "rougeL_fmeasure": 0.27542991361833463, "rougeL_fmeasure_stderr": 0.0015311724080949638, "rougeL_precision": 0.27762946990242493, "rougeL_precision_stderr": 0.0018754875427661021, "rougeL_recall": 0.2983245741357427, "rougeL_recall_stderr": 0.0020162838897830222, "rougeLsum_fmeasure": 0.3287311122356614, "rougeLsum_fmeasure_stderr": 0.001782259806323467, "rougeLsum_precision": 0.3318650598753267, "rougeLsum_precision_stderr": 0.0021960345461577626, "rougeLsum_recall": 0.3541002511978268, "rougeLsum_recall_stderr": 0.0022290840532188166}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 0.12092109985195117, "bleu_stderr": 0.034246881577404256, "rouge1_fmeasure": 0.10882608435694785, "rouge1_fmeasure_stderr": 0.001462837537770478, "rouge1_precision": 0.07925362926398048, "rouge1_precision_stderr": 0.0011370372081177169, "rouge1_recall": 0.18098825098858384, "rouge1_recall_stderr": 0.0023006406434685736, "rouge2_fmeasure": 0.006837673192649714, "rouge2_fmeasure_stderr": 0.000442612615175873, "rouge2_precision": 0.0049500024419751225, "rouge2_precision_stderr": 0.0003221010027610884, "rouge2_recall": 0.011519060394160629, "rouge2_recall_stderr": 0.0007552222759138258, "rougeL_fmeasure": 0.09494170160398116, "rougeL_fmeasure_stderr": 0.0012132340582403285, "rougeL_precision": 0.06892683313731025, "rougeL_precision_stderr": 0.000932076327601901, "rougeL_recall": 0.15911626707843118, "rougeL_recall_stderr": 0.0020012317128335327, "rougeLsum_fmeasure": 0.0899394725428077, "rougeLsum_fmeasure_stderr": 0.0012023093408387983, "rougeLsum_precision": 0.06534032138757281, "rougeLsum_precision_stderr": 0.000925024745716427, "rougeLsum_recall": 0.1505792139257626, "rougeLsum_recall_stderr": 0.001967931673316114}}, "1": {"article_DOC_summary": {"bleu": 0.13415861030855933, "bleu_stderr": 0.03178941606459842, "rouge1_fmeasure": 0.09933976208790961, "rouge1_fmeasure_stderr": 0.0015105268621052712, "rouge1_precision": 0.07136700780151199, "rouge1_precision_stderr": 0.0011530148021735068, "rouge1_recall": 0.17061111634547732, "rouge1_recall_stderr": 0.0024515335014419462, "rouge2_fmeasure": 0.007569232441595493, "rouge2_fmeasure_stderr": 0.0004654857633003587, "rouge2_precision": 0.005417921285791925, "rouge2_precision_stderr": 0.00033592492027509244, "rouge2_recall": 0.013060194907812957, "rouge2_recall_stderr": 0.0008110410377500807, "rougeL_fmeasure": 0.08986801887882968, "rougeL_fmeasure_stderr": 0.0012683121784435572, "rougeL_precision": 0.06437885934626987, "rougeL_precision_stderr": 0.0009594061202846054, "rougeL_recall": 0.1554012098707474, "rougeL_recall_stderr": 0.0021362266435883126, "rougeLsum_fmeasure": 0.08245663882625462, "rougeLsum_fmeasure_stderr": 0.0012172551546565829, "rougeLsum_precision": 0.059106334424700216, "rougeLsum_precision_stderr": 0.000921780599854098, "rougeLsum_recall": 0.14253712577743585, "rougeLsum_recall_stderr": 0.002053226455737736}}, "2": {"article_DOC_summary": {"bleu": 0.12926835061658584, "bleu_stderr": 0.031144890827347994, "rouge1_fmeasure": 0.09827283354103876, "rouge1_fmeasure_stderr": 0.0014779249302482205, "rouge1_precision": 0.07045755744978599, "rouge1_precision_stderr": 0.0011197061035798305, "rouge1_recall": 0.1695166104908529, "rouge1_recall_stderr": 0.0024490112684960375, "rouge2_fmeasure": 0.007191687689267013, "rouge2_fmeasure_stderr": 0.0004416289821760009, "rouge2_precision": 0.005139282932076346, "rouge2_precision_stderr": 0.00031501935520477996, "rouge2_recall": 0.01241481071541199, "rouge2_recall_stderr": 0.0007816833218457778, "rougeL_fmeasure": 0.08935199909306543, "rougeL_fmeasure_stderr": 0.0012717361942556741, "rougeL_precision": 0.06394661959092474, "rougeL_precision_stderr": 0.0009610100837351401, "rougeL_recall": 0.15483855207760336, "rougeL_recall_stderr": 0.0021492840237945345, "rougeLsum_fmeasure": 0.08188744465829662, "rougeLsum_fmeasure_stderr": 0.0012028366586011006, "rougeLsum_precision": 0.0585812823989726, "rougeLsum_precision_stderr": 0.0009045085352227196, "rougeLsum_recall": 0.14210183912770377, "rougeLsum_recall_stderr": 0.0020497549943373035}}, "3": {"article_DOC_summary": {"bleu": 0.19243170077336127, "bleu_stderr": 0.029553440444076876, "rouge1_fmeasure": 0.09447682363371723, "rouge1_fmeasure_stderr": 0.0016497035224024978, "rouge1_precision": 0.07075379884252816, "rouge1_precision_stderr": 0.001485432353726185, "rouge1_recall": 0.15768815281906948, "rouge1_recall_stderr": 0.00265924403959227, "rouge2_fmeasure": 0.007295985375796452, "rouge2_fmeasure_stderr": 0.00047973371921657647, "rouge2_precision": 0.005384789457002967, "rouge2_precision_stderr": 0.00038433889596432803, "rouge2_recall": 0.01255287060081809, "rouge2_recall_stderr": 0.0008379579770883962, "rougeL_fmeasure": 0.08484720398405765, "rougeL_fmeasure_stderr": 0.0013808160236295816, "rougeL_precision": 0.06325092101401272, "rougeL_precision_stderr": 0.0012434648427272113, "rougeL_recall": 0.14272917123385326, "rougeL_recall_stderr": 0.0022892155927241712, "rougeLsum_fmeasure": 0.07823287562318222, "rougeLsum_fmeasure_stderr": 0.001328680406843208, "rougeLsum_precision": 0.05848048220762729, "rougeLsum_precision_stderr": 0.0012154422756537036, "rougeLsum_recall": 0.13155323370681296, "rougeLsum_recall_stderr": 0.0022039878122268345}}, "4": {"article_DOC_summary": {"bleu": 0.052615863248391834, "bleu_stderr": 0.009113330815053633, "rouge1_fmeasure": 0.02608363281745486, "rouge1_fmeasure_stderr": 0.0015378702997621066, "rouge1_precision": 0.0230606329148841, "rouge1_precision_stderr": 0.001669556268708695, "rouge1_recall": 0.03931050113194901, "rouge1_recall_stderr": 0.002276662240791683, "rouge2_fmeasure": 0.0018619726116248108, "rouge2_fmeasure_stderr": 0.00026399269918420587, "rouge2_precision": 0.0015648182940910817, "rouge2_precision_stderr": 0.0002556263421133481, "rouge2_recall": 0.002830258508910678, "rouge2_recall_stderr": 0.0003872563374046439, "rougeL_fmeasure": 0.02355261113960695, "rougeL_fmeasure_stderr": 0.001364331759867035, "rougeL_precision": 0.020423048767215433, "rougeL_precision_stderr": 0.0013721487737836427, "rougeL_recall": 0.03591218470026906, "rougeL_recall_stderr": 0.00206779739972093, "rougeLsum_fmeasure": 0.021609618867255508, "rougeLsum_fmeasure_stderr": 0.0012726837713074035, "rougeLsum_precision": 0.01899394741047602, "rougeLsum_precision_stderr": 0.0013253063930260193, "rougeLsum_recall": 0.032752320378036076, "rougeLsum_recall_stderr": 0.0019045379991497096}}, "5": {"article_DOC_summary": {"bleu": 5.10720007616051e-38, "bleu_stderr": 3.031931667498303e-33, "rouge1_fmeasure": 0.0018545887592808419, "rouge1_fmeasure_stderr": 0.0004835971603513197, "rouge1_precision": 0.002042535086594319, "rouge1_precision_stderr": 0.0005339034695671533, "rouge1_recall": 0.0017891870168785233, "rouge1_recall_stderr": 0.0004800337144125977, "rouge2_fmeasure": 5.360205831903945e-05, "rouge2_fmeasure_stderr": 5.360205831903959e-05, "rouge2_precision": 5.360205831903945e-05, "rouge2_precision_stderr": 5.360205831903959e-05, "rouge2_recall": 5.360205831903945e-05, "rouge2_recall_stderr": 5.360205831903959e-05, "rougeL_fmeasure": 0.0014859696373791518, "rougeL_fmeasure_stderr": 0.00039548761545587586, "rougeL_precision": 0.0016326522715622547, "rougeL_precision_stderr": 0.00043592654264051184, "rougeL_recall": 0.001419705972958375, "rougeL_recall_stderr": 0.00037990440625202293, "rougeLsum_fmeasure": 0.0015279556773790943, "rougeLsum_fmeasure_stderr": 0.00041167209717655475, "rougeLsum_precision": 0.0016749881249852043, "rougeLsum_precision_stderr": 0.00045147167577573214, "rougeLsum_recall": 0.0014642325505038205, "rougeLsum_recall_stderr": 0.00039829208519008764}}}} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8a8eb884125c0c00433daa13359d67ab16b52110 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.397060070126241, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.028476177083435063 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08196959243431225, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00195116466928937 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.33108642729266935, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004857070190117945 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11979264664781694, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002158921431338679 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.038077755175156366, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011930918163582719 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16233506295685138, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033998382128665604 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.056570429414871386, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013679132720736964 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07704554627752275, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001768283838194329 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.31731175550944546, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004676502198993326 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11313932552316201, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019283064282019343 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07702671368860729, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001838053091960503 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3112451044141329, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00447435213067716 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11240293894137914, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001986128142601576 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7b7e3b8b327ede0d6e94ebb262e24fb5319f11b0 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.38706003560361096, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02717161163589533 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07729769477815421, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001771654452916722 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.32003526042529734, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004812626649605805 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11542957589833107, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021300218455718844 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.036414818702198135, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001111989991849411 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.15727962630352726, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033348788622806114 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05476675039227378, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013633806053571877 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.0729909459967392, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0016200898181591467 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.30701344301265726, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004658701295786162 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10950390722831477, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019529650137034304 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07274875626049643, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016668638545293297 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.30071552450108774, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004440903116419319 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1085381091863139, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019824670507400145 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..acc2d34bee3aae1c32f3762e63c962288da36f30 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.35329168677191275, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03390701669686637 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07636623292264108, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016638254935029849 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3287961533258121, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004764326327026456 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11535960559109319, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002056790821407062 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.035234171439883326, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009656833181951659 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.15891884434217565, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003347326475949424 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05384033369196494, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013019177568197539 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07191370056044478, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00151309831707199 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3139667684953094, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004595120348556269 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10900458168967593, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018752253984042533 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07180086441278799, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015621362000957913 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.308419107397528, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004350924818476407 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10835178632862826, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019132850106397554 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..dfdde8f441bdc25c23fb1623523fa5bf8ae76251 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.39131628859321554, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.026313116002416113 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07564940705521785, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016901779366449113 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3252629322591383, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0047859412578955705 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11391076313881403, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020936343693923527 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03538953748428128, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010004009583260043 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1567691792380991, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003336003242576354 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05369363828556127, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013209945591844108 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07119946415387049, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015306573606216481 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3103716691791309, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004600670503840457 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.1076674209928578, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019026187637810328 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07110671643080219, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015812267457522326 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.30565042167460654, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004393496455416831 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10699621950964268, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019384608416758112 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9717aaa30888a2ddb06482b870dcc369f8bf4eb0 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.37052755017666045, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.022042441052051973 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0748616883094487, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016268791916451487 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.32859336280130663, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004763314479505647 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11339049154327531, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002096303472052317 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03464054004561626, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009949762918573578 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.157960165962179, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003381278298836932 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.052775854126686714, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013204579265960609 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.0700069922960969, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001456781799696283 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.31134498667272115, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004535027041515096 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.1064438415118096, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001899753706201498 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07049420755664745, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001517990950279307 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3098059888344558, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004434437168281575 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10675864203685105, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019563539894181738 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4cb756163b713b6c91e904716b6764454af16858 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.35995256263213665, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03764670650407796 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07365790588568459, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017121949707532308 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3228153530836222, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004694269135211919 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11087540816858653, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020724670013026 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03370563835754997, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000982560364849555 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.15192330946847016, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033144184826216097 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05131186762124571, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001301751075510561 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06873278486701989, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014938317734199177 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3061441130232553, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00448586985770301 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10413194280026901, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018759453675962236 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06886164410317586, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015474980166708774 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.30297784804468764, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004342846233116827 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10388096228686042, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019168887238182983 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8f8a2c301bc6b9994d52fa83af7c387e419a802d --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.06974904696556859, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0011614879719000074 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.11241125442105249, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0014600463110962813 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.07993097912178979, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0011136701948783593 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0038814664404484087, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00020315036956781887 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.005354585862023678, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0002836354871500975 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.004179116314891411, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00020937586745221478 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.06608672332133624, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001045278233772761 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.10847163328970093, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0014019598783824951 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0763332930371866, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001024817413877872 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.06486651184386755, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001063656610383382 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.10550878356686602, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001363292887440063 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.07455065795690266, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0010195929073391948 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.06405097746568524, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.014294784040984042 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..47b7297016356f4b23c9999b88c90947d7248c84 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.07967563985483712, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0011981791486243532 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.13333453575974324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0015456880808194433 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.09272818816907549, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0011490248194294334 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0031182028953535613, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0001763228556558668 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.005182712565249269, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00033464377765989186 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.003531715774833695, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00019054232118964935 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.07295443320214035, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010133973838849122 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1250877049808009, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0014330876586568309 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.08573108085907154, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0009959675791518386 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.07428908435430764, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011046337929190013 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.1251193589727116, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0014467114410570768 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.08666221587131816, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0010615368892029633 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.08853135918871324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.020831959008675835 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..a7a5b665a98556626444bafd3572443be0e8ec4e --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.0821044769548865, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012109023007897738 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.13827872380494466, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0016474383605086832 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.09561825103521294, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0011752321521538633 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0037438880758047005, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00021045724159333664 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.006616313525097841, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00042267274141401527 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.004325117399126492, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002352094266785428 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.07475764548912422, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010236395736361035 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1281885979875213, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0014803317472057867 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.08768328186526594, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0010046872049285587 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.07537568417123489, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011026893753818762 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.12800436919065455, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0015289279450751802 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.08802189261932684, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0010727115935503444 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.16015334774145826, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.023734817784982588 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2e7c15aaf517a8901c50e50903eed3b1eda1e642 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.07398914178321388, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014407291212392703 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.1182771258048201, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0018198887142112982 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.08208715110302196, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0012831764702565588 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.004020145992604983, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00026592766536485463 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.006665406872296463, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0004654137487344057 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.004456026661162212, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002758499131281591 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.06723906063892546, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012458979372848018 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.10954540525765066, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0016400029407562262 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.07512642856203387, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001099007590251846 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.0688188354810642, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013457162556628375 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.11040591385464732, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0017010360876035554 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.07638882046568143, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011908718046903589 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.2614442095298893, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.028570775000506844 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0091a501de35f3eb4f2f8d2115c38a280da893d1 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.0280761104871091, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012926400012494954 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.04256550951472789, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0015662670220772453 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.02864898102166578, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0010448235312037137 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0015031467154335884, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00016818655996840296 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.002661081711747737, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003367092996858654 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0016218547111402145, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00016643077121423323 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.025283329619297263, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00116903072674806 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.039170970833320586, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0014262971628748195 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.025906000131277095, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0009115419100498881 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.026006848089608563, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011953058942308498 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.039653057024522397, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0014650811160356375 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.02654490920386107, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009630631658391304 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.06670432998840774, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.010834402499150991 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8c404fef72964344a6d6cf055e191c11406762aa --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.0049355335861678085, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0005700406667338665 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.007089024132379281, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0007092663255506791 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.004918846421495546, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0005110461047249682 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.000304156454074688, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00010109123411176751 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0004177630400024621, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0001288667888548727 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0002967006936469681, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 8.974945913013664e-05 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.004402996255292156, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00048523970345924354 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.0065174776893015506, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0006367984657316963 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.00441760659640342, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00043438930707966116 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.004598567341994099, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0005286763909225495 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.006622774214457113, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0006621175203479033 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.0045788303091290556, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0004707439731624039 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.6156436086606036e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 3.807467277955187e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..040b3cc8ca1afeb31a275dba4413109349112af5 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.023831548566168672, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.00013669989610831787 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.05449305285889657, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0007682171874626243 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.09611446455866401, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0009371035323738266 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.06253160637445583, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0006035775811340659 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.007752886867383771, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00022369481483085416 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.015703783259365008, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0004099918025863708 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.010133641195144875, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0002601808393175606 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.05437436599020971, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0007584161707705096 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.09607649971755518, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0009370784251490342 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.06247837072122016, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0006020906398104155 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.050869285779818474, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0007452830238007919 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.08872183965001415, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0008251902983264386 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.05776879458207875, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.000537142122378467 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..40824a1d292d19732fc6b114832d7c4127c24125 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 7.5716891789423135, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07657233541931847 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.37675073670067255, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0024649179184822933 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4196033967756775, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002554821342455347 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.378589775338143, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019484786742807038 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.1515787193445361, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016212252559065935 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.16960708192000204, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001740739006568578 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.15192303671918772, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014410254284747698 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2562623240157523, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0018156767746927076 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.2883651325055159, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002017743649477281 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.25806597930638026, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014774627085457283 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3119796155675665, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002197372081552806 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.34777409514067237, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0023378237968652736 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3135332346518358, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018186234521200878 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4e4d8352afa20d39b22aeb1916329eb45dcc7891 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 8.415825253964716, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.15234172654187825 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.39718229879297745, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0023267859874187644 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.431821762098323, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024735326081453987 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.39680880458063933, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0018153515680263594 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.16935200061749783, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016483718369393143 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.1866765784720031, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001801459763033518 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.16948726304174097, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014659320427177205 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.26901190247512224, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0018220891410013944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.2939323384016312, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002026079597108763 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2687904361168554, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014803733340222947 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.32665277498400797, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0021157313290731425 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.35541359697456837, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0022949279797004926 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.32628931648104437, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017352819339899982 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..38399eb365030986359c112acda8fb4abad2f76e --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 9.053715150693384, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06980100462430686 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.402912961249351, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002311451156821765 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4381203339806578, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002475121033893737 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.40313857771935285, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001805737386969938 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.17782567314062503, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016671508958511524 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.19584958268974514, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018895308376705284 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.178157082665128, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015223176148801163 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2736571491639734, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0018305403535992246 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.2995352347018479, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0020969000115146907 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.27413718639696194, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015353598026852135 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3307097275531657, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002114092970242258 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3606968160198456, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0023518211594742116 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3312412837590075, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017721110137833955 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4cab900c2b1ed6dce029a1a7de7ec227e6e8e119 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 8.992864407073405, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08770475097615464 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.4035902410295868, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002314326089880622 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.43366816643506345, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002417145513082443 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4013041193656425, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0018012199210851237 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.17901955155397473, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016801289447732556 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.19422772465440907, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001846206730549891 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.17801635513525366, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015174150039881623 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2749826532734801, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0018385761944167643 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.29799763054527917, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00209039090294883 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2740436937683373, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015578070729464124 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3308350462575971, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002138455916629113 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.35699098749445574, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0023463522382987587 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3294316503713183, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017961731530620674 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..974c800a94dc4fad2672f3ff5abdf943d622b57f --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 8.814480003636467, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.16384652517645307 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.40351181993098634, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0023784770515124913 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.430473586962648, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002372480142494689 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3997657675145631, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0018195701621308699 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.1784847314937123, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016941969329593459 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.19174384124754476, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018048604307058427 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.17668001136142425, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015118431601118265 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.27762946990242493, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0018754875427661021 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.2983245741357427, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0020162838897830222 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.27542991361833463, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015311724080949638 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3318650598753267, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0021960345461577626 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3541002511978268, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0022290840532188166 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3287311122356614, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001782259806323467 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_0.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7f8a14036be3bac7b6f1c709076e95658b1fcd4c --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.07925362926398048, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0011370372081177169 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.18098825098858384, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0023006406434685736 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.10882608435694785, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.001462837537770478 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0049500024419751225, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0003221010027610884 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.011519060394160629, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0007552222759138258 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.006837673192649714, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.000442612615175873 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.06892683313731025, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.000932076327601901 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.15911626707843118, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0020012317128335327 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.09494170160398116, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0012132340582403285 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.06534032138757281, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.000925024745716427 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.1505792139257626, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.001967931673316114 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0899394725428077, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0012023093408387983 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.12092109985195117, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.034246881577404256 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_1.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..34cb69b79955e6627f9b4c331aa53bca357bdb79 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.07136700780151199, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0011530148021735068 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.17061111634547732, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0024515335014419462 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.09933976208790961, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0015105268621052712 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.005417921285791925, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00033592492027509244 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.013060194907812957, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0008110410377500807 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.007569232441595493, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0004654857633003587 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.06437885934626987, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0009594061202846054 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.1554012098707474, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0021362266435883126 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.08986801887882968, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0012683121784435572 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.059106334424700216, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.000921780599854098 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.14253712577743585, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.002053226455737736 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.08245663882625462, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0012172551546565829 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.13415861030855933, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.03178941606459842 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_2.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4ae683e99e7610794d5da7d21f39b9cd9727bf1d --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.07045755744978599, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0011197061035798305 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.1695166104908529, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0024490112684960375 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.09827283354103876, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0014779249302482205 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.005139282932076346, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00031501935520477996 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.01241481071541199, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0007816833218457778 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.007191687689267013, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0004416289821760009 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.06394661959092474, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0009610100837351401 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.15483855207760336, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0021492840237945345 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.08935199909306543, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0012717361942556741 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0585812823989726, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0009045085352227196 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.14210183912770377, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0020497549943373035 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.08188744465829662, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0012028366586011006 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.12926835061658584, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.031144890827347994 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_3.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f2faafd30effd7bada4117d621b74c84b02037ca --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.07075379884252816, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001485432353726185 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.15768815281906948, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00265924403959227 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.09447682363371723, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0016497035224024978 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.005384789457002967, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00038433889596432803 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.01255287060081809, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0008379579770883962 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.007295985375796452, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00047973371921657647 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.06325092101401272, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0012434648427272113 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.14272917123385326, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0022892155927241712 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.08484720398405765, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0013808160236295816 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.05848048220762729, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0012154422756537036 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.13155323370681296, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0022039878122268345 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.07823287562318222, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.001328680406843208 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.19243170077336127, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.029553440444076876 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_4.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9923ab969905e1dd0c357013f50b70c9049e7d12 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0230606329148841, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001669556268708695 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.03931050113194901, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.002276662240791683 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.02608363281745486, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0015378702997621066 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0015648182940910817, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0002556263421133481 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.002830258508910678, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0003872563374046439 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0018619726116248108, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00026399269918420587 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.020423048767215433, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013721487737836427 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.03591218470026906, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00206779739972093 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.02355261113960695, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001364331759867035 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.01899394741047602, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0013253063930260193 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.032752320378036076, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0019045379991497096 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.021609618867255508, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0012726837713074035 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.052615863248391834, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.009113330815053633 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_5.json b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1bcfbe2114bff448dc5f2a69533b8a4b443fe95a --- /dev/null +++ b/4b284b12bc4opt4/evaluation/generation/slim.4b284b12bc4opt4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.002042535086594319, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0005339034695671533 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0017891870168785233, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0004800337144125977 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0018545887592808419, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0004835971603513197 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 5.360205831903945e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 5.360205831903959e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 5.360205831903945e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 5.360205831903959e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 5.360205831903945e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 5.360205831903959e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0016326522715622547, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00043592654264051184 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.001419705972958375, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00037990440625202293 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0014859696373791518, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00039548761545587586 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0016749881249852043, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.00045147167577573214 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0014642325505038205, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00039829208519008764 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0015279556773790943, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00041167209717655475 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 5.10720007616051e-38, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 3.031931667498303e-33 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4opt4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_0.csv b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..e1985e2cc51b408af17dac63da67cba1aa1a7808 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.345,0.015039986742055237,0 +anli_r2,acc,0.338,0.01496596071022449,0 +anli_r3,acc,0.3441666666666667,0.013720551062295756,0 +arc_challenge,acc,0.2713310580204778,0.012993807727545797,0 +arc_challenge,acc_norm,0.2909556313993174,0.013273077865907573,0 +arc_easy,acc,0.5753367003367004,0.01014265368748041,0 +arc_easy,acc_norm,0.515993265993266,0.01025453358928818,0 +boolq,acc,0.5091743119266054,0.00874358274491014,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.1986111111111111,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.4736108344951205,0.004982826916687145,0 +hellaswag,acc_norm,0.6177056363274248,0.0048495478191344825,0 +piqa,acc,0.7388465723612623,0.01024873864993558,0 +piqa,acc_norm,0.7437431991294886,0.010185787831565058,0 +rte,acc,0.5523465703971119,0.02993107036293953,0 +sciq,acc,0.842,0.01153989467755957,0 +sciq,acc_norm,0.751,0.013681600278702301,0 +storycloze_2016,acc,0.7049706039551042,0.010546232606962287,0 +winogrande,acc,0.5872138910812944,0.013837060648682103,0 diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_0_lm-eval_global_step109672_2023-05-15-10-12-47_0shots_backup.json b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_0_lm-eval_global_step109672_2023-05-15-10-12-47_0shots_backup.json deleted file mode 100644 index 9d0211230c963ffdb61f0be07289c69317364714..0000000000000000000000000000000000000000 --- a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_0_lm-eval_global_step109672_2023-05-15-10-12-47_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.345, - "acc_stderr": 0.015039986742055237 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.01496596071022449 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295756 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.1986111111111111 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4736108344951205, - "acc_stderr": 0.004982826916687145, - "acc_norm": 0.6177056363274248, - "acc_norm_stderr": 0.0048495478191344825 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.02993107036293953 - }, - "winogrande": { - "acc": 0.5872138910812944, - "acc_stderr": 0.013837060648682103 - }, - "storycloze_2016": { - "acc": 0.7049706039551042, - "acc_stderr": 0.010546232606962287 - }, - "boolq": { - "acc": 0.5091743119266054, - "acc_stderr": 0.00874358274491014 - }, - "arc_easy": { - "acc": 0.5753367003367004, - "acc_stderr": 0.01014265368748041, - "acc_norm": 0.515993265993266, - "acc_norm_stderr": 0.01025453358928818 - }, - "arc_challenge": { - "acc": 0.2713310580204778, - "acc_stderr": 0.012993807727545797, - "acc_norm": 0.2909556313993174, - "acc_norm_stderr": 0.013273077865907573 - }, - "sciq": { - "acc": 0.842, - "acc_stderr": 0.01153989467755957, - "acc_norm": 0.751, - "acc_norm_stderr": 0.013681600278702301 - }, - "piqa": { - "acc": 0.7388465723612623, - "acc_stderr": 0.01024873864993558, - "acc_norm": 0.7437431991294886, - "acc_norm_stderr": 0.010185787831565058 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_1.csv b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..782191860f687e2c13f1f64e9d245f391f423c40 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.327,0.014842213153411239,0 +anli_r2,acc,0.318,0.014734079309311901,0 +anli_r3,acc,0.34,0.013680495725767784,0 +arc_challenge,acc,0.28242320819112626,0.013155456884097224,0 +arc_challenge,acc_norm,0.310580204778157,0.013522292098053054,0 +arc_easy,acc,0.5934343434343434,0.01007905641922353,0 +arc_easy,acc_norm,0.5534511784511784,0.010200990076245307,0 +boolq,acc,0.536085626911315,0.008722250102078083,1 +cb,acc,0.42857142857142855,0.06672848092813057,1 +cb,f1,0.2385663082437276,,1 +copa,acc,0.82,0.038612291966536955,0 +hellaswag,acc,0.4735112527384983,0.004982774293927781,0 +hellaswag,acc_norm,0.6200955984863573,0.0048437085503865145,0 +piqa,acc,0.7404787812840044,0.010227939888173922,0 +piqa,acc_norm,0.7578890097932536,0.009994371269104397,0 +rte,acc,0.5415162454873647,0.029992535385373314,0 +sciq,acc,0.85,0.0112972398234093,0 +sciq,acc_norm,0.776,0.013190830072364464,0 +storycloze_2016,acc,0.703901656867985,0.010557307688475123,0 +winogrande,acc,0.5872138910812944,0.013837060648682106,0 diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_1_lm-eval_global_step109672_2023-05-15-10-12-47_1shots_backup.json b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_1_lm-eval_global_step109672_2023-05-15-10-12-47_1shots_backup.json deleted file mode 100644 index c671d993b3f175016326e72c2944d0e7b897f216..0000000000000000000000000000000000000000 --- a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_1_lm-eval_global_step109672_2023-05-15-10-12-47_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.327, - "acc_stderr": 0.014842213153411239 - }, - "anli_r2": { - "acc": 0.318, - "acc_stderr": 0.014734079309311901 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.013680495725767784 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813057, - "f1": 0.2385663082437276 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.038612291966536955 - }, - "hellaswag": { - "acc": 0.4735112527384983, - "acc_stderr": 0.004982774293927781, - "acc_norm": 0.6200955984863573, - "acc_norm_stderr": 0.0048437085503865145 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373314 - }, - "winogrande": { - "acc": 0.5872138910812944, - "acc_stderr": 0.013837060648682106 - }, - "storycloze_2016": { - "acc": 0.703901656867985, - "acc_stderr": 0.010557307688475123 - }, - "boolq": { - "acc": 0.536085626911315, - "acc_stderr": 0.008722250102078083 - }, - "arc_easy": { - "acc": 0.5934343434343434, - "acc_stderr": 0.01007905641922353, - "acc_norm": 0.5534511784511784, - "acc_norm_stderr": 0.010200990076245307 - }, - "arc_challenge": { - "acc": 0.28242320819112626, - "acc_stderr": 0.013155456884097224, - "acc_norm": 0.310580204778157, - "acc_norm_stderr": 0.013522292098053054 - }, - "sciq": { - "acc": 0.85, - "acc_stderr": 0.0112972398234093, - "acc_norm": 0.776, - "acc_norm_stderr": 0.013190830072364464 - }, - "piqa": { - "acc": 0.7404787812840044, - "acc_stderr": 0.010227939888173922, - "acc_norm": 0.7578890097932536, - "acc_norm_stderr": 0.009994371269104397 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_2.csv b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..4162155c1effef2894213b55d32103305fe88363 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.334,0.014922019523732963,0 +anli_r2,acc,0.325,0.014818724459095526,0 +anli_r3,acc,0.35083333333333333,0.013782212417178197,0 +arc_challenge,acc,0.2841296928327645,0.013179442447653886,0 +arc_challenge,acc_norm,0.30887372013651876,0.013501770929344003,0 +arc_easy,acc,0.609006734006734,0.010012992232540636,0 +arc_easy,acc_norm,0.5614478114478114,0.010182010275471116,0 +boolq,acc,0.5461773700305811,0.008707680082127857,1 +cb,acc,0.4107142857142857,0.06633634150359541,1 +cb,f1,0.23484848484848486,,1 +copa,acc,0.82,0.03861229196653694,0 +hellaswag,acc,0.4720175263891655,0.004981961097590806,0 +hellaswag,acc_norm,0.6194981079466242,0.0048451800342716195,0 +piqa,acc,0.7464635473340587,0.010150090834551786,0 +piqa,acc_norm,0.7557127312295974,0.010024765172284247,0 +rte,acc,0.5018050541516246,0.030096267148976626,0 +sciq,acc,0.853,0.011203415395160336,0 +sciq,acc_norm,0.783,0.01304151375727071,0 +storycloze_2016,acc,0.7135221806520577,0.01045510591863303,0 +winogrande,acc,0.584846093133386,0.013848684086658588,0 diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_2_lm-eval_global_step109672_2023-05-15-10-12-47_2shots_backup.json b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_2_lm-eval_global_step109672_2023-05-15-10-12-47_2shots_backup.json deleted file mode 100644 index e783231b163f78dca877152314faebbfdcda03b3..0000000000000000000000000000000000000000 --- a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_2_lm-eval_global_step109672_2023-05-15-10-12-47_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.334, - "acc_stderr": 0.014922019523732963 - }, - "anli_r2": { - "acc": 0.325, - "acc_stderr": 0.014818724459095526 - }, - "anli_r3": { - "acc": 0.35083333333333333, - "acc_stderr": 0.013782212417178197 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.06633634150359541, - "f1": 0.23484848484848486 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.03861229196653694 - }, - "hellaswag": { - "acc": 0.4720175263891655, - "acc_stderr": 0.004981961097590806, - "acc_norm": 0.6194981079466242, - "acc_norm_stderr": 0.0048451800342716195 - }, - "rte": { - "acc": 0.5018050541516246, - "acc_stderr": 0.030096267148976626 - }, - "winogrande": { - "acc": 0.584846093133386, - "acc_stderr": 0.013848684086658588 - }, - "storycloze_2016": { - "acc": 0.7135221806520577, - "acc_stderr": 0.01045510591863303 - }, - "boolq": { - "acc": 0.5461773700305811, - "acc_stderr": 0.008707680082127857 - }, - "arc_easy": { - "acc": 0.609006734006734, - "acc_stderr": 0.010012992232540636, - "acc_norm": 0.5614478114478114, - "acc_norm_stderr": 0.010182010275471116 - }, - "arc_challenge": { - "acc": 0.2841296928327645, - "acc_stderr": 0.013179442447653886, - "acc_norm": 0.30887372013651876, - "acc_norm_stderr": 0.013501770929344003 - }, - "sciq": { - "acc": 0.853, - "acc_stderr": 0.011203415395160336, - "acc_norm": 0.783, - "acc_norm_stderr": 0.01304151375727071 - }, - "piqa": { - "acc": 0.7464635473340587, - "acc_stderr": 0.010150090834551786, - "acc_norm": 0.7557127312295974, - "acc_norm_stderr": 0.010024765172284247 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_3.csv b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..ac7ef5e3afa35c72e569bc4ffc1709a0143ebb5e --- /dev/null +++ b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928367,0 +anli_r2,acc,0.325,0.014818724459095526,0 +anli_r3,acc,0.3458333333333333,0.013736245342311012,0 +arc_challenge,acc,0.2781569965870307,0.013094469919538812,0 +arc_challenge,acc_norm,0.3037542662116041,0.01343890918477876,0 +arc_easy,acc,0.5997474747474747,0.010053550119896129,0 +arc_easy,acc_norm,0.5749158249158249,0.010143966195717845,0 +boolq,acc,0.5645259938837921,0.008671927333703594,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.3862433862433863,,1 +copa,acc,0.85,0.03588702812826371,0 +hellaswag,acc,0.4733120892252539,0.004982668452118941,0 +hellaswag,acc_norm,0.6216889065923122,0.004839746491523517,0 +piqa,acc,0.7480957562568009,0.010128421335088681,0 +piqa,acc_norm,0.7562568008705114,0.010017199471500609,0 +rte,acc,0.516245487364621,0.030080573208738064,0 +sciq,acc,0.853,0.011203415395160331,0 +sciq,acc_norm,0.785,0.012997843819031827,0 +storycloze_2016,acc,0.711918760021379,0.010472537019822582,0 +winogrande,acc,0.5785319652722968,0.0138780723774976,0 diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_3.json b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_3.json index 44a3731d4f97cf432abe6e355c1f661d513bc059..5f43f3e65f5a96c4844fad36b94e6b4779f06a88 100644 --- a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_3.json +++ b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_3.json @@ -54,6 +54,18 @@ "acc_stderr": 0.013094469919538812, "acc_norm": 0.3037542662116041, "acc_norm_stderr": 0.01343890918477876 + }, + "sciq": { + "acc": 0.853, + "acc_stderr": 0.011203415395160331, + "acc_norm": 0.785, + "acc_norm_stderr": 0.012997843819031827 + }, + "piqa": { + "acc": 0.7480957562568009, + "acc_stderr": 0.010128421335088681, + "acc_norm": 0.7562568008705114, + "acc_norm_stderr": 0.010017199471500609 } }, "versions": { @@ -68,6 +80,8 @@ "storycloze_2016": 0, "boolq": 1, "arc_easy": 0, - "arc_challenge": 0 + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_3_lm-eval_global_step109672_2023-05-15-10-12-47_3shots_backup.json b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_3_lm-eval_global_step109672_2023-05-15-10-12-47_3shots_backup.json deleted file mode 100644 index 44a3731d4f97cf432abe6e355c1f661d513bc059..0000000000000000000000000000000000000000 --- a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_3_lm-eval_global_step109672_2023-05-15-10-12-47_3shots_backup.json +++ /dev/null @@ -1,73 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928367 - }, - "anli_r2": { - "acc": 0.325, - "acc_stderr": 0.014818724459095526 - }, - "anli_r3": { - "acc": 0.3458333333333333, - "acc_stderr": 0.013736245342311012 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.3862433862433863 - }, - "copa": { - "acc": 0.85, - "acc_stderr": 0.03588702812826371 - }, - "hellaswag": { - "acc": 0.4733120892252539, - "acc_stderr": 0.004982668452118941, - "acc_norm": 0.6216889065923122, - "acc_norm_stderr": 0.004839746491523517 - }, - "rte": { - "acc": 0.516245487364621, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5785319652722968, - "acc_stderr": 0.0138780723774976 - }, - "storycloze_2016": { - "acc": 0.711918760021379, - "acc_stderr": 0.010472537019822582 - }, - "boolq": { - "acc": 0.5645259938837921, - "acc_stderr": 0.008671927333703594 - }, - "arc_easy": { - "acc": 0.5997474747474747, - "acc_stderr": 0.010053550119896129, - "acc_norm": 0.5749158249158249, - "acc_norm_stderr": 0.010143966195717845 - }, - "arc_challenge": { - "acc": 0.2781569965870307, - "acc_stderr": 0.013094469919538812, - "acc_norm": 0.3037542662116041, - "acc_norm_stderr": 0.01343890918477876 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_4.csv b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..13bda48c81df7047a87e7bc3faf4bcd9049993f2 --- /dev/null +++ b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.342,0.015008706182121731,0 +anli_r2,acc,0.318,0.014734079309311901,0 +anli_r3,acc,0.3275,0.013553211167251944,0 +arc_challenge,acc,0.28924914675767915,0.013250012579393441,0 +arc_challenge,acc_norm,0.302901023890785,0.013428241573185349,0 +arc_easy,acc,0.6031144781144782,0.010039236800583206,0 +arc_easy,acc_norm,0.5622895622895623,0.010179856486006899,0 +boolq,acc,0.5581039755351682,0.008685806399014942,1 +cb,acc,0.4642857142857143,0.0672477765493766,1 +cb,f1,0.3282312925170068,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.47161919936267677,0.004981736689518751,0 +hellaswag,acc_norm,0.622087233618801,0.004838747305783333,0 +piqa,acc,0.7464635473340587,0.010150090834551784,0 +piqa,acc_norm,0.764961915125136,0.00989314668880531,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.847,0.011389500459665537,0 +sciq,acc_norm,0.795,0.012772554096113116,0 +storycloze_2016,acc,0.7076429716729022,0.01051823972978774,0 +winogrande,acc,0.5864246250986582,0.013840971763195306,0 diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_4.json b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_4.json index 45eb55cd7d14d7e819eb23c666eaba7531d90c13..84563ff29783edf63bb237f1e35652530a3e6ec4 100644 --- a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_4.json +++ b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_4.json @@ -42,6 +42,30 @@ "boolq": { "acc": 0.5581039755351682, "acc_stderr": 0.008685806399014942 + }, + "arc_easy": { + "acc": 0.6031144781144782, + "acc_stderr": 0.010039236800583206, + "acc_norm": 0.5622895622895623, + "acc_norm_stderr": 0.010179856486006899 + }, + "arc_challenge": { + "acc": 0.28924914675767915, + "acc_stderr": 0.013250012579393441, + "acc_norm": 0.302901023890785, + "acc_norm_stderr": 0.013428241573185349 + }, + "sciq": { + "acc": 0.847, + "acc_stderr": 0.011389500459665537, + "acc_norm": 0.795, + "acc_norm_stderr": 0.012772554096113116 + }, + "piqa": { + "acc": 0.7464635473340587, + "acc_stderr": 0.010150090834551784, + "acc_norm": 0.764961915125136, + "acc_norm_stderr": 0.00989314668880531 } }, "versions": { @@ -54,6 +78,10 @@ "rte": 0, "winogrande": 0, "storycloze_2016": 0, - "boolq": 1 + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_4_lm-eval_global_step109672_2023-05-15-10-12-47_4shots_backup.json b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_4_lm-eval_global_step109672_2023-05-15-10-12-47_4shots_backup.json deleted file mode 100644 index 45eb55cd7d14d7e819eb23c666eaba7531d90c13..0000000000000000000000000000000000000000 --- a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_4_lm-eval_global_step109672_2023-05-15-10-12-47_4shots_backup.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.342, - "acc_stderr": 0.015008706182121731 - }, - "anli_r2": { - "acc": 0.318, - "acc_stderr": 0.014734079309311901 - }, - "anli_r3": { - "acc": 0.3275, - "acc_stderr": 0.013553211167251944 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.0672477765493766, - "f1": 0.3282312925170068 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.47161919936267677, - "acc_stderr": 0.004981736689518751, - "acc_norm": 0.622087233618801, - "acc_norm_stderr": 0.004838747305783333 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5864246250986582, - "acc_stderr": 0.013840971763195306 - }, - "storycloze_2016": { - "acc": 0.7076429716729022, - "acc_stderr": 0.01051823972978774 - }, - "boolq": { - "acc": 0.5581039755351682, - "acc_stderr": 0.008685806399014942 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1 - } -} \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_5.csv b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..b2a3073fc16a320f097d98b5c8d47f42ba8cb5cf --- /dev/null +++ b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.318,0.014734079309311901,0 +anli_r2,acc,0.322,0.014782913600996662,0 +anli_r3,acc,0.3525,0.01379716491891836,0 +arc_challenge,acc,0.29266211604095566,0.013295916103619411,0 +arc_challenge,acc_norm,0.3046075085324232,0.013449522109932492,0 +arc_easy,acc,0.6077441077441077,0.010018744689650042,0 +arc_easy,acc_norm,0.5673400673400674,0.010166307932642874,0 +boolq,acc,0.5532110091743119,0.008695392261996197,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.32575201760821715,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.4700258912567218,0.0049808072311367515,0 +hellaswag,acc_norm,0.6182035451105358,0.0048483415604921335,0 +piqa,acc,0.7459194776931447,0.010157271999135044,0 +piqa,acc_norm,0.7568008705114254,0.010009611953858919,0 +rte,acc,0.5090252707581228,0.030091559826331334,0 +sciq,acc,0.844,0.011480235006122363,0 +sciq,acc_norm,0.79,0.012886662332274548,0 +storycloze_2016,acc,0.7071084981293426,0.010523873293246305,0 +winogrande,acc,0.5714285714285714,0.013908353814606696,0 diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_5.json b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_5.json index d59a41925dd1655d53a24104f2fcab52a068afda..cc94ca0ed948e22678baf5db83b954a6cd2ca1f1 100644 --- a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_5.json +++ b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_5.json @@ -38,6 +38,34 @@ "storycloze_2016": { "acc": 0.7071084981293426, "acc_stderr": 0.010523873293246305 + }, + "boolq": { + "acc": 0.5532110091743119, + "acc_stderr": 0.008695392261996197 + }, + "arc_easy": { + "acc": 0.6077441077441077, + "acc_stderr": 0.010018744689650042, + "acc_norm": 0.5673400673400674, + "acc_norm_stderr": 0.010166307932642874 + }, + "arc_challenge": { + "acc": 0.29266211604095566, + "acc_stderr": 0.013295916103619411, + "acc_norm": 0.3046075085324232, + "acc_norm_stderr": 0.013449522109932492 + }, + "sciq": { + "acc": 0.844, + "acc_stderr": 0.011480235006122363, + "acc_norm": 0.79, + "acc_norm_stderr": 0.012886662332274548 + }, + "piqa": { + "acc": 0.7459194776931447, + "acc_stderr": 0.010157271999135044, + "acc_norm": 0.7568008705114254, + "acc_norm_stderr": 0.010009611953858919 } }, "versions": { @@ -49,6 +77,11 @@ "hellaswag": 0, "rte": 0, "winogrande": 0, - "storycloze_2016": 0 + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_5_lm-eval_global_step109672_2023-05-15-10-12-47_5shots_backup.json b/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_5_lm-eval_global_step109672_2023-05-15-10-12-47_5shots_backup.json deleted file mode 100644 index d59a41925dd1655d53a24104f2fcab52a068afda..0000000000000000000000000000000000000000 --- a/4b284b12bc4opt4/evaluation/rankeval/4b284b12bc4opt4_5_lm-eval_global_step109672_2023-05-15-10-12-47_5shots_backup.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.318, - "acc_stderr": 0.014734079309311901 - }, - "anli_r2": { - "acc": 0.322, - "acc_stderr": 0.014782913600996662 - }, - "anli_r3": { - "acc": 0.3525, - "acc_stderr": 0.01379716491891836 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.06724777654937658, - "f1": 0.32575201760821715 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.4700258912567218, - "acc_stderr": 0.0049808072311367515, - "acc_norm": 0.6182035451105358, - "acc_norm_stderr": 0.0048483415604921335 - }, - "rte": { - "acc": 0.5090252707581228, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.5714285714285714, - "acc_stderr": 0.013908353814606696 - }, - "storycloze_2016": { - "acc": 0.7071084981293426, - "acc_stderr": 0.010523873293246305 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0 - } -} \ No newline at end of file