Commit
·
b277454
1
Parent(s):
c631903
Add
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +86 -0
- 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.json +1 -0
- 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.json +1 -0
- 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.json +1 -0
- 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.json +1 -0
- 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.json +1 -0
- 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.json +1 -0
- 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.json +1 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_2.jsonl +0 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_3.jsonl +0 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_4.jsonl +0 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_5.jsonl +0 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_3.jsonl +0 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_4.jsonl +0 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_5.jsonl +0 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl +3 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl +3 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl +3 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl +3 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.jsonl +3 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.jsonl +3 -0
- 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.jsonl +3 -0
- 2b855b9bc4seed1/evaluation/generation/merged.csv +39 -0
- 2b855b9bc4seed1/evaluation/generation/merged.json +1 -0
- 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.json +133 -0
- 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.json +133 -0
- 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.json +133 -0
- 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.json +133 -0
- 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.json +133 -0
- 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.json +133 -0
- 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.json +133 -0
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_0.csv +21 -0
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_0_lm-eval_global_step52452_2023-02-24-23-57-47_0shots_backup.json +0 -87
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_1.csv +21 -0
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_1.json +15 -1
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_1_lm-eval_global_step52452_2023-02-24-23-57-47_1shots_backup.json +0 -73
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_2.csv +21 -0
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_2.json +34 -1
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_2_lm-eval_global_step52452_2023-02-24-23-57-47_2shots_backup.json +0 -54
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_3.csv +21 -0
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_3.json +49 -1
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_3_lm-eval_global_step52452_2023-02-24-23-57-47_3shots_backup.json +0 -39
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_4.csv +21 -0
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_4.json +56 -1
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_5.csv +21 -0
- 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_5.json +56 -1
- 2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_0.json +1 -0
- 2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_0.json +1 -0
- 2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_1.json +1 -0
- 2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_0.json +1 -0
.gitattributes
CHANGED
@@ -733,3 +733,89 @@ evaluation/seed2/generation/examples.limited=3000.model=seed2.task=GEM-wiki_ling
|
|
733 |
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
734 |
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
735 |
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
733 |
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
734 |
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
735 |
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
736 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
737 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
738 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
739 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
740 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
741 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
742 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
743 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
744 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
745 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
746 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
747 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
748 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
749 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
750 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
751 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
752 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
753 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
754 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
755 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
756 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
757 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
758 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
759 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
760 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
761 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
762 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
763 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
764 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
765 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
766 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
767 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
768 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
769 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
770 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
771 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
772 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
773 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
774 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
775 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
776 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
777 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
778 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
779 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
780 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
781 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
782 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
783 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
784 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
785 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
786 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
787 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
788 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
789 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
790 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
791 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
792 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
793 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
794 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
795 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
796 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
797 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
798 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
799 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
800 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
801 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
802 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
803 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
804 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
805 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
806 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
807 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
808 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
809 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
810 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
|
811 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
812 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
813 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
814 |
+
2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
815 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
816 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
817 |
+
2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
818 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
819 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
|
820 |
+
2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
821 |
+
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.018041940580495, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07197788580096429}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.26958144890836977, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021540233079166827}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4588728570611039, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027449063068318643}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.32521240877335217, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019278160903662194}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.11570444930629259, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014199073558424482}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2004243711941686, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021042737791853398}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.13955166174444095, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014269312475032364}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2156185303251572, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015614536366490293}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3759398467983639, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002381221921078589}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.262564054866777, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014381342603292985}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.22224277917141733, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019418664704165772}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3788810896543758, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026592931094796103}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2680607212245618, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018300140862428237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.247143410389527, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09336005465042321}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.25907184832717167, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001803996044494107}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4710593035922337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027215741063189363}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.32338937321619443, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017934056667561753}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.11204434793169746, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012125669003284166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.21007032469689518, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021257583048451844}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.14093386031313695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013638568945346288}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2101184628317088, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001355833935975265}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3886278435049077, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023407273478071285}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2638096769634126, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013632619425518314}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.21561354041284292, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016830247054440658}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3919032879874159, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002632582004466588}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2689516848427162, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017385224354802351}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.208222380745828, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09813424389228141}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.25403488264023616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00174969774558582}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4709476911476187, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002655138227534261}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.32023650433333456, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017784408554274436}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.11058685749210603, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012080055063255909}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.21167766216290337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002144496564072756}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.1403526845297047, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013607857847417898}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.20576507091031662, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012824323500063948}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3889751930979724, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023221115722203652}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.26122109578349895, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013494634641885617}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.21178966636346616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016557432144235155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.393450062046956, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002663580316470773}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2670143871011222, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017625078468513325}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.148604351164331, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11104852380350659}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.2528456956077577, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017424978255548198}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.473017800721126, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026123356126015254}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3199925141836084, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017551605827543929}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.10974089008299018, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011789832834562743}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2119589387667116, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020903039724611107}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.13979285541625697, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013089259606601385}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.20463239752897472, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012743661964013022}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3904493381375762, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002278919766787325}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2608746206749859, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013206910936061742}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.21056870894114044, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001637983619116459}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.39500142434961627, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002625905199940031}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.26667287137591805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001737453548353309}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1131657921685037, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018249204798791675}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2722598710120147, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004133665114298229}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.15647200121804894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023727238595388817}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.019363405482896792, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009105840932812386}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.049210613825466105, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002366221262626497}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.027201929614022874, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012539141722579825}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.08876222177060916, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013761155158951413}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.21448631404181784, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003198193852945419}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.12268909574632023, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001773225086338}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.09102722612426341, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014951062687195294}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.22021791608209987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003544530658562865}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.12593169516392616, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001962773566689985}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.1532375468354903, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09169402373692534}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0383955897314317, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025422863710305266}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.06915759782304308, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0039608171509673586}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.04426046707797172, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024997989992097624}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.006481358010154669, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007393187434393857}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.013044537342091751, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00125152402986618}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.008141029291990268, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008202019731008721}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.031113719615522017, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021898843752975964}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.055197273257753664, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003191098630834803}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.035265847972026074, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020006364168593496}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03210804984465074, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022623416009452243}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.05680048575290953, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032931638988068055}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.03632930315349291, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020756841173550696}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.6055404023350197, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09409517819503949}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002821793157093116, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008106295472902334}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0022851464113943067, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006484962567584845}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0024831420596845987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007070331562075694}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0002914830883754302, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00020340968454234058}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00022401756585996315, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0001545711517822221}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0002519246692387647, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00017508831809454146}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0021547453202339587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006563220446223475}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0017708458456009723, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005325445996971974}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.001910475794160408, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005775932958294451}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.002307213397230338, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006934611042748366}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.001892317665185999, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005664327179419651}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0020447516574242644, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000612874204929875}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.1094899638874875e-37, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.0662006738968273e-31}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_2.jsonl
ADDED
File without changes
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_3.jsonl
ADDED
File without changes
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_4.jsonl
ADDED
File without changes
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_5.jsonl
ADDED
File without changes
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_3.jsonl
ADDED
File without changes
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_4.jsonl
ADDED
File without changes
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_5.jsonl
ADDED
File without changes
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b9cf841b226161291c93621cdea07f208242bd698e73f01ef6161119990c2a8f
|
3 |
+
size 6555356
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e33e5982a5cb9c474ac101018fe39437d6bd70bb49464fc9647dfc6e2394e29
|
3 |
+
size 7663191
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:981366570b6f5164479911044af041ce49524c88389257a45059020bc2633b5e
|
3 |
+
size 8754161
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:87012952085cdbecc185ca76605cce02fbcf84a1aad91feeb23fc3607bba6b81
|
3 |
+
size 9848861
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84c17a6680e2df8d7ebd172c08a43c279992784452d9950bb1b0f49c9c9354f4
|
3 |
+
size 9642676
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3538e8bc295d76ee9ad83063c989e8b90b4343cf7ffcedd2b6312b91f4d7ff32
|
3 |
+
size 11671805
|
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:51f6f0ca88e208ba87adb2a5d3bc46de505c984fec840ca0efd9388e72636cf6
|
3 |
+
size 13897450
|
2b855b9bc4seed1/evaluation/generation/merged.csv
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0025241396159283234
|
3 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0025241396159283234
|
4 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.162134939165695
|
5 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.162134939165695
|
6 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.13955166174444095
|
7 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.13955166174444095
|
8 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.14093386031313695
|
9 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.14093386031313695
|
10 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.1403526845297047
|
11 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.1403526845297047
|
12 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.13979285541625697
|
13 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.13979285541625697
|
14 |
+
e2e_nlg_cleaned,5,average,multiple,0.12088169013086048
|
15 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04580671342088997
|
16 |
+
gem_xsum,0,median,rouge2_fmeasure,0.04580671342088997
|
17 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03052402695525932
|
18 |
+
gem_xsum,1,median,rouge2_fmeasure,0.03052402695525932
|
19 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.029102694813621793
|
20 |
+
gem_xsum,2,median,rouge2_fmeasure,0.029102694813621793
|
21 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.027201929614022874
|
22 |
+
gem_xsum,3,median,rouge2_fmeasure,0.027201929614022874
|
23 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.008141029291990268
|
24 |
+
gem_xsum,4,median,rouge2_fmeasure,0.008141029291990268
|
25 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0002519246692387647
|
26 |
+
gem_xsum,5,median,rouge2_fmeasure,0.0002519246692387647
|
27 |
+
gem_xsum,5,average,multiple,0.023504719794170497
|
28 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05142765596627262
|
29 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.05142765596627262
|
30 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.051755190143320286
|
31 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.051755190143320286
|
32 |
+
web_nlg_en,1,average,multiple,0.051591423054796456
|
33 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03487145839395421
|
34 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.03487145839395421
|
35 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03614437414445121
|
36 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.03614437414445121
|
37 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04349341038860359
|
38 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.04349341038860359
|
39 |
+
wiki_lingua_en,2,average,multiple,0.038169747642336334
|
2b855b9bc4seed1/evaluation/generation/merged.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4066206711982134, "bleu_stderr": 0.033723013965742, "rouge1_fmeasure": 0.10895883012469396, "rouge1_fmeasure_stderr": 0.002008528704603682, "rouge1_precision": 0.07192732542952907, "rouge1_precision_stderr": 0.0016382475971692088, "rouge1_recall": 0.30547187741815907, "rouge1_recall_stderr": 0.004666013432977962, "rouge2_fmeasure": 0.05142765596627262, "rouge2_fmeasure_stderr": 0.001246824543408159, "rouge2_precision": 0.034124224521766514, "rouge2_precision_stderr": 0.0011258346949061335, "rouge2_recall": 0.14893550564787877, "rouge2_recall_stderr": 0.0032632079712954696, "rougeL_fmeasure": 0.10554163962469583, "rougeL_fmeasure_stderr": 0.0018999401584976951, "rougeL_precision": 0.06951188299150508, "rougeL_precision_stderr": 0.0015536321007724944, "rougeL_recall": 0.29797998285094185, "rougeL_recall_stderr": 0.004578539464668976, "rougeLsum_fmeasure": 0.10415553154282955, "rougeLsum_fmeasure_stderr": 0.0018954370177080594, "rougeLsum_precision": 0.06880186502940373, "rougeLsum_precision_stderr": 0.0015660770018247742, "rougeLsum_recall": 0.2918595743249444, "rougeLsum_recall_stderr": 0.004365973340593178}}, "1": {"PALM_prompt": {"bleu": 0.441015227234624, "bleu_stderr": 0.02811928663369506, "rouge1_fmeasure": 0.11213595098636865, "rouge1_fmeasure_stderr": 0.001929589035018206, "rouge1_precision": 0.0726548726317011, "rouge1_precision_stderr": 0.0015123203660414558, "rouge1_recall": 0.3480211924440434, "rouge1_recall_stderr": 0.005007162611700053, "rouge2_fmeasure": 0.051755190143320286, "rouge2_fmeasure_stderr": 0.0012231844221965455, "rouge2_precision": 0.0335432322481532, "rouge2_precision_stderr": 0.0009695182541133993, "rouge2_recall": 0.16639593533541566, "rouge2_recall_stderr": 0.0034320944741141886, "rougeL_fmeasure": 0.10765664842911221, "rougeL_fmeasure_stderr": 0.001806223035008682, "rougeL_precision": 0.06967949917271966, "rougeL_precision_stderr": 0.0014170582927231956, "rougeL_recall": 0.3335173366403848, "rougeL_recall_stderr": 0.004760042112091419, "rougeLsum_fmeasure": 0.10716257387185534, "rougeLsum_fmeasure_stderr": 0.0018203621728808315, "rougeLsum_precision": 0.06949682130009485, "rougeLsum_precision_stderr": 0.00144169867509152, "rougeLsum_recall": 0.33110876740142015, "rougeLsum_recall_stderr": 0.004635864354540981}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5382561696068602, "bleu_stderr": 0.06399544328700199, "rouge1_fmeasure": 0.17482546253918968, "rouge1_fmeasure_stderr": 0.0017962093241495317, "rouge1_precision": 0.1493431846788089, "rouge1_precision_stderr": 0.0018651057951166279, "rouge1_recall": 0.2548583336571623, "rouge1_recall_stderr": 0.002528299389518057, "rouge2_fmeasure": 0.03487145839395421, "rouge2_fmeasure_stderr": 0.0008257165713416542, "rouge2_precision": 0.02958059388980271, "rouge2_precision_stderr": 0.0007357126571931123, "rouge2_recall": 0.0521633146578729, "rouge2_recall_stderr": 0.0013585554854051705, "rougeL_fmeasure": 0.13645058466848356, "rougeL_fmeasure_stderr": 0.0012505081899530457, "rougeL_precision": 0.11482322910631781, "rougeL_precision_stderr": 0.0012610070217646853, "rougeL_recall": 0.2047896774759962, "rougeL_recall_stderr": 0.0020475160072816, "rougeLsum_fmeasure": 0.16137882388270616, "rougeLsum_fmeasure_stderr": 0.0016412506052086466, "rougeLsum_precision": 0.13765195894138185, "rougeLsum_precision_stderr": 0.0017075576824449126, "rougeLsum_recall": 0.23641320862844858, "rougeLsum_recall_stderr": 0.0023610992069355388}}, "1": {"tldr_en": {"bleu": 1.7101242197452051, "bleu_stderr": 0.04333761308856783, "rouge1_fmeasure": 0.1796806784514, "rouge1_fmeasure_stderr": 0.0018714401391125607, "rouge1_precision": 0.1550521213351438, "rouge1_precision_stderr": 0.0019541763971361132, "rouge1_recall": 0.2591201747750158, "rouge1_recall_stderr": 0.0026478655932746374, "rouge2_fmeasure": 0.03614437414445121, "rouge2_fmeasure_stderr": 0.0008460043362571757, "rouge2_precision": 0.030990812889466937, "rouge2_precision_stderr": 0.0007609017578133211, "rouge2_recall": 0.05362608829041054, "rouge2_recall_stderr": 0.001393078096579126, "rougeL_fmeasure": 0.13484320380645512, "rougeL_fmeasure_stderr": 0.001257828140076275, "rougeL_precision": 0.11485232716958933, "rougeL_precision_stderr": 0.0012878987216988057, "rougeL_recall": 0.19958644094767267, "rougeL_recall_stderr": 0.002041519133152489, "rougeLsum_fmeasure": 0.16735135112097876, "rougeLsum_fmeasure_stderr": 0.0017317067736812232, "rougeLsum_precision": 0.14415005003230233, "rougeLsum_precision_stderr": 0.0018020269635249566, "rougeLsum_recall": 0.24228585475636266, "rougeLsum_recall_stderr": 0.0024919558768600326}}, "2": {"tldr_en": {"bleu": 2.1448539044725874, "bleu_stderr": 0.06277677171239805, "rouge1_fmeasure": 0.1970048673630101, "rouge1_fmeasure_stderr": 0.001873251264352212, "rouge1_precision": 0.1705870674804116, "rouge1_precision_stderr": 0.002048796485044356, "rouge1_recall": 0.28464437226221473, "rouge1_recall_stderr": 0.0026857267394890393, "rouge2_fmeasure": 0.04349341038860359, "rouge2_fmeasure_stderr": 0.000895910412734044, "rouge2_precision": 0.0375931104436251, "rouge2_precision_stderr": 0.0008379763934403864, "rouge2_recall": 0.0642856259647273, "rouge2_recall_stderr": 0.0014690192240657321, "rougeL_fmeasure": 0.1456514344157347, "rougeL_fmeasure_stderr": 0.0012777745388170482, "rougeL_precision": 0.12469459823199261, "rougeL_precision_stderr": 0.001380245091026989, "rougeL_recall": 0.21573877115274695, "rougeL_recall_stderr": 0.002116828674796055, "rougeLsum_fmeasure": 0.18373940262798896, "rougeLsum_fmeasure_stderr": 0.001747803293779092, "rougeLsum_precision": 0.15874261945387053, "rougeLsum_precision_stderr": 0.0018989313183848545, "rougeLsum_recall": 0.26633944675442933, "rougeLsum_recall_stderr": 0.0025440410206595122}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.23848794691656622, "bleu_stderr": 0.03586833580767261, "rouge1_fmeasure": 0.02476955743708725, "rouge1_fmeasure_stderr": 0.0007710556690163732, "rouge1_precision": 0.019286698361945742, "rouge1_precision_stderr": 0.0006077660476531964, "rouge1_recall": 0.03771027274112491, "rouge1_recall_stderr": 0.0012643509148260043, "rouge2_fmeasure": 0.0025241396159283234, "rouge2_fmeasure_stderr": 0.00028150795832003375, "rouge2_precision": 0.0018866790365766187, "rouge2_precision_stderr": 0.0002143326315912907, "rouge2_recall": 0.004268812953176167, "rouge2_recall_stderr": 0.00046738906149789073, "rougeL_fmeasure": 0.02408262386824008, "rougeL_fmeasure_stderr": 0.0007136995922629838, "rougeL_precision": 0.018711596296599558, "rougeL_precision_stderr": 0.000549656942001388, "rougeL_recall": 0.036704663603431754, "rougeL_recall_stderr": 0.0011874696198706125, "rougeLsum_fmeasure": 0.022774229967586195, "rougeLsum_fmeasure_stderr": 0.0006671086701287083, "rougeLsum_precision": 0.017739797167011156, "rougeLsum_precision_stderr": 0.0005177890835136993, "rougeLsum_recall": 0.03456546163273569, "rougeLsum_recall_stderr": 0.0011034522576739757}}, "1": {"generate_text_restaurant": {"bleu": 7.611947046679122, "bleu_stderr": 0.1143274370287527, "rouge1_fmeasure": 0.37951927025471477, "rouge1_fmeasure_stderr": 0.002437652542254183, "rouge1_precision": 0.39546550497235883, "rouge1_precision_stderr": 0.00338263349515896, "rouge1_recall": 0.41781588842973927, "rouge1_recall_stderr": 0.0028231707574303336, "rouge2_fmeasure": 0.162134939165695, "rouge2_fmeasure_stderr": 0.0018063315065409062, "rouge2_precision": 0.17176180426713916, "rouge2_precision_stderr": 0.002231883030057573, "rouge2_recall": 0.17748075850051998, "rouge2_recall_stderr": 0.0020235877252369762, "rougeL_fmeasure": 0.2827204639054135, "rougeL_fmeasure_stderr": 0.001840892993788695, "rougeL_precision": 0.29255451415473116, "rougeL_precision_stderr": 0.0025475028016928546, "rougeL_recall": 0.31556464887995433, "rougeL_recall_stderr": 0.0023200773160819157, "rougeLsum_fmeasure": 0.3085637675341545, "rougeLsum_fmeasure_stderr": 0.002263042771686147, "rougeLsum_precision": 0.3218545869658081, "rougeLsum_precision_stderr": 0.0030063312376711593, "rougeLsum_recall": 0.33940243915149054, "rougeLsum_recall_stderr": 0.002593279963684452}}, "2": {"generate_text_restaurant": {"bleu": 6.018041940580495, "bleu_stderr": 0.07197788580096429, "rouge1_fmeasure": 0.32521240877335217, "rouge1_fmeasure_stderr": 0.0019278160903662194, "rouge1_precision": 0.26958144890836977, "rouge1_precision_stderr": 0.0021540233079166827, "rouge1_recall": 0.4588728570611039, "rouge1_recall_stderr": 0.0027449063068318643, "rouge2_fmeasure": 0.13955166174444095, "rouge2_fmeasure_stderr": 0.0014269312475032364, "rouge2_precision": 0.11570444930629259, "rouge2_precision_stderr": 0.0014199073558424482, "rouge2_recall": 0.2004243711941686, "rouge2_recall_stderr": 0.0021042737791853398, "rougeL_fmeasure": 0.262564054866777, "rougeL_fmeasure_stderr": 0.0014381342603292985, "rougeL_precision": 0.2156185303251572, "rougeL_precision_stderr": 0.0015614536366490293, "rougeL_recall": 0.3759398467983639, "rougeL_recall_stderr": 0.002381221921078589, "rougeLsum_fmeasure": 0.2680607212245618, "rougeLsum_fmeasure_stderr": 0.0018300140862428237, "rougeLsum_precision": 0.22224277917141733, "rougeLsum_precision_stderr": 0.0019418664704165772, "rougeLsum_recall": 0.3788810896543758, "rougeLsum_recall_stderr": 0.0026592931094796103}}, "3": {"generate_text_restaurant": {"bleu": 6.247143410389527, "bleu_stderr": 0.09336005465042321, "rouge1_fmeasure": 0.32338937321619443, "rouge1_fmeasure_stderr": 0.0017934056667561753, "rouge1_precision": 0.25907184832717167, "rouge1_precision_stderr": 0.001803996044494107, "rouge1_recall": 0.4710593035922337, "rouge1_recall_stderr": 0.0027215741063189363, "rouge2_fmeasure": 0.14093386031313695, "rouge2_fmeasure_stderr": 0.0013638568945346288, "rouge2_precision": 0.11204434793169746, "rouge2_precision_stderr": 0.0012125669003284166, "rouge2_recall": 0.21007032469689518, "rouge2_recall_stderr": 0.0021257583048451844, "rougeL_fmeasure": 0.2638096769634126, "rougeL_fmeasure_stderr": 0.0013632619425518314, "rougeL_precision": 0.2101184628317088, "rougeL_precision_stderr": 0.001355833935975265, "rougeL_recall": 0.3886278435049077, "rougeL_recall_stderr": 0.0023407273478071285, "rougeLsum_fmeasure": 0.2689516848427162, "rougeLsum_fmeasure_stderr": 0.0017385224354802351, "rougeLsum_precision": 0.21561354041284292, "rougeLsum_precision_stderr": 0.0016830247054440658, "rougeLsum_recall": 0.3919032879874159, "rougeLsum_recall_stderr": 0.002632582004466588}}, "4": {"generate_text_restaurant": {"bleu": 6.208222380745828, "bleu_stderr": 0.09813424389228141, "rouge1_fmeasure": 0.32023650433333456, "rouge1_fmeasure_stderr": 0.0017784408554274436, "rouge1_precision": 0.25403488264023616, "rouge1_precision_stderr": 0.00174969774558582, "rouge1_recall": 0.4709476911476187, "rouge1_recall_stderr": 0.002655138227534261, "rouge2_fmeasure": 0.1403526845297047, "rouge2_fmeasure_stderr": 0.0013607857847417898, "rouge2_precision": 0.11058685749210603, "rouge2_precision_stderr": 0.0012080055063255909, "rouge2_recall": 0.21167766216290337, "rouge2_recall_stderr": 0.002144496564072756, "rougeL_fmeasure": 0.26122109578349895, "rougeL_fmeasure_stderr": 0.0013494634641885617, "rougeL_precision": 0.20576507091031662, "rougeL_precision_stderr": 0.0012824323500063948, "rougeL_recall": 0.3889751930979724, "rougeL_recall_stderr": 0.0023221115722203652, "rougeLsum_fmeasure": 0.2670143871011222, "rougeLsum_fmeasure_stderr": 0.0017625078468513325, "rougeLsum_precision": 0.21178966636346616, "rougeLsum_precision_stderr": 0.0016557432144235155, "rougeLsum_recall": 0.393450062046956, "rougeLsum_recall_stderr": 0.002663580316470773}}, "5": {"generate_text_restaurant": {"bleu": 6.148604351164331, "bleu_stderr": 0.11104852380350659, "rouge1_fmeasure": 0.3199925141836084, "rouge1_fmeasure_stderr": 0.0017551605827543929, "rouge1_precision": 0.2528456956077577, "rouge1_precision_stderr": 0.0017424978255548198, "rouge1_recall": 0.473017800721126, "rouge1_recall_stderr": 0.0026123356126015254, "rouge2_fmeasure": 0.13979285541625697, "rouge2_fmeasure_stderr": 0.0013089259606601385, "rouge2_precision": 0.10974089008299018, "rouge2_precision_stderr": 0.0011789832834562743, "rouge2_recall": 0.2119589387667116, "rouge2_recall_stderr": 0.0020903039724611107, "rougeL_fmeasure": 0.2608746206749859, "rougeL_fmeasure_stderr": 0.0013206910936061742, "rougeL_precision": 0.20463239752897472, "rougeL_precision_stderr": 0.0012743661964013022, "rougeL_recall": 0.3904493381375762, "rougeL_recall_stderr": 0.002278919766787325, "rougeLsum_fmeasure": 0.26667287137591805, "rougeLsum_fmeasure_stderr": 0.001737453548353309, "rougeLsum_precision": 0.21056870894114044, "rougeLsum_precision_stderr": 0.001637983619116459, "rougeLsum_recall": 0.39500142434961627, "rougeLsum_recall_stderr": 0.002625905199940031}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8439516808341079, "bleu_stderr": 0.09422907248075703, "rouge1_fmeasure": 0.20643072459177322, "rouge1_fmeasure_stderr": 0.0023952712703594957, "rouge1_precision": 0.1489888413499883, "rouge1_precision_stderr": 0.0018521966676135623, "rouge1_recall": 0.35598499223601365, "rouge1_recall_stderr": 0.004207211166343017, "rouge2_fmeasure": 0.04580671342088997, "rouge2_fmeasure_stderr": 0.0015030605732593261, "rouge2_precision": 0.0324623282960817, "rouge2_precision_stderr": 0.0010674592282863239, "rouge2_recall": 0.0820809502686687, "rouge2_recall_stderr": 0.002813424567852905, "rougeL_fmeasure": 0.15483993335255772, "rougeL_fmeasure_stderr": 0.0018017433912559451, "rougeL_precision": 0.11153813287033311, "rougeL_precision_stderr": 0.001374306634500298, "rougeL_recall": 0.26864922067574026, "rougeL_recall_stderr": 0.0033157904418926283, "rougeLsum_fmeasure": 0.16399947665135114, "rougeLsum_fmeasure_stderr": 0.002023909905331339, "rougeLsum_precision": 0.1179757836593057, "rougeLsum_precision_stderr": 0.0015154466049348198, "rougeLsum_recall": 0.2847650880272033, "rougeLsum_recall_stderr": 0.003712939966528145}}, "1": {"article_DOC_summary": {"bleu": 1.247854057178413, "bleu_stderr": 0.053793972634237734, "rouge1_fmeasure": 0.1655189451131417, "rouge1_fmeasure_stderr": 0.0023018941372401455, "rouge1_precision": 0.1177467243515202, "rouge1_precision_stderr": 0.0017455506749855451, "rouge1_recall": 0.29171927966859307, "rouge1_recall_stderr": 0.003919612515883716, "rouge2_fmeasure": 0.03052402695525932, "rouge2_fmeasure_stderr": 0.0012776222243206263, "rouge2_precision": 0.02141171153130294, "rouge2_precision_stderr": 0.0008955978971257212, "rouge2_recall": 0.05531529147706192, "rouge2_recall_stderr": 0.002372274102893343, "rougeL_fmeasure": 0.13085315336542225, "rougeL_fmeasure_stderr": 0.0017506812383595754, "rougeL_precision": 0.0927268161322543, "rougeL_precision_stderr": 0.0012892522930530307, "rougeL_recall": 0.232421966160188, "rougeL_recall_stderr": 0.0031136409947685123, "rougeLsum_fmeasure": 0.13257722034008568, "rougeLsum_fmeasure_stderr": 0.0018892826906714033, "rougeLsum_precision": 0.09395615061271292, "rougeLsum_precision_stderr": 0.0013824641506762928, "rougeLsum_recall": 0.23524314570007646, "rougeLsum_recall_stderr": 0.0033502788795521077}}, "2": {"article_DOC_summary": {"bleu": 1.1975588661664014, "bleu_stderr": 0.06263558549987819, "rouge1_fmeasure": 0.165998653508353, "rouge1_fmeasure_stderr": 0.002288522145343238, "rouge1_precision": 0.1177626659310641, "rouge1_precision_stderr": 0.001707946986197866, "rouge1_recall": 0.2928721346826048, "rouge1_recall_stderr": 0.0039006436966328977, "rouge2_fmeasure": 0.029102694813621793, "rouge2_fmeasure_stderr": 0.0012269354923261043, "rouge2_precision": 0.020528446471707117, "rouge2_precision_stderr": 0.000867911954165022, "rouge2_recall": 0.052169823470780564, "rouge2_recall_stderr": 0.0022808799038575088, "rougeL_fmeasure": 0.12917404609126437, "rougeL_fmeasure_stderr": 0.0016814421892826735, "rougeL_precision": 0.09145048851843915, "rougeL_precision_stderr": 0.0012506279990524979, "rougeL_recall": 0.22942628081026484, "rougeL_recall_stderr": 0.002973056999961359, "rougeLsum_fmeasure": 0.13296050139540747, "rougeLsum_fmeasure_stderr": 0.001845998235299578, "rougeLsum_precision": 0.0940902852266538, "rougeLsum_precision_stderr": 0.0013601724232084945, "rougeLsum_recall": 0.23630864651546127, "rougeLsum_recall_stderr": 0.003290166668177434}}, "3": {"article_DOC_summary": {"bleu": 1.1532375468354903, "bleu_stderr": 0.09169402373692534, "rouge1_fmeasure": 0.15647200121804894, "rouge1_fmeasure_stderr": 0.0023727238595388817, "rouge1_precision": 0.1131657921685037, "rouge1_precision_stderr": 0.0018249204798791675, "rouge1_recall": 0.2722598710120147, "rouge1_recall_stderr": 0.004133665114298229, "rouge2_fmeasure": 0.027201929614022874, "rouge2_fmeasure_stderr": 0.0012539141722579825, "rouge2_precision": 0.019363405482896792, "rouge2_precision_stderr": 0.0009105840932812386, "rouge2_recall": 0.049210613825466105, "rouge2_recall_stderr": 0.002366221262626497, "rougeL_fmeasure": 0.12268909574632023, "rougeL_fmeasure_stderr": 0.001773225086338, "rougeL_precision": 0.08876222177060916, "rougeL_precision_stderr": 0.0013761155158951413, "rougeL_recall": 0.21448631404181784, "rougeL_recall_stderr": 0.003198193852945419, "rougeLsum_fmeasure": 0.12593169516392616, "rougeLsum_fmeasure_stderr": 0.001962773566689985, "rougeLsum_precision": 0.09102722612426341, "rougeLsum_precision_stderr": 0.0014951062687195294, "rougeLsum_recall": 0.22021791608209987, "rougeLsum_recall_stderr": 0.003544530658562865}}, "4": {"article_DOC_summary": {"bleu": 0.6055404023350197, "bleu_stderr": 0.09409517819503949, "rouge1_fmeasure": 0.04426046707797172, "rouge1_fmeasure_stderr": 0.0024997989992097624, "rouge1_precision": 0.0383955897314317, "rouge1_precision_stderr": 0.0025422863710305266, "rouge1_recall": 0.06915759782304308, "rouge1_recall_stderr": 0.0039608171509673586, "rouge2_fmeasure": 0.008141029291990268, "rouge2_fmeasure_stderr": 0.0008202019731008721, "rouge2_precision": 0.006481358010154669, "rouge2_precision_stderr": 0.0007393187434393857, "rouge2_recall": 0.013044537342091751, "rouge2_recall_stderr": 0.00125152402986618, "rougeL_fmeasure": 0.035265847972026074, "rougeL_fmeasure_stderr": 0.0020006364168593496, "rougeL_precision": 0.031113719615522017, "rougeL_precision_stderr": 0.0021898843752975964, "rougeL_recall": 0.055197273257753664, "rougeL_recall_stderr": 0.003191098630834803, "rougeLsum_fmeasure": 0.03632930315349291, "rougeLsum_fmeasure_stderr": 0.0020756841173550696, "rougeLsum_precision": 0.03210804984465074, "rougeLsum_precision_stderr": 0.0022623416009452243, "rougeLsum_recall": 0.05680048575290953, "rougeLsum_recall_stderr": 0.0032931638988068055}}, "5": {"article_DOC_summary": {"bleu": 1.1094899638874875e-37, "bleu_stderr": 1.0662006738968273e-31, "rouge1_fmeasure": 0.0024831420596845987, "rouge1_fmeasure_stderr": 0.0007070331562075694, "rouge1_precision": 0.002821793157093116, "rouge1_precision_stderr": 0.0008106295472902334, "rouge1_recall": 0.0022851464113943067, "rouge1_recall_stderr": 0.0006484962567584845, "rouge2_fmeasure": 0.0002519246692387647, "rouge2_fmeasure_stderr": 0.00017508831809454146, "rouge2_precision": 0.0002914830883754302, "rouge2_precision_stderr": 0.00020340968454234058, "rouge2_recall": 0.00022401756585996315, "rouge2_recall_stderr": 0.0001545711517822221, "rougeL_fmeasure": 0.001910475794160408, "rougeL_fmeasure_stderr": 0.0005775932958294451, "rougeL_precision": 0.0021547453202339587, "rougeL_precision_stderr": 0.0006563220446223475, "rougeL_recall": 0.0017708458456009723, "rougeL_recall_stderr": 0.0005325445996971974, "rougeLsum_fmeasure": 0.0020447516574242644, "rougeLsum_fmeasure_stderr": 0.000612874204929875, "rougeLsum_precision": 0.002307213397230338, "rougeLsum_precision_stderr": 0.0006934611042748366, "rougeLsum_recall": 0.001892317665185999, "rougeLsum_recall_stderr": 0.0005664327179419651}}}}
|
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "e2e_nlg_cleaned",
|
5 |
+
"prompt_name": "generate_text_restaurant",
|
6 |
+
"bleu": 6.018041940580495,
|
7 |
+
"dataset_path": "e2e_nlg_cleaned",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": null,
|
10 |
+
"bleu_stderr": 0.07197788580096429
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "e2e_nlg_cleaned",
|
14 |
+
"prompt_name": "generate_text_restaurant",
|
15 |
+
"rouge1_precision": 0.26958144890836977,
|
16 |
+
"dataset_path": "e2e_nlg_cleaned",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": null,
|
19 |
+
"rouge1_precision_stderr": 0.0021540233079166827
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "e2e_nlg_cleaned",
|
23 |
+
"prompt_name": "generate_text_restaurant",
|
24 |
+
"rouge1_recall": 0.4588728570611039,
|
25 |
+
"dataset_path": "e2e_nlg_cleaned",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": null,
|
28 |
+
"rouge1_recall_stderr": 0.0027449063068318643
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "e2e_nlg_cleaned",
|
32 |
+
"prompt_name": "generate_text_restaurant",
|
33 |
+
"rouge1_fmeasure": 0.32521240877335217,
|
34 |
+
"dataset_path": "e2e_nlg_cleaned",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": null,
|
37 |
+
"rouge1_fmeasure_stderr": 0.0019278160903662194
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "e2e_nlg_cleaned",
|
41 |
+
"prompt_name": "generate_text_restaurant",
|
42 |
+
"rouge2_precision": 0.11570444930629259,
|
43 |
+
"dataset_path": "e2e_nlg_cleaned",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": null,
|
46 |
+
"rouge2_precision_stderr": 0.0014199073558424482
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "e2e_nlg_cleaned",
|
50 |
+
"prompt_name": "generate_text_restaurant",
|
51 |
+
"rouge2_recall": 0.2004243711941686,
|
52 |
+
"dataset_path": "e2e_nlg_cleaned",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": null,
|
55 |
+
"rouge2_recall_stderr": 0.0021042737791853398
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "e2e_nlg_cleaned",
|
59 |
+
"prompt_name": "generate_text_restaurant",
|
60 |
+
"rouge2_fmeasure": 0.13955166174444095,
|
61 |
+
"dataset_path": "e2e_nlg_cleaned",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": null,
|
64 |
+
"rouge2_fmeasure_stderr": 0.0014269312475032364
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "e2e_nlg_cleaned",
|
68 |
+
"prompt_name": "generate_text_restaurant",
|
69 |
+
"rougeL_precision": 0.2156185303251572,
|
70 |
+
"dataset_path": "e2e_nlg_cleaned",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": null,
|
73 |
+
"rougeL_precision_stderr": 0.0015614536366490293
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "e2e_nlg_cleaned",
|
77 |
+
"prompt_name": "generate_text_restaurant",
|
78 |
+
"rougeL_recall": 0.3759398467983639,
|
79 |
+
"dataset_path": "e2e_nlg_cleaned",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": null,
|
82 |
+
"rougeL_recall_stderr": 0.002381221921078589
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "e2e_nlg_cleaned",
|
86 |
+
"prompt_name": "generate_text_restaurant",
|
87 |
+
"rougeL_fmeasure": 0.262564054866777,
|
88 |
+
"dataset_path": "e2e_nlg_cleaned",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": null,
|
91 |
+
"rougeL_fmeasure_stderr": 0.0014381342603292985
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "e2e_nlg_cleaned",
|
95 |
+
"prompt_name": "generate_text_restaurant",
|
96 |
+
"rougeLsum_precision": 0.22224277917141733,
|
97 |
+
"dataset_path": "e2e_nlg_cleaned",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": null,
|
100 |
+
"rougeLsum_precision_stderr": 0.0019418664704165772
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "e2e_nlg_cleaned",
|
104 |
+
"prompt_name": "generate_text_restaurant",
|
105 |
+
"rougeLsum_recall": 0.3788810896543758,
|
106 |
+
"dataset_path": "e2e_nlg_cleaned",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": null,
|
109 |
+
"rougeLsum_recall_stderr": 0.0026592931094796103
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "e2e_nlg_cleaned",
|
113 |
+
"prompt_name": "generate_text_restaurant",
|
114 |
+
"rougeLsum_fmeasure": 0.2680607212245618,
|
115 |
+
"dataset_path": "e2e_nlg_cleaned",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": null,
|
118 |
+
"rougeLsum_fmeasure_stderr": 0.0018300140862428237
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 2,
|
126 |
+
"batch_size": 16,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "e2e_nlg_cleaned",
|
5 |
+
"prompt_name": "generate_text_restaurant",
|
6 |
+
"bleu": 6.247143410389527,
|
7 |
+
"dataset_path": "e2e_nlg_cleaned",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": null,
|
10 |
+
"bleu_stderr": 0.09336005465042321
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "e2e_nlg_cleaned",
|
14 |
+
"prompt_name": "generate_text_restaurant",
|
15 |
+
"rouge1_precision": 0.25907184832717167,
|
16 |
+
"dataset_path": "e2e_nlg_cleaned",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": null,
|
19 |
+
"rouge1_precision_stderr": 0.001803996044494107
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "e2e_nlg_cleaned",
|
23 |
+
"prompt_name": "generate_text_restaurant",
|
24 |
+
"rouge1_recall": 0.4710593035922337,
|
25 |
+
"dataset_path": "e2e_nlg_cleaned",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": null,
|
28 |
+
"rouge1_recall_stderr": 0.0027215741063189363
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "e2e_nlg_cleaned",
|
32 |
+
"prompt_name": "generate_text_restaurant",
|
33 |
+
"rouge1_fmeasure": 0.32338937321619443,
|
34 |
+
"dataset_path": "e2e_nlg_cleaned",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": null,
|
37 |
+
"rouge1_fmeasure_stderr": 0.0017934056667561753
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "e2e_nlg_cleaned",
|
41 |
+
"prompt_name": "generate_text_restaurant",
|
42 |
+
"rouge2_precision": 0.11204434793169746,
|
43 |
+
"dataset_path": "e2e_nlg_cleaned",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": null,
|
46 |
+
"rouge2_precision_stderr": 0.0012125669003284166
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "e2e_nlg_cleaned",
|
50 |
+
"prompt_name": "generate_text_restaurant",
|
51 |
+
"rouge2_recall": 0.21007032469689518,
|
52 |
+
"dataset_path": "e2e_nlg_cleaned",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": null,
|
55 |
+
"rouge2_recall_stderr": 0.0021257583048451844
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "e2e_nlg_cleaned",
|
59 |
+
"prompt_name": "generate_text_restaurant",
|
60 |
+
"rouge2_fmeasure": 0.14093386031313695,
|
61 |
+
"dataset_path": "e2e_nlg_cleaned",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": null,
|
64 |
+
"rouge2_fmeasure_stderr": 0.0013638568945346288
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "e2e_nlg_cleaned",
|
68 |
+
"prompt_name": "generate_text_restaurant",
|
69 |
+
"rougeL_precision": 0.2101184628317088,
|
70 |
+
"dataset_path": "e2e_nlg_cleaned",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": null,
|
73 |
+
"rougeL_precision_stderr": 0.001355833935975265
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "e2e_nlg_cleaned",
|
77 |
+
"prompt_name": "generate_text_restaurant",
|
78 |
+
"rougeL_recall": 0.3886278435049077,
|
79 |
+
"dataset_path": "e2e_nlg_cleaned",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": null,
|
82 |
+
"rougeL_recall_stderr": 0.0023407273478071285
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "e2e_nlg_cleaned",
|
86 |
+
"prompt_name": "generate_text_restaurant",
|
87 |
+
"rougeL_fmeasure": 0.2638096769634126,
|
88 |
+
"dataset_path": "e2e_nlg_cleaned",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": null,
|
91 |
+
"rougeL_fmeasure_stderr": 0.0013632619425518314
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "e2e_nlg_cleaned",
|
95 |
+
"prompt_name": "generate_text_restaurant",
|
96 |
+
"rougeLsum_precision": 0.21561354041284292,
|
97 |
+
"dataset_path": "e2e_nlg_cleaned",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": null,
|
100 |
+
"rougeLsum_precision_stderr": 0.0016830247054440658
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "e2e_nlg_cleaned",
|
104 |
+
"prompt_name": "generate_text_restaurant",
|
105 |
+
"rougeLsum_recall": 0.3919032879874159,
|
106 |
+
"dataset_path": "e2e_nlg_cleaned",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": null,
|
109 |
+
"rougeLsum_recall_stderr": 0.002632582004466588
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "e2e_nlg_cleaned",
|
113 |
+
"prompt_name": "generate_text_restaurant",
|
114 |
+
"rougeLsum_fmeasure": 0.2689516848427162,
|
115 |
+
"dataset_path": "e2e_nlg_cleaned",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": null,
|
118 |
+
"rougeLsum_fmeasure_stderr": 0.0017385224354802351
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 3,
|
126 |
+
"batch_size": 16,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "e2e_nlg_cleaned",
|
5 |
+
"prompt_name": "generate_text_restaurant",
|
6 |
+
"bleu": 6.208222380745828,
|
7 |
+
"dataset_path": "e2e_nlg_cleaned",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": null,
|
10 |
+
"bleu_stderr": 0.09813424389228141
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "e2e_nlg_cleaned",
|
14 |
+
"prompt_name": "generate_text_restaurant",
|
15 |
+
"rouge1_precision": 0.25403488264023616,
|
16 |
+
"dataset_path": "e2e_nlg_cleaned",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": null,
|
19 |
+
"rouge1_precision_stderr": 0.00174969774558582
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "e2e_nlg_cleaned",
|
23 |
+
"prompt_name": "generate_text_restaurant",
|
24 |
+
"rouge1_recall": 0.4709476911476187,
|
25 |
+
"dataset_path": "e2e_nlg_cleaned",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": null,
|
28 |
+
"rouge1_recall_stderr": 0.002655138227534261
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "e2e_nlg_cleaned",
|
32 |
+
"prompt_name": "generate_text_restaurant",
|
33 |
+
"rouge1_fmeasure": 0.32023650433333456,
|
34 |
+
"dataset_path": "e2e_nlg_cleaned",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": null,
|
37 |
+
"rouge1_fmeasure_stderr": 0.0017784408554274436
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "e2e_nlg_cleaned",
|
41 |
+
"prompt_name": "generate_text_restaurant",
|
42 |
+
"rouge2_precision": 0.11058685749210603,
|
43 |
+
"dataset_path": "e2e_nlg_cleaned",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": null,
|
46 |
+
"rouge2_precision_stderr": 0.0012080055063255909
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "e2e_nlg_cleaned",
|
50 |
+
"prompt_name": "generate_text_restaurant",
|
51 |
+
"rouge2_recall": 0.21167766216290337,
|
52 |
+
"dataset_path": "e2e_nlg_cleaned",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": null,
|
55 |
+
"rouge2_recall_stderr": 0.002144496564072756
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "e2e_nlg_cleaned",
|
59 |
+
"prompt_name": "generate_text_restaurant",
|
60 |
+
"rouge2_fmeasure": 0.1403526845297047,
|
61 |
+
"dataset_path": "e2e_nlg_cleaned",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": null,
|
64 |
+
"rouge2_fmeasure_stderr": 0.0013607857847417898
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "e2e_nlg_cleaned",
|
68 |
+
"prompt_name": "generate_text_restaurant",
|
69 |
+
"rougeL_precision": 0.20576507091031662,
|
70 |
+
"dataset_path": "e2e_nlg_cleaned",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": null,
|
73 |
+
"rougeL_precision_stderr": 0.0012824323500063948
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "e2e_nlg_cleaned",
|
77 |
+
"prompt_name": "generate_text_restaurant",
|
78 |
+
"rougeL_recall": 0.3889751930979724,
|
79 |
+
"dataset_path": "e2e_nlg_cleaned",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": null,
|
82 |
+
"rougeL_recall_stderr": 0.0023221115722203652
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "e2e_nlg_cleaned",
|
86 |
+
"prompt_name": "generate_text_restaurant",
|
87 |
+
"rougeL_fmeasure": 0.26122109578349895,
|
88 |
+
"dataset_path": "e2e_nlg_cleaned",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": null,
|
91 |
+
"rougeL_fmeasure_stderr": 0.0013494634641885617
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "e2e_nlg_cleaned",
|
95 |
+
"prompt_name": "generate_text_restaurant",
|
96 |
+
"rougeLsum_precision": 0.21178966636346616,
|
97 |
+
"dataset_path": "e2e_nlg_cleaned",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": null,
|
100 |
+
"rougeLsum_precision_stderr": 0.0016557432144235155
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "e2e_nlg_cleaned",
|
104 |
+
"prompt_name": "generate_text_restaurant",
|
105 |
+
"rougeLsum_recall": 0.393450062046956,
|
106 |
+
"dataset_path": "e2e_nlg_cleaned",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": null,
|
109 |
+
"rougeLsum_recall_stderr": 0.002663580316470773
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "e2e_nlg_cleaned",
|
113 |
+
"prompt_name": "generate_text_restaurant",
|
114 |
+
"rougeLsum_fmeasure": 0.2670143871011222,
|
115 |
+
"dataset_path": "e2e_nlg_cleaned",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": null,
|
118 |
+
"rougeLsum_fmeasure_stderr": 0.0017625078468513325
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 4,
|
126 |
+
"batch_size": 16,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "e2e_nlg_cleaned",
|
5 |
+
"prompt_name": "generate_text_restaurant",
|
6 |
+
"bleu": 6.148604351164331,
|
7 |
+
"dataset_path": "e2e_nlg_cleaned",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": null,
|
10 |
+
"bleu_stderr": 0.11104852380350659
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "e2e_nlg_cleaned",
|
14 |
+
"prompt_name": "generate_text_restaurant",
|
15 |
+
"rouge1_precision": 0.2528456956077577,
|
16 |
+
"dataset_path": "e2e_nlg_cleaned",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": null,
|
19 |
+
"rouge1_precision_stderr": 0.0017424978255548198
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "e2e_nlg_cleaned",
|
23 |
+
"prompt_name": "generate_text_restaurant",
|
24 |
+
"rouge1_recall": 0.473017800721126,
|
25 |
+
"dataset_path": "e2e_nlg_cleaned",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": null,
|
28 |
+
"rouge1_recall_stderr": 0.0026123356126015254
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "e2e_nlg_cleaned",
|
32 |
+
"prompt_name": "generate_text_restaurant",
|
33 |
+
"rouge1_fmeasure": 0.3199925141836084,
|
34 |
+
"dataset_path": "e2e_nlg_cleaned",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": null,
|
37 |
+
"rouge1_fmeasure_stderr": 0.0017551605827543929
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "e2e_nlg_cleaned",
|
41 |
+
"prompt_name": "generate_text_restaurant",
|
42 |
+
"rouge2_precision": 0.10974089008299018,
|
43 |
+
"dataset_path": "e2e_nlg_cleaned",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": null,
|
46 |
+
"rouge2_precision_stderr": 0.0011789832834562743
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "e2e_nlg_cleaned",
|
50 |
+
"prompt_name": "generate_text_restaurant",
|
51 |
+
"rouge2_recall": 0.2119589387667116,
|
52 |
+
"dataset_path": "e2e_nlg_cleaned",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": null,
|
55 |
+
"rouge2_recall_stderr": 0.0020903039724611107
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "e2e_nlg_cleaned",
|
59 |
+
"prompt_name": "generate_text_restaurant",
|
60 |
+
"rouge2_fmeasure": 0.13979285541625697,
|
61 |
+
"dataset_path": "e2e_nlg_cleaned",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": null,
|
64 |
+
"rouge2_fmeasure_stderr": 0.0013089259606601385
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "e2e_nlg_cleaned",
|
68 |
+
"prompt_name": "generate_text_restaurant",
|
69 |
+
"rougeL_precision": 0.20463239752897472,
|
70 |
+
"dataset_path": "e2e_nlg_cleaned",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": null,
|
73 |
+
"rougeL_precision_stderr": 0.0012743661964013022
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "e2e_nlg_cleaned",
|
77 |
+
"prompt_name": "generate_text_restaurant",
|
78 |
+
"rougeL_recall": 0.3904493381375762,
|
79 |
+
"dataset_path": "e2e_nlg_cleaned",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": null,
|
82 |
+
"rougeL_recall_stderr": 0.002278919766787325
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "e2e_nlg_cleaned",
|
86 |
+
"prompt_name": "generate_text_restaurant",
|
87 |
+
"rougeL_fmeasure": 0.2608746206749859,
|
88 |
+
"dataset_path": "e2e_nlg_cleaned",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": null,
|
91 |
+
"rougeL_fmeasure_stderr": 0.0013206910936061742
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "e2e_nlg_cleaned",
|
95 |
+
"prompt_name": "generate_text_restaurant",
|
96 |
+
"rougeLsum_precision": 0.21056870894114044,
|
97 |
+
"dataset_path": "e2e_nlg_cleaned",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": null,
|
100 |
+
"rougeLsum_precision_stderr": 0.001637983619116459
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "e2e_nlg_cleaned",
|
104 |
+
"prompt_name": "generate_text_restaurant",
|
105 |
+
"rougeLsum_recall": 0.39500142434961627,
|
106 |
+
"dataset_path": "e2e_nlg_cleaned",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": null,
|
109 |
+
"rougeLsum_recall_stderr": 0.002625905199940031
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "e2e_nlg_cleaned",
|
113 |
+
"prompt_name": "generate_text_restaurant",
|
114 |
+
"rougeLsum_fmeasure": 0.26667287137591805,
|
115 |
+
"dataset_path": "e2e_nlg_cleaned",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": null,
|
118 |
+
"rougeLsum_fmeasure_stderr": 0.001737453548353309
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 5,
|
126 |
+
"batch_size": 16,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "gem_xsum",
|
5 |
+
"prompt_name": "article_DOC_summary",
|
6 |
+
"rouge1_precision": 0.1131657921685037,
|
7 |
+
"dataset_path": "GEM/xsum",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": "",
|
10 |
+
"rouge1_precision_stderr": 0.0018249204798791675
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "gem_xsum",
|
14 |
+
"prompt_name": "article_DOC_summary",
|
15 |
+
"rouge1_recall": 0.2722598710120147,
|
16 |
+
"dataset_path": "GEM/xsum",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": "",
|
19 |
+
"rouge1_recall_stderr": 0.004133665114298229
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "gem_xsum",
|
23 |
+
"prompt_name": "article_DOC_summary",
|
24 |
+
"rouge1_fmeasure": 0.15647200121804894,
|
25 |
+
"dataset_path": "GEM/xsum",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": "",
|
28 |
+
"rouge1_fmeasure_stderr": 0.0023727238595388817
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "gem_xsum",
|
32 |
+
"prompt_name": "article_DOC_summary",
|
33 |
+
"rouge2_precision": 0.019363405482896792,
|
34 |
+
"dataset_path": "GEM/xsum",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": "",
|
37 |
+
"rouge2_precision_stderr": 0.0009105840932812386
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "gem_xsum",
|
41 |
+
"prompt_name": "article_DOC_summary",
|
42 |
+
"rouge2_recall": 0.049210613825466105,
|
43 |
+
"dataset_path": "GEM/xsum",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": "",
|
46 |
+
"rouge2_recall_stderr": 0.002366221262626497
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "gem_xsum",
|
50 |
+
"prompt_name": "article_DOC_summary",
|
51 |
+
"rouge2_fmeasure": 0.027201929614022874,
|
52 |
+
"dataset_path": "GEM/xsum",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": "",
|
55 |
+
"rouge2_fmeasure_stderr": 0.0012539141722579825
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "gem_xsum",
|
59 |
+
"prompt_name": "article_DOC_summary",
|
60 |
+
"rougeL_precision": 0.08876222177060916,
|
61 |
+
"dataset_path": "GEM/xsum",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": "",
|
64 |
+
"rougeL_precision_stderr": 0.0013761155158951413
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "gem_xsum",
|
68 |
+
"prompt_name": "article_DOC_summary",
|
69 |
+
"rougeL_recall": 0.21448631404181784,
|
70 |
+
"dataset_path": "GEM/xsum",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": "",
|
73 |
+
"rougeL_recall_stderr": 0.003198193852945419
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "gem_xsum",
|
77 |
+
"prompt_name": "article_DOC_summary",
|
78 |
+
"rougeL_fmeasure": 0.12268909574632023,
|
79 |
+
"dataset_path": "GEM/xsum",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": "",
|
82 |
+
"rougeL_fmeasure_stderr": 0.001773225086338
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "gem_xsum",
|
86 |
+
"prompt_name": "article_DOC_summary",
|
87 |
+
"rougeLsum_precision": 0.09102722612426341,
|
88 |
+
"dataset_path": "GEM/xsum",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": "",
|
91 |
+
"rougeLsum_precision_stderr": 0.0014951062687195294
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "gem_xsum",
|
95 |
+
"prompt_name": "article_DOC_summary",
|
96 |
+
"rougeLsum_recall": 0.22021791608209987,
|
97 |
+
"dataset_path": "GEM/xsum",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": "",
|
100 |
+
"rougeLsum_recall_stderr": 0.003544530658562865
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "gem_xsum",
|
104 |
+
"prompt_name": "article_DOC_summary",
|
105 |
+
"rougeLsum_fmeasure": 0.12593169516392616,
|
106 |
+
"dataset_path": "GEM/xsum",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": "",
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.001962773566689985
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "gem_xsum",
|
113 |
+
"prompt_name": "article_DOC_summary",
|
114 |
+
"bleu": 1.1532375468354903,
|
115 |
+
"dataset_path": "GEM/xsum",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": "",
|
118 |
+
"bleu_stderr": 0.09169402373692534
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 3,
|
126 |
+
"batch_size": 16,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "gem_xsum",
|
5 |
+
"prompt_name": "article_DOC_summary",
|
6 |
+
"rouge1_precision": 0.0383955897314317,
|
7 |
+
"dataset_path": "GEM/xsum",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": "",
|
10 |
+
"rouge1_precision_stderr": 0.0025422863710305266
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "gem_xsum",
|
14 |
+
"prompt_name": "article_DOC_summary",
|
15 |
+
"rouge1_recall": 0.06915759782304308,
|
16 |
+
"dataset_path": "GEM/xsum",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": "",
|
19 |
+
"rouge1_recall_stderr": 0.0039608171509673586
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "gem_xsum",
|
23 |
+
"prompt_name": "article_DOC_summary",
|
24 |
+
"rouge1_fmeasure": 0.04426046707797172,
|
25 |
+
"dataset_path": "GEM/xsum",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": "",
|
28 |
+
"rouge1_fmeasure_stderr": 0.0024997989992097624
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "gem_xsum",
|
32 |
+
"prompt_name": "article_DOC_summary",
|
33 |
+
"rouge2_precision": 0.006481358010154669,
|
34 |
+
"dataset_path": "GEM/xsum",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": "",
|
37 |
+
"rouge2_precision_stderr": 0.0007393187434393857
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "gem_xsum",
|
41 |
+
"prompt_name": "article_DOC_summary",
|
42 |
+
"rouge2_recall": 0.013044537342091751,
|
43 |
+
"dataset_path": "GEM/xsum",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": "",
|
46 |
+
"rouge2_recall_stderr": 0.00125152402986618
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "gem_xsum",
|
50 |
+
"prompt_name": "article_DOC_summary",
|
51 |
+
"rouge2_fmeasure": 0.008141029291990268,
|
52 |
+
"dataset_path": "GEM/xsum",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": "",
|
55 |
+
"rouge2_fmeasure_stderr": 0.0008202019731008721
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "gem_xsum",
|
59 |
+
"prompt_name": "article_DOC_summary",
|
60 |
+
"rougeL_precision": 0.031113719615522017,
|
61 |
+
"dataset_path": "GEM/xsum",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": "",
|
64 |
+
"rougeL_precision_stderr": 0.0021898843752975964
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "gem_xsum",
|
68 |
+
"prompt_name": "article_DOC_summary",
|
69 |
+
"rougeL_recall": 0.055197273257753664,
|
70 |
+
"dataset_path": "GEM/xsum",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": "",
|
73 |
+
"rougeL_recall_stderr": 0.003191098630834803
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "gem_xsum",
|
77 |
+
"prompt_name": "article_DOC_summary",
|
78 |
+
"rougeL_fmeasure": 0.035265847972026074,
|
79 |
+
"dataset_path": "GEM/xsum",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": "",
|
82 |
+
"rougeL_fmeasure_stderr": 0.0020006364168593496
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "gem_xsum",
|
86 |
+
"prompt_name": "article_DOC_summary",
|
87 |
+
"rougeLsum_precision": 0.03210804984465074,
|
88 |
+
"dataset_path": "GEM/xsum",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": "",
|
91 |
+
"rougeLsum_precision_stderr": 0.0022623416009452243
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "gem_xsum",
|
95 |
+
"prompt_name": "article_DOC_summary",
|
96 |
+
"rougeLsum_recall": 0.05680048575290953,
|
97 |
+
"dataset_path": "GEM/xsum",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": "",
|
100 |
+
"rougeLsum_recall_stderr": 0.0032931638988068055
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "gem_xsum",
|
104 |
+
"prompt_name": "article_DOC_summary",
|
105 |
+
"rougeLsum_fmeasure": 0.03632930315349291,
|
106 |
+
"dataset_path": "GEM/xsum",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": "",
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.0020756841173550696
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "gem_xsum",
|
113 |
+
"prompt_name": "article_DOC_summary",
|
114 |
+
"bleu": 0.6055404023350197,
|
115 |
+
"dataset_path": "GEM/xsum",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": "",
|
118 |
+
"bleu_stderr": 0.09409517819503949
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 4,
|
126 |
+
"batch_size": 16,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.json
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": [
|
3 |
+
{
|
4 |
+
"task_name": "gem_xsum",
|
5 |
+
"prompt_name": "article_DOC_summary",
|
6 |
+
"rouge1_precision": 0.002821793157093116,
|
7 |
+
"dataset_path": "GEM/xsum",
|
8 |
+
"dataset_name": null,
|
9 |
+
"subset": "",
|
10 |
+
"rouge1_precision_stderr": 0.0008106295472902334
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"task_name": "gem_xsum",
|
14 |
+
"prompt_name": "article_DOC_summary",
|
15 |
+
"rouge1_recall": 0.0022851464113943067,
|
16 |
+
"dataset_path": "GEM/xsum",
|
17 |
+
"dataset_name": null,
|
18 |
+
"subset": "",
|
19 |
+
"rouge1_recall_stderr": 0.0006484962567584845
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"task_name": "gem_xsum",
|
23 |
+
"prompt_name": "article_DOC_summary",
|
24 |
+
"rouge1_fmeasure": 0.0024831420596845987,
|
25 |
+
"dataset_path": "GEM/xsum",
|
26 |
+
"dataset_name": null,
|
27 |
+
"subset": "",
|
28 |
+
"rouge1_fmeasure_stderr": 0.0007070331562075694
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"task_name": "gem_xsum",
|
32 |
+
"prompt_name": "article_DOC_summary",
|
33 |
+
"rouge2_precision": 0.0002914830883754302,
|
34 |
+
"dataset_path": "GEM/xsum",
|
35 |
+
"dataset_name": null,
|
36 |
+
"subset": "",
|
37 |
+
"rouge2_precision_stderr": 0.00020340968454234058
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"task_name": "gem_xsum",
|
41 |
+
"prompt_name": "article_DOC_summary",
|
42 |
+
"rouge2_recall": 0.00022401756585996315,
|
43 |
+
"dataset_path": "GEM/xsum",
|
44 |
+
"dataset_name": null,
|
45 |
+
"subset": "",
|
46 |
+
"rouge2_recall_stderr": 0.0001545711517822221
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"task_name": "gem_xsum",
|
50 |
+
"prompt_name": "article_DOC_summary",
|
51 |
+
"rouge2_fmeasure": 0.0002519246692387647,
|
52 |
+
"dataset_path": "GEM/xsum",
|
53 |
+
"dataset_name": null,
|
54 |
+
"subset": "",
|
55 |
+
"rouge2_fmeasure_stderr": 0.00017508831809454146
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"task_name": "gem_xsum",
|
59 |
+
"prompt_name": "article_DOC_summary",
|
60 |
+
"rougeL_precision": 0.0021547453202339587,
|
61 |
+
"dataset_path": "GEM/xsum",
|
62 |
+
"dataset_name": null,
|
63 |
+
"subset": "",
|
64 |
+
"rougeL_precision_stderr": 0.0006563220446223475
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"task_name": "gem_xsum",
|
68 |
+
"prompt_name": "article_DOC_summary",
|
69 |
+
"rougeL_recall": 0.0017708458456009723,
|
70 |
+
"dataset_path": "GEM/xsum",
|
71 |
+
"dataset_name": null,
|
72 |
+
"subset": "",
|
73 |
+
"rougeL_recall_stderr": 0.0005325445996971974
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"task_name": "gem_xsum",
|
77 |
+
"prompt_name": "article_DOC_summary",
|
78 |
+
"rougeL_fmeasure": 0.001910475794160408,
|
79 |
+
"dataset_path": "GEM/xsum",
|
80 |
+
"dataset_name": null,
|
81 |
+
"subset": "",
|
82 |
+
"rougeL_fmeasure_stderr": 0.0005775932958294451
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"task_name": "gem_xsum",
|
86 |
+
"prompt_name": "article_DOC_summary",
|
87 |
+
"rougeLsum_precision": 0.002307213397230338,
|
88 |
+
"dataset_path": "GEM/xsum",
|
89 |
+
"dataset_name": null,
|
90 |
+
"subset": "",
|
91 |
+
"rougeLsum_precision_stderr": 0.0006934611042748366
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"task_name": "gem_xsum",
|
95 |
+
"prompt_name": "article_DOC_summary",
|
96 |
+
"rougeLsum_recall": 0.001892317665185999,
|
97 |
+
"dataset_path": "GEM/xsum",
|
98 |
+
"dataset_name": null,
|
99 |
+
"subset": "",
|
100 |
+
"rougeLsum_recall_stderr": 0.0005664327179419651
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"task_name": "gem_xsum",
|
104 |
+
"prompt_name": "article_DOC_summary",
|
105 |
+
"rougeLsum_fmeasure": 0.0020447516574242644,
|
106 |
+
"dataset_path": "GEM/xsum",
|
107 |
+
"dataset_name": null,
|
108 |
+
"subset": "",
|
109 |
+
"rougeLsum_fmeasure_stderr": 0.000612874204929875
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"task_name": "gem_xsum",
|
113 |
+
"prompt_name": "article_DOC_summary",
|
114 |
+
"bleu": 1.1094899638874875e-37,
|
115 |
+
"dataset_path": "GEM/xsum",
|
116 |
+
"dataset_name": null,
|
117 |
+
"subset": "",
|
118 |
+
"bleu_stderr": 1.0662006738968273e-31
|
119 |
+
}
|
120 |
+
],
|
121 |
+
"config": {
|
122 |
+
"model": "hf-causal",
|
123 |
+
"model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
|
124 |
+
"task_args": "",
|
125 |
+
"num_fewshot": 5,
|
126 |
+
"batch_size": 16,
|
127 |
+
"device": "cuda",
|
128 |
+
"use_cache": false,
|
129 |
+
"limit": 3000,
|
130 |
+
"bootstrap_iters": 10,
|
131 |
+
"seed": 1234
|
132 |
+
}
|
133 |
+
}
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_0.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.34,0.014987482264363937,0
|
3 |
+
anli_r2,acc,0.333,0.014910846164229864,0
|
4 |
+
anli_r3,acc,0.3433333333333333,0.01371263383046586,0
|
5 |
+
arc_challenge,acc,0.2525597269624573,0.012696728980207708,0
|
6 |
+
arc_challenge,acc_norm,0.28071672354948807,0.013131238126975583,0
|
7 |
+
arc_easy,acc,0.5631313131313131,0.010177672928157685,0
|
8 |
+
arc_easy,acc_norm,0.49747474747474746,0.01025965266878347,0
|
9 |
+
boolq,acc,0.5351681957186545,0.008723396352960192,1
|
10 |
+
cb,acc,0.44642857142857145,0.06703189227942398,1
|
11 |
+
cb,f1,0.2956393200295639,,1
|
12 |
+
copa,acc,0.71,0.04560480215720684,0
|
13 |
+
hellaswag,acc,0.43328022306313485,0.004945157565218188,0
|
14 |
+
hellaswag,acc_norm,0.5569607647878908,0.004957296691391566,0
|
15 |
+
piqa,acc,0.7442872687704026,0.010178690109459855,0
|
16 |
+
piqa,acc_norm,0.750272034820457,0.010099232969867469,0
|
17 |
+
rte,acc,0.5415162454873647,0.029992535385373317,0
|
18 |
+
sciq,acc,0.805,0.012535235623319329,0
|
19 |
+
sciq,acc_norm,0.71,0.014356395999905684,0
|
20 |
+
storycloze_2016,acc,0.6916087653661144,0.0106797344454878,0
|
21 |
+
winogrande,acc,0.5477505919494869,0.013988256216606007,0
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_0_lm-eval_global_step52452_2023-02-24-23-57-47_0shots_backup.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.34,
|
5 |
-
"acc_stderr": 0.014987482264363937
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.333,
|
9 |
-
"acc_stderr": 0.014910846164229864
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3433333333333333,
|
13 |
-
"acc_stderr": 0.01371263383046586
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.44642857142857145,
|
17 |
-
"acc_stderr": 0.06703189227942398,
|
18 |
-
"f1": 0.2956393200295639
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.71,
|
22 |
-
"acc_stderr": 0.04560480215720684
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.43328022306313485,
|
26 |
-
"acc_stderr": 0.004945157565218188,
|
27 |
-
"acc_norm": 0.5569607647878908,
|
28 |
-
"acc_norm_stderr": 0.004957296691391566
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5415162454873647,
|
32 |
-
"acc_stderr": 0.029992535385373317
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5477505919494869,
|
36 |
-
"acc_stderr": 0.013988256216606007
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6916087653661144,
|
40 |
-
"acc_stderr": 0.0106797344454878
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.5351681957186545,
|
44 |
-
"acc_stderr": 0.008723396352960192
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5631313131313131,
|
48 |
-
"acc_stderr": 0.010177672928157685,
|
49 |
-
"acc_norm": 0.49747474747474746,
|
50 |
-
"acc_norm_stderr": 0.01025965266878347
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.2525597269624573,
|
54 |
-
"acc_stderr": 0.012696728980207708,
|
55 |
-
"acc_norm": 0.28071672354948807,
|
56 |
-
"acc_norm_stderr": 0.013131238126975583
|
57 |
-
},
|
58 |
-
"sciq": {
|
59 |
-
"acc": 0.805,
|
60 |
-
"acc_stderr": 0.012535235623319329,
|
61 |
-
"acc_norm": 0.71,
|
62 |
-
"acc_norm_stderr": 0.014356395999905684
|
63 |
-
},
|
64 |
-
"piqa": {
|
65 |
-
"acc": 0.7442872687704026,
|
66 |
-
"acc_stderr": 0.010178690109459855,
|
67 |
-
"acc_norm": 0.750272034820457,
|
68 |
-
"acc_norm_stderr": 0.010099232969867469
|
69 |
-
}
|
70 |
-
},
|
71 |
-
"versions": {
|
72 |
-
"anli_r1": 0,
|
73 |
-
"anli_r2": 0,
|
74 |
-
"anli_r3": 0,
|
75 |
-
"cb": 1,
|
76 |
-
"copa": 0,
|
77 |
-
"hellaswag": 0,
|
78 |
-
"rte": 0,
|
79 |
-
"winogrande": 0,
|
80 |
-
"storycloze_2016": 0,
|
81 |
-
"boolq": 1,
|
82 |
-
"arc_easy": 0,
|
83 |
-
"arc_challenge": 0,
|
84 |
-
"sciq": 0,
|
85 |
-
"piqa": 0
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_1.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.323,0.014794927843348635,0
|
3 |
+
anli_r2,acc,0.348,0.01507060460376841,0
|
4 |
+
anli_r3,acc,0.3416666666666667,0.013696658778002519,0
|
5 |
+
arc_challenge,acc,0.26023890784982934,0.012821930225112566,0
|
6 |
+
arc_challenge,acc_norm,0.27986348122866894,0.013119040897725923,0
|
7 |
+
arc_easy,acc,0.5740740740740741,0.010146568651002255,0
|
8 |
+
arc_easy,acc_norm,0.5332491582491582,0.010237073872130745,0
|
9 |
+
boolq,acc,0.555045871559633,0.008691897543539221,1
|
10 |
+
cb,acc,0.5178571428571429,0.06737697508644648,1
|
11 |
+
cb,f1,0.3656150648080215,,1
|
12 |
+
copa,acc,0.74,0.04408440022768078,0
|
13 |
+
hellaswag,acc,0.4282015534754033,0.004938068627349495,0
|
14 |
+
hellaswag,acc_norm,0.555964947221669,0.0049584261524818945,0
|
15 |
+
piqa,acc,0.7306855277475517,0.010350004070588758,0
|
16 |
+
piqa,acc_norm,0.7383025027203483,0.01025563077270823,0
|
17 |
+
rte,acc,0.5415162454873647,0.029992535385373314,0
|
18 |
+
sciq,acc,0.845,0.011450157470799471,0
|
19 |
+
sciq,acc_norm,0.811,0.012386784588117714,0
|
20 |
+
storycloze_2016,acc,0.6787814003206841,0.010798029402794916,0
|
21 |
+
winogrande,acc,0.5564325177584846,0.0139626949076204,0
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_1.json
CHANGED
@@ -54,6 +54,18 @@
|
|
54 |
"acc_stderr": 0.012821930225112566,
|
55 |
"acc_norm": 0.27986348122866894,
|
56 |
"acc_norm_stderr": 0.013119040897725923
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
}
|
58 |
},
|
59 |
"versions": {
|
@@ -68,6 +80,8 @@
|
|
68 |
"storycloze_2016": 0,
|
69 |
"boolq": 1,
|
70 |
"arc_easy": 0,
|
71 |
-
"arc_challenge": 0
|
|
|
|
|
72 |
}
|
73 |
}
|
|
|
54 |
"acc_stderr": 0.012821930225112566,
|
55 |
"acc_norm": 0.27986348122866894,
|
56 |
"acc_norm_stderr": 0.013119040897725923
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.845,
|
60 |
+
"acc_stderr": 0.011450157470799471,
|
61 |
+
"acc_norm": 0.811,
|
62 |
+
"acc_norm_stderr": 0.012386784588117714
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7306855277475517,
|
66 |
+
"acc_stderr": 0.010350004070588758,
|
67 |
+
"acc_norm": 0.7383025027203483,
|
68 |
+
"acc_norm_stderr": 0.01025563077270823
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
80 |
"storycloze_2016": 0,
|
81 |
"boolq": 1,
|
82 |
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_1_lm-eval_global_step52452_2023-02-24-23-57-47_1shots_backup.json
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.323,
|
5 |
-
"acc_stderr": 0.014794927843348635
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.348,
|
9 |
-
"acc_stderr": 0.01507060460376841
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.3416666666666667,
|
13 |
-
"acc_stderr": 0.013696658778002519
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5178571428571429,
|
17 |
-
"acc_stderr": 0.06737697508644648,
|
18 |
-
"f1": 0.3656150648080215
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.74,
|
22 |
-
"acc_stderr": 0.04408440022768078
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.4282015534754033,
|
26 |
-
"acc_stderr": 0.004938068627349495,
|
27 |
-
"acc_norm": 0.555964947221669,
|
28 |
-
"acc_norm_stderr": 0.0049584261524818945
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.5415162454873647,
|
32 |
-
"acc_stderr": 0.029992535385373314
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5564325177584846,
|
36 |
-
"acc_stderr": 0.0139626949076204
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.6787814003206841,
|
40 |
-
"acc_stderr": 0.010798029402794916
|
41 |
-
},
|
42 |
-
"boolq": {
|
43 |
-
"acc": 0.555045871559633,
|
44 |
-
"acc_stderr": 0.008691897543539221
|
45 |
-
},
|
46 |
-
"arc_easy": {
|
47 |
-
"acc": 0.5740740740740741,
|
48 |
-
"acc_stderr": 0.010146568651002255,
|
49 |
-
"acc_norm": 0.5332491582491582,
|
50 |
-
"acc_norm_stderr": 0.010237073872130745
|
51 |
-
},
|
52 |
-
"arc_challenge": {
|
53 |
-
"acc": 0.26023890784982934,
|
54 |
-
"acc_stderr": 0.012821930225112566,
|
55 |
-
"acc_norm": 0.27986348122866894,
|
56 |
-
"acc_norm_stderr": 0.013119040897725923
|
57 |
-
}
|
58 |
-
},
|
59 |
-
"versions": {
|
60 |
-
"anli_r1": 0,
|
61 |
-
"anli_r2": 0,
|
62 |
-
"anli_r3": 0,
|
63 |
-
"cb": 1,
|
64 |
-
"copa": 0,
|
65 |
-
"hellaswag": 0,
|
66 |
-
"rte": 0,
|
67 |
-
"winogrande": 0,
|
68 |
-
"storycloze_2016": 0,
|
69 |
-
"boolq": 1,
|
70 |
-
"arc_easy": 0,
|
71 |
-
"arc_challenge": 0
|
72 |
-
}
|
73 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_2.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.32,0.014758652303574874,0
|
3 |
+
anli_r2,acc,0.334,0.014922019523732968,0
|
4 |
+
anli_r3,acc,0.345,0.013728421539454878,0
|
5 |
+
arc_challenge,acc,0.24658703071672355,0.012595726268790122,0
|
6 |
+
arc_challenge,acc_norm,0.29180887372013653,0.01328452529240351,0
|
7 |
+
arc_easy,acc,0.5862794612794613,0.010105878530238144,0
|
8 |
+
arc_easy,acc_norm,0.569023569023569,0.010161552863493746,0
|
9 |
+
boolq,acc,0.5504587155963303,0.008700409761350796,1
|
10 |
+
cb,acc,0.42857142857142855,0.06672848092813058,1
|
11 |
+
cb,f1,0.291005291005291,,1
|
12 |
+
copa,acc,0.71,0.045604802157206845,0
|
13 |
+
hellaswag,acc,0.42929695279824737,0.004939642460172587,0
|
14 |
+
hellaswag,acc_norm,0.5593507269468233,0.004954503606471611,0
|
15 |
+
piqa,acc,0.7328618063112078,0.010323440492612437,0
|
16 |
+
piqa,acc_norm,0.735582154515778,0.010289787244767156,0
|
17 |
+
rte,acc,0.49097472924187724,0.030091559826331334,0
|
18 |
+
sciq,acc,0.859,0.011010914595992443,0
|
19 |
+
sciq,acc_norm,0.83,0.011884495834541663,0
|
20 |
+
storycloze_2016,acc,0.686798503474078,0.010725209422929404,0
|
21 |
+
winogrande,acc,0.5611681136543015,0.013946933444507032,0
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_2.json
CHANGED
@@ -38,6 +38,34 @@
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.686798503474078,
|
40 |
"acc_stderr": 0.010725209422929404
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
}
|
42 |
},
|
43 |
"versions": {
|
@@ -49,6 +77,11 @@
|
|
49 |
"hellaswag": 0,
|
50 |
"rte": 0,
|
51 |
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
|
|
|
|
|
|
|
|
|
|
53 |
}
|
54 |
}
|
|
|
38 |
"storycloze_2016": {
|
39 |
"acc": 0.686798503474078,
|
40 |
"acc_stderr": 0.010725209422929404
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5504587155963303,
|
44 |
+
"acc_stderr": 0.008700409761350796
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.5862794612794613,
|
48 |
+
"acc_stderr": 0.010105878530238144,
|
49 |
+
"acc_norm": 0.569023569023569,
|
50 |
+
"acc_norm_stderr": 0.010161552863493746
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.24658703071672355,
|
54 |
+
"acc_stderr": 0.012595726268790122,
|
55 |
+
"acc_norm": 0.29180887372013653,
|
56 |
+
"acc_norm_stderr": 0.01328452529240351
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.859,
|
60 |
+
"acc_stderr": 0.011010914595992443,
|
61 |
+
"acc_norm": 0.83,
|
62 |
+
"acc_norm_stderr": 0.011884495834541663
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7328618063112078,
|
66 |
+
"acc_stderr": 0.010323440492612437,
|
67 |
+
"acc_norm": 0.735582154515778,
|
68 |
+
"acc_norm_stderr": 0.010289787244767156
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
77 |
"hellaswag": 0,
|
78 |
"rte": 0,
|
79 |
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_2_lm-eval_global_step52452_2023-02-24-23-57-47_2shots_backup.json
DELETED
@@ -1,54 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.32,
|
5 |
-
"acc_stderr": 0.014758652303574874
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.334,
|
9 |
-
"acc_stderr": 0.014922019523732968
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.345,
|
13 |
-
"acc_stderr": 0.013728421539454878
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.42857142857142855,
|
17 |
-
"acc_stderr": 0.06672848092813058,
|
18 |
-
"f1": 0.291005291005291
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.71,
|
22 |
-
"acc_stderr": 0.045604802157206845
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.42929695279824737,
|
26 |
-
"acc_stderr": 0.004939642460172587,
|
27 |
-
"acc_norm": 0.5593507269468233,
|
28 |
-
"acc_norm_stderr": 0.004954503606471611
|
29 |
-
},
|
30 |
-
"rte": {
|
31 |
-
"acc": 0.49097472924187724,
|
32 |
-
"acc_stderr": 0.030091559826331334
|
33 |
-
},
|
34 |
-
"winogrande": {
|
35 |
-
"acc": 0.5611681136543015,
|
36 |
-
"acc_stderr": 0.013946933444507032
|
37 |
-
},
|
38 |
-
"storycloze_2016": {
|
39 |
-
"acc": 0.686798503474078,
|
40 |
-
"acc_stderr": 0.010725209422929404
|
41 |
-
}
|
42 |
-
},
|
43 |
-
"versions": {
|
44 |
-
"anli_r1": 0,
|
45 |
-
"anli_r2": 0,
|
46 |
-
"anli_r3": 0,
|
47 |
-
"cb": 1,
|
48 |
-
"copa": 0,
|
49 |
-
"hellaswag": 0,
|
50 |
-
"rte": 0,
|
51 |
-
"winogrande": 0,
|
52 |
-
"storycloze_2016": 0
|
53 |
-
}
|
54 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_3.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.325,0.014818724459095524,0
|
3 |
+
anli_r2,acc,0.35,0.015090650341444231,0
|
4 |
+
anli_r3,acc,0.33166666666666667,0.013596836729485157,0
|
5 |
+
arc_challenge,acc,0.26109215017064846,0.012835523909473841,0
|
6 |
+
arc_challenge,acc_norm,0.2815699658703072,0.013143376735009022,0
|
7 |
+
arc_easy,acc,0.5925925925925926,0.010082326627832858,0
|
8 |
+
arc_easy,acc_norm,0.5728114478114478,0.010150415974210871,0
|
9 |
+
boolq,acc,0.5648318042813456,0.008671229580582113,1
|
10 |
+
cb,acc,0.5357142857142857,0.06724777654937658,1
|
11 |
+
cb,f1,0.4369505854187708,,1
|
12 |
+
copa,acc,0.75,0.04351941398892446,0
|
13 |
+
hellaswag,acc,0.427504481179048,0.0049370542337115715,0
|
14 |
+
hellaswag,acc_norm,0.5574586735710018,0.004956724392646532,0
|
15 |
+
piqa,acc,0.7404787812840044,0.010227939888173918,0
|
16 |
+
piqa,acc_norm,0.7372143634385201,0.010269354068140776,0
|
17 |
+
rte,acc,0.5523465703971119,0.02993107036293953,0
|
18 |
+
sciq,acc,0.857,0.011075814808567038,0
|
19 |
+
sciq,acc_norm,0.843,0.011510146979230187,0
|
20 |
+
storycloze_2016,acc,0.6835916622127205,0.01075478009794089,0
|
21 |
+
winogrande,acc,0.5430149960536701,0.01400038676159829,0
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_3.json
CHANGED
@@ -26,6 +26,46 @@
|
|
26 |
"acc_stderr": 0.0049370542337115715,
|
27 |
"acc_norm": 0.5574586735710018,
|
28 |
"acc_norm_stderr": 0.004956724392646532
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
}
|
30 |
},
|
31 |
"versions": {
|
@@ -34,6 +74,14 @@
|
|
34 |
"anli_r3": 0,
|
35 |
"cb": 1,
|
36 |
"copa": 0,
|
37 |
-
"hellaswag": 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
}
|
39 |
}
|
|
|
26 |
"acc_stderr": 0.0049370542337115715,
|
27 |
"acc_norm": 0.5574586735710018,
|
28 |
"acc_norm_stderr": 0.004956724392646532
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5523465703971119,
|
32 |
+
"acc_stderr": 0.02993107036293953
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5430149960536701,
|
36 |
+
"acc_stderr": 0.01400038676159829
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.6835916622127205,
|
40 |
+
"acc_stderr": 0.01075478009794089
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5648318042813456,
|
44 |
+
"acc_stderr": 0.008671229580582113
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.5925925925925926,
|
48 |
+
"acc_stderr": 0.010082326627832858,
|
49 |
+
"acc_norm": 0.5728114478114478,
|
50 |
+
"acc_norm_stderr": 0.010150415974210871
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.26109215017064846,
|
54 |
+
"acc_stderr": 0.012835523909473841,
|
55 |
+
"acc_norm": 0.2815699658703072,
|
56 |
+
"acc_norm_stderr": 0.013143376735009022
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.857,
|
60 |
+
"acc_stderr": 0.011075814808567038,
|
61 |
+
"acc_norm": 0.843,
|
62 |
+
"acc_norm_stderr": 0.011510146979230187
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7404787812840044,
|
66 |
+
"acc_stderr": 0.010227939888173918,
|
67 |
+
"acc_norm": 0.7372143634385201,
|
68 |
+
"acc_norm_stderr": 0.010269354068140776
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
74 |
"anli_r3": 0,
|
75 |
"cb": 1,
|
76 |
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_3_lm-eval_global_step52452_2023-02-24-23-57-47_3shots_backup.json
DELETED
@@ -1,39 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"anli_r1": {
|
4 |
-
"acc": 0.325,
|
5 |
-
"acc_stderr": 0.014818724459095524
|
6 |
-
},
|
7 |
-
"anli_r2": {
|
8 |
-
"acc": 0.35,
|
9 |
-
"acc_stderr": 0.015090650341444231
|
10 |
-
},
|
11 |
-
"anli_r3": {
|
12 |
-
"acc": 0.33166666666666667,
|
13 |
-
"acc_stderr": 0.013596836729485157
|
14 |
-
},
|
15 |
-
"cb": {
|
16 |
-
"acc": 0.5357142857142857,
|
17 |
-
"acc_stderr": 0.06724777654937658,
|
18 |
-
"f1": 0.4369505854187708
|
19 |
-
},
|
20 |
-
"copa": {
|
21 |
-
"acc": 0.75,
|
22 |
-
"acc_stderr": 0.04351941398892446
|
23 |
-
},
|
24 |
-
"hellaswag": {
|
25 |
-
"acc": 0.427504481179048,
|
26 |
-
"acc_stderr": 0.0049370542337115715,
|
27 |
-
"acc_norm": 0.5574586735710018,
|
28 |
-
"acc_norm_stderr": 0.004956724392646532
|
29 |
-
}
|
30 |
-
},
|
31 |
-
"versions": {
|
32 |
-
"anli_r1": 0,
|
33 |
-
"anli_r2": 0,
|
34 |
-
"anli_r3": 0,
|
35 |
-
"cb": 1,
|
36 |
-
"copa": 0,
|
37 |
-
"hellaswag": 0
|
38 |
-
}
|
39 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_4.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.342,0.01500870618212173,0
|
3 |
+
anli_r2,acc,0.367,0.015249378464171754,0
|
4 |
+
anli_r3,acc,0.35333333333333333,0.01380457216231493,0
|
5 |
+
arc_challenge,acc,0.26109215017064846,0.012835523909473848,0
|
6 |
+
arc_challenge,acc_norm,0.29266211604095566,0.013295916103619417,0
|
7 |
+
arc_easy,acc,0.593013468013468,0.010080695355466598,0
|
8 |
+
arc_easy,acc_norm,0.5707070707070707,0.010156678075911096,0
|
9 |
+
boolq,acc,0.5724770642201835,0.008652692997177334,1
|
10 |
+
cb,acc,0.44642857142857145,0.067031892279424,1
|
11 |
+
cb,f1,0.26007168458781366,,1
|
12 |
+
copa,acc,0.72,0.04512608598542127,0
|
13 |
+
hellaswag,acc,0.42710615415255926,0.004936470085238484,0
|
14 |
+
hellaswag,acc_norm,0.5613423620792671,0.004952087083128899,0
|
15 |
+
piqa,acc,0.7323177366702938,0.010330111189370432,0
|
16 |
+
piqa,acc_norm,0.7366702937976061,0.010276185322196764,0
|
17 |
+
rte,acc,0.4981949458483754,0.030096267148976626,0
|
18 |
+
sciq,acc,0.866,0.010777762298369685,0
|
19 |
+
sciq,acc_norm,0.858,0.011043457699378227,0
|
20 |
+
storycloze_2016,acc,0.6873329770176376,0.01072022317295317,0
|
21 |
+
winogrande,acc,0.5501183898973955,0.01398171190404973,0
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_4.json
CHANGED
@@ -20,6 +20,52 @@
|
|
20 |
"copa": {
|
21 |
"acc": 0.72,
|
22 |
"acc_stderr": 0.04512608598542127
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
}
|
24 |
},
|
25 |
"versions": {
|
@@ -27,6 +73,15 @@
|
|
27 |
"anli_r2": 0,
|
28 |
"anli_r3": 0,
|
29 |
"cb": 1,
|
30 |
-
"copa": 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
}
|
32 |
}
|
|
|
20 |
"copa": {
|
21 |
"acc": 0.72,
|
22 |
"acc_stderr": 0.04512608598542127
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.42710615415255926,
|
26 |
+
"acc_stderr": 0.004936470085238484,
|
27 |
+
"acc_norm": 0.5613423620792671,
|
28 |
+
"acc_norm_stderr": 0.004952087083128899
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.4981949458483754,
|
32 |
+
"acc_stderr": 0.030096267148976626
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5501183898973955,
|
36 |
+
"acc_stderr": 0.01398171190404973
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.6873329770176376,
|
40 |
+
"acc_stderr": 0.01072022317295317
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5724770642201835,
|
44 |
+
"acc_stderr": 0.008652692997177334
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.593013468013468,
|
48 |
+
"acc_stderr": 0.010080695355466598,
|
49 |
+
"acc_norm": 0.5707070707070707,
|
50 |
+
"acc_norm_stderr": 0.010156678075911096
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.26109215017064846,
|
54 |
+
"acc_stderr": 0.012835523909473848,
|
55 |
+
"acc_norm": 0.29266211604095566,
|
56 |
+
"acc_norm_stderr": 0.013295916103619417
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.866,
|
60 |
+
"acc_stderr": 0.010777762298369685,
|
61 |
+
"acc_norm": 0.858,
|
62 |
+
"acc_norm_stderr": 0.011043457699378227
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7323177366702938,
|
66 |
+
"acc_stderr": 0.010330111189370432,
|
67 |
+
"acc_norm": 0.7366702937976061,
|
68 |
+
"acc_norm_stderr": 0.010276185322196764
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
73 |
"anli_r2": 0,
|
74 |
"anli_r3": 0,
|
75 |
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_5.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task,metric,value,err,version
|
2 |
+
anli_r1,acc,0.346,0.015050266127564448,0
|
3 |
+
anli_r2,acc,0.344,0.015029633724408947,0
|
4 |
+
anli_r3,acc,0.3516666666666667,0.013789711695404789,0
|
5 |
+
arc_challenge,acc,0.27047781569965873,0.012980954547659554,0
|
6 |
+
arc_challenge,acc_norm,0.2909556313993174,0.013273077865907593,0
|
7 |
+
arc_easy,acc,0.5862794612794613,0.010105878530238132,0
|
8 |
+
arc_easy,acc_norm,0.5740740740740741,0.010146568651002255,0
|
9 |
+
boolq,acc,0.5685015290519878,0.008662594569027314,1
|
10 |
+
cb,acc,0.5357142857142857,0.06724777654937658,1
|
11 |
+
cb,f1,0.31399711399711405,,1
|
12 |
+
copa,acc,0.72,0.04512608598542127,0
|
13 |
+
hellaswag,acc,0.4305915156343358,0.004941470620074864,0
|
14 |
+
hellaswag,acc_norm,0.5628360884285999,0.004950221546187576,0
|
15 |
+
piqa,acc,0.7312295973884657,0.010343392940090011,0
|
16 |
+
piqa,acc_norm,0.7415669205658324,0.0102139716367733,0
|
17 |
+
rte,acc,0.5451263537906137,0.029973636495415252,0
|
18 |
+
sciq,acc,0.866,0.01077776229836968,0
|
19 |
+
sciq,acc_norm,0.861,0.010945263761042968,0
|
20 |
+
storycloze_2016,acc,0.6814537680384821,0.010774165229761353,0
|
21 |
+
winogrande,acc,0.5548539857932123,0.013967662954355491,0
|
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_5.json
CHANGED
@@ -20,6 +20,52 @@
|
|
20 |
"copa": {
|
21 |
"acc": 0.72,
|
22 |
"acc_stderr": 0.04512608598542127
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
}
|
24 |
},
|
25 |
"versions": {
|
@@ -27,6 +73,15 @@
|
|
27 |
"anli_r2": 0,
|
28 |
"anli_r3": 0,
|
29 |
"cb": 1,
|
30 |
-
"copa": 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
}
|
32 |
}
|
|
|
20 |
"copa": {
|
21 |
"acc": 0.72,
|
22 |
"acc_stderr": 0.04512608598542127
|
23 |
+
},
|
24 |
+
"hellaswag": {
|
25 |
+
"acc": 0.4305915156343358,
|
26 |
+
"acc_stderr": 0.004941470620074864,
|
27 |
+
"acc_norm": 0.5628360884285999,
|
28 |
+
"acc_norm_stderr": 0.004950221546187576
|
29 |
+
},
|
30 |
+
"rte": {
|
31 |
+
"acc": 0.5451263537906137,
|
32 |
+
"acc_stderr": 0.029973636495415252
|
33 |
+
},
|
34 |
+
"winogrande": {
|
35 |
+
"acc": 0.5548539857932123,
|
36 |
+
"acc_stderr": 0.013967662954355491
|
37 |
+
},
|
38 |
+
"storycloze_2016": {
|
39 |
+
"acc": 0.6814537680384821,
|
40 |
+
"acc_stderr": 0.010774165229761353
|
41 |
+
},
|
42 |
+
"boolq": {
|
43 |
+
"acc": 0.5685015290519878,
|
44 |
+
"acc_stderr": 0.008662594569027314
|
45 |
+
},
|
46 |
+
"arc_easy": {
|
47 |
+
"acc": 0.5862794612794613,
|
48 |
+
"acc_stderr": 0.010105878530238132,
|
49 |
+
"acc_norm": 0.5740740740740741,
|
50 |
+
"acc_norm_stderr": 0.010146568651002255
|
51 |
+
},
|
52 |
+
"arc_challenge": {
|
53 |
+
"acc": 0.27047781569965873,
|
54 |
+
"acc_stderr": 0.012980954547659554,
|
55 |
+
"acc_norm": 0.2909556313993174,
|
56 |
+
"acc_norm_stderr": 0.013273077865907593
|
57 |
+
},
|
58 |
+
"sciq": {
|
59 |
+
"acc": 0.866,
|
60 |
+
"acc_stderr": 0.01077776229836968,
|
61 |
+
"acc_norm": 0.861,
|
62 |
+
"acc_norm_stderr": 0.010945263761042968
|
63 |
+
},
|
64 |
+
"piqa": {
|
65 |
+
"acc": 0.7312295973884657,
|
66 |
+
"acc_stderr": 0.010343392940090011,
|
67 |
+
"acc_norm": 0.7415669205658324,
|
68 |
+
"acc_norm_stderr": 0.0102139716367733
|
69 |
}
|
70 |
},
|
71 |
"versions": {
|
|
|
73 |
"anli_r2": 0,
|
74 |
"anli_r3": 0,
|
75 |
"cb": 1,
|
76 |
+
"copa": 0,
|
77 |
+
"hellaswag": 0,
|
78 |
+
"rte": 0,
|
79 |
+
"winogrande": 0,
|
80 |
+
"storycloze_2016": 0,
|
81 |
+
"boolq": 1,
|
82 |
+
"arc_easy": 0,
|
83 |
+
"arc_challenge": 0,
|
84 |
+
"sciq": 0,
|
85 |
+
"piqa": 0
|
86 |
}
|
87 |
}
|
2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_0.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.3750956835220534, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03303866348324263}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.06729841211664779, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014779125598332707}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.29304332001997996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004946222762455597}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1034915916030452, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020463645545355925}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.031583798963941895, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008797642983980407}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.14233929411373747, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003345640165065958}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.048745998009714464, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012510511387985711}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06496957050414408, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013738739239028245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2861388731751563, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004839232451958966}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10020083541023005, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019268832996130537}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06418206519400182, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001390783570882245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2794417864990063, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004636157223173928}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.09872549675778793, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019292683793847847}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_0.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.15312528607963066, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018475398053558751}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.26069676849871193, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002553411390285405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.17949455554206634, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001796797335172746}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.03025768195820637, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000737285788609388}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0526377509392788, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013707148216665472}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03551854752059051, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008299460804507645}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11900796171626439, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012961585979789718}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.21022152388268012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020859201845126016}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14125944815133307, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001293606648023059}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.14065190421878745, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016987560973759559}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.24028450248920846, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023647123157988985}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.16495917517256573, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001644741818116919}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.5392368463230133, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.053948912749046035}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_1.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.1668110350174811, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019877097402536763}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.28257427077519554, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002760368419127019}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.19472791234999667, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001906191600167485}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.035928254592281224, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008382143224304085}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06299590355552512, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015297554624304605}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04208923831822898, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009197645366087788}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.12071476283575872, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013170745626523695}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.21195380139438252, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002156914307318323}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14254660444247075, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012952641600081254}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.15552475435698954, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018425071564360774}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2646754231072801, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026025376230503126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18187224601097693, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017770105186546616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.0614970786142583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09284907950143785}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_0.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.027496856259569415, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.007385963274542756}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.047807397913058214, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011364617294874694}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.022673512097835077, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005137246819540603}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.02786087179209838, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005858506970554961}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.0005045565387850128, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00014549502538483447}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.00021403935275164058, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 5.468859488917244e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.0002573048402040541, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 6.575032923202268e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.04750467841033873, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011193107075921842}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.022556340907519876, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000501594662926671}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.02773258214801112, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005799438210362486}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.04681714649780676, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011159994745820552}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.02201246000410449, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00047810257232395407}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.027138243313024377, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005613131933490365}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|