Muennighoff commited on
Commit
b277454
·
1 Parent(s): c631903
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +86 -0
  2. 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.json +1 -0
  3. 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.json +1 -0
  4. 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.json +1 -0
  5. 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.json +1 -0
  6. 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.json +1 -0
  7. 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.json +1 -0
  8. 2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.json +1 -0
  9. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_2.jsonl +0 -0
  10. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_3.jsonl +0 -0
  11. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_4.jsonl +0 -0
  12. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_5.jsonl +0 -0
  13. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_3.jsonl +0 -0
  14. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_4.jsonl +0 -0
  15. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_5.jsonl +0 -0
  16. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl +3 -0
  17. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl +3 -0
  18. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl +3 -0
  19. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl +3 -0
  20. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.jsonl +3 -0
  21. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.jsonl +3 -0
  22. 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.jsonl +3 -0
  23. 2b855b9bc4seed1/evaluation/generation/merged.csv +39 -0
  24. 2b855b9bc4seed1/evaluation/generation/merged.json +1 -0
  25. 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.json +133 -0
  26. 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.json +133 -0
  27. 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.json +133 -0
  28. 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.json +133 -0
  29. 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.json +133 -0
  30. 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.json +133 -0
  31. 2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.json +133 -0
  32. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_0.csv +21 -0
  33. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_0_lm-eval_global_step52452_2023-02-24-23-57-47_0shots_backup.json +0 -87
  34. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_1.csv +21 -0
  35. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_1.json +15 -1
  36. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_1_lm-eval_global_step52452_2023-02-24-23-57-47_1shots_backup.json +0 -73
  37. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_2.csv +21 -0
  38. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_2.json +34 -1
  39. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_2_lm-eval_global_step52452_2023-02-24-23-57-47_2shots_backup.json +0 -54
  40. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_3.csv +21 -0
  41. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_3.json +49 -1
  42. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_3_lm-eval_global_step52452_2023-02-24-23-57-47_3shots_backup.json +0 -39
  43. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_4.csv +21 -0
  44. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_4.json +56 -1
  45. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_5.csv +21 -0
  46. 2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_5.json +56 -1
  47. 2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_0.json +1 -0
  48. 2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_0.json +1 -0
  49. 2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_1.json +1 -0
  50. 2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_0.json +1 -0
.gitattributes CHANGED
@@ -733,3 +733,89 @@ evaluation/seed2/generation/examples.limited=3000.model=seed2.task=GEM-wiki_ling
733
  2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
734
  2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
735
  2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733
  2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
734
  2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
735
  2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
736
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
737
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
738
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
739
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
740
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
741
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
742
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
743
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
744
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
745
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
746
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
747
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
748
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
749
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
750
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
751
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
752
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
753
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
754
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
755
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
756
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
757
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
758
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
759
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
760
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
761
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
762
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
763
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
764
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
765
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
766
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
767
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
768
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
769
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
770
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
771
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
772
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
773
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
774
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
775
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
776
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
777
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
778
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
779
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
780
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
781
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
782
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
783
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
784
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
785
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
786
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text
787
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
788
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
789
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text
790
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text
791
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
792
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
793
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
794
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
795
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text
796
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text
797
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text
798
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
799
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
800
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
801
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text
802
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text
803
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
804
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text
805
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
806
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
807
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text
808
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
809
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
810
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text
811
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
812
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text
813
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
814
+ 2b855b9bc4seed2/evaluation/generation/examples.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
815
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
816
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text
817
+ 2b855b9bc4seed3/evaluation/generation/examples.2b855b9bc4seed3_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
818
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
819
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text
820
+ 2b855b9bc4seed4/evaluation/generation/examples.2b855b9bc4seed4_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
821
+ 2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.018041940580495, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07197788580096429}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.26958144890836977, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021540233079166827}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4588728570611039, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027449063068318643}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.32521240877335217, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019278160903662194}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.11570444930629259, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014199073558424482}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2004243711941686, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021042737791853398}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.13955166174444095, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014269312475032364}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2156185303251572, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015614536366490293}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3759398467983639, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002381221921078589}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.262564054866777, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014381342603292985}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.22224277917141733, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019418664704165772}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3788810896543758, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026592931094796103}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2680607212245618, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018300140862428237}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.247143410389527, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09336005465042321}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.25907184832717167, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001803996044494107}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4710593035922337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027215741063189363}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.32338937321619443, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017934056667561753}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.11204434793169746, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012125669003284166}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.21007032469689518, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0021257583048451844}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.14093386031313695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013638568945346288}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2101184628317088, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001355833935975265}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3886278435049077, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023407273478071285}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2638096769634126, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013632619425518314}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.21561354041284292, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016830247054440658}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3919032879874159, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002632582004466588}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2689516848427162, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017385224354802351}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.208222380745828, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09813424389228141}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.25403488264023616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00174969774558582}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4709476911476187, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002655138227534261}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.32023650433333456, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017784408554274436}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.11058685749210603, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012080055063255909}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.21167766216290337, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002144496564072756}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.1403526845297047, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013607857847417898}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.20576507091031662, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012824323500063948}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3889751930979724, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023221115722203652}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.26122109578349895, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013494634641885617}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.21178966636346616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016557432144235155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.393450062046956, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002663580316470773}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2670143871011222, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017625078468513325}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.148604351164331, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11104852380350659}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.2528456956077577, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017424978255548198}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.473017800721126, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026123356126015254}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3199925141836084, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017551605827543929}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.10974089008299018, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011789832834562743}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2119589387667116, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020903039724611107}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.13979285541625697, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013089259606601385}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.20463239752897472, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012743661964013022}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3904493381375762, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002278919766787325}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2608746206749859, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013206910936061742}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.21056870894114044, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001637983619116459}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.39500142434961627, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002625905199940031}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.26667287137591805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001737453548353309}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1131657921685037, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018249204798791675}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2722598710120147, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004133665114298229}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.15647200121804894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023727238595388817}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.019363405482896792, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009105840932812386}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.049210613825466105, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002366221262626497}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.027201929614022874, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012539141722579825}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.08876222177060916, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013761155158951413}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.21448631404181784, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003198193852945419}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.12268909574632023, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001773225086338}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.09102722612426341, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014951062687195294}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.22021791608209987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003544530658562865}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.12593169516392616, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001962773566689985}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.1532375468354903, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09169402373692534}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0383955897314317, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0025422863710305266}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.06915759782304308, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0039608171509673586}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.04426046707797172, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024997989992097624}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.006481358010154669, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007393187434393857}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.013044537342091751, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00125152402986618}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.008141029291990268, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008202019731008721}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.031113719615522017, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021898843752975964}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.055197273257753664, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003191098630834803}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.035265847972026074, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020006364168593496}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03210804984465074, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022623416009452243}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.05680048575290953, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032931638988068055}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.03632930315349291, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020756841173550696}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.6055404023350197, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09409517819503949}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
2b855b9bc4seed1/evaluation/generation/agg.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002821793157093116, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008106295472902334}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0022851464113943067, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006484962567584845}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0024831420596845987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007070331562075694}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0002914830883754302, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00020340968454234058}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00022401756585996315, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0001545711517822221}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0002519246692387647, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00017508831809454146}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0021547453202339587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006563220446223475}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0017708458456009723, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005325445996971974}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.001910475794160408, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005775932958294451}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.002307213397230338, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006934611042748366}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.001892317665185999, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005664327179419651}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0020447516574242644, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000612874204929875}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.1094899638874875e-37, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.0662006738968273e-31}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_2.jsonl ADDED
File without changes
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_3.jsonl ADDED
File without changes
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_4.jsonl ADDED
File without changes
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-web_nlg_en_PALM_prompt_5.jsonl ADDED
File without changes
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_3.jsonl ADDED
File without changes
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_4.jsonl ADDED
File without changes
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_GEM-wiki_lingua_en_tldr_en_5.jsonl ADDED
File without changes
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9cf841b226161291c93621cdea07f208242bd698e73f01ef6161119990c2a8f
3
+ size 6555356
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e33e5982a5cb9c474ac101018fe39437d6bd70bb49464fc9647dfc6e2394e29
3
+ size 7663191
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:981366570b6f5164479911044af041ce49524c88389257a45059020bc2633b5e
3
+ size 8754161
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87012952085cdbecc185ca76605cce02fbcf84a1aad91feeb23fc3607bba6b81
3
+ size 9848861
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84c17a6680e2df8d7ebd172c08a43c279992784452d9950bb1b0f49c9c9354f4
3
+ size 9642676
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3538e8bc295d76ee9ad83063c989e8b90b4343cf7ffcedd2b6312b91f4d7ff32
3
+ size 11671805
2b855b9bc4seed1/evaluation/generation/examples.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51f6f0ca88e208ba87adb2a5d3bc46de505c984fec840ca0efd9388e72636cf6
3
+ size 13897450
2b855b9bc4seed1/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0025241396159283234
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0025241396159283234
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.162134939165695
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.162134939165695
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.13955166174444095
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.13955166174444095
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.14093386031313695
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.14093386031313695
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.1403526845297047
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.1403526845297047
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.13979285541625697
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.13979285541625697
14
+ e2e_nlg_cleaned,5,average,multiple,0.12088169013086048
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04580671342088997
16
+ gem_xsum,0,median,rouge2_fmeasure,0.04580671342088997
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03052402695525932
18
+ gem_xsum,1,median,rouge2_fmeasure,0.03052402695525932
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.029102694813621793
20
+ gem_xsum,2,median,rouge2_fmeasure,0.029102694813621793
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.027201929614022874
22
+ gem_xsum,3,median,rouge2_fmeasure,0.027201929614022874
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.008141029291990268
24
+ gem_xsum,4,median,rouge2_fmeasure,0.008141029291990268
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0002519246692387647
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0002519246692387647
27
+ gem_xsum,5,average,multiple,0.023504719794170497
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05142765596627262
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05142765596627262
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.051755190143320286
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.051755190143320286
32
+ web_nlg_en,1,average,multiple,0.051591423054796456
33
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03487145839395421
34
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03487145839395421
35
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03614437414445121
36
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.03614437414445121
37
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04349341038860359
38
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.04349341038860359
39
+ wiki_lingua_en,2,average,multiple,0.038169747642336334
2b855b9bc4seed1/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4066206711982134, "bleu_stderr": 0.033723013965742, "rouge1_fmeasure": 0.10895883012469396, "rouge1_fmeasure_stderr": 0.002008528704603682, "rouge1_precision": 0.07192732542952907, "rouge1_precision_stderr": 0.0016382475971692088, "rouge1_recall": 0.30547187741815907, "rouge1_recall_stderr": 0.004666013432977962, "rouge2_fmeasure": 0.05142765596627262, "rouge2_fmeasure_stderr": 0.001246824543408159, "rouge2_precision": 0.034124224521766514, "rouge2_precision_stderr": 0.0011258346949061335, "rouge2_recall": 0.14893550564787877, "rouge2_recall_stderr": 0.0032632079712954696, "rougeL_fmeasure": 0.10554163962469583, "rougeL_fmeasure_stderr": 0.0018999401584976951, "rougeL_precision": 0.06951188299150508, "rougeL_precision_stderr": 0.0015536321007724944, "rougeL_recall": 0.29797998285094185, "rougeL_recall_stderr": 0.004578539464668976, "rougeLsum_fmeasure": 0.10415553154282955, "rougeLsum_fmeasure_stderr": 0.0018954370177080594, "rougeLsum_precision": 0.06880186502940373, "rougeLsum_precision_stderr": 0.0015660770018247742, "rougeLsum_recall": 0.2918595743249444, "rougeLsum_recall_stderr": 0.004365973340593178}}, "1": {"PALM_prompt": {"bleu": 0.441015227234624, "bleu_stderr": 0.02811928663369506, "rouge1_fmeasure": 0.11213595098636865, "rouge1_fmeasure_stderr": 0.001929589035018206, "rouge1_precision": 0.0726548726317011, "rouge1_precision_stderr": 0.0015123203660414558, "rouge1_recall": 0.3480211924440434, "rouge1_recall_stderr": 0.005007162611700053, "rouge2_fmeasure": 0.051755190143320286, "rouge2_fmeasure_stderr": 0.0012231844221965455, "rouge2_precision": 0.0335432322481532, "rouge2_precision_stderr": 0.0009695182541133993, "rouge2_recall": 0.16639593533541566, "rouge2_recall_stderr": 0.0034320944741141886, "rougeL_fmeasure": 0.10765664842911221, "rougeL_fmeasure_stderr": 0.001806223035008682, "rougeL_precision": 0.06967949917271966, "rougeL_precision_stderr": 0.0014170582927231956, "rougeL_recall": 0.3335173366403848, "rougeL_recall_stderr": 0.004760042112091419, "rougeLsum_fmeasure": 0.10716257387185534, "rougeLsum_fmeasure_stderr": 0.0018203621728808315, "rougeLsum_precision": 0.06949682130009485, "rougeLsum_precision_stderr": 0.00144169867509152, "rougeLsum_recall": 0.33110876740142015, "rougeLsum_recall_stderr": 0.004635864354540981}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5382561696068602, "bleu_stderr": 0.06399544328700199, "rouge1_fmeasure": 0.17482546253918968, "rouge1_fmeasure_stderr": 0.0017962093241495317, "rouge1_precision": 0.1493431846788089, "rouge1_precision_stderr": 0.0018651057951166279, "rouge1_recall": 0.2548583336571623, "rouge1_recall_stderr": 0.002528299389518057, "rouge2_fmeasure": 0.03487145839395421, "rouge2_fmeasure_stderr": 0.0008257165713416542, "rouge2_precision": 0.02958059388980271, "rouge2_precision_stderr": 0.0007357126571931123, "rouge2_recall": 0.0521633146578729, "rouge2_recall_stderr": 0.0013585554854051705, "rougeL_fmeasure": 0.13645058466848356, "rougeL_fmeasure_stderr": 0.0012505081899530457, "rougeL_precision": 0.11482322910631781, "rougeL_precision_stderr": 0.0012610070217646853, "rougeL_recall": 0.2047896774759962, "rougeL_recall_stderr": 0.0020475160072816, "rougeLsum_fmeasure": 0.16137882388270616, "rougeLsum_fmeasure_stderr": 0.0016412506052086466, "rougeLsum_precision": 0.13765195894138185, "rougeLsum_precision_stderr": 0.0017075576824449126, "rougeLsum_recall": 0.23641320862844858, "rougeLsum_recall_stderr": 0.0023610992069355388}}, "1": {"tldr_en": {"bleu": 1.7101242197452051, "bleu_stderr": 0.04333761308856783, "rouge1_fmeasure": 0.1796806784514, "rouge1_fmeasure_stderr": 0.0018714401391125607, "rouge1_precision": 0.1550521213351438, "rouge1_precision_stderr": 0.0019541763971361132, "rouge1_recall": 0.2591201747750158, "rouge1_recall_stderr": 0.0026478655932746374, "rouge2_fmeasure": 0.03614437414445121, "rouge2_fmeasure_stderr": 0.0008460043362571757, "rouge2_precision": 0.030990812889466937, "rouge2_precision_stderr": 0.0007609017578133211, "rouge2_recall": 0.05362608829041054, "rouge2_recall_stderr": 0.001393078096579126, "rougeL_fmeasure": 0.13484320380645512, "rougeL_fmeasure_stderr": 0.001257828140076275, "rougeL_precision": 0.11485232716958933, "rougeL_precision_stderr": 0.0012878987216988057, "rougeL_recall": 0.19958644094767267, "rougeL_recall_stderr": 0.002041519133152489, "rougeLsum_fmeasure": 0.16735135112097876, "rougeLsum_fmeasure_stderr": 0.0017317067736812232, "rougeLsum_precision": 0.14415005003230233, "rougeLsum_precision_stderr": 0.0018020269635249566, "rougeLsum_recall": 0.24228585475636266, "rougeLsum_recall_stderr": 0.0024919558768600326}}, "2": {"tldr_en": {"bleu": 2.1448539044725874, "bleu_stderr": 0.06277677171239805, "rouge1_fmeasure": 0.1970048673630101, "rouge1_fmeasure_stderr": 0.001873251264352212, "rouge1_precision": 0.1705870674804116, "rouge1_precision_stderr": 0.002048796485044356, "rouge1_recall": 0.28464437226221473, "rouge1_recall_stderr": 0.0026857267394890393, "rouge2_fmeasure": 0.04349341038860359, "rouge2_fmeasure_stderr": 0.000895910412734044, "rouge2_precision": 0.0375931104436251, "rouge2_precision_stderr": 0.0008379763934403864, "rouge2_recall": 0.0642856259647273, "rouge2_recall_stderr": 0.0014690192240657321, "rougeL_fmeasure": 0.1456514344157347, "rougeL_fmeasure_stderr": 0.0012777745388170482, "rougeL_precision": 0.12469459823199261, "rougeL_precision_stderr": 0.001380245091026989, "rougeL_recall": 0.21573877115274695, "rougeL_recall_stderr": 0.002116828674796055, "rougeLsum_fmeasure": 0.18373940262798896, "rougeLsum_fmeasure_stderr": 0.001747803293779092, "rougeLsum_precision": 0.15874261945387053, "rougeLsum_precision_stderr": 0.0018989313183848545, "rougeLsum_recall": 0.26633944675442933, "rougeLsum_recall_stderr": 0.0025440410206595122}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.23848794691656622, "bleu_stderr": 0.03586833580767261, "rouge1_fmeasure": 0.02476955743708725, "rouge1_fmeasure_stderr": 0.0007710556690163732, "rouge1_precision": 0.019286698361945742, "rouge1_precision_stderr": 0.0006077660476531964, "rouge1_recall": 0.03771027274112491, "rouge1_recall_stderr": 0.0012643509148260043, "rouge2_fmeasure": 0.0025241396159283234, "rouge2_fmeasure_stderr": 0.00028150795832003375, "rouge2_precision": 0.0018866790365766187, "rouge2_precision_stderr": 0.0002143326315912907, "rouge2_recall": 0.004268812953176167, "rouge2_recall_stderr": 0.00046738906149789073, "rougeL_fmeasure": 0.02408262386824008, "rougeL_fmeasure_stderr": 0.0007136995922629838, "rougeL_precision": 0.018711596296599558, "rougeL_precision_stderr": 0.000549656942001388, "rougeL_recall": 0.036704663603431754, "rougeL_recall_stderr": 0.0011874696198706125, "rougeLsum_fmeasure": 0.022774229967586195, "rougeLsum_fmeasure_stderr": 0.0006671086701287083, "rougeLsum_precision": 0.017739797167011156, "rougeLsum_precision_stderr": 0.0005177890835136993, "rougeLsum_recall": 0.03456546163273569, "rougeLsum_recall_stderr": 0.0011034522576739757}}, "1": {"generate_text_restaurant": {"bleu": 7.611947046679122, "bleu_stderr": 0.1143274370287527, "rouge1_fmeasure": 0.37951927025471477, "rouge1_fmeasure_stderr": 0.002437652542254183, "rouge1_precision": 0.39546550497235883, "rouge1_precision_stderr": 0.00338263349515896, "rouge1_recall": 0.41781588842973927, "rouge1_recall_stderr": 0.0028231707574303336, "rouge2_fmeasure": 0.162134939165695, "rouge2_fmeasure_stderr": 0.0018063315065409062, "rouge2_precision": 0.17176180426713916, "rouge2_precision_stderr": 0.002231883030057573, "rouge2_recall": 0.17748075850051998, "rouge2_recall_stderr": 0.0020235877252369762, "rougeL_fmeasure": 0.2827204639054135, "rougeL_fmeasure_stderr": 0.001840892993788695, "rougeL_precision": 0.29255451415473116, "rougeL_precision_stderr": 0.0025475028016928546, "rougeL_recall": 0.31556464887995433, "rougeL_recall_stderr": 0.0023200773160819157, "rougeLsum_fmeasure": 0.3085637675341545, "rougeLsum_fmeasure_stderr": 0.002263042771686147, "rougeLsum_precision": 0.3218545869658081, "rougeLsum_precision_stderr": 0.0030063312376711593, "rougeLsum_recall": 0.33940243915149054, "rougeLsum_recall_stderr": 0.002593279963684452}}, "2": {"generate_text_restaurant": {"bleu": 6.018041940580495, "bleu_stderr": 0.07197788580096429, "rouge1_fmeasure": 0.32521240877335217, "rouge1_fmeasure_stderr": 0.0019278160903662194, "rouge1_precision": 0.26958144890836977, "rouge1_precision_stderr": 0.0021540233079166827, "rouge1_recall": 0.4588728570611039, "rouge1_recall_stderr": 0.0027449063068318643, "rouge2_fmeasure": 0.13955166174444095, "rouge2_fmeasure_stderr": 0.0014269312475032364, "rouge2_precision": 0.11570444930629259, "rouge2_precision_stderr": 0.0014199073558424482, "rouge2_recall": 0.2004243711941686, "rouge2_recall_stderr": 0.0021042737791853398, "rougeL_fmeasure": 0.262564054866777, "rougeL_fmeasure_stderr": 0.0014381342603292985, "rougeL_precision": 0.2156185303251572, "rougeL_precision_stderr": 0.0015614536366490293, "rougeL_recall": 0.3759398467983639, "rougeL_recall_stderr": 0.002381221921078589, "rougeLsum_fmeasure": 0.2680607212245618, "rougeLsum_fmeasure_stderr": 0.0018300140862428237, "rougeLsum_precision": 0.22224277917141733, "rougeLsum_precision_stderr": 0.0019418664704165772, "rougeLsum_recall": 0.3788810896543758, "rougeLsum_recall_stderr": 0.0026592931094796103}}, "3": {"generate_text_restaurant": {"bleu": 6.247143410389527, "bleu_stderr": 0.09336005465042321, "rouge1_fmeasure": 0.32338937321619443, "rouge1_fmeasure_stderr": 0.0017934056667561753, "rouge1_precision": 0.25907184832717167, "rouge1_precision_stderr": 0.001803996044494107, "rouge1_recall": 0.4710593035922337, "rouge1_recall_stderr": 0.0027215741063189363, "rouge2_fmeasure": 0.14093386031313695, "rouge2_fmeasure_stderr": 0.0013638568945346288, "rouge2_precision": 0.11204434793169746, "rouge2_precision_stderr": 0.0012125669003284166, "rouge2_recall": 0.21007032469689518, "rouge2_recall_stderr": 0.0021257583048451844, "rougeL_fmeasure": 0.2638096769634126, "rougeL_fmeasure_stderr": 0.0013632619425518314, "rougeL_precision": 0.2101184628317088, "rougeL_precision_stderr": 0.001355833935975265, "rougeL_recall": 0.3886278435049077, "rougeL_recall_stderr": 0.0023407273478071285, "rougeLsum_fmeasure": 0.2689516848427162, "rougeLsum_fmeasure_stderr": 0.0017385224354802351, "rougeLsum_precision": 0.21561354041284292, "rougeLsum_precision_stderr": 0.0016830247054440658, "rougeLsum_recall": 0.3919032879874159, "rougeLsum_recall_stderr": 0.002632582004466588}}, "4": {"generate_text_restaurant": {"bleu": 6.208222380745828, "bleu_stderr": 0.09813424389228141, "rouge1_fmeasure": 0.32023650433333456, "rouge1_fmeasure_stderr": 0.0017784408554274436, "rouge1_precision": 0.25403488264023616, "rouge1_precision_stderr": 0.00174969774558582, "rouge1_recall": 0.4709476911476187, "rouge1_recall_stderr": 0.002655138227534261, "rouge2_fmeasure": 0.1403526845297047, "rouge2_fmeasure_stderr": 0.0013607857847417898, "rouge2_precision": 0.11058685749210603, "rouge2_precision_stderr": 0.0012080055063255909, "rouge2_recall": 0.21167766216290337, "rouge2_recall_stderr": 0.002144496564072756, "rougeL_fmeasure": 0.26122109578349895, "rougeL_fmeasure_stderr": 0.0013494634641885617, "rougeL_precision": 0.20576507091031662, "rougeL_precision_stderr": 0.0012824323500063948, "rougeL_recall": 0.3889751930979724, "rougeL_recall_stderr": 0.0023221115722203652, "rougeLsum_fmeasure": 0.2670143871011222, "rougeLsum_fmeasure_stderr": 0.0017625078468513325, "rougeLsum_precision": 0.21178966636346616, "rougeLsum_precision_stderr": 0.0016557432144235155, "rougeLsum_recall": 0.393450062046956, "rougeLsum_recall_stderr": 0.002663580316470773}}, "5": {"generate_text_restaurant": {"bleu": 6.148604351164331, "bleu_stderr": 0.11104852380350659, "rouge1_fmeasure": 0.3199925141836084, "rouge1_fmeasure_stderr": 0.0017551605827543929, "rouge1_precision": 0.2528456956077577, "rouge1_precision_stderr": 0.0017424978255548198, "rouge1_recall": 0.473017800721126, "rouge1_recall_stderr": 0.0026123356126015254, "rouge2_fmeasure": 0.13979285541625697, "rouge2_fmeasure_stderr": 0.0013089259606601385, "rouge2_precision": 0.10974089008299018, "rouge2_precision_stderr": 0.0011789832834562743, "rouge2_recall": 0.2119589387667116, "rouge2_recall_stderr": 0.0020903039724611107, "rougeL_fmeasure": 0.2608746206749859, "rougeL_fmeasure_stderr": 0.0013206910936061742, "rougeL_precision": 0.20463239752897472, "rougeL_precision_stderr": 0.0012743661964013022, "rougeL_recall": 0.3904493381375762, "rougeL_recall_stderr": 0.002278919766787325, "rougeLsum_fmeasure": 0.26667287137591805, "rougeLsum_fmeasure_stderr": 0.001737453548353309, "rougeLsum_precision": 0.21056870894114044, "rougeLsum_precision_stderr": 0.001637983619116459, "rougeLsum_recall": 0.39500142434961627, "rougeLsum_recall_stderr": 0.002625905199940031}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8439516808341079, "bleu_stderr": 0.09422907248075703, "rouge1_fmeasure": 0.20643072459177322, "rouge1_fmeasure_stderr": 0.0023952712703594957, "rouge1_precision": 0.1489888413499883, "rouge1_precision_stderr": 0.0018521966676135623, "rouge1_recall": 0.35598499223601365, "rouge1_recall_stderr": 0.004207211166343017, "rouge2_fmeasure": 0.04580671342088997, "rouge2_fmeasure_stderr": 0.0015030605732593261, "rouge2_precision": 0.0324623282960817, "rouge2_precision_stderr": 0.0010674592282863239, "rouge2_recall": 0.0820809502686687, "rouge2_recall_stderr": 0.002813424567852905, "rougeL_fmeasure": 0.15483993335255772, "rougeL_fmeasure_stderr": 0.0018017433912559451, "rougeL_precision": 0.11153813287033311, "rougeL_precision_stderr": 0.001374306634500298, "rougeL_recall": 0.26864922067574026, "rougeL_recall_stderr": 0.0033157904418926283, "rougeLsum_fmeasure": 0.16399947665135114, "rougeLsum_fmeasure_stderr": 0.002023909905331339, "rougeLsum_precision": 0.1179757836593057, "rougeLsum_precision_stderr": 0.0015154466049348198, "rougeLsum_recall": 0.2847650880272033, "rougeLsum_recall_stderr": 0.003712939966528145}}, "1": {"article_DOC_summary": {"bleu": 1.247854057178413, "bleu_stderr": 0.053793972634237734, "rouge1_fmeasure": 0.1655189451131417, "rouge1_fmeasure_stderr": 0.0023018941372401455, "rouge1_precision": 0.1177467243515202, "rouge1_precision_stderr": 0.0017455506749855451, "rouge1_recall": 0.29171927966859307, "rouge1_recall_stderr": 0.003919612515883716, "rouge2_fmeasure": 0.03052402695525932, "rouge2_fmeasure_stderr": 0.0012776222243206263, "rouge2_precision": 0.02141171153130294, "rouge2_precision_stderr": 0.0008955978971257212, "rouge2_recall": 0.05531529147706192, "rouge2_recall_stderr": 0.002372274102893343, "rougeL_fmeasure": 0.13085315336542225, "rougeL_fmeasure_stderr": 0.0017506812383595754, "rougeL_precision": 0.0927268161322543, "rougeL_precision_stderr": 0.0012892522930530307, "rougeL_recall": 0.232421966160188, "rougeL_recall_stderr": 0.0031136409947685123, "rougeLsum_fmeasure": 0.13257722034008568, "rougeLsum_fmeasure_stderr": 0.0018892826906714033, "rougeLsum_precision": 0.09395615061271292, "rougeLsum_precision_stderr": 0.0013824641506762928, "rougeLsum_recall": 0.23524314570007646, "rougeLsum_recall_stderr": 0.0033502788795521077}}, "2": {"article_DOC_summary": {"bleu": 1.1975588661664014, "bleu_stderr": 0.06263558549987819, "rouge1_fmeasure": 0.165998653508353, "rouge1_fmeasure_stderr": 0.002288522145343238, "rouge1_precision": 0.1177626659310641, "rouge1_precision_stderr": 0.001707946986197866, "rouge1_recall": 0.2928721346826048, "rouge1_recall_stderr": 0.0039006436966328977, "rouge2_fmeasure": 0.029102694813621793, "rouge2_fmeasure_stderr": 0.0012269354923261043, "rouge2_precision": 0.020528446471707117, "rouge2_precision_stderr": 0.000867911954165022, "rouge2_recall": 0.052169823470780564, "rouge2_recall_stderr": 0.0022808799038575088, "rougeL_fmeasure": 0.12917404609126437, "rougeL_fmeasure_stderr": 0.0016814421892826735, "rougeL_precision": 0.09145048851843915, "rougeL_precision_stderr": 0.0012506279990524979, "rougeL_recall": 0.22942628081026484, "rougeL_recall_stderr": 0.002973056999961359, "rougeLsum_fmeasure": 0.13296050139540747, "rougeLsum_fmeasure_stderr": 0.001845998235299578, "rougeLsum_precision": 0.0940902852266538, "rougeLsum_precision_stderr": 0.0013601724232084945, "rougeLsum_recall": 0.23630864651546127, "rougeLsum_recall_stderr": 0.003290166668177434}}, "3": {"article_DOC_summary": {"bleu": 1.1532375468354903, "bleu_stderr": 0.09169402373692534, "rouge1_fmeasure": 0.15647200121804894, "rouge1_fmeasure_stderr": 0.0023727238595388817, "rouge1_precision": 0.1131657921685037, "rouge1_precision_stderr": 0.0018249204798791675, "rouge1_recall": 0.2722598710120147, "rouge1_recall_stderr": 0.004133665114298229, "rouge2_fmeasure": 0.027201929614022874, "rouge2_fmeasure_stderr": 0.0012539141722579825, "rouge2_precision": 0.019363405482896792, "rouge2_precision_stderr": 0.0009105840932812386, "rouge2_recall": 0.049210613825466105, "rouge2_recall_stderr": 0.002366221262626497, "rougeL_fmeasure": 0.12268909574632023, "rougeL_fmeasure_stderr": 0.001773225086338, "rougeL_precision": 0.08876222177060916, "rougeL_precision_stderr": 0.0013761155158951413, "rougeL_recall": 0.21448631404181784, "rougeL_recall_stderr": 0.003198193852945419, "rougeLsum_fmeasure": 0.12593169516392616, "rougeLsum_fmeasure_stderr": 0.001962773566689985, "rougeLsum_precision": 0.09102722612426341, "rougeLsum_precision_stderr": 0.0014951062687195294, "rougeLsum_recall": 0.22021791608209987, "rougeLsum_recall_stderr": 0.003544530658562865}}, "4": {"article_DOC_summary": {"bleu": 0.6055404023350197, "bleu_stderr": 0.09409517819503949, "rouge1_fmeasure": 0.04426046707797172, "rouge1_fmeasure_stderr": 0.0024997989992097624, "rouge1_precision": 0.0383955897314317, "rouge1_precision_stderr": 0.0025422863710305266, "rouge1_recall": 0.06915759782304308, "rouge1_recall_stderr": 0.0039608171509673586, "rouge2_fmeasure": 0.008141029291990268, "rouge2_fmeasure_stderr": 0.0008202019731008721, "rouge2_precision": 0.006481358010154669, "rouge2_precision_stderr": 0.0007393187434393857, "rouge2_recall": 0.013044537342091751, "rouge2_recall_stderr": 0.00125152402986618, "rougeL_fmeasure": 0.035265847972026074, "rougeL_fmeasure_stderr": 0.0020006364168593496, "rougeL_precision": 0.031113719615522017, "rougeL_precision_stderr": 0.0021898843752975964, "rougeL_recall": 0.055197273257753664, "rougeL_recall_stderr": 0.003191098630834803, "rougeLsum_fmeasure": 0.03632930315349291, "rougeLsum_fmeasure_stderr": 0.0020756841173550696, "rougeLsum_precision": 0.03210804984465074, "rougeLsum_precision_stderr": 0.0022623416009452243, "rougeLsum_recall": 0.05680048575290953, "rougeLsum_recall_stderr": 0.0032931638988068055}}, "5": {"article_DOC_summary": {"bleu": 1.1094899638874875e-37, "bleu_stderr": 1.0662006738968273e-31, "rouge1_fmeasure": 0.0024831420596845987, "rouge1_fmeasure_stderr": 0.0007070331562075694, "rouge1_precision": 0.002821793157093116, "rouge1_precision_stderr": 0.0008106295472902334, "rouge1_recall": 0.0022851464113943067, "rouge1_recall_stderr": 0.0006484962567584845, "rouge2_fmeasure": 0.0002519246692387647, "rouge2_fmeasure_stderr": 0.00017508831809454146, "rouge2_precision": 0.0002914830883754302, "rouge2_precision_stderr": 0.00020340968454234058, "rouge2_recall": 0.00022401756585996315, "rouge2_recall_stderr": 0.0001545711517822221, "rougeL_fmeasure": 0.001910475794160408, "rougeL_fmeasure_stderr": 0.0005775932958294451, "rougeL_precision": 0.0021547453202339587, "rougeL_precision_stderr": 0.0006563220446223475, "rougeL_recall": 0.0017708458456009723, "rougeL_recall_stderr": 0.0005325445996971974, "rougeLsum_fmeasure": 0.0020447516574242644, "rougeLsum_fmeasure_stderr": 0.000612874204929875, "rougeLsum_precision": 0.002307213397230338, "rougeLsum_precision_stderr": 0.0006934611042748366, "rougeLsum_recall": 0.001892317665185999, "rougeLsum_recall_stderr": 0.0005664327179419651}}}}
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "e2e_nlg_cleaned",
5
+ "prompt_name": "generate_text_restaurant",
6
+ "bleu": 6.018041940580495,
7
+ "dataset_path": "e2e_nlg_cleaned",
8
+ "dataset_name": null,
9
+ "subset": null,
10
+ "bleu_stderr": 0.07197788580096429
11
+ },
12
+ {
13
+ "task_name": "e2e_nlg_cleaned",
14
+ "prompt_name": "generate_text_restaurant",
15
+ "rouge1_precision": 0.26958144890836977,
16
+ "dataset_path": "e2e_nlg_cleaned",
17
+ "dataset_name": null,
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0021540233079166827
20
+ },
21
+ {
22
+ "task_name": "e2e_nlg_cleaned",
23
+ "prompt_name": "generate_text_restaurant",
24
+ "rouge1_recall": 0.4588728570611039,
25
+ "dataset_path": "e2e_nlg_cleaned",
26
+ "dataset_name": null,
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.0027449063068318643
29
+ },
30
+ {
31
+ "task_name": "e2e_nlg_cleaned",
32
+ "prompt_name": "generate_text_restaurant",
33
+ "rouge1_fmeasure": 0.32521240877335217,
34
+ "dataset_path": "e2e_nlg_cleaned",
35
+ "dataset_name": null,
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0019278160903662194
38
+ },
39
+ {
40
+ "task_name": "e2e_nlg_cleaned",
41
+ "prompt_name": "generate_text_restaurant",
42
+ "rouge2_precision": 0.11570444930629259,
43
+ "dataset_path": "e2e_nlg_cleaned",
44
+ "dataset_name": null,
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0014199073558424482
47
+ },
48
+ {
49
+ "task_name": "e2e_nlg_cleaned",
50
+ "prompt_name": "generate_text_restaurant",
51
+ "rouge2_recall": 0.2004243711941686,
52
+ "dataset_path": "e2e_nlg_cleaned",
53
+ "dataset_name": null,
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.0021042737791853398
56
+ },
57
+ {
58
+ "task_name": "e2e_nlg_cleaned",
59
+ "prompt_name": "generate_text_restaurant",
60
+ "rouge2_fmeasure": 0.13955166174444095,
61
+ "dataset_path": "e2e_nlg_cleaned",
62
+ "dataset_name": null,
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0014269312475032364
65
+ },
66
+ {
67
+ "task_name": "e2e_nlg_cleaned",
68
+ "prompt_name": "generate_text_restaurant",
69
+ "rougeL_precision": 0.2156185303251572,
70
+ "dataset_path": "e2e_nlg_cleaned",
71
+ "dataset_name": null,
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0015614536366490293
74
+ },
75
+ {
76
+ "task_name": "e2e_nlg_cleaned",
77
+ "prompt_name": "generate_text_restaurant",
78
+ "rougeL_recall": 0.3759398467983639,
79
+ "dataset_path": "e2e_nlg_cleaned",
80
+ "dataset_name": null,
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.002381221921078589
83
+ },
84
+ {
85
+ "task_name": "e2e_nlg_cleaned",
86
+ "prompt_name": "generate_text_restaurant",
87
+ "rougeL_fmeasure": 0.262564054866777,
88
+ "dataset_path": "e2e_nlg_cleaned",
89
+ "dataset_name": null,
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0014381342603292985
92
+ },
93
+ {
94
+ "task_name": "e2e_nlg_cleaned",
95
+ "prompt_name": "generate_text_restaurant",
96
+ "rougeLsum_precision": 0.22224277917141733,
97
+ "dataset_path": "e2e_nlg_cleaned",
98
+ "dataset_name": null,
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0019418664704165772
101
+ },
102
+ {
103
+ "task_name": "e2e_nlg_cleaned",
104
+ "prompt_name": "generate_text_restaurant",
105
+ "rougeLsum_recall": 0.3788810896543758,
106
+ "dataset_path": "e2e_nlg_cleaned",
107
+ "dataset_name": null,
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.0026592931094796103
110
+ },
111
+ {
112
+ "task_name": "e2e_nlg_cleaned",
113
+ "prompt_name": "generate_text_restaurant",
114
+ "rougeLsum_fmeasure": 0.2680607212245618,
115
+ "dataset_path": "e2e_nlg_cleaned",
116
+ "dataset_name": null,
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0018300140862428237
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "e2e_nlg_cleaned",
5
+ "prompt_name": "generate_text_restaurant",
6
+ "bleu": 6.247143410389527,
7
+ "dataset_path": "e2e_nlg_cleaned",
8
+ "dataset_name": null,
9
+ "subset": null,
10
+ "bleu_stderr": 0.09336005465042321
11
+ },
12
+ {
13
+ "task_name": "e2e_nlg_cleaned",
14
+ "prompt_name": "generate_text_restaurant",
15
+ "rouge1_precision": 0.25907184832717167,
16
+ "dataset_path": "e2e_nlg_cleaned",
17
+ "dataset_name": null,
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.001803996044494107
20
+ },
21
+ {
22
+ "task_name": "e2e_nlg_cleaned",
23
+ "prompt_name": "generate_text_restaurant",
24
+ "rouge1_recall": 0.4710593035922337,
25
+ "dataset_path": "e2e_nlg_cleaned",
26
+ "dataset_name": null,
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.0027215741063189363
29
+ },
30
+ {
31
+ "task_name": "e2e_nlg_cleaned",
32
+ "prompt_name": "generate_text_restaurant",
33
+ "rouge1_fmeasure": 0.32338937321619443,
34
+ "dataset_path": "e2e_nlg_cleaned",
35
+ "dataset_name": null,
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0017934056667561753
38
+ },
39
+ {
40
+ "task_name": "e2e_nlg_cleaned",
41
+ "prompt_name": "generate_text_restaurant",
42
+ "rouge2_precision": 0.11204434793169746,
43
+ "dataset_path": "e2e_nlg_cleaned",
44
+ "dataset_name": null,
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0012125669003284166
47
+ },
48
+ {
49
+ "task_name": "e2e_nlg_cleaned",
50
+ "prompt_name": "generate_text_restaurant",
51
+ "rouge2_recall": 0.21007032469689518,
52
+ "dataset_path": "e2e_nlg_cleaned",
53
+ "dataset_name": null,
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.0021257583048451844
56
+ },
57
+ {
58
+ "task_name": "e2e_nlg_cleaned",
59
+ "prompt_name": "generate_text_restaurant",
60
+ "rouge2_fmeasure": 0.14093386031313695,
61
+ "dataset_path": "e2e_nlg_cleaned",
62
+ "dataset_name": null,
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0013638568945346288
65
+ },
66
+ {
67
+ "task_name": "e2e_nlg_cleaned",
68
+ "prompt_name": "generate_text_restaurant",
69
+ "rougeL_precision": 0.2101184628317088,
70
+ "dataset_path": "e2e_nlg_cleaned",
71
+ "dataset_name": null,
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.001355833935975265
74
+ },
75
+ {
76
+ "task_name": "e2e_nlg_cleaned",
77
+ "prompt_name": "generate_text_restaurant",
78
+ "rougeL_recall": 0.3886278435049077,
79
+ "dataset_path": "e2e_nlg_cleaned",
80
+ "dataset_name": null,
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.0023407273478071285
83
+ },
84
+ {
85
+ "task_name": "e2e_nlg_cleaned",
86
+ "prompt_name": "generate_text_restaurant",
87
+ "rougeL_fmeasure": 0.2638096769634126,
88
+ "dataset_path": "e2e_nlg_cleaned",
89
+ "dataset_name": null,
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0013632619425518314
92
+ },
93
+ {
94
+ "task_name": "e2e_nlg_cleaned",
95
+ "prompt_name": "generate_text_restaurant",
96
+ "rougeLsum_precision": 0.21561354041284292,
97
+ "dataset_path": "e2e_nlg_cleaned",
98
+ "dataset_name": null,
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0016830247054440658
101
+ },
102
+ {
103
+ "task_name": "e2e_nlg_cleaned",
104
+ "prompt_name": "generate_text_restaurant",
105
+ "rougeLsum_recall": 0.3919032879874159,
106
+ "dataset_path": "e2e_nlg_cleaned",
107
+ "dataset_name": null,
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.002632582004466588
110
+ },
111
+ {
112
+ "task_name": "e2e_nlg_cleaned",
113
+ "prompt_name": "generate_text_restaurant",
114
+ "rougeLsum_fmeasure": 0.2689516848427162,
115
+ "dataset_path": "e2e_nlg_cleaned",
116
+ "dataset_name": null,
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0017385224354802351
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "e2e_nlg_cleaned",
5
+ "prompt_name": "generate_text_restaurant",
6
+ "bleu": 6.208222380745828,
7
+ "dataset_path": "e2e_nlg_cleaned",
8
+ "dataset_name": null,
9
+ "subset": null,
10
+ "bleu_stderr": 0.09813424389228141
11
+ },
12
+ {
13
+ "task_name": "e2e_nlg_cleaned",
14
+ "prompt_name": "generate_text_restaurant",
15
+ "rouge1_precision": 0.25403488264023616,
16
+ "dataset_path": "e2e_nlg_cleaned",
17
+ "dataset_name": null,
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.00174969774558582
20
+ },
21
+ {
22
+ "task_name": "e2e_nlg_cleaned",
23
+ "prompt_name": "generate_text_restaurant",
24
+ "rouge1_recall": 0.4709476911476187,
25
+ "dataset_path": "e2e_nlg_cleaned",
26
+ "dataset_name": null,
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.002655138227534261
29
+ },
30
+ {
31
+ "task_name": "e2e_nlg_cleaned",
32
+ "prompt_name": "generate_text_restaurant",
33
+ "rouge1_fmeasure": 0.32023650433333456,
34
+ "dataset_path": "e2e_nlg_cleaned",
35
+ "dataset_name": null,
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0017784408554274436
38
+ },
39
+ {
40
+ "task_name": "e2e_nlg_cleaned",
41
+ "prompt_name": "generate_text_restaurant",
42
+ "rouge2_precision": 0.11058685749210603,
43
+ "dataset_path": "e2e_nlg_cleaned",
44
+ "dataset_name": null,
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0012080055063255909
47
+ },
48
+ {
49
+ "task_name": "e2e_nlg_cleaned",
50
+ "prompt_name": "generate_text_restaurant",
51
+ "rouge2_recall": 0.21167766216290337,
52
+ "dataset_path": "e2e_nlg_cleaned",
53
+ "dataset_name": null,
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.002144496564072756
56
+ },
57
+ {
58
+ "task_name": "e2e_nlg_cleaned",
59
+ "prompt_name": "generate_text_restaurant",
60
+ "rouge2_fmeasure": 0.1403526845297047,
61
+ "dataset_path": "e2e_nlg_cleaned",
62
+ "dataset_name": null,
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0013607857847417898
65
+ },
66
+ {
67
+ "task_name": "e2e_nlg_cleaned",
68
+ "prompt_name": "generate_text_restaurant",
69
+ "rougeL_precision": 0.20576507091031662,
70
+ "dataset_path": "e2e_nlg_cleaned",
71
+ "dataset_name": null,
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0012824323500063948
74
+ },
75
+ {
76
+ "task_name": "e2e_nlg_cleaned",
77
+ "prompt_name": "generate_text_restaurant",
78
+ "rougeL_recall": 0.3889751930979724,
79
+ "dataset_path": "e2e_nlg_cleaned",
80
+ "dataset_name": null,
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.0023221115722203652
83
+ },
84
+ {
85
+ "task_name": "e2e_nlg_cleaned",
86
+ "prompt_name": "generate_text_restaurant",
87
+ "rougeL_fmeasure": 0.26122109578349895,
88
+ "dataset_path": "e2e_nlg_cleaned",
89
+ "dataset_name": null,
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0013494634641885617
92
+ },
93
+ {
94
+ "task_name": "e2e_nlg_cleaned",
95
+ "prompt_name": "generate_text_restaurant",
96
+ "rougeLsum_precision": 0.21178966636346616,
97
+ "dataset_path": "e2e_nlg_cleaned",
98
+ "dataset_name": null,
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.0016557432144235155
101
+ },
102
+ {
103
+ "task_name": "e2e_nlg_cleaned",
104
+ "prompt_name": "generate_text_restaurant",
105
+ "rougeLsum_recall": 0.393450062046956,
106
+ "dataset_path": "e2e_nlg_cleaned",
107
+ "dataset_name": null,
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.002663580316470773
110
+ },
111
+ {
112
+ "task_name": "e2e_nlg_cleaned",
113
+ "prompt_name": "generate_text_restaurant",
114
+ "rougeLsum_fmeasure": 0.2670143871011222,
115
+ "dataset_path": "e2e_nlg_cleaned",
116
+ "dataset_name": null,
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.0017625078468513325
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_e2e_nlg_cleaned_generate_text_restaurant_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "e2e_nlg_cleaned",
5
+ "prompt_name": "generate_text_restaurant",
6
+ "bleu": 6.148604351164331,
7
+ "dataset_path": "e2e_nlg_cleaned",
8
+ "dataset_name": null,
9
+ "subset": null,
10
+ "bleu_stderr": 0.11104852380350659
11
+ },
12
+ {
13
+ "task_name": "e2e_nlg_cleaned",
14
+ "prompt_name": "generate_text_restaurant",
15
+ "rouge1_precision": 0.2528456956077577,
16
+ "dataset_path": "e2e_nlg_cleaned",
17
+ "dataset_name": null,
18
+ "subset": null,
19
+ "rouge1_precision_stderr": 0.0017424978255548198
20
+ },
21
+ {
22
+ "task_name": "e2e_nlg_cleaned",
23
+ "prompt_name": "generate_text_restaurant",
24
+ "rouge1_recall": 0.473017800721126,
25
+ "dataset_path": "e2e_nlg_cleaned",
26
+ "dataset_name": null,
27
+ "subset": null,
28
+ "rouge1_recall_stderr": 0.0026123356126015254
29
+ },
30
+ {
31
+ "task_name": "e2e_nlg_cleaned",
32
+ "prompt_name": "generate_text_restaurant",
33
+ "rouge1_fmeasure": 0.3199925141836084,
34
+ "dataset_path": "e2e_nlg_cleaned",
35
+ "dataset_name": null,
36
+ "subset": null,
37
+ "rouge1_fmeasure_stderr": 0.0017551605827543929
38
+ },
39
+ {
40
+ "task_name": "e2e_nlg_cleaned",
41
+ "prompt_name": "generate_text_restaurant",
42
+ "rouge2_precision": 0.10974089008299018,
43
+ "dataset_path": "e2e_nlg_cleaned",
44
+ "dataset_name": null,
45
+ "subset": null,
46
+ "rouge2_precision_stderr": 0.0011789832834562743
47
+ },
48
+ {
49
+ "task_name": "e2e_nlg_cleaned",
50
+ "prompt_name": "generate_text_restaurant",
51
+ "rouge2_recall": 0.2119589387667116,
52
+ "dataset_path": "e2e_nlg_cleaned",
53
+ "dataset_name": null,
54
+ "subset": null,
55
+ "rouge2_recall_stderr": 0.0020903039724611107
56
+ },
57
+ {
58
+ "task_name": "e2e_nlg_cleaned",
59
+ "prompt_name": "generate_text_restaurant",
60
+ "rouge2_fmeasure": 0.13979285541625697,
61
+ "dataset_path": "e2e_nlg_cleaned",
62
+ "dataset_name": null,
63
+ "subset": null,
64
+ "rouge2_fmeasure_stderr": 0.0013089259606601385
65
+ },
66
+ {
67
+ "task_name": "e2e_nlg_cleaned",
68
+ "prompt_name": "generate_text_restaurant",
69
+ "rougeL_precision": 0.20463239752897472,
70
+ "dataset_path": "e2e_nlg_cleaned",
71
+ "dataset_name": null,
72
+ "subset": null,
73
+ "rougeL_precision_stderr": 0.0012743661964013022
74
+ },
75
+ {
76
+ "task_name": "e2e_nlg_cleaned",
77
+ "prompt_name": "generate_text_restaurant",
78
+ "rougeL_recall": 0.3904493381375762,
79
+ "dataset_path": "e2e_nlg_cleaned",
80
+ "dataset_name": null,
81
+ "subset": null,
82
+ "rougeL_recall_stderr": 0.002278919766787325
83
+ },
84
+ {
85
+ "task_name": "e2e_nlg_cleaned",
86
+ "prompt_name": "generate_text_restaurant",
87
+ "rougeL_fmeasure": 0.2608746206749859,
88
+ "dataset_path": "e2e_nlg_cleaned",
89
+ "dataset_name": null,
90
+ "subset": null,
91
+ "rougeL_fmeasure_stderr": 0.0013206910936061742
92
+ },
93
+ {
94
+ "task_name": "e2e_nlg_cleaned",
95
+ "prompt_name": "generate_text_restaurant",
96
+ "rougeLsum_precision": 0.21056870894114044,
97
+ "dataset_path": "e2e_nlg_cleaned",
98
+ "dataset_name": null,
99
+ "subset": null,
100
+ "rougeLsum_precision_stderr": 0.001637983619116459
101
+ },
102
+ {
103
+ "task_name": "e2e_nlg_cleaned",
104
+ "prompt_name": "generate_text_restaurant",
105
+ "rougeLsum_recall": 0.39500142434961627,
106
+ "dataset_path": "e2e_nlg_cleaned",
107
+ "dataset_name": null,
108
+ "subset": null,
109
+ "rougeLsum_recall_stderr": 0.002625905199940031
110
+ },
111
+ {
112
+ "task_name": "e2e_nlg_cleaned",
113
+ "prompt_name": "generate_text_restaurant",
114
+ "rougeLsum_fmeasure": 0.26667287137591805,
115
+ "dataset_path": "e2e_nlg_cleaned",
116
+ "dataset_name": null,
117
+ "subset": null,
118
+ "rougeLsum_fmeasure_stderr": 0.001737453548353309
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.1131657921685037,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0018249204798791675
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.2722598710120147,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.004133665114298229
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.15647200121804894,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0023727238595388817
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.019363405482896792,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0009105840932812386
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.049210613825466105,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.002366221262626497
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.027201929614022874,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0012539141722579825
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.08876222177060916,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0013761155158951413
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.21448631404181784,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.003198193852945419
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.12268909574632023,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.001773225086338
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.09102722612426341,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0014951062687195294
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.22021791608209987,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.003544530658562865
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.12593169516392616,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.001962773566689985
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 1.1532375468354903,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.09169402373692534
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.0383955897314317,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0025422863710305266
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.06915759782304308,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0039608171509673586
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.04426046707797172,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0024997989992097624
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.006481358010154669,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0007393187434393857
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.013044537342091751,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.00125152402986618
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.008141029291990268,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0008202019731008721
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.031113719615522017,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0021898843752975964
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.055197273257753664,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.003191098630834803
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.035265847972026074,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0020006364168593496
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.03210804984465074,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0022623416009452243
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.05680048575290953,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0032931638988068055
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.03632930315349291,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0020756841173550696
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 0.6055404023350197,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.09409517819503949
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
2b855b9bc4seed1/evaluation/generation/slim.2b855b9bc4seed1_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.002821793157093116,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0008106295472902334
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.0022851464113943067,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0006484962567584845
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.0024831420596845987,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0007070331562075694
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.0002914830883754302,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.00020340968454234058
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.00022401756585996315,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0001545711517822221
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.0002519246692387647,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.00017508831809454146
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.0021547453202339587,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0006563220446223475
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.0017708458456009723,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0005325445996971974
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.001910475794160408,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0005775932958294451
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.002307213397230338,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0006934611042748366
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.001892317665185999,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0005664327179419651
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.0020447516574242644,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.000612874204929875
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 1.1094899638874875e-37,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 1.0662006738968273e-31
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed1/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 16,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.34,0.014987482264363937,0
3
+ anli_r2,acc,0.333,0.014910846164229864,0
4
+ anli_r3,acc,0.3433333333333333,0.01371263383046586,0
5
+ arc_challenge,acc,0.2525597269624573,0.012696728980207708,0
6
+ arc_challenge,acc_norm,0.28071672354948807,0.013131238126975583,0
7
+ arc_easy,acc,0.5631313131313131,0.010177672928157685,0
8
+ arc_easy,acc_norm,0.49747474747474746,0.01025965266878347,0
9
+ boolq,acc,0.5351681957186545,0.008723396352960192,1
10
+ cb,acc,0.44642857142857145,0.06703189227942398,1
11
+ cb,f1,0.2956393200295639,,1
12
+ copa,acc,0.71,0.04560480215720684,0
13
+ hellaswag,acc,0.43328022306313485,0.004945157565218188,0
14
+ hellaswag,acc_norm,0.5569607647878908,0.004957296691391566,0
15
+ piqa,acc,0.7442872687704026,0.010178690109459855,0
16
+ piqa,acc_norm,0.750272034820457,0.010099232969867469,0
17
+ rte,acc,0.5415162454873647,0.029992535385373317,0
18
+ sciq,acc,0.805,0.012535235623319329,0
19
+ sciq,acc_norm,0.71,0.014356395999905684,0
20
+ storycloze_2016,acc,0.6916087653661144,0.0106797344454878,0
21
+ winogrande,acc,0.5477505919494869,0.013988256216606007,0
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_0_lm-eval_global_step52452_2023-02-24-23-57-47_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.34,
5
- "acc_stderr": 0.014987482264363937
6
- },
7
- "anli_r2": {
8
- "acc": 0.333,
9
- "acc_stderr": 0.014910846164229864
10
- },
11
- "anli_r3": {
12
- "acc": 0.3433333333333333,
13
- "acc_stderr": 0.01371263383046586
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.2956393200295639
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.04560480215720684
23
- },
24
- "hellaswag": {
25
- "acc": 0.43328022306313485,
26
- "acc_stderr": 0.004945157565218188,
27
- "acc_norm": 0.5569607647878908,
28
- "acc_norm_stderr": 0.004957296691391566
29
- },
30
- "rte": {
31
- "acc": 0.5415162454873647,
32
- "acc_stderr": 0.029992535385373317
33
- },
34
- "winogrande": {
35
- "acc": 0.5477505919494869,
36
- "acc_stderr": 0.013988256216606007
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6916087653661144,
40
- "acc_stderr": 0.0106797344454878
41
- },
42
- "boolq": {
43
- "acc": 0.5351681957186545,
44
- "acc_stderr": 0.008723396352960192
45
- },
46
- "arc_easy": {
47
- "acc": 0.5631313131313131,
48
- "acc_stderr": 0.010177672928157685,
49
- "acc_norm": 0.49747474747474746,
50
- "acc_norm_stderr": 0.01025965266878347
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2525597269624573,
54
- "acc_stderr": 0.012696728980207708,
55
- "acc_norm": 0.28071672354948807,
56
- "acc_norm_stderr": 0.013131238126975583
57
- },
58
- "sciq": {
59
- "acc": 0.805,
60
- "acc_stderr": 0.012535235623319329,
61
- "acc_norm": 0.71,
62
- "acc_norm_stderr": 0.014356395999905684
63
- },
64
- "piqa": {
65
- "acc": 0.7442872687704026,
66
- "acc_stderr": 0.010178690109459855,
67
- "acc_norm": 0.750272034820457,
68
- "acc_norm_stderr": 0.010099232969867469
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.323,0.014794927843348635,0
3
+ anli_r2,acc,0.348,0.01507060460376841,0
4
+ anli_r3,acc,0.3416666666666667,0.013696658778002519,0
5
+ arc_challenge,acc,0.26023890784982934,0.012821930225112566,0
6
+ arc_challenge,acc_norm,0.27986348122866894,0.013119040897725923,0
7
+ arc_easy,acc,0.5740740740740741,0.010146568651002255,0
8
+ arc_easy,acc_norm,0.5332491582491582,0.010237073872130745,0
9
+ boolq,acc,0.555045871559633,0.008691897543539221,1
10
+ cb,acc,0.5178571428571429,0.06737697508644648,1
11
+ cb,f1,0.3656150648080215,,1
12
+ copa,acc,0.74,0.04408440022768078,0
13
+ hellaswag,acc,0.4282015534754033,0.004938068627349495,0
14
+ hellaswag,acc_norm,0.555964947221669,0.0049584261524818945,0
15
+ piqa,acc,0.7306855277475517,0.010350004070588758,0
16
+ piqa,acc_norm,0.7383025027203483,0.01025563077270823,0
17
+ rte,acc,0.5415162454873647,0.029992535385373314,0
18
+ sciq,acc,0.845,0.011450157470799471,0
19
+ sciq,acc_norm,0.811,0.012386784588117714,0
20
+ storycloze_2016,acc,0.6787814003206841,0.010798029402794916,0
21
+ winogrande,acc,0.5564325177584846,0.0139626949076204,0
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_1.json CHANGED
@@ -54,6 +54,18 @@
54
  "acc_stderr": 0.012821930225112566,
55
  "acc_norm": 0.27986348122866894,
56
  "acc_norm_stderr": 0.013119040897725923
 
 
 
 
 
 
 
 
 
 
 
 
57
  }
58
  },
59
  "versions": {
@@ -68,6 +80,8 @@
68
  "storycloze_2016": 0,
69
  "boolq": 1,
70
  "arc_easy": 0,
71
- "arc_challenge": 0
 
 
72
  }
73
  }
 
54
  "acc_stderr": 0.012821930225112566,
55
  "acc_norm": 0.27986348122866894,
56
  "acc_norm_stderr": 0.013119040897725923
57
+ },
58
+ "sciq": {
59
+ "acc": 0.845,
60
+ "acc_stderr": 0.011450157470799471,
61
+ "acc_norm": 0.811,
62
+ "acc_norm_stderr": 0.012386784588117714
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7306855277475517,
66
+ "acc_stderr": 0.010350004070588758,
67
+ "acc_norm": 0.7383025027203483,
68
+ "acc_norm_stderr": 0.01025563077270823
69
  }
70
  },
71
  "versions": {
 
80
  "storycloze_2016": 0,
81
  "boolq": 1,
82
  "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_1_lm-eval_global_step52452_2023-02-24-23-57-47_1shots_backup.json DELETED
@@ -1,73 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.323,
5
- "acc_stderr": 0.014794927843348635
6
- },
7
- "anli_r2": {
8
- "acc": 0.348,
9
- "acc_stderr": 0.01507060460376841
10
- },
11
- "anli_r3": {
12
- "acc": 0.3416666666666667,
13
- "acc_stderr": 0.013696658778002519
14
- },
15
- "cb": {
16
- "acc": 0.5178571428571429,
17
- "acc_stderr": 0.06737697508644648,
18
- "f1": 0.3656150648080215
19
- },
20
- "copa": {
21
- "acc": 0.74,
22
- "acc_stderr": 0.04408440022768078
23
- },
24
- "hellaswag": {
25
- "acc": 0.4282015534754033,
26
- "acc_stderr": 0.004938068627349495,
27
- "acc_norm": 0.555964947221669,
28
- "acc_norm_stderr": 0.0049584261524818945
29
- },
30
- "rte": {
31
- "acc": 0.5415162454873647,
32
- "acc_stderr": 0.029992535385373314
33
- },
34
- "winogrande": {
35
- "acc": 0.5564325177584846,
36
- "acc_stderr": 0.0139626949076204
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6787814003206841,
40
- "acc_stderr": 0.010798029402794916
41
- },
42
- "boolq": {
43
- "acc": 0.555045871559633,
44
- "acc_stderr": 0.008691897543539221
45
- },
46
- "arc_easy": {
47
- "acc": 0.5740740740740741,
48
- "acc_stderr": 0.010146568651002255,
49
- "acc_norm": 0.5332491582491582,
50
- "acc_norm_stderr": 0.010237073872130745
51
- },
52
- "arc_challenge": {
53
- "acc": 0.26023890784982934,
54
- "acc_stderr": 0.012821930225112566,
55
- "acc_norm": 0.27986348122866894,
56
- "acc_norm_stderr": 0.013119040897725923
57
- }
58
- },
59
- "versions": {
60
- "anli_r1": 0,
61
- "anli_r2": 0,
62
- "anli_r3": 0,
63
- "cb": 1,
64
- "copa": 0,
65
- "hellaswag": 0,
66
- "rte": 0,
67
- "winogrande": 0,
68
- "storycloze_2016": 0,
69
- "boolq": 1,
70
- "arc_easy": 0,
71
- "arc_challenge": 0
72
- }
73
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.32,0.014758652303574874,0
3
+ anli_r2,acc,0.334,0.014922019523732968,0
4
+ anli_r3,acc,0.345,0.013728421539454878,0
5
+ arc_challenge,acc,0.24658703071672355,0.012595726268790122,0
6
+ arc_challenge,acc_norm,0.29180887372013653,0.01328452529240351,0
7
+ arc_easy,acc,0.5862794612794613,0.010105878530238144,0
8
+ arc_easy,acc_norm,0.569023569023569,0.010161552863493746,0
9
+ boolq,acc,0.5504587155963303,0.008700409761350796,1
10
+ cb,acc,0.42857142857142855,0.06672848092813058,1
11
+ cb,f1,0.291005291005291,,1
12
+ copa,acc,0.71,0.045604802157206845,0
13
+ hellaswag,acc,0.42929695279824737,0.004939642460172587,0
14
+ hellaswag,acc_norm,0.5593507269468233,0.004954503606471611,0
15
+ piqa,acc,0.7328618063112078,0.010323440492612437,0
16
+ piqa,acc_norm,0.735582154515778,0.010289787244767156,0
17
+ rte,acc,0.49097472924187724,0.030091559826331334,0
18
+ sciq,acc,0.859,0.011010914595992443,0
19
+ sciq,acc_norm,0.83,0.011884495834541663,0
20
+ storycloze_2016,acc,0.686798503474078,0.010725209422929404,0
21
+ winogrande,acc,0.5611681136543015,0.013946933444507032,0
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_2.json CHANGED
@@ -38,6 +38,34 @@
38
  "storycloze_2016": {
39
  "acc": 0.686798503474078,
40
  "acc_stderr": 0.010725209422929404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  }
42
  },
43
  "versions": {
@@ -49,6 +77,11 @@
49
  "hellaswag": 0,
50
  "rte": 0,
51
  "winogrande": 0,
52
- "storycloze_2016": 0
 
 
 
 
 
53
  }
54
  }
 
38
  "storycloze_2016": {
39
  "acc": 0.686798503474078,
40
  "acc_stderr": 0.010725209422929404
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5504587155963303,
44
+ "acc_stderr": 0.008700409761350796
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5862794612794613,
48
+ "acc_stderr": 0.010105878530238144,
49
+ "acc_norm": 0.569023569023569,
50
+ "acc_norm_stderr": 0.010161552863493746
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.24658703071672355,
54
+ "acc_stderr": 0.012595726268790122,
55
+ "acc_norm": 0.29180887372013653,
56
+ "acc_norm_stderr": 0.01328452529240351
57
+ },
58
+ "sciq": {
59
+ "acc": 0.859,
60
+ "acc_stderr": 0.011010914595992443,
61
+ "acc_norm": 0.83,
62
+ "acc_norm_stderr": 0.011884495834541663
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7328618063112078,
66
+ "acc_stderr": 0.010323440492612437,
67
+ "acc_norm": 0.735582154515778,
68
+ "acc_norm_stderr": 0.010289787244767156
69
  }
70
  },
71
  "versions": {
 
77
  "hellaswag": 0,
78
  "rte": 0,
79
  "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_2_lm-eval_global_step52452_2023-02-24-23-57-47_2shots_backup.json DELETED
@@ -1,54 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.32,
5
- "acc_stderr": 0.014758652303574874
6
- },
7
- "anli_r2": {
8
- "acc": 0.334,
9
- "acc_stderr": 0.014922019523732968
10
- },
11
- "anli_r3": {
12
- "acc": 0.345,
13
- "acc_stderr": 0.013728421539454878
14
- },
15
- "cb": {
16
- "acc": 0.42857142857142855,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.291005291005291
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.045604802157206845
23
- },
24
- "hellaswag": {
25
- "acc": 0.42929695279824737,
26
- "acc_stderr": 0.004939642460172587,
27
- "acc_norm": 0.5593507269468233,
28
- "acc_norm_stderr": 0.004954503606471611
29
- },
30
- "rte": {
31
- "acc": 0.49097472924187724,
32
- "acc_stderr": 0.030091559826331334
33
- },
34
- "winogrande": {
35
- "acc": 0.5611681136543015,
36
- "acc_stderr": 0.013946933444507032
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.686798503474078,
40
- "acc_stderr": 0.010725209422929404
41
- }
42
- },
43
- "versions": {
44
- "anli_r1": 0,
45
- "anli_r2": 0,
46
- "anli_r3": 0,
47
- "cb": 1,
48
- "copa": 0,
49
- "hellaswag": 0,
50
- "rte": 0,
51
- "winogrande": 0,
52
- "storycloze_2016": 0
53
- }
54
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.325,0.014818724459095524,0
3
+ anli_r2,acc,0.35,0.015090650341444231,0
4
+ anli_r3,acc,0.33166666666666667,0.013596836729485157,0
5
+ arc_challenge,acc,0.26109215017064846,0.012835523909473841,0
6
+ arc_challenge,acc_norm,0.2815699658703072,0.013143376735009022,0
7
+ arc_easy,acc,0.5925925925925926,0.010082326627832858,0
8
+ arc_easy,acc_norm,0.5728114478114478,0.010150415974210871,0
9
+ boolq,acc,0.5648318042813456,0.008671229580582113,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.4369505854187708,,1
12
+ copa,acc,0.75,0.04351941398892446,0
13
+ hellaswag,acc,0.427504481179048,0.0049370542337115715,0
14
+ hellaswag,acc_norm,0.5574586735710018,0.004956724392646532,0
15
+ piqa,acc,0.7404787812840044,0.010227939888173918,0
16
+ piqa,acc_norm,0.7372143634385201,0.010269354068140776,0
17
+ rte,acc,0.5523465703971119,0.02993107036293953,0
18
+ sciq,acc,0.857,0.011075814808567038,0
19
+ sciq,acc_norm,0.843,0.011510146979230187,0
20
+ storycloze_2016,acc,0.6835916622127205,0.01075478009794089,0
21
+ winogrande,acc,0.5430149960536701,0.01400038676159829,0
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_3.json CHANGED
@@ -26,6 +26,46 @@
26
  "acc_stderr": 0.0049370542337115715,
27
  "acc_norm": 0.5574586735710018,
28
  "acc_norm_stderr": 0.004956724392646532
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  }
30
  },
31
  "versions": {
@@ -34,6 +74,14 @@
34
  "anli_r3": 0,
35
  "cb": 1,
36
  "copa": 0,
37
- "hellaswag": 0
 
 
 
 
 
 
 
 
38
  }
39
  }
 
26
  "acc_stderr": 0.0049370542337115715,
27
  "acc_norm": 0.5574586735710018,
28
  "acc_norm_stderr": 0.004956724392646532
29
+ },
30
+ "rte": {
31
+ "acc": 0.5523465703971119,
32
+ "acc_stderr": 0.02993107036293953
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5430149960536701,
36
+ "acc_stderr": 0.01400038676159829
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6835916622127205,
40
+ "acc_stderr": 0.01075478009794089
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5648318042813456,
44
+ "acc_stderr": 0.008671229580582113
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5925925925925926,
48
+ "acc_stderr": 0.010082326627832858,
49
+ "acc_norm": 0.5728114478114478,
50
+ "acc_norm_stderr": 0.010150415974210871
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.26109215017064846,
54
+ "acc_stderr": 0.012835523909473841,
55
+ "acc_norm": 0.2815699658703072,
56
+ "acc_norm_stderr": 0.013143376735009022
57
+ },
58
+ "sciq": {
59
+ "acc": 0.857,
60
+ "acc_stderr": 0.011075814808567038,
61
+ "acc_norm": 0.843,
62
+ "acc_norm_stderr": 0.011510146979230187
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7404787812840044,
66
+ "acc_stderr": 0.010227939888173918,
67
+ "acc_norm": 0.7372143634385201,
68
+ "acc_norm_stderr": 0.010269354068140776
69
  }
70
  },
71
  "versions": {
 
74
  "anli_r3": 0,
75
  "cb": 1,
76
  "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_3_lm-eval_global_step52452_2023-02-24-23-57-47_3shots_backup.json DELETED
@@ -1,39 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.325,
5
- "acc_stderr": 0.014818724459095524
6
- },
7
- "anli_r2": {
8
- "acc": 0.35,
9
- "acc_stderr": 0.015090650341444231
10
- },
11
- "anli_r3": {
12
- "acc": 0.33166666666666667,
13
- "acc_stderr": 0.013596836729485157
14
- },
15
- "cb": {
16
- "acc": 0.5357142857142857,
17
- "acc_stderr": 0.06724777654937658,
18
- "f1": 0.4369505854187708
19
- },
20
- "copa": {
21
- "acc": 0.75,
22
- "acc_stderr": 0.04351941398892446
23
- },
24
- "hellaswag": {
25
- "acc": 0.427504481179048,
26
- "acc_stderr": 0.0049370542337115715,
27
- "acc_norm": 0.5574586735710018,
28
- "acc_norm_stderr": 0.004956724392646532
29
- }
30
- },
31
- "versions": {
32
- "anli_r1": 0,
33
- "anli_r2": 0,
34
- "anli_r3": 0,
35
- "cb": 1,
36
- "copa": 0,
37
- "hellaswag": 0
38
- }
39
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.342,0.01500870618212173,0
3
+ anli_r2,acc,0.367,0.015249378464171754,0
4
+ anli_r3,acc,0.35333333333333333,0.01380457216231493,0
5
+ arc_challenge,acc,0.26109215017064846,0.012835523909473848,0
6
+ arc_challenge,acc_norm,0.29266211604095566,0.013295916103619417,0
7
+ arc_easy,acc,0.593013468013468,0.010080695355466598,0
8
+ arc_easy,acc_norm,0.5707070707070707,0.010156678075911096,0
9
+ boolq,acc,0.5724770642201835,0.008652692997177334,1
10
+ cb,acc,0.44642857142857145,0.067031892279424,1
11
+ cb,f1,0.26007168458781366,,1
12
+ copa,acc,0.72,0.04512608598542127,0
13
+ hellaswag,acc,0.42710615415255926,0.004936470085238484,0
14
+ hellaswag,acc_norm,0.5613423620792671,0.004952087083128899,0
15
+ piqa,acc,0.7323177366702938,0.010330111189370432,0
16
+ piqa,acc_norm,0.7366702937976061,0.010276185322196764,0
17
+ rte,acc,0.4981949458483754,0.030096267148976626,0
18
+ sciq,acc,0.866,0.010777762298369685,0
19
+ sciq,acc_norm,0.858,0.011043457699378227,0
20
+ storycloze_2016,acc,0.6873329770176376,0.01072022317295317,0
21
+ winogrande,acc,0.5501183898973955,0.01398171190404973,0
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_4.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.72,
22
  "acc_stderr": 0.04512608598542127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.72,
22
  "acc_stderr": 0.04512608598542127
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.42710615415255926,
26
+ "acc_stderr": 0.004936470085238484,
27
+ "acc_norm": 0.5613423620792671,
28
+ "acc_norm_stderr": 0.004952087083128899
29
+ },
30
+ "rte": {
31
+ "acc": 0.4981949458483754,
32
+ "acc_stderr": 0.030096267148976626
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5501183898973955,
36
+ "acc_stderr": 0.01398171190404973
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6873329770176376,
40
+ "acc_stderr": 0.01072022317295317
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5724770642201835,
44
+ "acc_stderr": 0.008652692997177334
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.593013468013468,
48
+ "acc_stderr": 0.010080695355466598,
49
+ "acc_norm": 0.5707070707070707,
50
+ "acc_norm_stderr": 0.010156678075911096
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.26109215017064846,
54
+ "acc_stderr": 0.012835523909473848,
55
+ "acc_norm": 0.29266211604095566,
56
+ "acc_norm_stderr": 0.013295916103619417
57
+ },
58
+ "sciq": {
59
+ "acc": 0.866,
60
+ "acc_stderr": 0.010777762298369685,
61
+ "acc_norm": 0.858,
62
+ "acc_norm_stderr": 0.011043457699378227
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7323177366702938,
66
+ "acc_stderr": 0.010330111189370432,
67
+ "acc_norm": 0.7366702937976061,
68
+ "acc_norm_stderr": 0.010276185322196764
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.346,0.015050266127564448,0
3
+ anli_r2,acc,0.344,0.015029633724408947,0
4
+ anli_r3,acc,0.3516666666666667,0.013789711695404789,0
5
+ arc_challenge,acc,0.27047781569965873,0.012980954547659554,0
6
+ arc_challenge,acc_norm,0.2909556313993174,0.013273077865907593,0
7
+ arc_easy,acc,0.5862794612794613,0.010105878530238132,0
8
+ arc_easy,acc_norm,0.5740740740740741,0.010146568651002255,0
9
+ boolq,acc,0.5685015290519878,0.008662594569027314,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.31399711399711405,,1
12
+ copa,acc,0.72,0.04512608598542127,0
13
+ hellaswag,acc,0.4305915156343358,0.004941470620074864,0
14
+ hellaswag,acc_norm,0.5628360884285999,0.004950221546187576,0
15
+ piqa,acc,0.7312295973884657,0.010343392940090011,0
16
+ piqa,acc_norm,0.7415669205658324,0.0102139716367733,0
17
+ rte,acc,0.5451263537906137,0.029973636495415252,0
18
+ sciq,acc,0.866,0.01077776229836968,0
19
+ sciq,acc_norm,0.861,0.010945263761042968,0
20
+ storycloze_2016,acc,0.6814537680384821,0.010774165229761353,0
21
+ winogrande,acc,0.5548539857932123,0.013967662954355491,0
2b855b9bc4seed1/evaluation/rankeval/2b855b9bc4seed1_5.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.72,
22
  "acc_stderr": 0.04512608598542127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.72,
22
  "acc_stderr": 0.04512608598542127
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4305915156343358,
26
+ "acc_stderr": 0.004941470620074864,
27
+ "acc_norm": 0.5628360884285999,
28
+ "acc_norm_stderr": 0.004950221546187576
29
+ },
30
+ "rte": {
31
+ "acc": 0.5451263537906137,
32
+ "acc_stderr": 0.029973636495415252
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5548539857932123,
36
+ "acc_stderr": 0.013967662954355491
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6814537680384821,
40
+ "acc_stderr": 0.010774165229761353
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5685015290519878,
44
+ "acc_stderr": 0.008662594569027314
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5862794612794613,
48
+ "acc_stderr": 0.010105878530238132,
49
+ "acc_norm": 0.5740740740740741,
50
+ "acc_norm_stderr": 0.010146568651002255
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.27047781569965873,
54
+ "acc_stderr": 0.012980954547659554,
55
+ "acc_norm": 0.2909556313993174,
56
+ "acc_norm_stderr": 0.013273077865907593
57
+ },
58
+ "sciq": {
59
+ "acc": 0.866,
60
+ "acc_stderr": 0.01077776229836968,
61
+ "acc_norm": 0.861,
62
+ "acc_norm_stderr": 0.010945263761042968
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7312295973884657,
66
+ "acc_stderr": 0.010343392940090011,
67
+ "acc_norm": 0.7415669205658324,
68
+ "acc_norm_stderr": 0.0102139716367733
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_GEM-web_nlg_en_PALM_prompt_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.3750956835220534, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03303866348324263}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.06729841211664779, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014779125598332707}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.29304332001997996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004946222762455597}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1034915916030452, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020463645545355925}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.031583798963941895, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008797642983980407}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.14233929411373747, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003345640165065958}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.048745998009714464, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012510511387985711}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06496957050414408, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013738739239028245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2861388731751563, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004839232451958966}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10020083541023005, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019268832996130537}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06418206519400182, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001390783570882245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2794417864990063, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004636157223173928}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.09872549675778793, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019292683793847847}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.15312528607963066, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018475398053558751}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.26069676849871193, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002553411390285405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.17949455554206634, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001796797335172746}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.03025768195820637, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000737285788609388}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0526377509392788, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013707148216665472}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03551854752059051, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008299460804507645}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11900796171626439, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012961585979789718}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.21022152388268012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020859201845126016}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14125944815133307, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001293606648023059}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.14065190421878745, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016987560973759559}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.24028450248920846, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023647123157988985}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.16495917517256573, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001644741818116919}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.5392368463230133, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.053948912749046035}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_GEM-wiki_lingua_en_tldr_en_1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.1668110350174811, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019877097402536763}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.28257427077519554, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002760368419127019}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.19472791234999667, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001906191600167485}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.035928254592281224, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008382143224304085}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06299590355552512, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015297554624304605}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04208923831822898, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009197645366087788}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.12071476283575872, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013170745626523695}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.21195380139438252, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002156914307318323}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14254660444247075, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012952641600081254}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.15552475435698954, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018425071564360774}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2646754231072801, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026025376230503126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18187224601097693, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017770105186546616}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.0614970786142583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09284907950143785}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
2b855b9bc4seed2/evaluation/generation/agg.2b855b9bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_0.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.027496856259569415, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.007385963274542756}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.047807397913058214, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011364617294874694}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.022673512097835077, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005137246819540603}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.02786087179209838, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005858506970554961}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.0005045565387850128, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00014549502538483447}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.00021403935275164058, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 5.468859488917244e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.0002573048402040541, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 6.575032923202268e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.04750467841033873, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011193107075921842}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.022556340907519876, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000501594662926671}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.02773258214801112, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005799438210362486}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.04681714649780676, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011159994745820552}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.02201246000410449, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00047810257232395407}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.027138243313024377, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005613131933490365}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-c4seeds/2b855b9bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}