baebee commited on
Commit
1541bbe
·
1 Parent(s): 5a1b1d5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +68 -1
README.md CHANGED
@@ -37,4 +37,71 @@ tags:
37
  ## Limitations
38
 
39
  - May hallucinate or generate incorrect information
40
- - Large model size leads to high compute requirements
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  ## Limitations
38
 
39
  - May hallucinate or generate incorrect information
40
+ - Large model size leads to high compute requirements
41
+
42
+
43
+ @misc{open-llm-leaderboard,
44
+ author = {Edward Beeching, Clémentine Fourrier, Nathan Habib, Sheon Han, Nathan Lambert, Nazneen Rajani, Omar Sanseviero, Lewis Tunstall, Thomas Wolf},
45
+ title = {Open LLM Leaderboard},
46
+ year = {2023},
47
+ publisher = {Hugging Face},
48
+ howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
49
+ }
50
+ @software{eval-harness,
51
+ author = {Gao, Leo and
52
+ Tow, Jonathan and
53
+ Biderman, Stella and
54
+ Black, Sid and
55
+ DiPofi, Anthony and
56
+ Foster, Charles and
57
+ Golding, Laurence and
58
+ Hsu, Jeffrey and
59
+ McDonell, Kyle and
60
+ Muennighoff, Niklas and
61
+ Phang, Jason and
62
+ Reynolds, Laria and
63
+ Tang, Eric and
64
+ Thite, Anish and
65
+ Wang, Ben and
66
+ Wang, Kevin and
67
+ Zou, Andy},
68
+ title = {A framework for few-shot language model evaluation},
69
+ month = sep,
70
+ year = 2021,
71
+ publisher = {Zenodo},
72
+ version = {v0.0.1},
73
+ doi = {10.5281/zenodo.5371628},
74
+ url = {https://doi.org/10.5281/zenodo.5371628}
75
+ }
76
+ @misc{clark2018think,
77
+ title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
78
+ author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
79
+ year={2018},
80
+ eprint={1803.05457},
81
+ archivePrefix={arXiv},
82
+ primaryClass={cs.AI}
83
+ }
84
+ @misc{zellers2019hellaswag,
85
+ title={HellaSwag: Can a Machine Really Finish Your Sentence?},
86
+ author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
87
+ year={2019},
88
+ eprint={1905.07830},
89
+ archivePrefix={arXiv},
90
+ primaryClass={cs.CL}
91
+ }
92
+ @misc{hendrycks2021measuring,
93
+ title={Measuring Massive Multitask Language Understanding},
94
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
95
+ year={2021},
96
+ eprint={2009.03300},
97
+ archivePrefix={arXiv},
98
+ primaryClass={cs.CY}
99
+ }
100
+ @misc{lin2022truthfulqa,
101
+ title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
102
+ author={Stephanie Lin and Jacob Hilton and Owain Evans},
103
+ year={2022},
104
+ eprint={2109.07958},
105
+ archivePrefix={arXiv},
106
+ primaryClass={cs.CL}
107
+ }