Update app.py
Browse files
app.py
CHANGED
|
@@ -25,7 +25,7 @@ def make_leaderboard_md(elo_results):
|
|
| 25 |
- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
|
| 26 |
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
|
| 27 |
|
| 28 |
-
💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available.
|
| 29 |
"""
|
| 30 |
return leaderboard_md
|
| 31 |
|
|
@@ -233,6 +233,9 @@ Please note that you may see different orders from different ranking methods. Th
|
|
| 233 |
"#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
|
| 234 |
)
|
| 235 |
plot_4 = gr.Plot(p4, show_label=False)
|
|
|
|
|
|
|
|
|
|
| 236 |
return [md_1, plot_1, plot_2, plot_3, plot_4]
|
| 237 |
|
| 238 |
block_css = """
|
|
@@ -294,7 +297,6 @@ def build_demo(elo_results_file, leaderboard_table_file):
|
|
| 294 |
leader_components = build_leaderboard_tab(
|
| 295 |
elo_results_file, leaderboard_table_file
|
| 296 |
)
|
| 297 |
-
gr.Markdown(acknowledgment_md)
|
| 298 |
|
| 299 |
return demo
|
| 300 |
|
|
|
|
| 25 |
- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
|
| 26 |
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
|
| 27 |
|
| 28 |
+
💻 Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Sept, 2023.
|
| 29 |
"""
|
| 30 |
return leaderboard_md
|
| 31 |
|
|
|
|
| 233 |
"#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
|
| 234 |
)
|
| 235 |
plot_4 = gr.Plot(p4, show_label=False)
|
| 236 |
+
|
| 237 |
+
gr.Markdown(acknowledgment_md)
|
| 238 |
+
|
| 239 |
return [md_1, plot_1, plot_2, plot_3, plot_4]
|
| 240 |
|
| 241 |
block_css = """
|
|
|
|
| 297 |
leader_components = build_leaderboard_tab(
|
| 298 |
elo_results_file, leaderboard_table_file
|
| 299 |
)
|
|
|
|
| 300 |
|
| 301 |
return demo
|
| 302 |
|