from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Init: to update with your specific keys class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task0 = Task("task_name1", "metric_name", "First task") task1 = Task("task_name2", "metric_name", "Second task") # Your leaderboard name TITLE = """

🏆 Auto Arena of LLMs

""" # subtitle SUB_TITLE = """

Automating LLM Evaluations with Agent Peer-battles and Committee Discussions

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ This leaderboard is from a completely automated large language model (LLM) evaluation framework by employing various LLM agents in peer-battles and committee discussions. You can find more details from the [project page](https://auto-arena.github.io/) and our [paper](). """ # For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab. # Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings. # """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = f""" ``` """ # You can find the detailed numerical results in the results Hugging Face dataset: https://huggingface.co/datasets/SeaLLMs/SeaExam-results EVALUATION_QUEUE_TEXT = """ """ CITATION_BUTTON_LABEL = "" CITATION_BUTTON_TEXT = r""" """ CONTACT_TEXT = f""" ## Contact """