wip added primary category tasks
Browse files- src/about.py +14 -4
- src/display/utils.py +43 -23
- src/envs.py +2 -2
- src/leaderboard/read_evals.py +4 -0
src/about.py
CHANGED
@@ -10,10 +10,20 @@ class Task:
|
|
10 |
|
11 |
# Select your tasks here
|
12 |
# ---------------------------------------------------
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
|
|
10 |
|
11 |
# Select your tasks here
|
12 |
# ---------------------------------------------------
|
13 |
+
if False:
|
14 |
+
class Tasks(Enum):
|
15 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
+
task0 = Task("anli_r1", "acc", "ANLI")
|
17 |
+
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
18 |
+
else:
|
19 |
+
class Tasks(Enum):
|
20 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
21 |
+
# NOTE: the task_key must be unique
|
22 |
+
task0 = Task("default_primary_subfield_accuracy", "primary_subfield_accuracy", "Accuracy of predicting the released primary category on the default split")
|
23 |
+
task1 = Task("default_primary_top_3_tpr", "primary_top_3_tpr", "Occurrence of the released primary category in the top-3 predictions on the default split")
|
24 |
+
task2 = Task("default_primary_top_5_tpr", "primary_top_5_tpr", "Occurrence of the released primary category in the top-5 predictions on the default split")
|
25 |
+
task3 = Task("default_primary_top_10_tpr", "primary_top_10_tpr", "Occurrence of the released primary category in the top-10 predictions on the default split")
|
26 |
+
# task1 = Task("all2023_v2", "acc", "Acc on all 2023 papers")
|
27 |
|
28 |
NUM_FEWSHOT = 0 # Change with your few shot
|
29 |
# ---------------------------------------------------
|
src/display/utils.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
@@ -21,27 +21,46 @@ class ColumnContent:
|
|
21 |
never_hidden: bool = False
|
22 |
|
23 |
## Leaderboard columns
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
auto_eval_column_dict.append(["
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
auto_eval_column_dict.append(["
|
35 |
-
auto_eval_column_dict.append(["
|
36 |
-
auto_eval_column_dict.append(["
|
37 |
-
auto_eval_column_dict.append(["
|
38 |
-
auto_eval_column_dict.append(["
|
39 |
-
auto_eval_column_dict.append(["
|
40 |
-
auto_eval_column_dict.append(["
|
41 |
-
auto_eval_column_dict.append(["
|
42 |
-
|
43 |
-
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
-
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
## For the queue columns in the submission tab
|
47 |
@dataclass(frozen=True)
|
@@ -107,4 +126,5 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
|
107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
108 |
|
109 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
110 |
-
|
|
|
|
1 |
+
from dataclasses import dataclass, make_dataclass, field
|
2 |
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
|
|
21 |
never_hidden: bool = False
|
22 |
|
23 |
## Leaderboard columns
|
24 |
+
if False:
|
25 |
+
auto_eval_column_dict = []
|
26 |
+
# Init
|
27 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
28 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
29 |
+
#Scores
|
30 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
31 |
+
for task in Tasks:
|
32 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
+
# Model information
|
34 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
35 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
36 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
37 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
38 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
39 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
40 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
41 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
42 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
43 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
45 |
+
|
46 |
+
@dataclass(frozen=True)
|
47 |
+
class AutoEvalColumn:
|
48 |
+
model_type_symbol = ColumnContent("T", "str", True, never_hidden=True)
|
49 |
+
model = ColumnContent("Model", "markdown", True, never_hidden=True)
|
50 |
+
average = ColumnContent("Average ⬆️", "number", True)
|
51 |
+
model_type = ColumnContent("Type", "str", False)
|
52 |
+
architecture = ColumnContent("Architecture", "str", False)
|
53 |
+
weight_type = ColumnContent("Weight type", "str", False, True)
|
54 |
+
precision = ColumnContent("Precision", "str", False)
|
55 |
+
license = ColumnContent("Hub License", "str", False)
|
56 |
+
params = ColumnContent("#Params (B)", "number", False)
|
57 |
+
likes = ColumnContent("Hub ❤️", "number", False)
|
58 |
+
still_on_hub = ColumnContent("Available on the hub", "bool", False)
|
59 |
+
revision = ColumnContent("Model sha", "str", False, False)
|
60 |
+
# Dynamically add task columns
|
61 |
+
def __init__(self):
|
62 |
+
for task in Tasks:
|
63 |
+
setattr(self.__class__, task.name, ColumnContent(task.value.col_name, "number", True))
|
64 |
|
65 |
## For the queue columns in the submission tab
|
66 |
@dataclass(frozen=True)
|
|
|
126 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
127 |
|
128 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
129 |
+
# NOTE: add the benchmark columns to the COLS since they are dynamically added
|
130 |
+
COLS += BENCHMARK_COLS
|
src/envs.py
CHANGED
@@ -11,8 +11,8 @@ OWNER = "mlcore"
|
|
11 |
# ----------------------------------
|
12 |
|
13 |
REPO_ID = f"{OWNER}/arxiv-classifier-leaderboard"
|
14 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
15 |
-
RESULTS_REPO = f"{OWNER}/results"
|
16 |
|
17 |
# If you setup a cache later, just change HF_HOME
|
18 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
11 |
# ----------------------------------
|
12 |
|
13 |
REPO_ID = f"{OWNER}/arxiv-classifier-leaderboard"
|
14 |
+
QUEUE_REPO = f"{OWNER}/arxiv-classifier-leaderboard-requests"
|
15 |
+
RESULTS_REPO = f"{OWNER}/arxiv-classifier-leaderboard-results"
|
16 |
|
17 |
# If you setup a cache later, just change HF_HOME
|
18 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
src/leaderboard/read_evals.py
CHANGED
@@ -176,6 +176,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
176 |
for model_result_filepath in model_result_filepaths:
|
177 |
# Creation of result
|
178 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
|
|
|
|
|
|
179 |
eval_result.update_with_request_file(requests_path)
|
180 |
|
181 |
# Store results of same eval together
|
@@ -187,6 +190,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
187 |
|
188 |
results = []
|
189 |
for v in eval_results.values():
|
|
|
190 |
try:
|
191 |
v.to_dict() # we test if the dict version is complete
|
192 |
results.append(v)
|
|
|
176 |
for model_result_filepath in model_result_filepaths:
|
177 |
# Creation of result
|
178 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
+
# TODO: populate requests repo with request files corresponding to llama3-8b_primary
|
180 |
+
# Current output of `python app.py`:
|
181 |
+
# Could not find request file for None/llama3-8b_primary with precision ?
|
182 |
eval_result.update_with_request_file(requests_path)
|
183 |
|
184 |
# Store results of same eval together
|
|
|
190 |
|
191 |
results = []
|
192 |
for v in eval_results.values():
|
193 |
+
# import pdb; pdb.set_trace()
|
194 |
try:
|
195 |
v.to_dict() # we test if the dict version is complete
|
196 |
results.append(v)
|