ag2435 commited on
Commit
e568bbb
·
1 Parent(s): 5635685

wip added primary category tasks

Browse files
src/about.py CHANGED
@@ -10,10 +10,20 @@ class Task:
10
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
 
10
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
+ if False:
14
+ class Tasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ task0 = Task("anli_r1", "acc", "ANLI")
17
+ task1 = Task("logiqa", "acc_norm", "LogiQA")
18
+ else:
19
+ class Tasks(Enum):
20
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
21
+ # NOTE: the task_key must be unique
22
+ task0 = Task("default_primary_subfield_accuracy", "primary_subfield_accuracy", "Accuracy of predicting the released primary category on the default split")
23
+ task1 = Task("default_primary_top_3_tpr", "primary_top_3_tpr", "Occurrence of the released primary category in the top-3 predictions on the default split")
24
+ task2 = Task("default_primary_top_5_tpr", "primary_top_5_tpr", "Occurrence of the released primary category in the top-5 predictions on the default split")
25
+ task3 = Task("default_primary_top_10_tpr", "primary_top_10_tpr", "Occurrence of the released primary category in the top-10 predictions on the default split")
26
+ # task1 = Task("all2023_v2", "acc", "Acc on all 2023 papers")
27
 
28
  NUM_FEWSHOT = 0 # Change with your few shot
29
  # ---------------------------------------------------
src/display/utils.py CHANGED
@@ -1,4 +1,4 @@
1
- from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
  import pandas as pd
@@ -21,27 +21,46 @@ class ColumnContent:
21
  never_hidden: bool = False
22
 
23
  ## Leaderboard columns
24
- auto_eval_column_dict = []
25
- # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
-
43
- # We use make dataclass to dynamically fill the scores from Tasks
44
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
@@ -107,4 +126,5 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
 
1
+ from dataclasses import dataclass, make_dataclass, field
2
  from enum import Enum
3
 
4
  import pandas as pd
 
21
  never_hidden: bool = False
22
 
23
  ## Leaderboard columns
24
+ if False:
25
+ auto_eval_column_dict = []
26
+ # Init
27
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
+ #Scores
30
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
+ for task in Tasks:
32
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
+ # Model information
34
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
35
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
36
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
37
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
38
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
39
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
40
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
41
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
42
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
43
+ # We use make dataclass to dynamically fill the scores from Tasks
44
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
+
46
+ @dataclass(frozen=True)
47
+ class AutoEvalColumn:
48
+ model_type_symbol = ColumnContent("T", "str", True, never_hidden=True)
49
+ model = ColumnContent("Model", "markdown", True, never_hidden=True)
50
+ average = ColumnContent("Average ⬆️", "number", True)
51
+ model_type = ColumnContent("Type", "str", False)
52
+ architecture = ColumnContent("Architecture", "str", False)
53
+ weight_type = ColumnContent("Weight type", "str", False, True)
54
+ precision = ColumnContent("Precision", "str", False)
55
+ license = ColumnContent("Hub License", "str", False)
56
+ params = ColumnContent("#Params (B)", "number", False)
57
+ likes = ColumnContent("Hub ❤️", "number", False)
58
+ still_on_hub = ColumnContent("Available on the hub", "bool", False)
59
+ revision = ColumnContent("Model sha", "str", False, False)
60
+ # Dynamically add task columns
61
+ def __init__(self):
62
+ for task in Tasks:
63
+ setattr(self.__class__, task.name, ColumnContent(task.value.col_name, "number", True))
64
 
65
  ## For the queue columns in the submission tab
66
  @dataclass(frozen=True)
 
126
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
127
 
128
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
129
+ # NOTE: add the benchmark columns to the COLS since they are dynamically added
130
+ COLS += BENCHMARK_COLS
src/envs.py CHANGED
@@ -11,8 +11,8 @@ OWNER = "mlcore"
11
  # ----------------------------------
12
 
13
  REPO_ID = f"{OWNER}/arxiv-classifier-leaderboard"
14
- QUEUE_REPO = f"{OWNER}/requests"
15
- RESULTS_REPO = f"{OWNER}/results"
16
 
17
  # If you setup a cache later, just change HF_HOME
18
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
11
  # ----------------------------------
12
 
13
  REPO_ID = f"{OWNER}/arxiv-classifier-leaderboard"
14
+ QUEUE_REPO = f"{OWNER}/arxiv-classifier-leaderboard-requests"
15
+ RESULTS_REPO = f"{OWNER}/arxiv-classifier-leaderboard-results"
16
 
17
  # If you setup a cache later, just change HF_HOME
18
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/leaderboard/read_evals.py CHANGED
@@ -176,6 +176,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
 
 
179
  eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
@@ -187,6 +190,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
187
 
188
  results = []
189
  for v in eval_results.values():
 
190
  try:
191
  v.to_dict() # we test if the dict version is complete
192
  results.append(v)
 
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
+ # TODO: populate requests repo with request files corresponding to llama3-8b_primary
180
+ # Current output of `python app.py`:
181
+ # Could not find request file for None/llama3-8b_primary with precision ?
182
  eval_result.update_with_request_file(requests_path)
183
 
184
  # Store results of same eval together
 
190
 
191
  results = []
192
  for v in eval_results.values():
193
+ # import pdb; pdb.set_trace()
194
  try:
195
  v.to_dict() # we test if the dict version is complete
196
  results.append(v)