Commit
·
78301f0
1
Parent(s):
3b7b492
solved group and singular problem hierarchially
Browse files- app.py +3 -3
- src/leaderboard/read_evals.py +114 -3
- src/populate.py +55 -2
app.py
CHANGED
@@ -29,7 +29,7 @@ from src.display.utils import (
|
|
29 |
AutoEvalColumnGroup,
|
30 |
)
|
31 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO, EVAL_RESULTS_GROUP_PATH, RESULTS_GROUP_REPO
|
32 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
33 |
from src.submission.submit import add_new_eval
|
34 |
|
35 |
|
@@ -65,7 +65,7 @@ except Exception:
|
|
65 |
|
66 |
|
67 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
68 |
-
raw_data_grouped, original_df_grouped =
|
69 |
|
70 |
leaderboard_grouped_df = original_df_grouped.copy()
|
71 |
leaderboard_df = original_df.copy()
|
@@ -81,7 +81,7 @@ leaderboard_df = original_df.copy()
|
|
81 |
finished_eval_queue_g_df,
|
82 |
running_eval_queue_g_df,
|
83 |
pending_eval_queue_g_df,
|
84 |
-
) =
|
85 |
|
86 |
# Searching and filtering
|
87 |
def update_table(
|
|
|
29 |
AutoEvalColumnGroup,
|
30 |
)
|
31 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO, EVAL_RESULTS_GROUP_PATH, RESULTS_GROUP_REPO
|
32 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_evaluation_queue_df_group, get_leaderboard_group_df
|
33 |
from src.submission.submit import add_new_eval
|
34 |
|
35 |
|
|
|
65 |
|
66 |
|
67 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
68 |
+
raw_data_grouped, original_df_grouped = get_leaderboard_group_df(EVAL_RESULTS_GROUP_PATH, COLS_GROUP, BENCHMARK_COLS_GROUP)
|
69 |
|
70 |
leaderboard_grouped_df = original_df_grouped.copy()
|
71 |
leaderboard_df = original_df.copy()
|
|
|
81 |
finished_eval_queue_g_df,
|
82 |
running_eval_queue_g_df,
|
83 |
pending_eval_queue_g_df,
|
84 |
+
) = get_evaluation_queue_df_group(EVAL_REQUESTS_PATH, EVAL_COLS_GROUP)
|
85 |
|
86 |
# Searching and filtering
|
87 |
def update_table(
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, Tasks
|
12 |
|
13 |
@dataclass
|
14 |
class EvalResult:
|
@@ -40,7 +40,6 @@ class EvalResult:
|
|
40 |
# Extract results available in this file (some results are split in several files)
|
41 |
results = {}
|
42 |
for task in Tasks:
|
43 |
-
task = task.value
|
44 |
|
45 |
# We average all scores of a given metric (not all metrics are present in all files)
|
46 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
@@ -72,11 +71,85 @@ class EvalResult:
|
|
72 |
}
|
73 |
|
74 |
for task in Tasks:
|
75 |
-
data_dict[task.
|
76 |
|
77 |
return data_dict
|
78 |
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
81 |
"""From the path of the results folder root, extract all needed info for results"""
|
82 |
model_result_filepaths = []
|
@@ -110,3 +183,41 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
110 |
continue
|
111 |
|
112 |
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, Tasks, Groups
|
12 |
|
13 |
@dataclass
|
14 |
class EvalResult:
|
|
|
40 |
# Extract results available in this file (some results are split in several files)
|
41 |
results = {}
|
42 |
for task in Tasks:
|
|
|
43 |
|
44 |
# We average all scores of a given metric (not all metrics are present in all files)
|
45 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
|
|
71 |
}
|
72 |
|
73 |
for task in Tasks:
|
74 |
+
data_dict[task.col_name] = self.results[task.benchmark]
|
75 |
|
76 |
return data_dict
|
77 |
|
78 |
|
79 |
+
|
80 |
+
|
81 |
+
@dataclass
|
82 |
+
class EvalResultGroup:
|
83 |
+
eval_name: str # org_model_date (uid)
|
84 |
+
full_model: str # org/model (path on hub)
|
85 |
+
org: str
|
86 |
+
model: str
|
87 |
+
results: dict
|
88 |
+
date: str = "" # submission date of request file
|
89 |
+
|
90 |
+
@classmethod
|
91 |
+
def init_from_json_file(self, json_filepath):
|
92 |
+
"""Inits the result from the specific model result file"""
|
93 |
+
with open(json_filepath) as fp:
|
94 |
+
data = json.load(fp)
|
95 |
+
|
96 |
+
config = data.get("config")
|
97 |
+
|
98 |
+
# Get model and org
|
99 |
+
org_and_model = config.get("model_name", None)
|
100 |
+
org_and_model = org_and_model.split("/", 1)
|
101 |
+
|
102 |
+
org = org_and_model[0]
|
103 |
+
model = org_and_model[1]
|
104 |
+
date = config.get("submitted_time", None)
|
105 |
+
result_key = f"{org}_{model}_{date}"
|
106 |
+
full_model = "/".join(org_and_model)
|
107 |
+
|
108 |
+
# Extract results available in this file (some results are split in several files)
|
109 |
+
results = {}
|
110 |
+
for task in Groups:
|
111 |
+
|
112 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
113 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
114 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
115 |
+
continue
|
116 |
+
|
117 |
+
mean_acc = np.mean(accs) * 100.0
|
118 |
+
results[task.benchmark] = mean_acc
|
119 |
+
|
120 |
+
return self(
|
121 |
+
eval_name=result_key,
|
122 |
+
full_model=full_model,
|
123 |
+
org=org,
|
124 |
+
model=model,
|
125 |
+
results=results,
|
126 |
+
date=date
|
127 |
+
)
|
128 |
+
|
129 |
+
|
130 |
+
def to_dict(self):
|
131 |
+
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
132 |
+
average = sum([v for v in self.results.values() if v is not None]) / len(Groups)
|
133 |
+
data_dict = {
|
134 |
+
"eval_name": self.eval_name, # not a column, just a save name,
|
135 |
+
AutoEvalColumn.model_submission_date.name: self.date,
|
136 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
137 |
+
AutoEvalColumn.dummy.name: self.full_model,
|
138 |
+
AutoEvalColumn.average.name: average,
|
139 |
+
}
|
140 |
+
|
141 |
+
for task in Groups:
|
142 |
+
data_dict[task.col_name] = self.results[task.benchmark]
|
143 |
+
|
144 |
+
return data_dict
|
145 |
+
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
154 |
"""From the path of the results folder root, extract all needed info for results"""
|
155 |
model_result_filepaths = []
|
|
|
183 |
continue
|
184 |
|
185 |
return results
|
186 |
+
|
187 |
+
|
188 |
+
|
189 |
+
|
190 |
+
def get_group_eval_results(results_path: str) -> list[EvalResultGroup]:
|
191 |
+
"""From the path of the results folder root, extract all needed info for results"""
|
192 |
+
model_result_filepaths = []
|
193 |
+
|
194 |
+
for root, _, files in os.walk(results_path):
|
195 |
+
# We should only have json files in model results
|
196 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
197 |
+
continue
|
198 |
+
|
199 |
+
# Sort the files by date
|
200 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
201 |
+
|
202 |
+
for file in files:
|
203 |
+
model_result_filepaths.append(os.path.join(root, file))
|
204 |
+
|
205 |
+
eval_results = {}
|
206 |
+
for model_result_filepath in model_result_filepaths:
|
207 |
+
# Creation of result
|
208 |
+
eval_result = EvalResultGroup.init_from_json_file(model_result_filepath)
|
209 |
+
|
210 |
+
# Store results of same eval together
|
211 |
+
eval_name = eval_result.eval_name
|
212 |
+
eval_results[eval_name] = eval_result
|
213 |
+
|
214 |
+
results = []
|
215 |
+
for v in eval_results.values():
|
216 |
+
try:
|
217 |
+
v.to_dict() # we test if the dict version is complete
|
218 |
+
results.append(v)
|
219 |
+
except KeyError: # not all eval values present
|
220 |
+
continue
|
221 |
+
|
222 |
+
return results
|
223 |
+
|
src/populate.py
CHANGED
@@ -4,8 +4,8 @@ import os
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
@@ -21,6 +21,21 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
|
|
21 |
return raw_data, df
|
22 |
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
25 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
26 |
all_evals = []
|
@@ -54,3 +69,41 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
54 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
55 |
|
56 |
return df_finished[cols], df_running[cols], df_pending[cols]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn, AutoEvalColumnGroup, EvalQueueColumnGroup
|
8 |
+
from src.leaderboard.read_evals import get_raw_eval_results, get_group_eval_results
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
|
21 |
return raw_data, df
|
22 |
|
23 |
|
24 |
+
def get_leaderboard_group_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
25 |
+
raw_data = get_group_eval_results(results_path)
|
26 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
27 |
+
|
28 |
+
df = pd.DataFrame.from_records(all_data_json)
|
29 |
+
df = df.sort_values(by=[AutoEvalColumnGroup.average.name], ascending=False)
|
30 |
+
df = df[cols].round(decimals=2)
|
31 |
+
|
32 |
+
# filter out if any of the benchmarks have not been produced
|
33 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
34 |
+
return raw_data, df
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
|
39 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
40 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
41 |
all_evals = []
|
|
|
69 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
70 |
|
71 |
return df_finished[cols], df_running[cols], df_pending[cols]
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
|
77 |
+
def get_evaluation_queue_df_group(save_path: str, cols: list) -> list[pd.DataFrame]:
|
78 |
+
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
79 |
+
all_evals = []
|
80 |
+
|
81 |
+
for entry in entries:
|
82 |
+
if ".json" in entry:
|
83 |
+
file_path = os.path.join(save_path, entry)
|
84 |
+
with open(file_path) as fp:
|
85 |
+
data = json.load(fp)
|
86 |
+
|
87 |
+
data[EvalQueueColumnGroup.model.name] = make_clickable_model(data["model"])
|
88 |
+
|
89 |
+
all_evals.append(data)
|
90 |
+
elif ".md" not in entry:
|
91 |
+
# this is a folder
|
92 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
93 |
+
for sub_entry in sub_entries:
|
94 |
+
file_path = os.path.join(save_path, entry, sub_entry)
|
95 |
+
with open(file_path) as fp:
|
96 |
+
data = json.load(fp)
|
97 |
+
|
98 |
+
data[EvalQueueColumnGroup.model.name] = make_clickable_model(data["model"])
|
99 |
+
all_evals.append(data)
|
100 |
+
|
101 |
+
pending_list = [e for e in all_evals if e["status"] == "PENDING"]
|
102 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
103 |
+
finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
|
104 |
+
|
105 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
106 |
+
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
107 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
108 |
+
|
109 |
+
return df_finished[cols], df_running[cols], df_pending[cols]
|