MirakramAghalarov commited on
Commit
78301f0
·
1 Parent(s): 3b7b492

solved group and singular problem hierarchially

Browse files
Files changed (3) hide show
  1. app.py +3 -3
  2. src/leaderboard/read_evals.py +114 -3
  3. src/populate.py +55 -2
app.py CHANGED
@@ -29,7 +29,7 @@ from src.display.utils import (
29
  AutoEvalColumnGroup,
30
  )
31
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO, EVAL_RESULTS_GROUP_PATH, RESULTS_GROUP_REPO
32
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
33
  from src.submission.submit import add_new_eval
34
 
35
 
@@ -65,7 +65,7 @@ except Exception:
65
 
66
 
67
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
68
- raw_data_grouped, original_df_grouped = get_leaderboard_df(EVAL_RESULTS_GROUP_PATH, COLS_GROUP, BENCHMARK_COLS_GROUP)
69
 
70
  leaderboard_grouped_df = original_df_grouped.copy()
71
  leaderboard_df = original_df.copy()
@@ -81,7 +81,7 @@ leaderboard_df = original_df.copy()
81
  finished_eval_queue_g_df,
82
  running_eval_queue_g_df,
83
  pending_eval_queue_g_df,
84
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS_GROUP)
85
 
86
  # Searching and filtering
87
  def update_table(
 
29
  AutoEvalColumnGroup,
30
  )
31
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO, EVAL_RESULTS_GROUP_PATH, RESULTS_GROUP_REPO
32
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_evaluation_queue_df_group, get_leaderboard_group_df
33
  from src.submission.submit import add_new_eval
34
 
35
 
 
65
 
66
 
67
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
68
+ raw_data_grouped, original_df_grouped = get_leaderboard_group_df(EVAL_RESULTS_GROUP_PATH, COLS_GROUP, BENCHMARK_COLS_GROUP)
69
 
70
  leaderboard_grouped_df = original_df_grouped.copy()
71
  leaderboard_df = original_df.copy()
 
81
  finished_eval_queue_g_df,
82
  running_eval_queue_g_df,
83
  pending_eval_queue_g_df,
84
+ ) = get_evaluation_queue_df_group(EVAL_REQUESTS_PATH, EVAL_COLS_GROUP)
85
 
86
  # Searching and filtering
87
  def update_table(
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, Tasks
12
 
13
  @dataclass
14
  class EvalResult:
@@ -40,7 +40,6 @@ class EvalResult:
40
  # Extract results available in this file (some results are split in several files)
41
  results = {}
42
  for task in Tasks:
43
- task = task.value
44
 
45
  # We average all scores of a given metric (not all metrics are present in all files)
46
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
@@ -72,11 +71,85 @@ class EvalResult:
72
  }
73
 
74
  for task in Tasks:
75
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
76
 
77
  return data_dict
78
 
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def get_raw_eval_results(results_path: str) -> list[EvalResult]:
81
  """From the path of the results folder root, extract all needed info for results"""
82
  model_result_filepaths = []
@@ -110,3 +183,41 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
110
  continue
111
 
112
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, Tasks, Groups
12
 
13
  @dataclass
14
  class EvalResult:
 
40
  # Extract results available in this file (some results are split in several files)
41
  results = {}
42
  for task in Tasks:
 
43
 
44
  # We average all scores of a given metric (not all metrics are present in all files)
45
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
 
71
  }
72
 
73
  for task in Tasks:
74
+ data_dict[task.col_name] = self.results[task.benchmark]
75
 
76
  return data_dict
77
 
78
 
79
+
80
+
81
+ @dataclass
82
+ class EvalResultGroup:
83
+ eval_name: str # org_model_date (uid)
84
+ full_model: str # org/model (path on hub)
85
+ org: str
86
+ model: str
87
+ results: dict
88
+ date: str = "" # submission date of request file
89
+
90
+ @classmethod
91
+ def init_from_json_file(self, json_filepath):
92
+ """Inits the result from the specific model result file"""
93
+ with open(json_filepath) as fp:
94
+ data = json.load(fp)
95
+
96
+ config = data.get("config")
97
+
98
+ # Get model and org
99
+ org_and_model = config.get("model_name", None)
100
+ org_and_model = org_and_model.split("/", 1)
101
+
102
+ org = org_and_model[0]
103
+ model = org_and_model[1]
104
+ date = config.get("submitted_time", None)
105
+ result_key = f"{org}_{model}_{date}"
106
+ full_model = "/".join(org_and_model)
107
+
108
+ # Extract results available in this file (some results are split in several files)
109
+ results = {}
110
+ for task in Groups:
111
+
112
+ # We average all scores of a given metric (not all metrics are present in all files)
113
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
114
+ if accs.size == 0 or any([acc is None for acc in accs]):
115
+ continue
116
+
117
+ mean_acc = np.mean(accs) * 100.0
118
+ results[task.benchmark] = mean_acc
119
+
120
+ return self(
121
+ eval_name=result_key,
122
+ full_model=full_model,
123
+ org=org,
124
+ model=model,
125
+ results=results,
126
+ date=date
127
+ )
128
+
129
+
130
+ def to_dict(self):
131
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
132
+ average = sum([v for v in self.results.values() if v is not None]) / len(Groups)
133
+ data_dict = {
134
+ "eval_name": self.eval_name, # not a column, just a save name,
135
+ AutoEvalColumn.model_submission_date.name: self.date,
136
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
137
+ AutoEvalColumn.dummy.name: self.full_model,
138
+ AutoEvalColumn.average.name: average,
139
+ }
140
+
141
+ for task in Groups:
142
+ data_dict[task.col_name] = self.results[task.benchmark]
143
+
144
+ return data_dict
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
  def get_raw_eval_results(results_path: str) -> list[EvalResult]:
154
  """From the path of the results folder root, extract all needed info for results"""
155
  model_result_filepaths = []
 
183
  continue
184
 
185
  return results
186
+
187
+
188
+
189
+
190
+ def get_group_eval_results(results_path: str) -> list[EvalResultGroup]:
191
+ """From the path of the results folder root, extract all needed info for results"""
192
+ model_result_filepaths = []
193
+
194
+ for root, _, files in os.walk(results_path):
195
+ # We should only have json files in model results
196
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
197
+ continue
198
+
199
+ # Sort the files by date
200
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
201
+
202
+ for file in files:
203
+ model_result_filepaths.append(os.path.join(root, file))
204
+
205
+ eval_results = {}
206
+ for model_result_filepath in model_result_filepaths:
207
+ # Creation of result
208
+ eval_result = EvalResultGroup.init_from_json_file(model_result_filepath)
209
+
210
+ # Store results of same eval together
211
+ eval_name = eval_result.eval_name
212
+ eval_results[eval_name] = eval_result
213
+
214
+ results = []
215
+ for v in eval_results.values():
216
+ try:
217
+ v.to_dict() # we test if the dict version is complete
218
+ results.append(v)
219
+ except KeyError: # not all eval values present
220
+ continue
221
+
222
+ return results
223
+
src/populate.py CHANGED
@@ -4,8 +4,8 @@ import os
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
@@ -21,6 +21,21 @@ def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> p
21
  return raw_data, df
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
25
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
26
  all_evals = []
@@ -54,3 +69,41 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
54
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
55
 
56
  return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, AutoEvalColumnGroup, EvalQueueColumnGroup
8
+ from src.leaderboard.read_evals import get_raw_eval_results, get_group_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
21
  return raw_data, df
22
 
23
 
24
+ def get_leaderboard_group_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
25
+ raw_data = get_group_eval_results(results_path)
26
+ all_data_json = [v.to_dict() for v in raw_data]
27
+
28
+ df = pd.DataFrame.from_records(all_data_json)
29
+ df = df.sort_values(by=[AutoEvalColumnGroup.average.name], ascending=False)
30
+ df = df[cols].round(decimals=2)
31
+
32
+ # filter out if any of the benchmarks have not been produced
33
+ df = df[has_no_nan_values(df, benchmark_cols)]
34
+ return raw_data, df
35
+
36
+
37
+
38
+
39
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
40
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
41
  all_evals = []
 
69
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
70
 
71
  return df_finished[cols], df_running[cols], df_pending[cols]
72
+
73
+
74
+
75
+
76
+
77
+ def get_evaluation_queue_df_group(save_path: str, cols: list) -> list[pd.DataFrame]:
78
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
79
+ all_evals = []
80
+
81
+ for entry in entries:
82
+ if ".json" in entry:
83
+ file_path = os.path.join(save_path, entry)
84
+ with open(file_path) as fp:
85
+ data = json.load(fp)
86
+
87
+ data[EvalQueueColumnGroup.model.name] = make_clickable_model(data["model"])
88
+
89
+ all_evals.append(data)
90
+ elif ".md" not in entry:
91
+ # this is a folder
92
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
93
+ for sub_entry in sub_entries:
94
+ file_path = os.path.join(save_path, entry, sub_entry)
95
+ with open(file_path) as fp:
96
+ data = json.load(fp)
97
+
98
+ data[EvalQueueColumnGroup.model.name] = make_clickable_model(data["model"])
99
+ all_evals.append(data)
100
+
101
+ pending_list = [e for e in all_evals if e["status"] == "PENDING"]
102
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
103
+ finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
104
+
105
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
106
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
107
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
108
+
109
+ return df_finished[cols], df_running[cols], df_pending[cols]