haiengchuihaian commited on
Commit
99ec88b
·
1 Parent(s): 4ea4ae9

leaderboard pipiline

Browse files
app.py CHANGED
@@ -33,20 +33,20 @@ from src.submission.submit import add_new_eval, upload_file
33
  def restart_space():
34
  API.restart_space(repo_id=REPO_ID, token=TOKEN)
35
 
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
47
- )
48
- except Exception:
49
- restart_space()
50
 
51
 
52
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
@@ -56,86 +56,7 @@ value=[ c.name for c in fields(AutoEvalColumn)
56
 
57
  leaderboard_df = original_df.copy()
58
 
59
- # (
60
- # finished_eval_queue_df,
61
- # running_eval_queue_df,
62
- # pending_eval_queue_df,
63
- # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
64
 
65
-
66
- # Searching and filtering
67
- # def update_table(
68
- # hidden_df: pd.DataFrame,
69
- # columns: list,
70
- # type_query: list,
71
- # precision_query: str,
72
- # size_query: list,
73
- # show_deleted: bool,
74
- # query: str,
75
- # ):
76
- # filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
77
- # filtered_df = filter_queries(query, filtered_df)
78
- # df = select_columns(filtered_df, columns)
79
- # return df
80
-
81
-
82
- # def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
83
- # return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
84
-
85
-
86
- # def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
87
- # always_here_cols = [
88
- # AutoEvalColumn.model_type_symbol.name,
89
- # AutoEvalColumn.model.name,
90
- # ]
91
- # # We use COLS to maintain sorting
92
- # filtered_df = df[
93
- # always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
94
- # ]
95
- # return filtered_df
96
-
97
-
98
- # def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
99
- # final_df = []
100
- # if query != "":
101
- # queries = [q.strip() for q in query.split(";")]
102
- # for _q in queries:
103
- # _q = _q.strip()
104
- # if _q != "":
105
- # temp_filtered_df = search_table(filtered_df, _q)
106
- # if len(temp_filtered_df) > 0:
107
- # final_df.append(temp_filtered_df)
108
- # if len(final_df) > 0:
109
- # filtered_df = pd.concat(final_df)
110
- # filtered_df = filtered_df.drop_duplicates(
111
- # subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
112
- # )
113
-
114
- # return filtered_df
115
-
116
-
117
- # def filter_models(
118
- # df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
119
- # ) -> pd.DataFrame:
120
- # # Show all models
121
- # if show_deleted:
122
- # filtered_df = df
123
- # else: # Show only still on the hub models
124
- # filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
125
-
126
- # type_emoji = [t[0] for t in type_query]
127
- # filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
128
- # filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
129
-
130
- # numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
131
- # params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
132
- # mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
133
- # filtered_df = filtered_df.loc[mask]
134
-
135
- # return filtered_df
136
-
137
-
138
- # print([c.name for c in fields(AutoEvalColumn) if c.never_hidden])
139
  demo = gr.Blocks(css=custom_css)
140
  with demo:
141
  gr.HTML(TITLE)
@@ -143,58 +64,7 @@ with demo:
143
 
144
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
145
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
146
- # with gr.Row():
147
- # with gr.Column():
148
- # with gr.Row():
149
- # search_bar = gr.Textbox(
150
- # placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
151
- # show_label=False,
152
- # elem_id="search-bar",
153
- # )
154
- # with gr.Row():
155
- # shown_columns = gr.CheckboxGroup(
156
- # choices=[
157
- # c.name
158
- # for c in fields(AutoEvalColumn)
159
- # if not c.hidden and not c.never_hidden and not c.dummy
160
- # ],
161
- # value=[
162
- # c.name
163
- # for c in fields(AutoEvalColumn)
164
- # if c.displayed_by_default and not c.hidden and not c.never_hidden
165
- # ],
166
- # label="Select columns to show",
167
- # elem_id="column-select",
168
- # interactive=True,
169
- # )
170
- # with gr.Row():
171
- # deleted_models_visibility = gr.Checkbox(
172
- # value=False, label="Show gated/private/deleted models", interactive=True
173
- # )
174
- # with gr.Column(min_width=320):
175
- # #with gr.Box(elem_id="box-filter"):
176
- # filter_columns_type = gr.CheckboxGroup(
177
- # label="Model types",
178
- # choices=[t.to_str() for t in ModelType],
179
- # value=[t.to_str() for t in ModelType],
180
- # interactive=True,
181
- # elem_id="filter-columns-type",
182
- # )
183
- # filter_columns_precision = gr.CheckboxGroup(
184
- # label="Precision",
185
- # choices=[i.value.name for i in Precision],
186
- # value=[i.value.name for i in Precision],
187
- # interactive=True,
188
- # elem_id="filter-columns-precision",
189
- # )
190
- # filter_columns_size = gr.CheckboxGroup(
191
- # label="Model sizes (in billions of parameters)",
192
- # choices=list(NUMERIC_INTERVALS.keys()),
193
- # value=list(NUMERIC_INTERVALS.keys()),
194
- # interactive=True,
195
- # elem_id="filter-columns-size",
196
- # )
197
-
198
  leaderboard_table = gr.components.Dataframe(
199
  value=leaderboard_df[
200
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + value
@@ -215,34 +85,7 @@ with demo:
215
  datatype=TYPES,
216
  visible=False,
217
  )
218
- # search_bar.submit(
219
- # update_table,
220
- # [
221
- # hidden_leaderboard_table_for_search,
222
- # shown_columns,
223
- # filter_columns_type,
224
- # filter_columns_precision,
225
- # filter_columns_size,
226
- # deleted_models_visibility,
227
- # search_bar,
228
- # ],
229
- # leaderboard_table,
230
- # )
231
- # for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
232
- # selector.change(
233
- # update_table,
234
- # [
235
- # hidden_leaderboard_table_for_search,
236
- # shown_columns,
237
- # filter_columns_type,
238
- # filter_columns_precision,
239
- # filter_columns_size,
240
- # deleted_models_visibility,
241
- # search_bar,
242
- # ],
243
- # leaderboard_table,
244
- # queue=True,
245
- # )
246
 
247
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
248
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -252,89 +95,26 @@ with demo:
252
  with gr.Row():
253
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
254
 
255
- # with gr.Column():
256
- # with gr.Accordion(
257
- # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
258
- # open=False,
259
- # ):
260
- # with gr.Row():
261
- # finished_eval_table = gr.components.Dataframe(
262
- # value=finished_eval_queue_df,
263
- # headers=EVAL_COLS,
264
- # datatype=EVAL_TYPES,
265
- # row_count=5,
266
- # )
267
- # with gr.Accordion(
268
- # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
269
- # open=False,
270
- # ):
271
- # with gr.Row():
272
- # running_eval_table = gr.components.Dataframe(
273
- # value=running_eval_queue_df,
274
- # headers=EVAL_COLS,
275
- # datatype=EVAL_TYPES,
276
- # row_count=5,
277
- # )
278
-
279
- # with gr.Accordion(
280
- # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
281
- # open=False,
282
- # ):
283
- # with gr.Row():
284
- # pending_eval_table = gr.components.Dataframe(
285
- # value=pending_eval_queue_df,
286
- # headers=EVAL_COLS,
287
- # datatype=EVAL_TYPES,
288
- # row_count=5,
289
- # )
290
  with gr.Row():
291
  gr.Markdown("# ✉️✨ Submit your files here!", elem_classes="markdown-text")
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  with gr.Row():
294
- upload = gr.Interface(fn=upload_file,inputs="file" ,outputs=None)
295
- # with gr.Column():
296
- # model_name_textbox = gr.Textbox(label="Model name")
297
- # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
298
- # model_type = gr.Dropdown(
299
- # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
300
- # label="Model type",
301
- # multiselect=False,
302
- # value=None,
303
- # interactive=True,
304
- # )
305
-
306
- # with gr.Column():
307
- # precision = gr.Dropdown(
308
- # choices=[i.value.name for i in Precision if i != Precision.Unknown],
309
- # label="Precision",
310
- # multiselect=False,
311
- # value="float16",
312
- # interactive=True,
313
- # )
314
- # weight_type = gr.Dropdown(
315
- # choices=[i.value.name for i in WeightType],
316
- # label="Weights type",
317
- # multiselect=False,
318
- # value="Original",
319
- # interactive=True,
320
- # )
321
- # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
322
-
323
- # submit_button = gr.Button("Submit Eval")
324
- # submission_result = gr.Markdown()
325
- # submit_button.click(
326
- # add_new_eval,
327
- # [
328
- # model_name_textbox,
329
- # base_model_name_textbox,
330
- # revision_name_textbox,
331
- # precision,
332
- # weight_type,
333
- # model_type,
334
- # ],
335
- # submission_result,
336
- # )
337
-
338
  with gr.Row():
339
  with gr.Accordion("📙 Citation", open=False):
340
  citation_button = gr.Textbox(
@@ -347,6 +127,6 @@ with demo:
347
 
348
  scheduler = BackgroundScheduler()
349
  scheduler.add_job(restart_space, "interval", seconds=30)
350
- scheduler.start()
351
 
352
  demo.queue(default_concurrency_limit=40).launch()
 
33
  def restart_space():
34
  API.restart_space(repo_id=REPO_ID, token=TOKEN)
35
 
36
+ # try:
37
+ # print(EVAL_REQUESTS_PATH)
38
+ # snapshot_download(
39
+ # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
40
+ # )
41
+ # except Exception:
42
+ # restart_space()
43
+ # try:
44
+ # print(EVAL_RESULTS_PATH)
45
+ # snapshot_download(
46
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
47
+ # )
48
+ # except Exception:
49
+ # restart_space()
50
 
51
 
52
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
56
 
57
  leaderboard_df = original_df.copy()
58
 
 
 
 
 
 
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  demo = gr.Blocks(css=custom_css)
61
  with demo:
62
  gr.HTML(TITLE)
 
64
 
65
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
66
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
67
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  leaderboard_table = gr.components.Dataframe(
69
  value=leaderboard_df[
70
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + value
 
85
  datatype=TYPES,
86
  visible=False,
87
  )
88
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
91
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
95
  with gr.Row():
96
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
97
 
98
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  with gr.Row():
100
  gr.Markdown("# ✉️✨ Submit your files here!", elem_classes="markdown-text")
101
 
102
+ def update_leaderboard(file_obj):
103
+ upload_file(file_obj)
104
+ raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
105
+ value=[ c.name for c in fields(AutoEvalColumn)
106
+ if c.displayed_by_default and not c.hidden and not c.never_hidden]
107
+
108
+ leaderboard_df = original_df.copy()
109
+ leaderboard_table = leaderboard_df[
110
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + value
111
+ + [AutoEvalColumn.dummy.name]
112
+ ]
113
+ return leaderboard_table
114
+
115
  with gr.Row():
116
+ upload = gr.Interface(fn=update_leaderboard,inputs="file" ,outputs=leaderboard_table)
117
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  with gr.Row():
119
  with gr.Accordion("📙 Citation", open=False):
120
  citation_button = gr.Textbox(
 
127
 
128
  scheduler = BackgroundScheduler()
129
  scheduler.add_job(restart_space, "interval", seconds=30)
130
+ # scheduler.start()
131
 
132
  demo.queue(default_concurrency_limit=40).launch()
src/display/about.py CHANGED
@@ -11,8 +11,11 @@ class Task:
11
  # Init: to update with your specific keys
12
  class Tasks(Enum):
13
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
- task0 = Task("task_name1", "metric_name", "First task")
15
- task1 = Task("task_name2", "metric_name", "Second task")
 
 
 
16
 
17
 
18
  # Your leaderboard name
@@ -25,43 +28,54 @@ Intro text
25
 
26
  # Which evaluations are you running? how can people reproduce what you have?
27
  LLM_BENCHMARKS_TEXT = f"""
28
- ## TEST
 
 
 
 
 
 
 
29
 
30
- ## Reproducibility
31
- To reproduce our results, here is the commands you can run:
 
 
32
 
33
  """
34
 
35
  EVALUATION_QUEUE_TEXT = """
36
- ## Some good practices before submitting a model
37
-
38
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
39
- ```python
40
- from transformers import AutoConfig, AutoModel, AutoTokenizer
41
- config = AutoConfig.from_pretrained("your model name", revision=revision)
42
- model = AutoModel.from_pretrained("your model name", revision=revision)
43
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
 
 
 
 
 
 
 
 
44
  ```
45
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
46
-
47
- Note: make sure your model is public!
48
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
49
-
50
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
51
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
52
-
53
- ### 3) Make sure your model has an open license!
54
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
55
-
56
- ### 4) Fill up your model card
57
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
58
 
59
- ## In case of model failure
60
- If your model is displayed in the `FAILED` category, its execution stopped.
61
- Make sure you have followed the above steps first.
62
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 
63
  """
64
 
65
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
66
- CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
67
  """
 
11
  # Init: to update with your specific keys
12
  class Tasks(Enum):
13
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
+ task0 = Task("fetaqa", "bleu", "fetaqa")
15
+ task1 = Task("hitab_ng", "bleu", "hitab_ng")
16
+ task2 = Task("qtsumm", "bleu", "qtsumm")
17
+ task3 = Task("scigen", "bleu", "scigen")
18
+ task4 = Task("totto", "bleu", "totto")
19
 
20
 
21
  # Your leaderboard name
 
28
 
29
  # Which evaluations are you running? how can people reproduce what you have?
30
  LLM_BENCHMARKS_TEXT = f"""
31
+ ## Introduction
32
+ ### OPENT2T - first open-source toolkit for table-to-text generation
33
+ - designed to reproduce existing table pre-training models for
34
+ - expedite the development of new models.
35
+ - performance comparison
36
+ - implemented and compared
37
+ - 19 types of large language models under zero and few-shot settings
38
+ - 7 fine-tuned models
39
 
40
+ Currently we released evaluation results on "fetaqa", "hitab_ng" , "qtsumm", "scigen", "totto" dataset
41
+
42
+ ## Notice
43
+ Currently, we only display local evaluation results on the leaderboard. We plan to add automatic evaluation scripts in the future. At that point, you will be able to upload your model output for evaluation.
44
 
45
  """
46
 
47
  EVALUATION_QUEUE_TEXT = """
48
+ ## Your submit file format
49
+ Currently, we only accept the final results files. Your submission file should be a JSON file in the following format:
50
+ ```json
51
+ {
52
+ "config": {
53
+ "model_dtype": <model_dtype>,
54
+ "model_name": <model_name>,
55
+ },
56
+ "results": {
57
+ "task_name": {
58
+ "bleu": <score>
59
+ },
60
+
61
+ ...
62
+ }
63
+ }
64
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ ## Please make sure
67
+ #### 1)Your uploaded file is in the requested format
68
+ Failure to do so may lead to unprecedented errors.
69
+ #### 2) Your results cover all tasks.
70
+ If some tasks do not have results, please assign a score of zero to those categories.
71
  """
72
 
73
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
74
+ CITATION_BUTTON_TEXT =r"""
75
+ @article{zhang2024opent2t,
76
+ title={OPENT2T: An Open-Source Toolkit for Table-to-Text Generation},
77
+ author={Zhang, Haowei and Si, Shengyun and Zhao, Yilun and Wang, Pengcheng and Nan, Linyong and Tang, Xiangru and Radev, Dragomir and Cohan, Arman},
78
+ journal={ACL},
79
+ year={2024},
80
+ }
81
  """
src/display/utils.py CHANGED
@@ -69,7 +69,7 @@ class ModelType(Enum):
69
  FT = ModelDetails(name="fine-tuned", symbol="🔶")
70
  IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
71
  RL = ModelDetails(name="RL-tuned", symbol="🟦")
72
- Unknown = ModelDetails(name="", symbol="?")
73
 
74
  def to_str(self, separator=" "):
75
  return f"{self.value.symbol}{separator}{self.value.name}"
 
69
  FT = ModelDetails(name="fine-tuned", symbol="🔶")
70
  IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
71
  RL = ModelDetails(name="RL-tuned", symbol="🟦")
72
+ Unknown = ModelDetails(name="", symbol="")
73
 
74
  def to_str(self, separator=" "):
75
  return f"{self.value.symbol}{separator}{self.value.name}"
src/envs.py CHANGED
@@ -8,8 +8,8 @@ TOKEN = os.environ.get("TOKEN", None)
8
  OWNER = "huohua0314"
9
  TEST = "demo-leaderboard"
10
  REPO_ID = f"{OWNER}/OPENT2T"
11
- QUEUE_REPO = f"{TEST}/requests"
12
- RESULTS_REPO = f"{TEST}/results"
13
 
14
  CACHE_PATH=os.getenv("HF_HOME", ".")
15
 
 
8
  OWNER = "huohua0314"
9
  TEST = "demo-leaderboard"
10
  REPO_ID = f"{OWNER}/OPENT2T"
11
+ QUEUE_REPO = f"{OWNER}/requests"
12
+ RESULTS_REPO = f"{OWNER}/opent2t_results"
13
 
14
  CACHE_PATH=os.getenv("HF_HOME", ".")
15
 
src/leaderboard/read_evals.py CHANGED
@@ -107,14 +107,9 @@ class EvalResult:
107
 
108
  def to_dict(self):
109
  """Converts the Eval Result to a dict compatible with our dataframe display"""
110
- print(Tasks)
111
- for item in Tasks:
112
- print(item)
113
 
114
- print(AutoEvalColumn.model_type_symbol.name)
115
- print(AutoEvalColumn.model.name)
116
- print(AutoEvalColumn.dummy.name)
117
- print(AutoEvalColumn.average.name)
118
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
119
  data_dict = {
120
  "eval_name": self.eval_name, # not a column, just a save name,
@@ -133,11 +128,9 @@ class EvalResult:
133
  # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
134
  }
135
 
136
- print("zzzz")
137
- print(self.results)
138
  for task in Tasks:
139
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
140
- print("tttttzzzz")
141
 
142
  return data_dict
143
 
@@ -169,7 +162,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
169
  model_result_filepaths = []
170
 
171
  for root, _, files in os.walk(results_path):
172
- print(files)
173
  # We should only have json files in model results
174
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
175
  continue
@@ -197,16 +190,11 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
197
  eval_results[eval_name] = eval_result
198
 
199
  results = []
200
- # print(eval_results)
201
  for v in eval_results.values():
202
- print(f"v : {v}")
203
  try:
204
  v.to_dict() # we test if the dict version is complete
205
- print("aaa")
206
  results.append(v)
207
  except KeyError as e: # not all eval values present
208
- print(f"error : {e}")
209
- print("nono")
210
  continue
211
 
212
  return results
 
107
 
108
  def to_dict(self):
109
  """Converts the Eval Result to a dict compatible with our dataframe display"""
110
+
 
 
111
 
112
+
 
 
 
113
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
114
  data_dict = {
115
  "eval_name": self.eval_name, # not a column, just a save name,
 
128
  # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
129
  }
130
 
131
+
 
132
  for task in Tasks:
133
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
 
134
 
135
  return data_dict
136
 
 
162
  model_result_filepaths = []
163
 
164
  for root, _, files in os.walk(results_path):
165
+
166
  # We should only have json files in model results
167
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
168
  continue
 
190
  eval_results[eval_name] = eval_result
191
 
192
  results = []
 
193
  for v in eval_results.values():
 
194
  try:
195
  v.to_dict() # we test if the dict version is complete
 
196
  results.append(v)
197
  except KeyError as e: # not all eval values present
 
 
198
  continue
199
 
200
  return results
src/populate.py CHANGED
@@ -14,8 +14,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- print(df)
18
- print(AutoEvalColumn.average.name)
19
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
20
 
21
  df = df[cols].round(decimals=2)
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+
 
18
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
19
 
20
  df = df[cols].round(decimals=2)