Spaces:
Sleeping
Sleeping
haiengchuihaian
commited on
Commit
·
99ec88b
1
Parent(s):
4ea4ae9
leaderboard pipiline
Browse files- app.py +33 -253
- src/display/about.py +45 -31
- src/display/utils.py +1 -1
- src/envs.py +2 -2
- src/leaderboard/read_evals.py +4 -16
- src/populate.py +1 -2
app.py
CHANGED
@@ -33,20 +33,20 @@ from src.submission.submit import add_new_eval, upload_file
|
|
33 |
def restart_space():
|
34 |
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
35 |
|
36 |
-
try:
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
except Exception:
|
42 |
-
|
43 |
-
try:
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
except Exception:
|
49 |
-
|
50 |
|
51 |
|
52 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
@@ -56,86 +56,7 @@ value=[ c.name for c in fields(AutoEvalColumn)
|
|
56 |
|
57 |
leaderboard_df = original_df.copy()
|
58 |
|
59 |
-
# (
|
60 |
-
# finished_eval_queue_df,
|
61 |
-
# running_eval_queue_df,
|
62 |
-
# pending_eval_queue_df,
|
63 |
-
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
64 |
|
65 |
-
|
66 |
-
# Searching and filtering
|
67 |
-
# def update_table(
|
68 |
-
# hidden_df: pd.DataFrame,
|
69 |
-
# columns: list,
|
70 |
-
# type_query: list,
|
71 |
-
# precision_query: str,
|
72 |
-
# size_query: list,
|
73 |
-
# show_deleted: bool,
|
74 |
-
# query: str,
|
75 |
-
# ):
|
76 |
-
# filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
77 |
-
# filtered_df = filter_queries(query, filtered_df)
|
78 |
-
# df = select_columns(filtered_df, columns)
|
79 |
-
# return df
|
80 |
-
|
81 |
-
|
82 |
-
# def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
83 |
-
# return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
84 |
-
|
85 |
-
|
86 |
-
# def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
87 |
-
# always_here_cols = [
|
88 |
-
# AutoEvalColumn.model_type_symbol.name,
|
89 |
-
# AutoEvalColumn.model.name,
|
90 |
-
# ]
|
91 |
-
# # We use COLS to maintain sorting
|
92 |
-
# filtered_df = df[
|
93 |
-
# always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
|
94 |
-
# ]
|
95 |
-
# return filtered_df
|
96 |
-
|
97 |
-
|
98 |
-
# def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
99 |
-
# final_df = []
|
100 |
-
# if query != "":
|
101 |
-
# queries = [q.strip() for q in query.split(";")]
|
102 |
-
# for _q in queries:
|
103 |
-
# _q = _q.strip()
|
104 |
-
# if _q != "":
|
105 |
-
# temp_filtered_df = search_table(filtered_df, _q)
|
106 |
-
# if len(temp_filtered_df) > 0:
|
107 |
-
# final_df.append(temp_filtered_df)
|
108 |
-
# if len(final_df) > 0:
|
109 |
-
# filtered_df = pd.concat(final_df)
|
110 |
-
# filtered_df = filtered_df.drop_duplicates(
|
111 |
-
# subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
112 |
-
# )
|
113 |
-
|
114 |
-
# return filtered_df
|
115 |
-
|
116 |
-
|
117 |
-
# def filter_models(
|
118 |
-
# df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
119 |
-
# ) -> pd.DataFrame:
|
120 |
-
# # Show all models
|
121 |
-
# if show_deleted:
|
122 |
-
# filtered_df = df
|
123 |
-
# else: # Show only still on the hub models
|
124 |
-
# filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
125 |
-
|
126 |
-
# type_emoji = [t[0] for t in type_query]
|
127 |
-
# filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
128 |
-
# filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
129 |
-
|
130 |
-
# numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
131 |
-
# params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
132 |
-
# mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
133 |
-
# filtered_df = filtered_df.loc[mask]
|
134 |
-
|
135 |
-
# return filtered_df
|
136 |
-
|
137 |
-
|
138 |
-
# print([c.name for c in fields(AutoEvalColumn) if c.never_hidden])
|
139 |
demo = gr.Blocks(css=custom_css)
|
140 |
with demo:
|
141 |
gr.HTML(TITLE)
|
@@ -143,58 +64,7 @@ with demo:
|
|
143 |
|
144 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
145 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
146 |
-
|
147 |
-
# with gr.Column():
|
148 |
-
# with gr.Row():
|
149 |
-
# search_bar = gr.Textbox(
|
150 |
-
# placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
151 |
-
# show_label=False,
|
152 |
-
# elem_id="search-bar",
|
153 |
-
# )
|
154 |
-
# with gr.Row():
|
155 |
-
# shown_columns = gr.CheckboxGroup(
|
156 |
-
# choices=[
|
157 |
-
# c.name
|
158 |
-
# for c in fields(AutoEvalColumn)
|
159 |
-
# if not c.hidden and not c.never_hidden and not c.dummy
|
160 |
-
# ],
|
161 |
-
# value=[
|
162 |
-
# c.name
|
163 |
-
# for c in fields(AutoEvalColumn)
|
164 |
-
# if c.displayed_by_default and not c.hidden and not c.never_hidden
|
165 |
-
# ],
|
166 |
-
# label="Select columns to show",
|
167 |
-
# elem_id="column-select",
|
168 |
-
# interactive=True,
|
169 |
-
# )
|
170 |
-
# with gr.Row():
|
171 |
-
# deleted_models_visibility = gr.Checkbox(
|
172 |
-
# value=False, label="Show gated/private/deleted models", interactive=True
|
173 |
-
# )
|
174 |
-
# with gr.Column(min_width=320):
|
175 |
-
# #with gr.Box(elem_id="box-filter"):
|
176 |
-
# filter_columns_type = gr.CheckboxGroup(
|
177 |
-
# label="Model types",
|
178 |
-
# choices=[t.to_str() for t in ModelType],
|
179 |
-
# value=[t.to_str() for t in ModelType],
|
180 |
-
# interactive=True,
|
181 |
-
# elem_id="filter-columns-type",
|
182 |
-
# )
|
183 |
-
# filter_columns_precision = gr.CheckboxGroup(
|
184 |
-
# label="Precision",
|
185 |
-
# choices=[i.value.name for i in Precision],
|
186 |
-
# value=[i.value.name for i in Precision],
|
187 |
-
# interactive=True,
|
188 |
-
# elem_id="filter-columns-precision",
|
189 |
-
# )
|
190 |
-
# filter_columns_size = gr.CheckboxGroup(
|
191 |
-
# label="Model sizes (in billions of parameters)",
|
192 |
-
# choices=list(NUMERIC_INTERVALS.keys()),
|
193 |
-
# value=list(NUMERIC_INTERVALS.keys()),
|
194 |
-
# interactive=True,
|
195 |
-
# elem_id="filter-columns-size",
|
196 |
-
# )
|
197 |
-
|
198 |
leaderboard_table = gr.components.Dataframe(
|
199 |
value=leaderboard_df[
|
200 |
[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + value
|
@@ -215,34 +85,7 @@ with demo:
|
|
215 |
datatype=TYPES,
|
216 |
visible=False,
|
217 |
)
|
218 |
-
|
219 |
-
# update_table,
|
220 |
-
# [
|
221 |
-
# hidden_leaderboard_table_for_search,
|
222 |
-
# shown_columns,
|
223 |
-
# filter_columns_type,
|
224 |
-
# filter_columns_precision,
|
225 |
-
# filter_columns_size,
|
226 |
-
# deleted_models_visibility,
|
227 |
-
# search_bar,
|
228 |
-
# ],
|
229 |
-
# leaderboard_table,
|
230 |
-
# )
|
231 |
-
# for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
|
232 |
-
# selector.change(
|
233 |
-
# update_table,
|
234 |
-
# [
|
235 |
-
# hidden_leaderboard_table_for_search,
|
236 |
-
# shown_columns,
|
237 |
-
# filter_columns_type,
|
238 |
-
# filter_columns_precision,
|
239 |
-
# filter_columns_size,
|
240 |
-
# deleted_models_visibility,
|
241 |
-
# search_bar,
|
242 |
-
# ],
|
243 |
-
# leaderboard_table,
|
244 |
-
# queue=True,
|
245 |
-
# )
|
246 |
|
247 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
248 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
@@ -252,89 +95,26 @@ with demo:
|
|
252 |
with gr.Row():
|
253 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
254 |
|
255 |
-
|
256 |
-
# with gr.Accordion(
|
257 |
-
# f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
258 |
-
# open=False,
|
259 |
-
# ):
|
260 |
-
# with gr.Row():
|
261 |
-
# finished_eval_table = gr.components.Dataframe(
|
262 |
-
# value=finished_eval_queue_df,
|
263 |
-
# headers=EVAL_COLS,
|
264 |
-
# datatype=EVAL_TYPES,
|
265 |
-
# row_count=5,
|
266 |
-
# )
|
267 |
-
# with gr.Accordion(
|
268 |
-
# f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
269 |
-
# open=False,
|
270 |
-
# ):
|
271 |
-
# with gr.Row():
|
272 |
-
# running_eval_table = gr.components.Dataframe(
|
273 |
-
# value=running_eval_queue_df,
|
274 |
-
# headers=EVAL_COLS,
|
275 |
-
# datatype=EVAL_TYPES,
|
276 |
-
# row_count=5,
|
277 |
-
# )
|
278 |
-
|
279 |
-
# with gr.Accordion(
|
280 |
-
# f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
281 |
-
# open=False,
|
282 |
-
# ):
|
283 |
-
# with gr.Row():
|
284 |
-
# pending_eval_table = gr.components.Dataframe(
|
285 |
-
# value=pending_eval_queue_df,
|
286 |
-
# headers=EVAL_COLS,
|
287 |
-
# datatype=EVAL_TYPES,
|
288 |
-
# row_count=5,
|
289 |
-
# )
|
290 |
with gr.Row():
|
291 |
gr.Markdown("# ✉️✨ Submit your files here!", elem_classes="markdown-text")
|
292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
with gr.Row():
|
294 |
-
upload = gr.Interface(fn=
|
295 |
-
|
296 |
-
# model_name_textbox = gr.Textbox(label="Model name")
|
297 |
-
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
298 |
-
# model_type = gr.Dropdown(
|
299 |
-
# choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
300 |
-
# label="Model type",
|
301 |
-
# multiselect=False,
|
302 |
-
# value=None,
|
303 |
-
# interactive=True,
|
304 |
-
# )
|
305 |
-
|
306 |
-
# with gr.Column():
|
307 |
-
# precision = gr.Dropdown(
|
308 |
-
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
309 |
-
# label="Precision",
|
310 |
-
# multiselect=False,
|
311 |
-
# value="float16",
|
312 |
-
# interactive=True,
|
313 |
-
# )
|
314 |
-
# weight_type = gr.Dropdown(
|
315 |
-
# choices=[i.value.name for i in WeightType],
|
316 |
-
# label="Weights type",
|
317 |
-
# multiselect=False,
|
318 |
-
# value="Original",
|
319 |
-
# interactive=True,
|
320 |
-
# )
|
321 |
-
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
322 |
-
|
323 |
-
# submit_button = gr.Button("Submit Eval")
|
324 |
-
# submission_result = gr.Markdown()
|
325 |
-
# submit_button.click(
|
326 |
-
# add_new_eval,
|
327 |
-
# [
|
328 |
-
# model_name_textbox,
|
329 |
-
# base_model_name_textbox,
|
330 |
-
# revision_name_textbox,
|
331 |
-
# precision,
|
332 |
-
# weight_type,
|
333 |
-
# model_type,
|
334 |
-
# ],
|
335 |
-
# submission_result,
|
336 |
-
# )
|
337 |
-
|
338 |
with gr.Row():
|
339 |
with gr.Accordion("📙 Citation", open=False):
|
340 |
citation_button = gr.Textbox(
|
@@ -347,6 +127,6 @@ with demo:
|
|
347 |
|
348 |
scheduler = BackgroundScheduler()
|
349 |
scheduler.add_job(restart_space, "interval", seconds=30)
|
350 |
-
scheduler.start()
|
351 |
|
352 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
33 |
def restart_space():
|
34 |
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
35 |
|
36 |
+
# try:
|
37 |
+
# print(EVAL_REQUESTS_PATH)
|
38 |
+
# snapshot_download(
|
39 |
+
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
40 |
+
# )
|
41 |
+
# except Exception:
|
42 |
+
# restart_space()
|
43 |
+
# try:
|
44 |
+
# print(EVAL_RESULTS_PATH)
|
45 |
+
# snapshot_download(
|
46 |
+
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
47 |
+
# )
|
48 |
+
# except Exception:
|
49 |
+
# restart_space()
|
50 |
|
51 |
|
52 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
56 |
|
57 |
leaderboard_df = original_df.copy()
|
58 |
|
|
|
|
|
|
|
|
|
|
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
demo = gr.Blocks(css=custom_css)
|
61 |
with demo:
|
62 |
gr.HTML(TITLE)
|
|
|
64 |
|
65 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
66 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
67 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
leaderboard_table = gr.components.Dataframe(
|
69 |
value=leaderboard_df[
|
70 |
[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + value
|
|
|
85 |
datatype=TYPES,
|
86 |
visible=False,
|
87 |
)
|
88 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
91 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
95 |
with gr.Row():
|
96 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
97 |
|
98 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
with gr.Row():
|
100 |
gr.Markdown("# ✉️✨ Submit your files here!", elem_classes="markdown-text")
|
101 |
|
102 |
+
def update_leaderboard(file_obj):
|
103 |
+
upload_file(file_obj)
|
104 |
+
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
105 |
+
value=[ c.name for c in fields(AutoEvalColumn)
|
106 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden]
|
107 |
+
|
108 |
+
leaderboard_df = original_df.copy()
|
109 |
+
leaderboard_table = leaderboard_df[
|
110 |
+
[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + value
|
111 |
+
+ [AutoEvalColumn.dummy.name]
|
112 |
+
]
|
113 |
+
return leaderboard_table
|
114 |
+
|
115 |
with gr.Row():
|
116 |
+
upload = gr.Interface(fn=update_leaderboard,inputs="file" ,outputs=leaderboard_table)
|
117 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
with gr.Row():
|
119 |
with gr.Accordion("📙 Citation", open=False):
|
120 |
citation_button = gr.Textbox(
|
|
|
127 |
|
128 |
scheduler = BackgroundScheduler()
|
129 |
scheduler.add_job(restart_space, "interval", seconds=30)
|
130 |
+
# scheduler.start()
|
131 |
|
132 |
demo.queue(default_concurrency_limit=40).launch()
|
src/display/about.py
CHANGED
@@ -11,8 +11,11 @@ class Task:
|
|
11 |
# Init: to update with your specific keys
|
12 |
class Tasks(Enum):
|
13 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
14 |
-
task0 = Task("
|
15 |
-
task1 = Task("
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
# Your leaderboard name
|
@@ -25,43 +28,54 @@ Intro text
|
|
25 |
|
26 |
# Which evaluations are you running? how can people reproduce what you have?
|
27 |
LLM_BENCHMARKS_TEXT = f"""
|
28 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
|
31 |
-
|
|
|
|
|
32 |
|
33 |
"""
|
34 |
|
35 |
EVALUATION_QUEUE_TEXT = """
|
36 |
-
##
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
```
|
45 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
46 |
-
|
47 |
-
Note: make sure your model is public!
|
48 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
49 |
-
|
50 |
-
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
51 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
52 |
-
|
53 |
-
### 3) Make sure your model has an open license!
|
54 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
55 |
-
|
56 |
-
### 4) Fill up your model card
|
57 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
58 |
|
59 |
-
##
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
63 |
"""
|
64 |
|
65 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
66 |
-
CITATION_BUTTON_TEXT =
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
"""
|
|
|
11 |
# Init: to update with your specific keys
|
12 |
class Tasks(Enum):
|
13 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
14 |
+
task0 = Task("fetaqa", "bleu", "fetaqa")
|
15 |
+
task1 = Task("hitab_ng", "bleu", "hitab_ng")
|
16 |
+
task2 = Task("qtsumm", "bleu", "qtsumm")
|
17 |
+
task3 = Task("scigen", "bleu", "scigen")
|
18 |
+
task4 = Task("totto", "bleu", "totto")
|
19 |
|
20 |
|
21 |
# Your leaderboard name
|
|
|
28 |
|
29 |
# Which evaluations are you running? how can people reproduce what you have?
|
30 |
LLM_BENCHMARKS_TEXT = f"""
|
31 |
+
## Introduction
|
32 |
+
### OPENT2T - first open-source toolkit for table-to-text generation
|
33 |
+
- designed to reproduce existing table pre-training models for
|
34 |
+
- expedite the development of new models.
|
35 |
+
- performance comparison
|
36 |
+
- implemented and compared
|
37 |
+
- 19 types of large language models under zero and few-shot settings
|
38 |
+
- 7 fine-tuned models
|
39 |
|
40 |
+
Currently we released evaluation results on "fetaqa", "hitab_ng" , "qtsumm", "scigen", "totto" dataset
|
41 |
+
|
42 |
+
## Notice
|
43 |
+
Currently, we only display local evaluation results on the leaderboard. We plan to add automatic evaluation scripts in the future. At that point, you will be able to upload your model output for evaluation.
|
44 |
|
45 |
"""
|
46 |
|
47 |
EVALUATION_QUEUE_TEXT = """
|
48 |
+
## Your submit file format
|
49 |
+
Currently, we only accept the final results files. Your submission file should be a JSON file in the following format:
|
50 |
+
```json
|
51 |
+
{
|
52 |
+
"config": {
|
53 |
+
"model_dtype": <model_dtype>,
|
54 |
+
"model_name": <model_name>,
|
55 |
+
},
|
56 |
+
"results": {
|
57 |
+
"task_name": {
|
58 |
+
"bleu": <score>
|
59 |
+
},
|
60 |
+
|
61 |
+
...
|
62 |
+
}
|
63 |
+
}
|
64 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
+
## Please make sure
|
67 |
+
#### 1)Your uploaded file is in the requested format
|
68 |
+
Failure to do so may lead to unprecedented errors.
|
69 |
+
#### 2) Your results cover all tasks.
|
70 |
+
If some tasks do not have results, please assign a score of zero to those categories.
|
71 |
"""
|
72 |
|
73 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
74 |
+
CITATION_BUTTON_TEXT =r"""
|
75 |
+
@article{zhang2024opent2t,
|
76 |
+
title={OPENT2T: An Open-Source Toolkit for Table-to-Text Generation},
|
77 |
+
author={Zhang, Haowei and Si, Shengyun and Zhao, Yilun and Wang, Pengcheng and Nan, Linyong and Tang, Xiangru and Radev, Dragomir and Cohan, Arman},
|
78 |
+
journal={ACL},
|
79 |
+
year={2024},
|
80 |
+
}
|
81 |
"""
|
src/display/utils.py
CHANGED
@@ -69,7 +69,7 @@ class ModelType(Enum):
|
|
69 |
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
70 |
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
71 |
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
72 |
-
Unknown = ModelDetails(name="", symbol="
|
73 |
|
74 |
def to_str(self, separator=" "):
|
75 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
|
|
69 |
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
70 |
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
71 |
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
72 |
+
Unknown = ModelDetails(name="", symbol="★")
|
73 |
|
74 |
def to_str(self, separator=" "):
|
75 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
src/envs.py
CHANGED
@@ -8,8 +8,8 @@ TOKEN = os.environ.get("TOKEN", None)
|
|
8 |
OWNER = "huohua0314"
|
9 |
TEST = "demo-leaderboard"
|
10 |
REPO_ID = f"{OWNER}/OPENT2T"
|
11 |
-
QUEUE_REPO = f"{
|
12 |
-
RESULTS_REPO = f"{
|
13 |
|
14 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
15 |
|
|
|
8 |
OWNER = "huohua0314"
|
9 |
TEST = "demo-leaderboard"
|
10 |
REPO_ID = f"{OWNER}/OPENT2T"
|
11 |
+
QUEUE_REPO = f"{OWNER}/requests"
|
12 |
+
RESULTS_REPO = f"{OWNER}/opent2t_results"
|
13 |
|
14 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
15 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -107,14 +107,9 @@ class EvalResult:
|
|
107 |
|
108 |
def to_dict(self):
|
109 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
110 |
-
|
111 |
-
for item in Tasks:
|
112 |
-
print(item)
|
113 |
|
114 |
-
|
115 |
-
print(AutoEvalColumn.model.name)
|
116 |
-
print(AutoEvalColumn.dummy.name)
|
117 |
-
print(AutoEvalColumn.average.name)
|
118 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
119 |
data_dict = {
|
120 |
"eval_name": self.eval_name, # not a column, just a save name,
|
@@ -133,11 +128,9 @@ class EvalResult:
|
|
133 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
134 |
}
|
135 |
|
136 |
-
|
137 |
-
print(self.results)
|
138 |
for task in Tasks:
|
139 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
140 |
-
print("tttttzzzz")
|
141 |
|
142 |
return data_dict
|
143 |
|
@@ -169,7 +162,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
169 |
model_result_filepaths = []
|
170 |
|
171 |
for root, _, files in os.walk(results_path):
|
172 |
-
|
173 |
# We should only have json files in model results
|
174 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
175 |
continue
|
@@ -197,16 +190,11 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
197 |
eval_results[eval_name] = eval_result
|
198 |
|
199 |
results = []
|
200 |
-
# print(eval_results)
|
201 |
for v in eval_results.values():
|
202 |
-
print(f"v : {v}")
|
203 |
try:
|
204 |
v.to_dict() # we test if the dict version is complete
|
205 |
-
print("aaa")
|
206 |
results.append(v)
|
207 |
except KeyError as e: # not all eval values present
|
208 |
-
print(f"error : {e}")
|
209 |
-
print("nono")
|
210 |
continue
|
211 |
|
212 |
return results
|
|
|
107 |
|
108 |
def to_dict(self):
|
109 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
110 |
+
|
|
|
|
|
111 |
|
112 |
+
|
|
|
|
|
|
|
113 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
114 |
data_dict = {
|
115 |
"eval_name": self.eval_name, # not a column, just a save name,
|
|
|
128 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
129 |
}
|
130 |
|
131 |
+
|
|
|
132 |
for task in Tasks:
|
133 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
|
|
134 |
|
135 |
return data_dict
|
136 |
|
|
|
162 |
model_result_filepaths = []
|
163 |
|
164 |
for root, _, files in os.walk(results_path):
|
165 |
+
|
166 |
# We should only have json files in model results
|
167 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
168 |
continue
|
|
|
190 |
eval_results[eval_name] = eval_result
|
191 |
|
192 |
results = []
|
|
|
193 |
for v in eval_results.values():
|
|
|
194 |
try:
|
195 |
v.to_dict() # we test if the dict version is complete
|
|
|
196 |
results.append(v)
|
197 |
except KeyError as e: # not all eval values present
|
|
|
|
|
198 |
continue
|
199 |
|
200 |
return results
|
src/populate.py
CHANGED
@@ -14,8 +14,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
|
18 |
-
print(AutoEvalColumn.average.name)
|
19 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
20 |
|
21 |
df = df[cols].round(decimals=2)
|
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
+
|
|
|
18 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
19 |
|
20 |
df = df[cols].round(decimals=2)
|