Spaces:
Running
Running
Commit
Β·
6e57415
1
Parent(s):
8171dbf
update 14B
Browse files- app.py +74 -86
- data/2024-05/14b.xlsx +0 -0
- data/2024-06/14b.xlsx +0 -0
- data/2024-07/14b.xlsx +0 -0
app.py
CHANGED
|
@@ -17,6 +17,7 @@ load_dotenv()
|
|
| 17 |
webhook_url = os.environ.get("WEBHOOK_URL")
|
| 18 |
|
| 19 |
file_name_list = [
|
|
|
|
| 20 |
'9b',
|
| 21 |
'7b',
|
| 22 |
'3b',
|
|
@@ -36,6 +37,7 @@ metric_list = [
|
|
| 36 |
]
|
| 37 |
|
| 38 |
model_size_list = [
|
|
|
|
| 39 |
'~9B',
|
| 40 |
'~7B',
|
| 41 |
'~3B',
|
|
@@ -49,25 +51,13 @@ metric_to_sheet = {
|
|
| 49 |
}
|
| 50 |
|
| 51 |
model_size_to_file_name = {
|
|
|
|
| 52 |
'~9B': '9b',
|
| 53 |
'~7B': '7b',
|
| 54 |
'~3B': '3b',
|
| 55 |
'~1.5B': '1b5',
|
| 56 |
}
|
| 57 |
|
| 58 |
-
css = """
|
| 59 |
-
.gr-dataframe table {
|
| 60 |
-
table-layout: fixed;
|
| 61 |
-
width: 100%; /* Ensures the table fills its container */
|
| 62 |
-
}
|
| 63 |
-
.gr-dataframe th, .gr-dataframe td {
|
| 64 |
-
width: 100px; /* Set the exact width of each cell */
|
| 65 |
-
overflow: hidden; /* Ensures the content doesn't overflow */
|
| 66 |
-
text-overflow: ellipsis; /* Adds an ellipsis (...) if the text overflows */
|
| 67 |
-
white-space: nowrap; /* Keeps the content on a single line */
|
| 68 |
-
}
|
| 69 |
-
"""
|
| 70 |
-
|
| 71 |
about_md = """
|
| 72 |
# Uncheatable Eval
|
| 73 |
|
|
@@ -167,61 +157,49 @@ def update_table(period: str,
|
|
| 167 |
if 'Average (The lower the better)' in combined_data.columns:
|
| 168 |
relevant_columns = [col for col in visible_columns if
|
| 169 |
col not in ['Name', 'Parameters Count (B)', 'Average (The lower the better)']]
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
target_color_columns = []
|
| 214 |
-
if 'Average' in color_columns:
|
| 215 |
-
target_color_columns.append('Average (lower=better)')
|
| 216 |
-
if 'Individual Tests' in color_columns:
|
| 217 |
-
target_color_columns.extend([col for col in filtered_data.columns if
|
| 218 |
-
col not in ['Name', 'Parameters Count (B)', 'Average (lower=better)']])
|
| 219 |
-
|
| 220 |
-
styler = filtered_data.style.format(formatter).map(color_cell, subset=['Parameters Count (B)'])
|
| 221 |
-
for column in target_color_columns:
|
| 222 |
-
styler = styler.background_gradient(cmap=cmap, subset=[column], vmin=vmin[column], vmax=vmax[column])
|
| 223 |
-
|
| 224 |
-
return styler
|
| 225 |
else:
|
| 226 |
return pd.DataFrame()
|
| 227 |
|
|
@@ -334,7 +312,8 @@ def create_scaling_plot(all_data, period):
|
|
| 334 |
y_tick_text = [f"{val:.1f}" for val in y_tick_vals]
|
| 335 |
|
| 336 |
fig.update_xaxes(tickvals=np.log(x_tick_vals), ticktext=x_tick_text, title='Params(B)')
|
| 337 |
-
fig.update_yaxes(tickvals=np.log(y_tick_vals), ticktext=y_tick_text, title='Compression Rate (%)',
|
|
|
|
| 338 |
|
| 339 |
fig.update_layout(
|
| 340 |
xaxis=dict(showgrid=True, zeroline=False),
|
|
@@ -346,20 +325,26 @@ def create_scaling_plot(all_data, period):
|
|
| 346 |
return fig
|
| 347 |
|
| 348 |
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
all_data
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
all_data
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
|
| 364 |
initial_fig = create_scaling_plot(all_data, time_list[-1])
|
| 365 |
|
|
@@ -385,10 +370,12 @@ css = '''
|
|
| 385 |
|
| 386 |
'''
|
| 387 |
|
|
|
|
|
|
|
|
|
|
| 388 |
with gr.Blocks(css=css) as demo:
|
| 389 |
-
gr.HTML(
|
| 390 |
-
gr.HTML(
|
| 391 |
-
"<h1 style='text-align:center'><span style='font-size:0.8em'>Welcome to Uncheatable Eval LLM Compression Leaderboard, where fancy fine-tuning and cheating wonβt work π«; only compute π», data π, and real innovation π₯ can prevail!</span></h1>")
|
| 392 |
with gr.Tabs() as tabs:
|
| 393 |
with gr.Tab("π Leaderboard"):
|
| 394 |
with gr.Row():
|
|
@@ -430,6 +417,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 430 |
with gr.Tab("π Scaling Law"):
|
| 431 |
period_selector_2 = gr.Dropdown(label="Period", choices=time_list, value=time_list[0])
|
| 432 |
|
|
|
|
| 433 |
def update_plot(period):
|
| 434 |
new_fig = create_scaling_plot(all_data, period)
|
| 435 |
return new_fig
|
|
|
|
| 17 |
webhook_url = os.environ.get("WEBHOOK_URL")
|
| 18 |
|
| 19 |
file_name_list = [
|
| 20 |
+
'14b',
|
| 21 |
'9b',
|
| 22 |
'7b',
|
| 23 |
'3b',
|
|
|
|
| 37 |
]
|
| 38 |
|
| 39 |
model_size_list = [
|
| 40 |
+
'~14B',
|
| 41 |
'~9B',
|
| 42 |
'~7B',
|
| 43 |
'~3B',
|
|
|
|
| 51 |
}
|
| 52 |
|
| 53 |
model_size_to_file_name = {
|
| 54 |
+
'~14B': '14b',
|
| 55 |
'~9B': '9b',
|
| 56 |
'~7B': '7b',
|
| 57 |
'~3B': '3b',
|
| 58 |
'~1.5B': '1b5',
|
| 59 |
}
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
about_md = """
|
| 62 |
# Uncheatable Eval
|
| 63 |
|
|
|
|
| 157 |
if 'Average (The lower the better)' in combined_data.columns:
|
| 158 |
relevant_columns = [col for col in visible_columns if
|
| 159 |
col not in ['Name', 'Parameters Count (B)', 'Average (The lower the better)']]
|
| 160 |
+
if len(combined_data) > 0:
|
| 161 |
+
combined_data['Average (The lower the better)'] = round(combined_data[relevant_columns].mean(axis=1), 3)
|
| 162 |
+
|
| 163 |
+
if len(combined_data) > 0:
|
| 164 |
+
sorted_data = combined_data.sort_values(by=sort_by, ascending=ascending)
|
| 165 |
+
sorted_data = sorted_data.rename(columns={'Average (The lower the better)': 'Average (lower=better)'})
|
| 166 |
+
visible_columns = ['Name', 'Parameters Count (B)', 'Average (lower=better)'] + visible_columns
|
| 167 |
+
filtered_data = sorted_data[visible_columns]
|
| 168 |
+
|
| 169 |
+
filtered_data.columns = [col.replace('_', ' ') for col in filtered_data.columns]
|
| 170 |
+
|
| 171 |
+
formatter = {col: "{:.3f}" for col in filtered_data.columns if
|
| 172 |
+
filtered_data[col].dtype in ['float64', 'float32']}
|
| 173 |
+
|
| 174 |
+
# color gradient
|
| 175 |
+
colors = ["#63be7b", "#ffffff", "#f8696b"]
|
| 176 |
+
cmap = LinearSegmentedColormap.from_list("custom_cmap", colors)
|
| 177 |
+
vmin = {}
|
| 178 |
+
vmax = {}
|
| 179 |
+
for column in filtered_data.columns:
|
| 180 |
+
if column in ['Name', 'Parameters Count (B)']:
|
| 181 |
+
continue
|
| 182 |
+
col_values = filtered_data[column]
|
| 183 |
+
if len(col_values) > 1:
|
| 184 |
+
second_largest = col_values.nlargest(2).iloc[-1]
|
| 185 |
+
vmin[column] = col_values.min()
|
| 186 |
+
vmax[column] = second_largest
|
| 187 |
+
|
| 188 |
+
target_color_columns = []
|
| 189 |
+
if 'Average' in color_columns:
|
| 190 |
+
target_color_columns.append('Average (lower=better)')
|
| 191 |
+
if 'Individual Tests' in color_columns:
|
| 192 |
+
target_color_columns.extend([col for col in filtered_data.columns if
|
| 193 |
+
col not in ['Name', 'Parameters Count (B)', 'Average (lower=better)']])
|
| 194 |
+
|
| 195 |
+
styler = filtered_data.style.format(formatter)
|
| 196 |
+
for column in target_color_columns:
|
| 197 |
+
if column in vmin and column in vmax: # Ensure that the vmin and vmax dicts contain the column
|
| 198 |
+
styler = styler.background_gradient(cmap=cmap, subset=[column], vmin=vmin[column], vmax=vmax[column])
|
| 199 |
+
|
| 200 |
+
return styler
|
| 201 |
+
else:
|
| 202 |
+
return pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
else:
|
| 204 |
return pd.DataFrame()
|
| 205 |
|
|
|
|
| 312 |
y_tick_text = [f"{val:.1f}" for val in y_tick_vals]
|
| 313 |
|
| 314 |
fig.update_xaxes(tickvals=np.log(x_tick_vals), ticktext=x_tick_text, title='Params(B)')
|
| 315 |
+
fig.update_yaxes(tickvals=np.log(y_tick_vals), ticktext=y_tick_text, title='Compression Rate (%)',
|
| 316 |
+
autorange='reversed')
|
| 317 |
|
| 318 |
fig.update_layout(
|
| 319 |
xaxis=dict(showgrid=True, zeroline=False),
|
|
|
|
| 325 |
return fig
|
| 326 |
|
| 327 |
|
| 328 |
+
def read_all_data(folder_name):
|
| 329 |
+
all_data = {}
|
| 330 |
+
time_list = []
|
| 331 |
+
for folder in get_folders_matching_format(folder_name):
|
| 332 |
+
folder_name = os.path.basename(folder)
|
| 333 |
+
time_list.append(folder_name)
|
| 334 |
+
if all_data.get(folder) is None:
|
| 335 |
+
all_data[folder_name] = {}
|
| 336 |
+
for file_name in file_name_list:
|
| 337 |
+
if all_data.get(file_name) is None:
|
| 338 |
+
all_data[folder_name][file_name] = {}
|
| 339 |
+
for sheet_name in sheet_name_list:
|
| 340 |
+
final_file_name = os.path.join(folder, file_name)
|
| 341 |
+
all_data[folder_name][file_name][sheet_name] = rename_columns(
|
| 342 |
+
pd.read_excel(final_file_name + '.xlsx', sheet_name=sheet_name))
|
| 343 |
+
|
| 344 |
+
return all_data, time_list
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
all_data, time_list = read_all_data('data')
|
| 348 |
|
| 349 |
initial_fig = create_scaling_plot(all_data, time_list[-1])
|
| 350 |
|
|
|
|
| 370 |
|
| 371 |
'''
|
| 372 |
|
| 373 |
+
TITLE_HTML = '<h1 style="text-align:center"><span style="font-size:1.3em">π LLM Compression Leaderboard</span></h1>'
|
| 374 |
+
SUBTITLE_HTML = "<h1 style='text-align:center'><span style='font-size:0.8em'>Welcome to Uncheatable Eval LLM Compression Leaderboard, where fancy fine-tuning and cheating wonβt work π«; only compute π», data π, and real innovation π₯ can prevail!</span></h1>"
|
| 375 |
+
|
| 376 |
with gr.Blocks(css=css) as demo:
|
| 377 |
+
gr.HTML(TITLE_HTML)
|
| 378 |
+
gr.HTML(SUBTITLE_HTML)
|
|
|
|
| 379 |
with gr.Tabs() as tabs:
|
| 380 |
with gr.Tab("π Leaderboard"):
|
| 381 |
with gr.Row():
|
|
|
|
| 417 |
with gr.Tab("π Scaling Law"):
|
| 418 |
period_selector_2 = gr.Dropdown(label="Period", choices=time_list, value=time_list[0])
|
| 419 |
|
| 420 |
+
|
| 421 |
def update_plot(period):
|
| 422 |
new_fig = create_scaling_plot(all_data, period)
|
| 423 |
return new_fig
|
data/2024-05/14b.xlsx
ADDED
|
Binary file (10.5 kB). View file
|
|
|
data/2024-06/14b.xlsx
ADDED
|
Binary file (10.5 kB). View file
|
|
|
data/2024-07/14b.xlsx
ADDED
|
Binary file (11.3 kB). View file
|
|
|