LLMArena commited on
Commit
f7beec8
·
verified ·
1 Parent(s): f515676

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +712 -0
app.py ADDED
@@ -0,0 +1,712 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import ast
3
+ import glob
4
+ import pickle
5
+ import traceback
6
+ from datetime import datetime
7
+
8
+ import pandas as pd
9
+ import gradio as gr
10
+ import numpy as np
11
+
12
+
13
+ basic_component_values = [None] * 6
14
+ leader_component_values = [None] * 5
15
+
16
+ def make_default_md_1():
17
+ leaderboard_md = f"""
18
+ # 🏆 LLM Arena in Russian: Leaderboard
19
+ """
20
+ return leaderboard_md
21
+
22
+
23
+ def make_default_md_2():
24
+ leaderboard_md = f"""
25
+
26
+ The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale.
27
+ Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote!
28
+
29
+ - To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy)
30
+ - If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev)
31
+ - You can contribute your vote at llmarena.ru!
32
+ """
33
+
34
+ return leaderboard_md
35
+
36
+
37
+
38
+ def make_arena_leaderboard_md(arena_df, last_updated_time):
39
+ total_votes = sum(arena_df["num_battles"])
40
+ total_models = len(arena_df)
41
+ space = "   "
42
+
43
+ leaderboard_md = f"""
44
+ Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
45
+
46
+ ***Rank (UB)**: model rating (upper bound), determined as one plus the number of models that are statistically better than the target model.
47
+ Model A is statistically better than Model B when the lower bound of Model A's rating is higher than the upper bound of Model B's rating (with a 95% confidence interval).
48
+ See Figure 1 below for a visualization of the confidence intervals of model ratings.
49
+ """
50
+ return leaderboard_md
51
+
52
+
53
+
54
+ def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"):
55
+ total_votes = sum(arena_df["num_battles"])
56
+ total_models = len(arena_df)
57
+ space = "   "
58
+ total_subset_votes = sum(arena_subset_df["num_battles"])
59
+ total_subset_models = len(arena_subset_df)
60
+ leaderboard_md = f"""### {cat_name_to_explanation[name]}
61
+ #### {space} #models: **{total_subset_models} ({round(total_subset_models / total_models * 100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes / total_votes * 100)}%)**{space}
62
+ """
63
+ return leaderboard_md
64
+
65
+
66
+
67
+ def model_hyperlink(model_name, link):
68
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
69
+
70
+
71
+ def load_leaderboard_table_csv(filename, add_hyperlink=True):
72
+ lines = open(filename).readlines()
73
+ heads = [v.strip() for v in lines[0].split(",")]
74
+ rows = []
75
+ for i in range(1, len(lines)):
76
+ row = [v.strip() for v in lines[i].split(",")]
77
+ for j in range(len(heads)):
78
+ item = {}
79
+ for h, v in zip(heads, row):
80
+ if h == "Arena Elo rating":
81
+ if v != "-":
82
+ v = int(ast.literal_eval(v))
83
+ else:
84
+ v = np.nan
85
+ elif h == "MMLU":
86
+ if v != "-":
87
+ v = round(ast.literal_eval(v) * 100, 1)
88
+ else:
89
+ v = np.nan
90
+ elif h == "MT-bench (win rate %)":
91
+ if v != "-":
92
+ v = round(ast.literal_eval(v[:-1]), 1)
93
+ else:
94
+ v = np.nan
95
+ elif h == "MT-bench (score)":
96
+ if v != "-":
97
+ v = round(ast.literal_eval(v), 2)
98
+ else:
99
+ v = np.nan
100
+ item[h] = v
101
+ if add_hyperlink:
102
+ item["Model"] = model_hyperlink(item["Model"], item["Link"])
103
+ rows.append(item)
104
+
105
+ return rows
106
+
107
+
108
+ def create_ranking_str(ranking, ranking_difference):
109
+ if ranking_difference > 0:
110
+ return f"{int(ranking)} \u2191"
111
+ elif ranking_difference < 0:
112
+ return f"{int(ranking)} \u2193"
113
+ else:
114
+ return f"{int(ranking)}"
115
+
116
+
117
+ def recompute_final_ranking(arena_df):
118
+ # compute ranking based on CI
119
+ ranking = {}
120
+ for i, model_a in enumerate(arena_df.index):
121
+ ranking[model_a] = 1
122
+ for j, model_b in enumerate(arena_df.index):
123
+ if i == j:
124
+ continue
125
+ if (
126
+ arena_df.loc[model_b]["rating_q025"]
127
+ > arena_df.loc[model_a]["rating_q975"]
128
+ ):
129
+ ranking[model_a] += 1
130
+ return list(ranking.values())
131
+
132
+
133
+ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
134
+ arena_df = arena_df.sort_values(
135
+ by=["final_ranking", "rating"], ascending=[True, False]
136
+ )
137
+ arena_df["final_ranking"] = recompute_final_ranking(arena_df)
138
+ arena_df = arena_df.sort_values(
139
+ by=["final_ranking", "rating"], ascending=[True, False]
140
+ )
141
+
142
+ # sort by rating
143
+ if arena_subset_df is not None:
144
+ # filter out models not in the arena_df
145
+ arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
146
+ arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
147
+ arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
148
+ # keep only the models in the subset in arena_df and recompute final_ranking
149
+ arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
150
+ # recompute final ranking
151
+ arena_df["final_ranking"] = recompute_final_ranking(arena_df)
152
+
153
+ # assign ranking by the order
154
+ arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
155
+ arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
156
+ # join arena_df and arena_subset_df on index
157
+ arena_df = arena_subset_df.join(
158
+ arena_df["final_ranking"], rsuffix="_global", how="inner"
159
+ )
160
+ arena_df["ranking_difference"] = (
161
+ arena_df["final_ranking_global"] - arena_df["final_ranking"]
162
+ )
163
+
164
+ arena_df = arena_df.sort_values(
165
+ by=["final_ranking", "rating"], ascending=[True, False]
166
+ )
167
+ arena_df["final_ranking"] = arena_df.apply(
168
+ lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]),
169
+ axis=1,
170
+ )
171
+
172
+ arena_df["final_ranking"] = arena_df["final_ranking"].astype(str)
173
+
174
+ values = []
175
+ for i in range(len(arena_df)):
176
+ row = []
177
+ model_key = arena_df.index[i]
178
+ try:
179
+ model_name = model_table_df[model_table_df["key"] == model_key][
180
+ "Model"
181
+ ].values[0]
182
+ ranking = arena_df.iloc[i].get("final_ranking") or i + 1
183
+ row.append(ranking)
184
+ if arena_subset_df is not None:
185
+ row.append(arena_df.iloc[i].get("ranking_difference") or 0)
186
+ row.append(model_name)
187
+ row.append(round(arena_df.iloc[i]["rating"]))
188
+ upper_diff = round(
189
+ arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
190
+ )
191
+ lower_diff = round(
192
+ arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
193
+ )
194
+ row.append(f"+{upper_diff}/-{lower_diff}")
195
+ row.append(round(arena_df.iloc[i]["num_battles"]))
196
+ row.append(
197
+ model_table_df[model_table_df["key"] == model_key][
198
+ "Organization"
199
+ ].values[0]
200
+ )
201
+ row.append(
202
+ model_table_df[model_table_df["key"] == model_key]["License"].values[0]
203
+ )
204
+ cutoff_date = model_table_df[model_table_df["key"] == model_key][
205
+ "Knowledge cutoff date"
206
+ ].values[0]
207
+ if cutoff_date == "-":
208
+ row.append("Unknown")
209
+ else:
210
+ row.append(cutoff_date)
211
+ values.append(row)
212
+ except Exception as e:
213
+ traceback.print_exc()
214
+ print(f"{model_key} - {e}")
215
+ return values
216
+
217
+
218
+ key_to_category_name = {
219
+ "full": "Overall",
220
+ "crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
221
+ "site_visitors/medium_prompts": "site_visitors/medium_prompts",
222
+ "site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style control"
223
+ }
224
+ cat_name_to_explanation = {
225
+ "Overall": "All queries",
226
+ "crowdsourcing/simple_prompts": "Queries collected through crowdsourcing. Mostly simple ones.",
227
+ "site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.",
228
+ "site_visitors/medium_prompts:style control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating."
229
+ }
230
+
231
+ cat_name_to_baseline = {
232
+ "Hard Prompts (English)": "English",
233
+ }
234
+
235
+ actual_categories = [
236
+ "Overall",
237
+ "crowdsourcing/simple_prompts",
238
+ "site_visitors/medium_prompts",
239
+ "site_visitors/medium_prompts:style control"
240
+ ]
241
+
242
+
243
+ def read_elo_file(elo_results_file, leaderboard_table_file):
244
+ arena_dfs = {}
245
+ category_elo_results = {}
246
+ with open(elo_results_file, "rb") as fin:
247
+ elo_results = pickle.load(fin)
248
+ last_updated_time = None
249
+ if "full" in elo_results:
250
+ last_updated_time = elo_results["full"]["last_updated_datetime"].split(
251
+ " "
252
+ )[0]
253
+ for k in key_to_category_name.keys():
254
+ if k not in elo_results:
255
+ continue
256
+ arena_dfs[key_to_category_name[k]] = elo_results[k][
257
+ "leaderboard_table_df"
258
+ ]
259
+ category_elo_results[key_to_category_name[k]] = elo_results[k]
260
+
261
+ data = load_leaderboard_table_csv(leaderboard_table_file)
262
+
263
+
264
+ model_table_df = pd.DataFrame(data)
265
+
266
+ return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
267
+
268
+
269
+ def build_leaderboard_tab(
270
+ elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
271
+ ):
272
+ arena_dfs = {}
273
+ arena_df = pd.DataFrame()
274
+ category_elo_results = {}
275
+
276
+ last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
277
+
278
+ p1 = category_elo_results["Overall"]["win_fraction_heatmap"]
279
+ p2 = category_elo_results["Overall"]["battle_count_heatmap"]
280
+ p3 = category_elo_results["Overall"]["bootstrap_elo_rating"]
281
+ p4 = category_elo_results["Overall"]["average_win_rate_bar"]
282
+ arena_df = arena_dfs["Overall"]
283
+ default_md = make_default_md_1()
284
+ default_md_2 = make_default_md_2()
285
+
286
+ with gr.Row():
287
+ with gr.Column(scale=4):
288
+ md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
289
+ with gr.Column(scale=1):
290
+ vote_button = gr.Button("Vote!", link="https://llmarena.ru")
291
+ md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
292
+
293
+ if leaderboard_table_file:
294
+ data = load_leaderboard_table_csv(leaderboard_table_file)
295
+
296
+ model_table_df = pd.DataFrame(data)
297
+
298
+ with gr.Tabs() as tabs:
299
+ arena_table_vals = get_arena_table(arena_df, model_table_df)
300
+
301
+ with gr.Tab("Арена", id=0):
302
+ md = make_arena_leaderboard_md(arena_df, last_updated_time)
303
+
304
+ lb_description = gr.Markdown(md, elem_id="leaderboard_markdown")
305
+ with gr.Row():
306
+ with gr.Column(scale=2):
307
+ category_dropdown = gr.Dropdown(
308
+ choices=actual_categories,
309
+ label="Category",
310
+ value="Overall",
311
+ )
312
+ default_category_details = make_category_arena_leaderboard_md(
313
+ arena_df, arena_df, name="Overall"
314
+ )
315
+
316
+ with gr.Column(scale=4, variant="panel"):
317
+ category_deets = gr.Markdown(
318
+ default_category_details, elem_id="category_deets"
319
+ )
320
+
321
+ arena_vals = pd.DataFrame(
322
+ arena_table_vals,
323
+ columns=[
324
+ "Rank* (UB)",
325
+ "Model",
326
+ "Arena Elo",
327
+ "95% CI",
328
+ "Votes",
329
+ "Organization",
330
+ "License",
331
+ "Knowledge Cutoff",
332
+ ],
333
+ )
334
+ elo_display_df = gr.Dataframe(
335
+ headers=[
336
+ "Rank* (UB)",
337
+ "Model",
338
+ "Arena Elo",
339
+ "95% CI",
340
+ "Votes",
341
+ "Organization",
342
+ "License",
343
+ "Knowledge Cutoff",
344
+ ],
345
+ datatype=[
346
+ "str",
347
+ "markdown",
348
+ "number",
349
+ "str",
350
+ "number",
351
+ "str",
352
+ "str",
353
+ "str",
354
+ ],
355
+ value=arena_vals.style,
356
+ elem_id="arena_leaderboard_dataframe",
357
+ height=700,
358
+ column_widths=[70, 190, 100, 100, 90, 130, 150, 100],
359
+ wrap=True,
360
+ )
361
+
362
+ gr.Markdown(
363
+ elem_id="leaderboard_markdown",
364
+ )
365
+
366
+ leader_component_values[:] = [default_md, p1, p2, p3, p4]
367
+
368
+ if show_plot:
369
+ more_stats_md = gr.Markdown(
370
+ f"""## More statistics on Chatbot Arena""",
371
+ elem_id="leaderboard_header_markdown",
372
+ )
373
+ with gr.Row():
374
+ with gr.Column():
375
+ gr.Markdown(
376
+ "#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
377
+ elem_id="plot-title",
378
+ )
379
+ plot_3 = gr.Plot(p3, show_label=False)
380
+ with gr.Column():
381
+ gr.Markdown(
382
+ "#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
383
+ elem_id="plot-title",
384
+ )
385
+ plot_4 = gr.Plot(p4, show_label=False)
386
+ with gr.Row():
387
+ with gr.Column():
388
+ gr.Markdown(
389
+ "#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
390
+ elem_id="plot-title",
391
+ )
392
+ plot_1 = gr.Plot(
393
+ p1, show_label=False, elem_id="plot-container"
394
+ )
395
+ with gr.Column():
396
+ gr.Markdown(
397
+ "#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
398
+ elem_id="plot-title",
399
+ )
400
+ plot_2 = gr.Plot(p2, show_label=False)
401
+
402
+ if not show_plot:
403
+ gr.Markdown(
404
+ """
405
+ """,
406
+ elem_id="leaderboard_markdown",
407
+ )
408
+ else:
409
+ pass
410
+
411
+ def update_leaderboard_df(arena_table_vals):
412
+ elo_datarame = pd.DataFrame(
413
+ arena_table_vals,
414
+ columns=[
415
+ "Rank* (UB)",
416
+ "Delta",
417
+ "Model",
418
+ "Arena Elo",
419
+ "95% CI",
420
+ "Votes",
421
+ "Organization",
422
+ "License",
423
+ "Knowledge Cutoff",
424
+ ],
425
+ )
426
+
427
+ def highlight_max(s):
428
+ return [
429
+ "color: green; font-weight: bold"
430
+ if "\u2191" in v
431
+ else "color: red; font-weight: bold"
432
+ if "\u2193" in v
433
+ else ""
434
+ for v in s
435
+ ]
436
+
437
+ def highlight_rank_max(s):
438
+ return [
439
+ "color: green; font-weight: bold"
440
+ if v > 0
441
+ else "color: red; font-weight: bold"
442
+ if v < 0
443
+ else ""
444
+ for v in s
445
+ ]
446
+
447
+ return elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
448
+ highlight_rank_max, subset=["Delta"]
449
+ )
450
+
451
+ def update_leaderboard_and_plots(category):
452
+ _, arena_dfs, category_elo_results, _ , model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
453
+
454
+ arena_subset_df = arena_dfs[category]
455
+ arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 300]
456
+ elo_subset_results = category_elo_results[category]
457
+
458
+ baseline_category = cat_name_to_baseline.get(category, "Overall")
459
+ arena_df = arena_dfs[baseline_category]
460
+ arena_values = get_arena_table(
461
+ arena_df,
462
+ model_table_df,
463
+ arena_subset_df=arena_subset_df if category != "Overall" else None,
464
+ )
465
+ if category != "Overall":
466
+ arena_values = update_leaderboard_df(arena_values)
467
+ arena_values = gr.Dataframe(
468
+ headers=[
469
+ "Rank* (UB)",
470
+ "Delta",
471
+ "Model",
472
+ "Arena Elo",
473
+ "95% CI",
474
+ "Votes",
475
+ "Organization",
476
+ "License",
477
+ "Knowledge Cutoff",
478
+ ],
479
+ datatype=[
480
+ "str",
481
+ "number",
482
+ "markdown",
483
+ "number",
484
+ "str",
485
+ "number",
486
+ "str",
487
+ "str",
488
+ "str",
489
+ ],
490
+ value=arena_values,
491
+ elem_id="arena_leaderboard_dataframe",
492
+ height=700,
493
+ column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100],
494
+ wrap=True,
495
+ )
496
+ else:
497
+ arena_values = gr.Dataframe(
498
+ headers=[
499
+ "Rank* (UB)",
500
+ "Model",
501
+ "Arena Elo",
502
+ "95% CI",
503
+ "Votes",
504
+ "Organization",
505
+ "License",
506
+ "Knowledge Cutoff",
507
+ ],
508
+ datatype=[
509
+ "str",
510
+ "markdown",
511
+ "number",
512
+ "str",
513
+ "number",
514
+ "str",
515
+ "str",
516
+ "str",
517
+ ],
518
+ value=arena_values,
519
+ elem_id="arena_leaderboard_dataframe",
520
+ height=700,
521
+ column_widths=[70, 190, 100, 100, 90, 140, 150, 100],
522
+ wrap=True,
523
+ )
524
+
525
+ p1 = elo_subset_results["win_fraction_heatmap"]
526
+ p2 = elo_subset_results["battle_count_heatmap"]
527
+ p3 = elo_subset_results["bootstrap_elo_rating"]
528
+ p4 = elo_subset_results["average_win_rate_bar"]
529
+ more_stats_md = f"""## More Statistics for Chatbot Arena - {category}
530
+ """
531
+ leaderboard_md = make_category_arena_leaderboard_md(
532
+ arena_df, arena_subset_df, name=category
533
+ )
534
+ return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
535
+
536
+ if leaderboard_table_file:
537
+ category_dropdown.change(
538
+ fn=update_leaderboard_and_plots,
539
+ inputs=[category_dropdown],
540
+ outputs=[
541
+ elo_display_df,
542
+ plot_1,
543
+ plot_2,
544
+ plot_3,
545
+ plot_4,
546
+ more_stats_md,
547
+ category_deets,
548
+ ],
549
+ )
550
+ if show_plot and leaderboard_table_file:
551
+ return [md_1, md_2, lb_description, category_deets, elo_display_df, plot_1, plot_2, plot_3, plot_4]
552
+ return [md_1]
553
+
554
+
555
+ def build_demo(elo_results_file, leaderboard_table_file):
556
+ text_size = gr.themes.sizes.text_lg
557
+ theme = gr.themes.Default.load("theme.json")
558
+ theme.text_size = text_size
559
+ theme.set(
560
+ button_large_text_size="40px",
561
+ button_small_text_size="40px",
562
+ button_large_text_weight="1000",
563
+ button_small_text_weight="1000",
564
+ button_shadow="*shadow_drop_lg",
565
+ button_shadow_hover="*shadow_drop_lg",
566
+ checkbox_label_shadow="*shadow_drop_lg",
567
+ button_shadow_active="*shadow_inset",
568
+ button_secondary_background_fill="*primary_300",
569
+ button_secondary_background_fill_dark="*primary_700",
570
+ button_secondary_background_fill_hover="*primary_200",
571
+ button_secondary_background_fill_hover_dark="*primary_500",
572
+ button_secondary_text_color="*primary_800",
573
+ button_secondary_text_color_dark="white",
574
+ )
575
+
576
+ with gr.Blocks(
577
+ title="LLM arena: leaderboard",
578
+ theme=theme,
579
+ css=block_css,
580
+ ) as demo:
581
+ build_leaderboard_tab(
582
+ elo_results_file, leaderboard_table_file, show_plot=True, mirror=True
583
+ )
584
+ return demo
585
+
586
+ block_css = """
587
+ #notice_markdown .prose {
588
+ font-size: 110% !important;
589
+ }
590
+ #notice_markdown th {
591
+ display: none;
592
+ }
593
+ #notice_markdown td {
594
+ padding-top: 6px;
595
+ padding-bottom: 6px;
596
+ }
597
+ #arena_leaderboard_dataframe table {
598
+ font-size: 110%;
599
+ }
600
+ #full_leaderboard_dataframe table {
601
+ font-size: 110%;
602
+ }
603
+ #model_description_markdown {
604
+ font-size: 110% !important;
605
+ }
606
+ #leaderboard_markdown .prose {
607
+ font-size: 110% !important;
608
+ }
609
+ #leaderboard_markdown td {
610
+ padding-top: 6px;
611
+ padding-bottom: 6px;
612
+ }
613
+ #leaderboard_dataframe td {
614
+ line-height: 0.1em;
615
+ }
616
+ #about_markdown .prose {
617
+ font-size: 110% !important;
618
+ }
619
+ #ack_markdown .prose {
620
+ font-size: 110% !important;
621
+ }
622
+ #chatbot .prose {
623
+ font-size: 105% !important;
624
+ }
625
+ .sponsor-image-about img {
626
+ margin: 0 20px;
627
+ margin-top: 20px;
628
+ height: 40px;
629
+ max-height: 100%;
630
+ width: auto;
631
+ float: left;
632
+ }
633
+
634
+ .chatbot h1, h2, h3 {
635
+ margin-top: 8px; /* Adjust the value as needed */
636
+ margin-bottom: 0px; /* Adjust the value as needed */
637
+ padding-bottom: 0px;
638
+ }
639
+
640
+ .chatbot h1 {
641
+ font-size: 130%;
642
+ }
643
+ .chatbot h2 {
644
+ font-size: 120%;
645
+ }
646
+ .chatbot h3 {
647
+ font-size: 110%;
648
+ }
649
+ .chatbot p:not(:first-child) {
650
+ margin-top: 8px;
651
+ }
652
+
653
+ .typing {
654
+ display: inline-block;
655
+ }
656
+
657
+ .cursor {
658
+ display: inline-block;
659
+ width: 7px;
660
+ height: 1em;
661
+ background-color: black;
662
+ vertical-align: middle;
663
+ animation: blink 1s infinite;
664
+ }
665
+
666
+ .dark .cursor {
667
+ display: inline-block;
668
+ width: 7px;
669
+ height: 1em;
670
+ background-color: white;
671
+ vertical-align: middle;
672
+ animation: blink 1s infinite;
673
+ }
674
+
675
+ @keyframes blink {
676
+ 0%, 50% { opacity: 1; }
677
+ 50.1%, 100% { opacity: 0; }
678
+ }
679
+
680
+ .app {
681
+ max-width: 100% !important;
682
+ padding: 20px !important;
683
+ }
684
+
685
+ a {
686
+ color: #1976D2; /* Your current link color, a shade of blue */
687
+ text-decoration: none; /* Removes underline from links */
688
+ }
689
+ a:hover {
690
+ color: #63A4FF; /* This can be any color you choose for hover */
691
+ text-decoration: underline; /* Adds underline on hover */
692
+ }
693
+ """
694
+
695
+
696
+ if __name__ == "__main__":
697
+ parser = argparse.ArgumentParser()
698
+ parser.add_argument("--share", action="store_true")
699
+ parser.add_argument("--host", default="0.0.0.0")
700
+ parser.add_argument("--port", type=int, default=7860)
701
+ args = parser.parse_args()
702
+
703
+ elo_result_files = glob.glob("elo_results_*.pkl")
704
+ elo_result_files.sort(key=lambda x: int(x[12:-4]))
705
+ elo_result_file = elo_result_files[-1]
706
+
707
+ leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
708
+ leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
709
+ leaderboard_table_file = leaderboard_table_files[-1]
710
+
711
+ demo = build_demo(elo_result_file, leaderboard_table_file)
712
+ demo.launch(show_api=False)