Ahmad Shallouf commited on
Commit
7a759bb
·
1 Parent(s): 5eb3c1d

deleted ignored file

Browse files
Files changed (1) hide show
  1. app_wildbench.py +0 -526
app_wildbench.py DELETED
@@ -1,526 +0,0 @@
1
- """A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
2
- import ast
3
- import argparse
4
- import glob
5
- import pickle
6
- import plotly
7
- import gradio as gr
8
- import numpy as np
9
- import pandas as pd
10
- import gradio as gr
11
- import pandas as pd
12
- from pathlib import Path
13
- import json
14
- from constants import BANNER, CITATION_TEXT, WINRATE_HEATMAP, css, js_code, all_task_types, DEFAULT_LP, TASK_TYPE_STR, \
15
- js_light
16
- from datetime import datetime, timezone
17
- from data_utils import load_eval_results, sample_an_eval_result, apply_length_penalty, post_processing, add_winrates, \
18
- add_winrates_tasks
19
- # from gradio.themes.utils import colors, fonts, sizes
20
- from themes import Seafoam
21
- from huggingface_hub import HfApi
22
- # from datasets import Dataset, load_dataset, concatenate_datasets
23
- import os, uuid
24
- from utils_display import model_info
25
-
26
- # get the last updated time from the elo_ranks.all.jsonl file
27
- LAST_UPDATED = None
28
- with open("_intro.md", "r") as f:
29
- INTRO_MD = f.read()
30
-
31
- with open("_about_us.md", "r") as f:
32
- ABOUT_MD = f.read()
33
-
34
- with open("_header.md", "r") as f:
35
- HEADER_MD = f.read()
36
-
37
- original_df, ablation_df = None, None
38
- eval_results = load_eval_results()
39
-
40
- available_models = [] # to be filled in later
41
-
42
-
43
- def display_chat_history(model_selections, task_selections):
44
- eval_item = sample_an_eval_result(eval_results, model_selections, task_selections)
45
- session_id = eval_item["session_id"]
46
- chats = [x["content"] for x in eval_item['conversation_input']]
47
- # form a list of tuples of two adjacent messages in chats
48
- chats_common = chats[:] + [None]
49
- # chats_modelA = ["Model A Output"] + [eval_item["model_A_output"]]
50
- # chats_modelB = ["Model B Output"] + [eval_item["model_B_output"]]
51
- chats_modelA = [None] + [eval_item["model_A_output"]]
52
- chats_modelB = [None] + [eval_item["model_B_output"]]
53
- message_history_common = [(chats_common[i], chats_common[i + 1]) for i in range(0, len(chats_common) - 1, 2)]
54
- message_history_model_A = [(chats_modelA[i], chats_modelA[i + 1]) for i in range(0, len(chats_modelA) - 1, 2)]
55
- message_history_model_B = [(chats_modelB[i], chats_modelB[i + 1]) for i in range(0, len(chats_modelB) - 1, 2)]
56
- checklist_string = ""
57
- for item in eval_item["checklist"]:
58
- checklist_string += f"1. {item}\n"
59
- list_reasons = eval_item["reason"].strip().split(". ")
60
- # remove the last one if it is empty
61
- if list_reasons[-1] == "":
62
- list_reasons = list_reasons[:-1]
63
- list_reasons = "\n".join([f"- {item}." for item in list_reasons])
64
- gpt4_reason = f"### Choice: {eval_item['choice']}. Reason: ⬇️\n" + list_reasons
65
- assignment_string = f"Model A: {eval_item['model_A']} | Model B: {eval_item['model_B']}"
66
- user_intent = f"- 🆔: `{session_id}` \n- 💬 **User Intent:** {eval_item['intent']} \n- ⚙️ **Task category**: {', '.join(eval_item['all_tags'])}"
67
- return session_id, user_intent, message_history_common, message_history_model_A, message_history_model_B, gpt4_reason, checklist_string, assignment_string
68
-
69
-
70
- def slider_change_main(length_penalty):
71
- global original_df, ablation_df
72
- adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty)
73
- adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
74
- adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
75
- adjusted_df = add_winrates(adjusted_df)
76
- adjusted_df = adjusted_df.drop(columns=["Length"])
77
- return adjusted_df
78
-
79
-
80
- def slider_change_full(length_penalty, show_winrate):
81
- global original_df, ablation_df
82
- adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty)
83
- # sort the model by the "Task-Avg Elo" column
84
- adjusted_df = adjusted_df.sort_values(by="Task-Avg Elo", ascending=False)
85
- adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
86
- if show_winrate == "none":
87
- return adjusted_df
88
- elif show_winrate == "gpt-3.5":
89
- adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5")
90
- elif show_winrate == "gpt-4":
91
- adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4")
92
- return adjusted_df
93
-
94
-
95
- seafoam = Seafoam()
96
-
97
-
98
- def build_demo(TYPES):
99
- global original_df, ablation_df, skip_empty_original_df, skip_empty_ablation_df, available_models
100
- with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
101
- # with gr.Blocks(theme=seafoam, css=css) as demo:
102
- gr.HTML(BANNER, elem_id="banner")
103
- # gr.Markdown("### Work in progress. Please do not share.", elem_classes="markdown-text") # TODO: remove this later.
104
- gr.Markdown(HEADER_MD, elem_classes="markdown-text")
105
-
106
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
107
- with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
108
- gr.Markdown(
109
- f"**Version**: WildBench (v1.0; 2024.03.07) | **# Examples**: 1024 | **# Models**: {len(available_models)} | **# Comparisons**: 26k",
110
- elem_classes="markdown-text")
111
-
112
- with gr.TabItem("Main Table", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
113
- # original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
114
- default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
115
- default_main_df = default_main_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
116
- default_main_df = add_winrates(default_main_df)
117
- default_main_df = default_main_df.drop(columns=["Length"])
118
- # TODO: add the win rate for GPT-4 and GPT-3.5T
119
- with gr.Row():
120
- with gr.Column(scale=4):
121
- gr.Markdown(
122
- "**Overall Elo**: [Standard Elo rating with boostrap.](https://en.wikipedia.org/wiki/Elo_rating_system). | **Task-Avg Elo**: Compute Elo on subsets of each task type and then take avg. | **Win Rates**: [Estimated by Elo differences](https://www.hexwiki.net/index.php/Elo_rating#Definition). | **Length penalty**: Models w/ longer outputs are penalized. (Plz check 📖 **Details**.)",
123
- elem_classes="markdown-text-small top-left-LP")
124
- with gr.Column(scale=0.8):
125
- length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
126
- label="Length Penalty", elem_id="length-penalty-slider")
127
- # checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
128
- leaderboard_table = gr.components.Dataframe(
129
- value=default_main_df,
130
- datatype=TYPES,
131
- # max_rows=None,
132
- height=1000,
133
- elem_id="leaderboard-table",
134
- interactive=False,
135
- visible=True,
136
- min_width=60,
137
- )
138
- length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider],
139
- outputs=[leaderboard_table])
140
-
141
- with gr.TabItem("All Tasks (Win% vs GPT-3.5T)", elem_id="od-benchmark-tab-table-ablation", id=1):
142
- with gr.Row():
143
- with gr.Column(scale=4):
144
- gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP")
145
- with gr.Column(scale=0.8):
146
- length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
147
- label="Length Penalty",
148
- elem_id="length-penalty-slider")
149
- default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
150
- # do not show the "# battles" column here
151
- default_full_df = default_full_df.drop(
152
- columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
153
- default_full_df = add_winrates_tasks(default_full_df, ref="gpt-3.5")
154
-
155
- leaderboard_table_full = gr.components.Dataframe(
156
- value=default_full_df,
157
- datatype=TYPES,
158
- # max_rows=None,
159
- height=1000,
160
- elem_id="leaderboard-table-full_table",
161
- interactive=False,
162
- visible=True,
163
- min_width=60,
164
- )
165
- show_winrate = gr.Checkbox(value="gpt-3.5", visible=False)
166
- length_penlty_slider_full.change(fn=slider_change_full,
167
- inputs=[length_penlty_slider_full, show_winrate],
168
- outputs=[leaderboard_table_full])
169
-
170
- with gr.TabItem("All Tasks (Win% vs GPT-4)", elem_id="od-benchmark-tab-table-ablation", id=2):
171
- with gr.Row():
172
- with gr.Column(scale=4):
173
- gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP")
174
- with gr.Column(scale=0.8):
175
- length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
176
- label="Length Penalty",
177
- elem_id="length-penalty-slider")
178
- default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
179
- # do not show the "# battles" column here
180
- default_full_df = default_full_df.drop(
181
- columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
182
- default_full_df = add_winrates_tasks(default_full_df, ref="gpt-4")
183
- leaderboard_table_full = gr.components.Dataframe(
184
- value=default_full_df,
185
- datatype=TYPES,
186
- # max_rows=None,
187
- height=1000,
188
- elem_id="leaderboard-table-full_table",
189
- interactive=False,
190
- visible=True,
191
- min_width=60,
192
- )
193
- show_winrate = gr.Checkbox(value="gpt-4", visible=False)
194
- length_penlty_slider_full.change(fn=slider_change_full,
195
- inputs=[length_penlty_slider_full, show_winrate],
196
- outputs=[leaderboard_table_full])
197
-
198
- with gr.TabItem("All Tasks (Elo)", elem_id="od-benchmark-tab-table-ablation", id=3):
199
- with gr.Row():
200
- with gr.Column(scale=4):
201
- gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP")
202
- with gr.Column(scale=0.8):
203
- length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
204
- label="Length Penalty",
205
- elem_id="length-penalty-slider")
206
- default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
207
- # do not show the "# battles" column here
208
- default_full_df = default_full_df.drop(
209
- columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
210
- leaderboard_table_full = gr.components.Dataframe(
211
- value=default_full_df,
212
- datatype=TYPES,
213
- # max_rows=None,
214
- height=1000,
215
- elem_id="leaderboard-table-full_table",
216
- interactive=False,
217
- visible=True,
218
- min_width=60,
219
- )
220
- show_winrate = gr.Checkbox(value="none", visible=False)
221
- length_penlty_slider_full.change(fn=slider_change_full,
222
- inputs=[length_penlty_slider_full, show_winrate],
223
- outputs=[leaderboard_table_full])
224
-
225
- # with gr.TabItem("Pairwise Win Rates", elem_id="od-benchmark-tab-table-ablation", id=4):
226
- # # TODO: show all winrate
227
- # # winrates_heatmap = pickle.load(open("data_dir/pairwise_win_fractions.pkl", "rb"))
228
- # # gr.Plot(value=winrates_heatmap, scale=2, min_width=800, container=False, elem_classes="plotly-plot", visible=True)
229
- # gr.HTML(WINRATE_HEATMAP, visible=True)
230
-
231
- with gr.TabItem("📖 Details", elem_id="od-benchmark-tab-table", id=1):
232
- gr.Markdown(INTRO_MD, elem_classes="markdown-text-details")
233
-
234
- with gr.TabItem("🔍 Explore | 🆚 Evaluate", elem_id="od-benchmark-tab-table", id=2):
235
-
236
- with gr.Row():
237
- btn_show_history = gr.Button("🎲 Click here to sample an example + a pair of LLM outputs! ",
238
- elem_classes="sample_button")
239
-
240
- with gr.Row():
241
- with gr.Column(scale=1.5):
242
- with gr.Accordion("Choose models to sample from", open=False, elem_classes="accordion-label"):
243
- model_options = available_models
244
- selected_models = gr.CheckboxGroup(model_options, info="", value=model_options,
245
- show_label=False, elem_id="select-models")
246
- clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
247
- # clear the selected_models
248
- clear_button.click(lambda: {selected_models: {"value": [], "__type__": "update"}},
249
- inputs=[], outputs=[selected_models])
250
- with gr.Column(scale=1):
251
- with gr.Accordion("Choose task types to sample from", open=False,
252
- elem_classes="accordion-label"):
253
- select_tasks = gr.CheckboxGroup(all_task_types, info="", value=all_task_types,
254
- show_label=False, elem_id="select-tasks")
255
- clear_task_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
256
- # clear the select_tasks
257
- clear_task_button.click(lambda: {select_tasks: {"value": [], "__type__": "update"}},
258
- inputs=[], outputs=[select_tasks])
259
-
260
- with gr.Row():
261
- with gr.Column():
262
- gr.Markdown("## 📢 Chat History", elem_classes="markdown-text")
263
- Chatbot_Common = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height="auto",
264
- container=False, label="Common Chat History", likeable=False,
265
- show_share_button=False, show_label=True,
266
- elem_classes="chat-common", layout="bubble")
267
- Chatbot_Common.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
268
- with gr.Accordion("✍️ Task Annotation", elem_classes="accordion-label", open=False):
269
- user_intent = gr.Markdown("", elem_classes="markdown-text-small")
270
- # two columns for the two models
271
- with gr.Row():
272
- # https://www.gradio.app/docs/chatbot
273
- with gr.Column():
274
- gr.Markdown("## ⬅️ Model A Output", elem_classes="markdown-text")
275
- Chatbot_A = gr.Chatbot(height="auto", container=False, label="Model A Output", likeable=False,
276
- show_share_button=False, show_label=True, elem_classes="chat-specific",
277
- layout="bubble")
278
- Chatbot_A.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
279
- with gr.Column():
280
- # add a Markdown to show this is for Model B
281
- gr.Markdown("## ➡️ Model B Output", elem_classes="markdown-text")
282
- Chatbot_B = gr.Chatbot(height="auto", container=False, label="Model B Output", likeable=False,
283
- show_share_button=False, show_label=True, elem_classes="chat-specific",
284
- layout="bubble")
285
- Chatbot_B.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
286
- with gr.Row():
287
- # Here we can show the GPT-4 judgement for the model outputs
288
- # show a textarea
289
- with gr.Column():
290
- with gr.Accordion("⏱️ Checklist", open=False, elem_classes="accordion-label"):
291
- checklist = gr.Markdown("### Checklist: \n Will be shown later.",
292
- elem_classes="markdown-text-tiny")
293
- with gr.Accordion("⚖️ GPT-4 Judgement", open=False,
294
- elem_classes="accordion-label") as gpt4_accordion:
295
- # gpt4_reason = gr.TextArea(label="GPT-4 Judgement", placeholder="Will be shown later.", type="text", elem_classes="", max_lines=10, show_copy_button=True)
296
- gpt4_reason = gr.Markdown("Will be shown later.", elem_classes="markdown-text-tiny")
297
-
298
- with gr.Row():
299
- # show buttons for user to choose which model output is better or Tie
300
- btn_model_A = gr.Button("⬅️ Model A is better! ", elem_classes="btn_boderline_gray", scale=2,
301
- interactive=False)
302
- btn_tie = gr.Button("🟰 Tie", elem_classes="btn_boderline_gray", scale=2, interactive=False)
303
- btn_model_B = gr.Button("➡️ Model B is better!", elem_classes="btn_boderline_gray", scale=2,
304
- interactive=False)
305
- with gr.Row():
306
- with gr.Column(scale=2):
307
- reason_textbox = gr.Textbox(label="Reason", placeholder="Please input your reason here.",
308
- type="text", elem_classes="", max_lines=10, lines=8,
309
- show_copy_button=False, visible=True, scale=4, interactive=True)
310
- with gr.Column():
311
- with gr.Row():
312
- user_choice = gr.Markdown("Your choice: N/A", elem_classes="markdown-text", visible=True)
313
- btn_pass = gr.Button("🔁 Next", elem_classes="btn_boderline_next", scale=1)
314
- user_name = gr.Textbox(label="Your HF Username", placeholder="Your HuggingFace username",
315
- type="text", elem_classes="", max_lines=1, show_copy_button=False,
316
- visible=True, interactive=True, show_label=False)
317
- # login_btn = gr.LoginButton(visible=False, interactive=True, elem_classes="btn_boderline")
318
- submit_button = gr.Button("Submit your feedback! 🚀", elem_classes="btn_boderline", visible=True,
319
- interactive=False)
320
- assignment = gr.Markdown("Model A: | Model B: ", elem_classes="markdown-text-tiny-red",
321
- visible=False)
322
-
323
- session_id = gr.Textbox(label="Session ID", placeholder="N/A.", type="text", elem_classes="",
324
- max_lines=10, show_copy_button=False, visible=False)
325
-
326
- def show_reason_and_submit(session_id, user_name_text, btn, request: gr.Request):
327
-
328
- if request.username is not None:
329
- user_name_text = request.username
330
- result_dict = {
331
- reason_textbox: {"visible": True, "__type__": "update"},
332
- submit_button: {"visible": True, "__type__": "update", "interactive": True},
333
- user_name: {"visible": True, "__type__": "update", "value": user_name_text},
334
- }
335
- if "Model A" in btn:
336
- choice = "Model A"
337
- result_dict.update({
338
- user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
339
- btn_model_A: {"elem_classes": "btn_boderline_selected", "__type__": "update"},
340
- btn_model_B: {"elem_classes": "btn_boderline", "__type__": "update"},
341
- btn_tie: {"elem_classes": "btn_boderline", "__type__": "update"},
342
- })
343
- elif "Model B" in btn:
344
- choice = "Model B"
345
- result_dict.update({
346
- user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
347
- btn_model_B: {"elem_classes": "btn_boderline_selected", "__type__": "update"},
348
- btn_model_A: {"elem_classes": "btn_boderline", "__type__": "update"},
349
- btn_tie: {"elem_classes": "btn_boderline", "__type__": "update"},
350
- })
351
- elif "Tie" in btn:
352
- choice = "Tie"
353
- result_dict.update({
354
- user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
355
- btn_tie: {"elem_classes": "btn_boderline_selected", "__type__": "update"},
356
- btn_model_A: {"elem_classes": "btn_boderline", "__type__": "update"},
357
- btn_model_B: {"elem_classes": "btn_boderline", "__type__": "update"},
358
- })
359
- else:
360
- choice = "N/A"
361
- result_dict.update({
362
- user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
363
- })
364
- return result_dict
365
-
366
- btn_model_A.click(show_reason_and_submit, inputs=[session_id, user_name, btn_model_A],
367
- outputs=[user_choice, reason_textbox, submit_button, user_name, btn_model_A, btn_tie,
368
- btn_model_B])
369
- btn_tie.click(show_reason_and_submit, inputs=[session_id, user_name, btn_tie],
370
- outputs=[user_choice, reason_textbox, submit_button, user_name, btn_model_A, btn_tie,
371
- btn_model_B])
372
- btn_model_B.click(show_reason_and_submit, inputs=[session_id, user_name, btn_model_B],
373
- outputs=[user_choice, reason_textbox, submit_button, user_name, btn_model_A, btn_tie,
374
- btn_model_B])
375
-
376
- def submit_feedback(session_id, user_reason, user_choice, user_name_text, assignment_string,
377
- request: gr.Request):
378
- if "N/A" in session_id or "N/A" in user_choice:
379
- # send a message to the user to sample an example and select a choice first
380
- return {
381
- submit_button: {"interactive": True, "__type__": "update",
382
- "value": "Submit your feedback! 🚀 Please sample an example and select a choice!"},
383
- }
384
- # create a jsonl file and upload it to hf
385
- choice_str = ""
386
- if "Model A" in user_choice:
387
- choice_str = "Model A"
388
- elif "Model B" in user_choice:
389
- choice_str = "Model B"
390
- elif "Tie" in user_choice:
391
- choice_str = "Tie"
392
- else:
393
- choice_str = "N/A"
394
- if user_name_text == "" and request.username is None:
395
- user_name_text = "Anonymous"
396
- if request.username is not None:
397
- user_name_text = request.username
398
- feedback_item = {
399
- "session_id": session_id,
400
- "user_name": user_name_text,
401
- "user_reason": user_reason,
402
- "user_choice": choice_str,
403
- "ip": request.client.host,
404
- "assignment_string": assignment_string
405
- }
406
- jsonl_str = json.dumps(feedback_item)
407
- api = HfApi()
408
- token = os.getenv("HF_TOKEN")
409
- if token is None:
410
- raise ValueError(
411
- "Hugging Face token not found. Ensure the HF_TOKEN environment variable is set.")
412
-
413
- # Generate a random filename using UUID
414
- filename = f"{uuid.uuid4()}.json"
415
-
416
- # Define the repository
417
- repo_id = "WildEval/WildBench-HumanFeedback"
418
-
419
- # Upload the json_str as a file directly to the specified path in your dataset repository
420
- api.upload_file(
421
- token=token,
422
- repo_id=repo_id,
423
- repo_type="dataset",
424
- path_or_fileobj=jsonl_str.encode("utf-8"), # Convert string to bytes
425
- path_in_repo=filename,
426
- commit_message=f"Add user feedback for session_id: {session_id}. Assignment: {assignment_string}",
427
- )
428
- return {
429
- submit_button: {"interactive": False, "__type__": "update",
430
- "value": "Submitted! ✅ \n Please click 🔁 Next."},
431
- reason_textbox: {"interactive": False, "__type__": "update"},
432
- btn_model_A: {"interactive": False, "__type__": "update"},
433
- btn_tie: {"interactive": False, "__type__": "update"},
434
- btn_model_B: {"interactive": False, "__type__": "update"},
435
- user_name: {"interactive": False, "__type__": "update"},
436
- assignment: {"visible": True, "__type__": "update"}
437
- }
438
-
439
- def reset_submission(session_id):
440
- return {
441
- submit_button: {"interactive": False, "__type__": "update", "value": "Submit your feedback! 🚀"},
442
- reason_textbox: {"interactive": True, "__type__": "update", "value": ""},
443
- btn_model_A: {"interactive": True, "__type__": "update", "elem_classes": "btn_boderline_gray"},
444
- btn_tie: {"interactive": True, "__type__": "update", "elem_classes": "btn_boderline_gray"},
445
- btn_model_B: {"interactive": True, "__type__": "update", "elem_classes": "btn_boderline_gray"},
446
- user_name: {"interactive": True, "__type__": "update"},
447
- user_choice: {"value": "Your choice: N/A", "__type__": "update"},
448
- assignment: {"__type__": "update", "visible": False},
449
- gpt4_accordion: {"__type__": "update", "open": False},
450
- }
451
-
452
- # reset the reason_textbox, submit_button, and btn_model_A
453
- session_id.change(reset_submission, inputs=[session_id],
454
- outputs=[submit_button, reason_textbox, btn_model_A, btn_tie, btn_model_B, user_name,
455
- user_choice, assignment, gpt4_accordion])
456
- submit_button.click(submit_feedback,
457
- inputs=[session_id, reason_textbox, user_choice, user_name, assignment],
458
- outputs=[submit_button, reason_textbox, btn_model_A, btn_tie, btn_model_B,
459
- user_name, assignment])
460
-
461
- # Display chat history when button is clicked
462
- # TODO: add the model list and tag list
463
- btn_show_history.click(fn=display_chat_history, inputs=[selected_models, select_tasks],
464
- outputs=[session_id, user_intent, Chatbot_Common, Chatbot_A, Chatbot_B,
465
- gpt4_reason, checklist, assignment])
466
- btn_pass.click(fn=display_chat_history, inputs=[selected_models, select_tasks],
467
- outputs=[session_id, user_intent, Chatbot_Common, Chatbot_A, Chatbot_B, gpt4_reason,
468
- checklist,
469
- assignment]) # the pass button will be the same function of resampling
470
-
471
- with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=3):
472
- gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
473
- gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
474
-
475
- with gr.Row():
476
- with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
477
- gr.Textbox(
478
- value=CITATION_TEXT,
479
- lines=7,
480
- label="Copy the BibTeX snippet to cite this source",
481
- elem_id="citation-button",
482
- show_copy_button=True)
483
- # ).style(show_copy_button=True)
484
-
485
- return demo
486
-
487
-
488
- if __name__ == "__main__":
489
- parser = argparse.ArgumentParser()
490
- parser.add_argument("--share", action="store_true")
491
- parser.add_argument("--result_file", help="Path to results table", default="data_dir/elo_ranks.all.jsonl")
492
- parser.add_argument("--length_balation_file", help="Path to results table",
493
- default="data_dir/elo_ranks.length_ablation.all.jsonl")
494
- parser.add_argument("--skip_empty_result_file", help="Path to results table",
495
- default="data_dir/elo_ranks.skip_empty.all.jsonl")
496
- parser.add_argument("--skip_empty_length_balation_file", help="Path to results table",
497
- default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl")
498
- args = parser.parse_args()
499
-
500
- LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime(
501
- "%Y-%m-%d %H:%M:%S")
502
-
503
- original_df = pd.read_json(args.result_file, lines=True)
504
- ablation_df = pd.read_json(args.length_balation_file, lines=True)
505
- skip_empty_original_df = pd.read_json(args.skip_empty_result_file, lines=True)
506
- skip_empty_ablation_df = pd.read_json(args.skip_empty_length_balation_file, lines=True)
507
-
508
- # available_models = sorted(list(set(list(original_df["model name "]))))
509
- available_models = list(model_info.keys())
510
- # remove the rows where the model name is not in the available_models
511
- original_df = original_df[original_df["model name "].isin(available_models)]
512
- ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]
513
- skip_empty_ablation_df = skip_empty_ablation_df[skip_empty_ablation_df["model name "].isin(available_models)]
514
- skip_empty_original_df = skip_empty_original_df[skip_empty_original_df["model name "].isin(available_models)]
515
-
516
- model_len_info = json.load(open("model_len_info.json", "r"))
517
-
518
- original_df = post_processing(original_df, model_len_info)
519
- ablation_df = post_processing(ablation_df, model_len_info)
520
- skip_empty_original_df = post_processing(skip_empty_original_df, model_len_info)
521
- skip_empty_ablation_df = post_processing(skip_empty_ablation_df, model_len_info)
522
-
523
- TYPES = ["markdown", "number"]
524
-
525
- demo = build_demo(TYPES)
526
- demo.launch(share=args.share, height=1000)