Spaces:

ahmadshalloufuhh
/

CompUGE

Paused

App Files Files Community

Ahmad Shallouf commited on Mar 24, 2024

Commit

7a759bb

1 Parent(s): 5eb3c1d

deleted ignored file

Browse files

Files changed (1) hide show

app_wildbench.py +0 -526

app_wildbench.py DELETED Viewed

@@ -1,526 +0,0 @@
-"""A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
-import ast
-import argparse
-import glob
-import pickle
-import plotly
-import gradio as gr
-import numpy as np
-import pandas as pd
-import gradio as gr
-import pandas as pd
-from pathlib import Path
-import json
-from constants import BANNER, CITATION_TEXT, WINRATE_HEATMAP, css, js_code, all_task_types, DEFAULT_LP, TASK_TYPE_STR, \
-    js_light
-from datetime import datetime, timezone
-from data_utils import load_eval_results, sample_an_eval_result, apply_length_penalty, post_processing, add_winrates, \
-    add_winrates_tasks
-# from gradio.themes.utils import colors, fonts, sizes
-from themes import Seafoam
-from huggingface_hub import HfApi
-# from datasets import Dataset, load_dataset, concatenate_datasets
-import os, uuid
-from utils_display import model_info
-# get the last updated time from the elo_ranks.all.jsonl file
-LAST_UPDATED = None
-with open("_intro.md", "r") as f:
-    INTRO_MD = f.read()
-with open("_about_us.md", "r") as f:
-    ABOUT_MD = f.read()
-with open("_header.md", "r") as f:
-    HEADER_MD = f.read()
-original_df, ablation_df = None, None
-eval_results = load_eval_results()
-available_models = []  # to be filled in later
-def display_chat_history(model_selections, task_selections):
-    eval_item = sample_an_eval_result(eval_results, model_selections, task_selections)
-    session_id = eval_item["session_id"]
-    chats = [x["content"] for x in eval_item['conversation_input']]
-    # form a list of tuples of two adjacent messages in chats
-    chats_common = chats[:] + [None]
-    # chats_modelA = ["Model A Output"] + [eval_item["model_A_output"]]
-    # chats_modelB = ["Model B Output"] + [eval_item["model_B_output"]]
-    chats_modelA = [None] + [eval_item["model_A_output"]]
-    chats_modelB = [None] + [eval_item["model_B_output"]]
-    message_history_common = [(chats_common[i], chats_common[i + 1]) for i in range(0, len(chats_common) - 1, 2)]
-    message_history_model_A = [(chats_modelA[i], chats_modelA[i + 1]) for i in range(0, len(chats_modelA) - 1, 2)]
-    message_history_model_B = [(chats_modelB[i], chats_modelB[i + 1]) for i in range(0, len(chats_modelB) - 1, 2)]
-    checklist_string = ""
-    for item in eval_item["checklist"]:
-        checklist_string += f"1. {item}\n"
-    list_reasons = eval_item["reason"].strip().split(". ")
-    # remove the last one if it is empty
-    if list_reasons[-1] == "":
-        list_reasons = list_reasons[:-1]
-    list_reasons = "\n".join([f"- {item}." for item in list_reasons])
-    gpt4_reason = f"### Choice: {eval_item['choice']}.  Reason: ⬇️\n" + list_reasons
-    assignment_string = f"Model A: {eval_item['model_A']} | Model B: {eval_item['model_B']}"
-    user_intent = f"- 🆔: `{session_id}` \n- 💬 **User Intent:** {eval_item['intent']} \n- ⚙️ **Task category**: {', '.join(eval_item['all_tags'])}"
-    return session_id, user_intent, message_history_common, message_history_model_A, message_history_model_B, gpt4_reason, checklist_string, assignment_string
-def slider_change_main(length_penalty):
-    global original_df, ablation_df
-    adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty)
-    adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
-    adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
-    adjusted_df = add_winrates(adjusted_df)
-    adjusted_df = adjusted_df.drop(columns=["Length"])
-    return adjusted_df
-def slider_change_full(length_penalty, show_winrate):
-    global original_df, ablation_df
-    adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty)
-    # sort the model by the "Task-Avg Elo" column
-    adjusted_df = adjusted_df.sort_values(by="Task-Avg Elo", ascending=False)
-    adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
-    if show_winrate == "none":
-        return adjusted_df
-    elif show_winrate == "gpt-3.5":
-        adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5")
-    elif show_winrate == "gpt-4":
-        adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4")
-    return adjusted_df
-seafoam = Seafoam()
-def build_demo(TYPES):
-    global original_df, ablation_df, skip_empty_original_df, skip_empty_ablation_df, available_models
-    with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
-        # with gr.Blocks(theme=seafoam, css=css) as demo:
-        gr.HTML(BANNER, elem_id="banner")
-        # gr.Markdown("### Work in progress. Please do not share.", elem_classes="markdown-text") # TODO: remove this later.
-        gr.Markdown(HEADER_MD, elem_classes="markdown-text")
-        with gr.Tabs(elem_classes="tab-buttons") as tabs:
-            with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
-                gr.Markdown(
-                    f"**Version**: WildBench (v1.0; 2024.03.07) | **# Examples**: 1024 | **# Models**: {len(available_models)} | **# Comparisons**: 26k",
-                    elem_classes="markdown-text")
-                with gr.TabItem("Main Table", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
-                    # original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
-                    default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
-                    default_main_df = default_main_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
-                    default_main_df = add_winrates(default_main_df)
-                    default_main_df = default_main_df.drop(columns=["Length"])
-                    # TODO: add the win rate for GPT-4 and GPT-3.5T
-                    with gr.Row():
-                        with gr.Column(scale=4):
-                            gr.Markdown(
-                                "**Overall Elo**: [Standard Elo rating with boostrap.](https://en.wikipedia.org/wiki/Elo_rating_system). | **Task-Avg Elo**: Compute Elo on subsets of each task type and then take avg. | **Win Rates**: [Estimated by Elo differences](https://www.hexwiki.net/index.php/Elo_rating#Definition). | **Length penalty**: Models w/ longer outputs are penalized. (Plz check 📖 **Details**.)",
-                                elem_classes="markdown-text-small top-left-LP")
-                        with gr.Column(scale=0.8):
-                            length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
-                                                             label="Length Penalty", elem_id="length-penalty-slider")
-                            # checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
-                    leaderboard_table = gr.components.Dataframe(
-                        value=default_main_df,
-                        datatype=TYPES,
-                        # max_rows=None,
-                        height=1000,
-                        elem_id="leaderboard-table",
-                        interactive=False,
-                        visible=True,
-                        min_width=60,
-                    )
-                    length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider],
-                                                outputs=[leaderboard_table])
-                with gr.TabItem("All Tasks (Win% vs GPT-3.5T)", elem_id="od-benchmark-tab-table-ablation", id=1):
-                    with gr.Row():
-                        with gr.Column(scale=4):
-                            gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP")
-                        with gr.Column(scale=0.8):
-                            length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
-                                                                  label="Length Penalty",
-                                                                  elem_id="length-penalty-slider")
-                    default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
-                    # do not show the "# battles" column here
-                    default_full_df = default_full_df.drop(
-                        columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
-                    default_full_df = add_winrates_tasks(default_full_df, ref="gpt-3.5")
-                    leaderboard_table_full = gr.components.Dataframe(
-                        value=default_full_df,
-                        datatype=TYPES,
-                        # max_rows=None,
-                        height=1000,
-                        elem_id="leaderboard-table-full_table",
-                        interactive=False,
-                        visible=True,
-                        min_width=60,
-                    )
-                    show_winrate = gr.Checkbox(value="gpt-3.5", visible=False)
-                    length_penlty_slider_full.change(fn=slider_change_full,
-                                                     inputs=[length_penlty_slider_full, show_winrate],
-                                                     outputs=[leaderboard_table_full])
-                with gr.TabItem("All Tasks (Win% vs GPT-4)", elem_id="od-benchmark-tab-table-ablation", id=2):
-                    with gr.Row():
-                        with gr.Column(scale=4):
-                            gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP")
-                        with gr.Column(scale=0.8):
-                            length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
-                                                                  label="Length Penalty",
-                                                                  elem_id="length-penalty-slider")
-                    default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
-                    # do not show the "# battles" column here
-                    default_full_df = default_full_df.drop(
-                        columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
-                    default_full_df = add_winrates_tasks(default_full_df, ref="gpt-4")
-                    leaderboard_table_full = gr.components.Dataframe(
-                        value=default_full_df,
-                        datatype=TYPES,
-                        # max_rows=None,
-                        height=1000,
-                        elem_id="leaderboard-table-full_table",
-                        interactive=False,
-                        visible=True,
-                        min_width=60,
-                    )
-                    show_winrate = gr.Checkbox(value="gpt-4", visible=False)
-                    length_penlty_slider_full.change(fn=slider_change_full,
-                                                     inputs=[length_penlty_slider_full, show_winrate],
-                                                     outputs=[leaderboard_table_full])
-                with gr.TabItem("All Tasks (Elo)", elem_id="od-benchmark-tab-table-ablation", id=3):
-                    with gr.Row():
-                        with gr.Column(scale=4):
-                            gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP")
-                        with gr.Column(scale=0.8):
-                            length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
-                                                                  label="Length Penalty",
-                                                                  elem_id="length-penalty-slider")
-                    default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
-                    # do not show the "# battles" column here
-                    default_full_df = default_full_df.drop(
-                        columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
-                    leaderboard_table_full = gr.components.Dataframe(
-                        value=default_full_df,
-                        datatype=TYPES,
-                        # max_rows=None,
-                        height=1000,
-                        elem_id="leaderboard-table-full_table",
-                        interactive=False,
-                        visible=True,
-                        min_width=60,
-                    )
-                    show_winrate = gr.Checkbox(value="none", visible=False)
-                    length_penlty_slider_full.change(fn=slider_change_full,
-                                                     inputs=[length_penlty_slider_full, show_winrate],
-                                                     outputs=[leaderboard_table_full])
-                # with gr.TabItem("Pairwise Win Rates", elem_id="od-benchmark-tab-table-ablation", id=4):
-                #     # TODO: show all winrate
-                #     # winrates_heatmap = pickle.load(open("data_dir/pairwise_win_fractions.pkl", "rb"))
-                #     # gr.Plot(value=winrates_heatmap, scale=2, min_width=800, container=False, elem_classes="plotly-plot", visible=True)
-                #     gr.HTML(WINRATE_HEATMAP, visible=True)
-            with gr.TabItem("📖 Details", elem_id="od-benchmark-tab-table", id=1):
-                gr.Markdown(INTRO_MD, elem_classes="markdown-text-details")
-            with gr.TabItem("🔍 Explore | 🆚 Evaluate", elem_id="od-benchmark-tab-table", id=2):
-                with gr.Row():
-                    btn_show_history = gr.Button("🎲  Click here to sample an example + a pair of LLM outputs! ",
-                                                 elem_classes="sample_button")
-                with gr.Row():
-                    with gr.Column(scale=1.5):
-                        with gr.Accordion("Choose models to sample from", open=False, elem_classes="accordion-label"):
-                            model_options = available_models
-                            selected_models = gr.CheckboxGroup(model_options, info="", value=model_options,
-                                                               show_label=False, elem_id="select-models")
-                            clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
-                            # clear the selected_models
-                            clear_button.click(lambda: {selected_models: {"value": [], "__type__": "update"}},
-                                               inputs=[], outputs=[selected_models])
-                    with gr.Column(scale=1):
-                        with gr.Accordion("Choose task types to sample from", open=False,
-                                          elem_classes="accordion-label"):
-                            select_tasks = gr.CheckboxGroup(all_task_types, info="", value=all_task_types,
-                                                            show_label=False, elem_id="select-tasks")
-                            clear_task_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
-                            # clear the select_tasks
-                            clear_task_button.click(lambda: {select_tasks: {"value": [], "__type__": "update"}},
-                                                    inputs=[], outputs=[select_tasks])
-                with gr.Row():
-                    with gr.Column():
-                        gr.Markdown("## 📢 Chat History", elem_classes="markdown-text")
-                        Chatbot_Common = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height="auto",
-                                                    container=False, label="Common Chat History", likeable=False,
-                                                    show_share_button=False, show_label=True,
-                                                    elem_classes="chat-common", layout="bubble")
-                        Chatbot_Common.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
-                        with gr.Accordion("✍️ Task Annotation", elem_classes="accordion-label", open=False):
-                            user_intent = gr.Markdown("", elem_classes="markdown-text-small")
-                # two columns for the two models
-                with gr.Row():
-                    # https://www.gradio.app/docs/chatbot
-                    with gr.Column():
-                        gr.Markdown("## ⬅️ Model A Output", elem_classes="markdown-text")
-                        Chatbot_A = gr.Chatbot(height="auto", container=False, label="Model A Output", likeable=False,
-                                               show_share_button=False, show_label=True, elem_classes="chat-specific",
-                                               layout="bubble")
-                        Chatbot_A.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
-                    with gr.Column():
-                        # add a Markdown to show this is for Model B
-                        gr.Markdown("## ➡️ Model B Output", elem_classes="markdown-text")
-                        Chatbot_B = gr.Chatbot(height="auto", container=False, label="Model B Output", likeable=False,
-                                               show_share_button=False, show_label=True, elem_classes="chat-specific",
-                                               layout="bubble")
-                        Chatbot_B.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
-                with gr.Row():
-                    # Here we can show the GPT-4 judgement for the model outputs
-                    # show a textarea
-                    with gr.Column():
-                        with gr.Accordion("⏱️ Checklist", open=False, elem_classes="accordion-label"):
-                            checklist = gr.Markdown("### Checklist: \n Will be shown later.",
-                                                    elem_classes="markdown-text-tiny")
-                        with gr.Accordion("⚖️ GPT-4 Judgement", open=False,
-                                          elem_classes="accordion-label") as gpt4_accordion:
-                            # gpt4_reason = gr.TextArea(label="GPT-4 Judgement", placeholder="Will be shown later.", type="text", elem_classes="", max_lines=10, show_copy_button=True)
-                            gpt4_reason = gr.Markdown("Will be shown later.", elem_classes="markdown-text-tiny")
-                with gr.Row():
-                    # show buttons for user to choose which model output is better or Tie
-                    btn_model_A = gr.Button("⬅️ Model A is better! ", elem_classes="btn_boderline_gray", scale=2,
-                                            interactive=False)
-                    btn_tie = gr.Button("🟰 Tie", elem_classes="btn_boderline_gray", scale=2, interactive=False)
-                    btn_model_B = gr.Button("➡️ Model B is better!", elem_classes="btn_boderline_gray", scale=2,
-                                            interactive=False)
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        reason_textbox = gr.Textbox(label="Reason", placeholder="Please input your reason here.",
-                                                    type="text", elem_classes="", max_lines=10, lines=8,
-                                                    show_copy_button=False, visible=True, scale=4, interactive=True)
-                    with gr.Column():
-                        with gr.Row():
-                            user_choice = gr.Markdown("Your choice: N/A", elem_classes="markdown-text", visible=True)
-                            btn_pass = gr.Button("🔁 Next", elem_classes="btn_boderline_next", scale=1)
-                        user_name = gr.Textbox(label="Your HF Username", placeholder="Your HuggingFace username",
-                                               type="text", elem_classes="", max_lines=1, show_copy_button=False,
-                                               visible=True, interactive=True, show_label=False)
-                        # login_btn = gr.LoginButton(visible=False, interactive=True, elem_classes="btn_boderline")
-                        submit_button = gr.Button("Submit your feedback! 🚀", elem_classes="btn_boderline", visible=True,
-                                                  interactive=False)
-                        assignment = gr.Markdown("Model A: | Model B: ", elem_classes="markdown-text-tiny-red",
-                                                 visible=False)
-                session_id = gr.Textbox(label="Session ID", placeholder="N/A.", type="text", elem_classes="",
-                                        max_lines=10, show_copy_button=False, visible=False)
-                def show_reason_and_submit(session_id, user_name_text, btn, request: gr.Request):
-                    if request.username is not None:
-                        user_name_text = request.username
-                    result_dict = {
-                        reason_textbox: {"visible": True, "__type__": "update"},
-                        submit_button: {"visible": True, "__type__": "update", "interactive": True},
-                        user_name: {"visible": True, "__type__": "update", "value": user_name_text},
-                    }
-                    if "Model A" in btn:
-                        choice = "Model A"
-                        result_dict.update({
-                            user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
-                            btn_model_A: {"elem_classes": "btn_boderline_selected", "__type__": "update"},
-                            btn_model_B: {"elem_classes": "btn_boderline", "__type__": "update"},
-                            btn_tie: {"elem_classes": "btn_boderline", "__type__": "update"},
-                        })
-                    elif "Model B" in btn:
-                        choice = "Model B"
-                        result_dict.update({
-                            user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
-                            btn_model_B: {"elem_classes": "btn_boderline_selected", "__type__": "update"},
-                            btn_model_A: {"elem_classes": "btn_boderline", "__type__": "update"},
-                            btn_tie: {"elem_classes": "btn_boderline", "__type__": "update"},
-                        })
-                    elif "Tie" in btn:
-                        choice = "Tie"
-                        result_dict.update({
-                            user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
-                            btn_tie: {"elem_classes": "btn_boderline_selected", "__type__": "update"},
-                            btn_model_A: {"elem_classes": "btn_boderline", "__type__": "update"},
-                            btn_model_B: {"elem_classes": "btn_boderline", "__type__": "update"},
-                        })
-                    else:
-                        choice = "N/A"
-                        result_dict.update({
-                            user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
-                        })
-                    return result_dict
-                btn_model_A.click(show_reason_and_submit, inputs=[session_id, user_name, btn_model_A],
-                                  outputs=[user_choice, reason_textbox, submit_button, user_name, btn_model_A, btn_tie,
-                                           btn_model_B])
-                btn_tie.click(show_reason_and_submit, inputs=[session_id, user_name, btn_tie],
-                              outputs=[user_choice, reason_textbox, submit_button, user_name, btn_model_A, btn_tie,
-                                       btn_model_B])
-                btn_model_B.click(show_reason_and_submit, inputs=[session_id, user_name, btn_model_B],
-                                  outputs=[user_choice, reason_textbox, submit_button, user_name, btn_model_A, btn_tie,
-                                           btn_model_B])
-                def submit_feedback(session_id, user_reason, user_choice, user_name_text, assignment_string,
-                                    request: gr.Request):
-                    if "N/A" in session_id or "N/A" in user_choice:
-                        # send a message to the user to sample an example and select a choice first
-                        return {
-                            submit_button: {"interactive": True, "__type__": "update",
-                                            "value": "Submit your feedback! 🚀 Please sample an example and select a choice!"},
-                        }
-                        # create a jsonl file and upload it to hf
-                    choice_str = ""
-                    if "Model A" in user_choice:
-                        choice_str = "Model A"
-                    elif "Model B" in user_choice:
-                        choice_str = "Model B"
-                    elif "Tie" in user_choice:
-                        choice_str = "Tie"
-                    else:
-                        choice_str = "N/A"
-                    if user_name_text == "" and request.username is None:
-                        user_name_text = "Anonymous"
-                    if request.username is not None:
-                        user_name_text = request.username
-                    feedback_item = {
-                        "session_id": session_id,
-                        "user_name": user_name_text,
-                        "user_reason": user_reason,
-                        "user_choice": choice_str,
-                        "ip": request.client.host,
-                        "assignment_string": assignment_string
-                    }
-                    jsonl_str = json.dumps(feedback_item)
-                    api = HfApi()
-                    token = os.getenv("HF_TOKEN")
-                    if token is None:
-                        raise ValueError(
-                            "Hugging Face token not found. Ensure the HF_TOKEN environment variable is set.")
-                    # Generate a random filename using UUID
-                    filename = f"{uuid.uuid4()}.json"
-                    # Define the repository
-                    repo_id = "WildEval/WildBench-HumanFeedback"
-                    # Upload the json_str as a file directly to the specified path in your dataset repository
-                    api.upload_file(
-                        token=token,
-                        repo_id=repo_id,
-                        repo_type="dataset",
-                        path_or_fileobj=jsonl_str.encode("utf-8"),  # Convert string to bytes
-                        path_in_repo=filename,
-                        commit_message=f"Add user feedback for session_id: {session_id}. Assignment: {assignment_string}",
-                    )
-                    return {
-                        submit_button: {"interactive": False, "__type__": "update",
-                                        "value": "Submitted! ✅ \n Please click 🔁 Next."},
-                        reason_textbox: {"interactive": False, "__type__": "update"},
-                        btn_model_A: {"interactive": False, "__type__": "update"},
-                        btn_tie: {"interactive": False, "__type__": "update"},
-                        btn_model_B: {"interactive": False, "__type__": "update"},
-                        user_name: {"interactive": False, "__type__": "update"},
-                        assignment: {"visible": True, "__type__": "update"}
-                    }
-                def reset_submission(session_id):
-                    return {
-                        submit_button: {"interactive": False, "__type__": "update", "value": "Submit your feedback! 🚀"},
-                        reason_textbox: {"interactive": True, "__type__": "update", "value": ""},
-                        btn_model_A: {"interactive": True, "__type__": "update", "elem_classes": "btn_boderline_gray"},
-                        btn_tie: {"interactive": True, "__type__": "update", "elem_classes": "btn_boderline_gray"},
-                        btn_model_B: {"interactive": True, "__type__": "update", "elem_classes": "btn_boderline_gray"},
-                        user_name: {"interactive": True, "__type__": "update"},
-                        user_choice: {"value": "Your choice: N/A", "__type__": "update"},
-                        assignment: {"__type__": "update", "visible": False},
-                        gpt4_accordion: {"__type__": "update", "open": False},
-                    }
-                # reset the reason_textbox, submit_button, and btn_model_A
-                session_id.change(reset_submission, inputs=[session_id],
-                                  outputs=[submit_button, reason_textbox, btn_model_A, btn_tie, btn_model_B, user_name,
-                                           user_choice, assignment, gpt4_accordion])
-                submit_button.click(submit_feedback,
-                                    inputs=[session_id, reason_textbox, user_choice, user_name, assignment],
-                                    outputs=[submit_button, reason_textbox, btn_model_A, btn_tie, btn_model_B,
-                                             user_name, assignment])
-                # Display chat history when button is clicked
-                # TODO: add the model list and tag list
-                btn_show_history.click(fn=display_chat_history, inputs=[selected_models, select_tasks],
-                                       outputs=[session_id, user_intent, Chatbot_Common, Chatbot_A, Chatbot_B,
-                                                gpt4_reason, checklist, assignment])
-                btn_pass.click(fn=display_chat_history, inputs=[selected_models, select_tasks],
-                               outputs=[session_id, user_intent, Chatbot_Common, Chatbot_A, Chatbot_B, gpt4_reason,
-                                        checklist,
-                                        assignment])  # the pass button will be the same function of resampling
-            with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=3):
-                gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
-        gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
-        with gr.Row():
-            with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
-                gr.Textbox(
-                    value=CITATION_TEXT,
-                    lines=7,
-                    label="Copy the BibTeX snippet to cite this source",
-                    elem_id="citation-button",
-                    show_copy_button=True)
-                # ).style(show_copy_button=True)
-    return demo
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--share", action="store_true")
-    parser.add_argument("--result_file", help="Path to results table", default="data_dir/elo_ranks.all.jsonl")
-    parser.add_argument("--length_balation_file", help="Path to results table",
-                        default="data_dir/elo_ranks.length_ablation.all.jsonl")
-    parser.add_argument("--skip_empty_result_file", help="Path to results table",
-                        default="data_dir/elo_ranks.skip_empty.all.jsonl")
-    parser.add_argument("--skip_empty_length_balation_file", help="Path to results table",
-                        default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl")
-    args = parser.parse_args()
-    LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime(
-        "%Y-%m-%d %H:%M:%S")
-    original_df = pd.read_json(args.result_file, lines=True)
-    ablation_df = pd.read_json(args.length_balation_file, lines=True)
-    skip_empty_original_df = pd.read_json(args.skip_empty_result_file, lines=True)
-    skip_empty_ablation_df = pd.read_json(args.skip_empty_length_balation_file, lines=True)
-    # available_models = sorted(list(set(list(original_df["model name "]))))
-    available_models = list(model_info.keys())
-    # remove the rows where the model name is not in the available_models
-    original_df = original_df[original_df["model name "].isin(available_models)]
-    ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]
-    skip_empty_ablation_df = skip_empty_ablation_df[skip_empty_ablation_df["model name "].isin(available_models)]
-    skip_empty_original_df = skip_empty_original_df[skip_empty_original_df["model name "].isin(available_models)]
-    model_len_info = json.load(open("model_len_info.json", "r"))
-    original_df = post_processing(original_df, model_len_info)
-    ablation_df = post_processing(ablation_df, model_len_info)
-    skip_empty_original_df = post_processing(skip_empty_original_df, model_len_info)
-    skip_empty_ablation_df = post_processing(skip_empty_ablation_df, model_len_info)
-    TYPES = ["markdown", "number"]
-    demo = build_demo(TYPES)
-    demo.launch(share=args.share, height=1000)