Spaces:

ahmadshalloufuhh
/

CompUGE

Paused

App Files Files Community

Ahmad Shallouf commited on Mar 24, 2024

Commit

2df6a4f

1 Parent(s): b766929

added initial design

Browse files

Files changed (9) hide show

.gitignore +1 -0
.idea/.gitignore +2 -0
.idea/ComparativeQA-Benchmark.iml +3 -1
.idea/misc.xml +3 -0
CQI_Leaderboard.csv +3 -0
DataProcessing.ipynb +90 -0
__pycache__/app.cpython-39.pyc +0 -0
app.py +46 -4
app_wildbench.py +526 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ /venv/

.idea/.gitignore CHANGED Viewed

@@ -6,3 +6,5 @@
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml

 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
+# GitHub Copilot persisted chat sessions
+/copilot/chatSessions

.idea/ComparativeQA-Benchmark.iml CHANGED Viewed

@@ -1,7 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
   <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
     <orderEntry type="jdk" jdkName="Python 3.9" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>

 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
   <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/.idea/copilot/chatSessions" />
+    </content>
     <orderEntry type="jdk" jdkName="Python 3.9" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>

.idea/misc.xml CHANGED Viewed

@@ -4,4 +4,7 @@
     <option name="sdkName" value="Python 3.9 (CSI)" />
   </component>
   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
 </project>

     <option name="sdkName" value="Python 3.9 (CSI)" />
   </component>
   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
+  <component name="PythonCompatibilityInspectionAdvertiser">
+    <option name="version" value="3" />
+  </component>
 </project>

CQI_Leaderboard.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+Model,Accuracy,Precision,Recall,F1 Score,Evaluation Time,Overall Score
+Dummy,0.5,0.5,0.5,0.5,0.5,0.5
+Dummy2,0.6,0.6,0.6,0.6,0.6,0.6

DataProcessing.ipynb ADDED Viewed

	@@ -0,0 +1,90 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2024-03-24T11:48:41.895997Z",
+     "start_time": "2024-03-24T11:48:41.863555Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "    Model  Accuracy  Precision  Recall  F1 Score  Evaluation Time  \\\n0   Dummy       0.5        0.5     0.5       0.5              0.5   \n0  Dummy2       0.6        0.6     0.6       0.6              0.6   \n\n   Overall Score  \n0            0.5  \n0            0.6  ",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Model</th>\n      <th>Accuracy</th>\n      <th>Precision</th>\n      <th>Recall</th>\n      <th>F1 Score</th>\n      <th>Evaluation Time</th>\n      <th>Overall Score</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Dummy</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <th>0</th>\n      <td>Dummy2</td>\n      <td>0.6</td>\n      <td>0.6</td>\n      <td>0.6</td>\n      <td>0.6</td>\n      <td>0.6</td>\n      <td>0.6</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# Build a dataframe with Model, Accuracy, Precision, Recall, F1 Score, Evaluation Time, Overall Score\n",
+    "\n",
+    "model_results = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Evaluation Time', 'Overall Score'])\n",
+    "\n",
+    "# Add dummy data using concat\n",
+    "model_results = pd.concat([model_results, pd.DataFrame([['Dummy', 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Evaluation Time', 'Overall Score'])])\n",
+    "\n",
+    "# add more dummy data\n",
+    "model_results = pd.concat([model_results, pd.DataFrame([['Dummy2', 0.6, 0.6, 0.6, 0.6, 0.6, 0.6]], columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Evaluation Time', 'Overall Score'])])\n",
+    "\n",
+    "model_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "# Save the model results to a csv file\n",
+    "model_results.to_csv('CQI_Leaderboard.csv', index=False)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-03-24T11:49:23.687615Z",
+     "start_time": "2024-03-24T11:49:23.602354Z"
+    }
+   },
+   "id": "d6d288e1af91dd1d",
+   "execution_count": 4
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "f164c55726b7cbaf"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

__pycache__/app.cpython-39.pyc ADDED Viewed

Binary file (1.67 kB). View file

app.py CHANGED Viewed

@@ -1,9 +1,51 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+import pandas as pd
+import numpy as np
+# UI Root
+with gr.Blocks() as demo:
+    gr.Markdown("## CompUGE-Bench: Comparative Understanding and Generation Evaluation Benchmarks")
+    # Main Tabs
+    with gr.Tab("Leaderboards"):
+        gr.Markdown("### Leaderboards")
+        # CQI Tab
+        with gr.Tab("CQI"):
+            gr.Markdown("### Comparative Question Identification Leaderboard")
+            # read dataframe from CQI_Learboard.csv
+            # TODO: replace the following line with the actual leaderboard file
+            CQI_Leaderboard = pd.read_csv("CQI_Leaderboard.csv")
+            cqi_leaderboard = gr.components.Dataframe(CQI_Leaderboard)
+        # OAI Tab
+        with gr.Tab("OAI"):
+            gr.Markdown("### Object & Aspect Identification Leaderboard")
+            gr.Markdown("The OAI leaderboard will be opened soon!")
+        # SC Tab
+        with gr.Tab("SC"):
+            gr.Markdown("### Stance Clasification Leaderboard")
+            gr.Markdown("The SC leaderboard will be opened soon!")
+        # SG Tab
+        with gr.Tab("SG"):
+            gr.Markdown("### Summary Generation Leaderboard")
+            gr.Markdown("The Summary Generation leaderboard will be opened soon!")
+    # Model Submissions Tab
+    with gr.Tab("Model Submissions"):
+        gr.Markdown("### Submission")
+        gr.Markdown("The submission will be opened soon!")
+    # About Tab
+    with gr.Tab("About"):
+        gr.Markdown("### About")
+        gr.Markdown("CompUGE-Bench is a benchmark for comparative understanding and generation evaluation.")
+    # Contact Tab
+    with gr.Tab("Contact"):
+        gr.Markdown("### Contact")
+        gr.Markdown("For any questions, please contact us at [email protected]")
+# Launch public demo
+demo.launch(share=True)

app_wildbench.py ADDED Viewed

	@@ -0,0 +1,526 @@

+"""A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
+import ast
+import argparse
+import glob
+import pickle
+import plotly
+import gradio as gr
+import numpy as np
+import pandas as pd
+import gradio as gr
+import pandas as pd
+from pathlib import Path
+import json
+from constants import BANNER, CITATION_TEXT, WINRATE_HEATMAP, css, js_code, all_task_types, DEFAULT_LP, TASK_TYPE_STR, \
+    js_light
+from datetime import datetime, timezone
+from data_utils import load_eval_results, sample_an_eval_result, apply_length_penalty, post_processing, add_winrates, \
+    add_winrates_tasks
+# from gradio.themes.utils import colors, fonts, sizes
+from themes import Seafoam
+from huggingface_hub import HfApi
+# from datasets import Dataset, load_dataset, concatenate_datasets
+import os, uuid
+from utils_display import model_info
+# get the last updated time from the elo_ranks.all.jsonl file
+LAST_UPDATED = None
+with open("_intro.md", "r") as f:
+    INTRO_MD = f.read()
+with open("_about_us.md", "r") as f:
+    ABOUT_MD = f.read()
+with open("_header.md", "r") as f:
+    HEADER_MD = f.read()
+original_df, ablation_df = None, None
+eval_results = load_eval_results()
+available_models = []  # to be filled in later
+def display_chat_history(model_selections, task_selections):
+    eval_item = sample_an_eval_result(eval_results, model_selections, task_selections)
+    session_id = eval_item["session_id"]
+    chats = [x["content"] for x in eval_item['conversation_input']]
+    # form a list of tuples of two adjacent messages in chats
+    chats_common = chats[:] + [None]
+    # chats_modelA = ["Model A Output"] + [eval_item["model_A_output"]]
+    # chats_modelB = ["Model B Output"] + [eval_item["model_B_output"]]
+    chats_modelA = [None] + [eval_item["model_A_output"]]
+    chats_modelB = [None] + [eval_item["model_B_output"]]
+    message_history_common = [(chats_common[i], chats_common[i + 1]) for i in range(0, len(chats_common) - 1, 2)]
+    message_history_model_A = [(chats_modelA[i], chats_modelA[i + 1]) for i in range(0, len(chats_modelA) - 1, 2)]
+    message_history_model_B = [(chats_modelB[i], chats_modelB[i + 1]) for i in range(0, len(chats_modelB) - 1, 2)]
+    checklist_string = ""
+    for item in eval_item["checklist"]:
+        checklist_string += f"1. {item}\n"
+    list_reasons = eval_item["reason"].strip().split(". ")
+    # remove the last one if it is empty
+    if list_reasons[-1] == "":
+        list_reasons = list_reasons[:-1]
+    list_reasons = "\n".join([f"- {item}." for item in list_reasons])
+    gpt4_reason = f"### Choice: {eval_item['choice']}.  Reason: ⬇️\n" + list_reasons
+    assignment_string = f"Model A: {eval_item['model_A']} | Model B: {eval_item['model_B']}"
+    user_intent = f"- 🆔: `{session_id}` \n- 💬 **User Intent:** {eval_item['intent']} \n- ⚙️ **Task category**: {', '.join(eval_item['all_tags'])}"
+    return session_id, user_intent, message_history_common, message_history_model_A, message_history_model_B, gpt4_reason, checklist_string, assignment_string
+def slider_change_main(length_penalty):
+    global original_df, ablation_df
+    adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty)
+    adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
+    adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
+    adjusted_df = add_winrates(adjusted_df)
+    adjusted_df = adjusted_df.drop(columns=["Length"])
+    return adjusted_df
+def slider_change_full(length_penalty, show_winrate):
+    global original_df, ablation_df
+    adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty)
+    # sort the model by the "Task-Avg Elo" column
+    adjusted_df = adjusted_df.sort_values(by="Task-Avg Elo", ascending=False)
+    adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
+    if show_winrate == "none":
+        return adjusted_df
+    elif show_winrate == "gpt-3.5":
+        adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5")
+    elif show_winrate == "gpt-4":
+        adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4")
+    return adjusted_df
+seafoam = Seafoam()
+def build_demo(TYPES):
+    global original_df, ablation_df, skip_empty_original_df, skip_empty_ablation_df, available_models
+    with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
+        # with gr.Blocks(theme=seafoam, css=css) as demo:
+        gr.HTML(BANNER, elem_id="banner")
+        # gr.Markdown("### Work in progress. Please do not share.", elem_classes="markdown-text") # TODO: remove this later.
+        gr.Markdown(HEADER_MD, elem_classes="markdown-text")
+        with gr.Tabs(elem_classes="tab-buttons") as tabs:
+            with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
+                gr.Markdown(
+                    f"**Version**: WildBench (v1.0; 2024.03.07) | **# Examples**: 1024 | **# Models**: {len(available_models)} | **# Comparisons**: 26k",
+                    elem_classes="markdown-text")
+                with gr.TabItem("Main Table", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
+                    # original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
+                    default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
+                    default_main_df = default_main_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
+                    default_main_df = add_winrates(default_main_df)
+                    default_main_df = default_main_df.drop(columns=["Length"])
+                    # TODO: add the win rate for GPT-4 and GPT-3.5T
+                    with gr.Row():
+                        with gr.Column(scale=4):
+                            gr.Markdown(
+                                "**Overall Elo**: [Standard Elo rating with boostrap.](https://en.wikipedia.org/wiki/Elo_rating_system). | **Task-Avg Elo**: Compute Elo on subsets of each task type and then take avg. | **Win Rates**: [Estimated by Elo differences](https://www.hexwiki.net/index.php/Elo_rating#Definition). | **Length penalty**: Models w/ longer outputs are penalized. (Plz check 📖 **Details**.)",
+                                elem_classes="markdown-text-small top-left-LP")
+                        with gr.Column(scale=0.8):
+                            length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
+                                                             label="Length Penalty", elem_id="length-penalty-slider")
+                            # checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
+                    leaderboard_table = gr.components.Dataframe(
+                        value=default_main_df,
+                        datatype=TYPES,
+                        # max_rows=None,
+                        height=1000,
+                        elem_id="leaderboard-table",
+                        interactive=False,
+                        visible=True,
+                        min_width=60,
+                    )
+                    length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider],
+                                                outputs=[leaderboard_table])
+                with gr.TabItem("All Tasks (Win% vs GPT-3.5T)", elem_id="od-benchmark-tab-table-ablation", id=1):
+                    with gr.Row():
+                        with gr.Column(scale=4):
+                            gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP")
+                        with gr.Column(scale=0.8):
+                            length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
+                                                                  label="Length Penalty",
+                                                                  elem_id="length-penalty-slider")
+                    default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
+                    # do not show the "# battles" column here
+                    default_full_df = default_full_df.drop(
+                        columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
+                    default_full_df = add_winrates_tasks(default_full_df, ref="gpt-3.5")
+                    leaderboard_table_full = gr.components.Dataframe(
+                        value=default_full_df,
+                        datatype=TYPES,
+                        # max_rows=None,
+                        height=1000,
+                        elem_id="leaderboard-table-full_table",
+                        interactive=False,
+                        visible=True,
+                        min_width=60,
+                    )
+                    show_winrate = gr.Checkbox(value="gpt-3.5", visible=False)
+                    length_penlty_slider_full.change(fn=slider_change_full,
+                                                     inputs=[length_penlty_slider_full, show_winrate],
+                                                     outputs=[leaderboard_table_full])
+                with gr.TabItem("All Tasks (Win% vs GPT-4)", elem_id="od-benchmark-tab-table-ablation", id=2):
+                    with gr.Row():
+                        with gr.Column(scale=4):
+                            gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP")
+                        with gr.Column(scale=0.8):
+                            length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
+                                                                  label="Length Penalty",
+                                                                  elem_id="length-penalty-slider")
+                    default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
+                    # do not show the "# battles" column here
+                    default_full_df = default_full_df.drop(
+                        columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
+                    default_full_df = add_winrates_tasks(default_full_df, ref="gpt-4")
+                    leaderboard_table_full = gr.components.Dataframe(
+                        value=default_full_df,
+                        datatype=TYPES,
+                        # max_rows=None,
+                        height=1000,
+                        elem_id="leaderboard-table-full_table",
+                        interactive=False,
+                        visible=True,
+                        min_width=60,
+                    )
+                    show_winrate = gr.Checkbox(value="gpt-4", visible=False)
+                    length_penlty_slider_full.change(fn=slider_change_full,
+                                                     inputs=[length_penlty_slider_full, show_winrate],
+                                                     outputs=[leaderboard_table_full])
+                with gr.TabItem("All Tasks (Elo)", elem_id="od-benchmark-tab-table-ablation", id=3):
+                    with gr.Row():
+                        with gr.Column(scale=4):
+                            gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP")
+                        with gr.Column(scale=0.8):
+                            length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
+                                                                  label="Length Penalty",
+                                                                  elem_id="length-penalty-slider")
+                    default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
+                    # do not show the "# battles" column here
+                    default_full_df = default_full_df.drop(
+                        columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
+                    leaderboard_table_full = gr.components.Dataframe(
+                        value=default_full_df,
+                        datatype=TYPES,
+                        # max_rows=None,
+                        height=1000,
+                        elem_id="leaderboard-table-full_table",
+                        interactive=False,
+                        visible=True,
+                        min_width=60,
+                    )
+                    show_winrate = gr.Checkbox(value="none", visible=False)
+                    length_penlty_slider_full.change(fn=slider_change_full,
+                                                     inputs=[length_penlty_slider_full, show_winrate],
+                                                     outputs=[leaderboard_table_full])
+                # with gr.TabItem("Pairwise Win Rates", elem_id="od-benchmark-tab-table-ablation", id=4):
+                #     # TODO: show all winrate
+                #     # winrates_heatmap = pickle.load(open("data_dir/pairwise_win_fractions.pkl", "rb"))
+                #     # gr.Plot(value=winrates_heatmap, scale=2, min_width=800, container=False, elem_classes="plotly-plot", visible=True)
+                #     gr.HTML(WINRATE_HEATMAP, visible=True)
+            with gr.TabItem("📖 Details", elem_id="od-benchmark-tab-table", id=1):
+                gr.Markdown(INTRO_MD, elem_classes="markdown-text-details")
+            with gr.TabItem("🔍 Explore | 🆚 Evaluate", elem_id="od-benchmark-tab-table", id=2):
+                with gr.Row():
+                    btn_show_history = gr.Button("🎲  Click here to sample an example + a pair of LLM outputs! ",
+                                                 elem_classes="sample_button")
+                with gr.Row():
+                    with gr.Column(scale=1.5):
+                        with gr.Accordion("Choose models to sample from", open=False, elem_classes="accordion-label"):
+                            model_options = available_models
+                            selected_models = gr.CheckboxGroup(model_options, info="", value=model_options,
+                                                               show_label=False, elem_id="select-models")
+                            clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
+                            # clear the selected_models
+                            clear_button.click(lambda: {selected_models: {"value": [], "__type__": "update"}},
+                                               inputs=[], outputs=[selected_models])
+                    with gr.Column(scale=1):
+                        with gr.Accordion("Choose task types to sample from", open=False,
+                                          elem_classes="accordion-label"):
+                            select_tasks = gr.CheckboxGroup(all_task_types, info="", value=all_task_types,
+                                                            show_label=False, elem_id="select-tasks")
+                            clear_task_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
+                            # clear the select_tasks
+                            clear_task_button.click(lambda: {select_tasks: {"value": [], "__type__": "update"}},
+                                                    inputs=[], outputs=[select_tasks])
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("## 📢 Chat History", elem_classes="markdown-text")
+                        Chatbot_Common = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height="auto",
+                                                    container=False, label="Common Chat History", likeable=False,
+                                                    show_share_button=False, show_label=True,
+                                                    elem_classes="chat-common", layout="bubble")
+                        Chatbot_Common.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
+                        with gr.Accordion("✍️ Task Annotation", elem_classes="accordion-label", open=False):
+                            user_intent = gr.Markdown("", elem_classes="markdown-text-small")
+                # two columns for the two models
+                with gr.Row():
+                    # https://www.gradio.app/docs/chatbot
+                    with gr.Column():
+                        gr.Markdown("## ⬅️ Model A Output", elem_classes="markdown-text")
+                        Chatbot_A = gr.Chatbot(height="auto", container=False, label="Model A Output", likeable=False,
+                                               show_share_button=False, show_label=True, elem_classes="chat-specific",
+                                               layout="bubble")
+                        Chatbot_A.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
+                    with gr.Column():
+                        # add a Markdown to show this is for Model B
+                        gr.Markdown("## ➡️ Model B Output", elem_classes="markdown-text")
+                        Chatbot_B = gr.Chatbot(height="auto", container=False, label="Model B Output", likeable=False,
+                                               show_share_button=False, show_label=True, elem_classes="chat-specific",
+                                               layout="bubble")
+                        Chatbot_B.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
+                with gr.Row():
+                    # Here we can show the GPT-4 judgement for the model outputs
+                    # show a textarea
+                    with gr.Column():
+                        with gr.Accordion("⏱️ Checklist", open=False, elem_classes="accordion-label"):
+                            checklist = gr.Markdown("### Checklist: \n Will be shown later.",
+                                                    elem_classes="markdown-text-tiny")
+                        with gr.Accordion("⚖️ GPT-4 Judgement", open=False,
+                                          elem_classes="accordion-label") as gpt4_accordion:
+                            # gpt4_reason = gr.TextArea(label="GPT-4 Judgement", placeholder="Will be shown later.", type="text", elem_classes="", max_lines=10, show_copy_button=True)
+                            gpt4_reason = gr.Markdown("Will be shown later.", elem_classes="markdown-text-tiny")
+                with gr.Row():
+                    # show buttons for user to choose which model output is better or Tie
+                    btn_model_A = gr.Button("⬅️ Model A is better! ", elem_classes="btn_boderline_gray", scale=2,
+                                            interactive=False)
+                    btn_tie = gr.Button("🟰 Tie", elem_classes="btn_boderline_gray", scale=2, interactive=False)
+                    btn_model_B = gr.Button("➡️ Model B is better!", elem_classes="btn_boderline_gray", scale=2,
+                                            interactive=False)
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        reason_textbox = gr.Textbox(label="Reason", placeholder="Please input your reason here.",
+                                                    type="text", elem_classes="", max_lines=10, lines=8,
+                                                    show_copy_button=False, visible=True, scale=4, interactive=True)
+                    with gr.Column():
+                        with gr.Row():
+                            user_choice = gr.Markdown("Your choice: N/A", elem_classes="markdown-text", visible=True)
+                            btn_pass = gr.Button("🔁 Next", elem_classes="btn_boderline_next", scale=1)
+                        user_name = gr.Textbox(label="Your HF Username", placeholder="Your HuggingFace username",
+                                               type="text", elem_classes="", max_lines=1, show_copy_button=False,
+                                               visible=True, interactive=True, show_label=False)
+                        # login_btn = gr.LoginButton(visible=False, interactive=True, elem_classes="btn_boderline")
+                        submit_button = gr.Button("Submit your feedback! 🚀", elem_classes="btn_boderline", visible=True,
+                                                  interactive=False)
+                        assignment = gr.Markdown("Model A: | Model B: ", elem_classes="markdown-text-tiny-red",
+                                                 visible=False)
+                session_id = gr.Textbox(label="Session ID", placeholder="N/A.", type="text", elem_classes="",
+                                        max_lines=10, show_copy_button=False, visible=False)
+                def show_reason_and_submit(session_id, user_name_text, btn, request: gr.Request):
+                    if request.username is not None:
+                        user_name_text = request.username
+                    result_dict = {
+                        reason_textbox: {"visible": True, "__type__": "update"},
+                        submit_button: {"visible": True, "__type__": "update", "interactive": True},
+                        user_name: {"visible": True, "__type__": "update", "value": user_name_text},
+                    }
+                    if "Model A" in btn:
+                        choice = "Model A"
+                        result_dict.update({
+                            user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
+                            btn_model_A: {"elem_classes": "btn_boderline_selected", "__type__": "update"},
+                            btn_model_B: {"elem_classes": "btn_boderline", "__type__": "update"},
+                            btn_tie: {"elem_classes": "btn_boderline", "__type__": "update"},
+                        })
+                    elif "Model B" in btn:
+                        choice = "Model B"
+                        result_dict.update({
+                            user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
+                            btn_model_B: {"elem_classes": "btn_boderline_selected", "__type__": "update"},
+                            btn_model_A: {"elem_classes": "btn_boderline", "__type__": "update"},
+                            btn_tie: {"elem_classes": "btn_boderline", "__type__": "update"},
+                        })
+                    elif "Tie" in btn:
+                        choice = "Tie"
+                        result_dict.update({
+                            user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
+                            btn_tie: {"elem_classes": "btn_boderline_selected", "__type__": "update"},
+                            btn_model_A: {"elem_classes": "btn_boderline", "__type__": "update"},
+                            btn_model_B: {"elem_classes": "btn_boderline", "__type__": "update"},
+                        })
+                    else:
+                        choice = "N/A"
+                        result_dict.update({
+                            user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
+                        })
+                    return result_dict
+                btn_model_A.click(show_reason_and_submit, inputs=[session_id, user_name, btn_model_A],
+                                  outputs=[user_choice, reason_textbox, submit_button, user_name, btn_model_A, btn_tie,
+                                           btn_model_B])
+                btn_tie.click(show_reason_and_submit, inputs=[session_id, user_name, btn_tie],
+                              outputs=[user_choice, reason_textbox, submit_button, user_name, btn_model_A, btn_tie,
+                                       btn_model_B])
+                btn_model_B.click(show_reason_and_submit, inputs=[session_id, user_name, btn_model_B],
+                                  outputs=[user_choice, reason_textbox, submit_button, user_name, btn_model_A, btn_tie,
+                                           btn_model_B])
+                def submit_feedback(session_id, user_reason, user_choice, user_name_text, assignment_string,
+                                    request: gr.Request):
+                    if "N/A" in session_id or "N/A" in user_choice:
+                        # send a message to the user to sample an example and select a choice first
+                        return {
+                            submit_button: {"interactive": True, "__type__": "update",
+                                            "value": "Submit your feedback! 🚀 Please sample an example and select a choice!"},
+                        }
+                        # create a jsonl file and upload it to hf
+                    choice_str = ""
+                    if "Model A" in user_choice:
+                        choice_str = "Model A"
+                    elif "Model B" in user_choice:
+                        choice_str = "Model B"
+                    elif "Tie" in user_choice:
+                        choice_str = "Tie"
+                    else:
+                        choice_str = "N/A"
+                    if user_name_text == "" and request.username is None:
+                        user_name_text = "Anonymous"
+                    if request.username is not None:
+                        user_name_text = request.username
+                    feedback_item = {
+                        "session_id": session_id,
+                        "user_name": user_name_text,
+                        "user_reason": user_reason,
+                        "user_choice": choice_str,
+                        "ip": request.client.host,
+                        "assignment_string": assignment_string
+                    }
+                    jsonl_str = json.dumps(feedback_item)
+                    api = HfApi()
+                    token = os.getenv("HF_TOKEN")
+                    if token is None:
+                        raise ValueError(
+                            "Hugging Face token not found. Ensure the HF_TOKEN environment variable is set.")
+                    # Generate a random filename using UUID
+                    filename = f"{uuid.uuid4()}.json"
+                    # Define the repository
+                    repo_id = "WildEval/WildBench-HumanFeedback"
+                    # Upload the json_str as a file directly to the specified path in your dataset repository
+                    api.upload_file(
+                        token=token,
+                        repo_id=repo_id,
+                        repo_type="dataset",
+                        path_or_fileobj=jsonl_str.encode("utf-8"),  # Convert string to bytes
+                        path_in_repo=filename,
+                        commit_message=f"Add user feedback for session_id: {session_id}. Assignment: {assignment_string}",
+                    )
+                    return {
+                        submit_button: {"interactive": False, "__type__": "update",
+                                        "value": "Submitted! ✅ \n Please click 🔁 Next."},
+                        reason_textbox: {"interactive": False, "__type__": "update"},
+                        btn_model_A: {"interactive": False, "__type__": "update"},
+                        btn_tie: {"interactive": False, "__type__": "update"},
+                        btn_model_B: {"interactive": False, "__type__": "update"},
+                        user_name: {"interactive": False, "__type__": "update"},
+                        assignment: {"visible": True, "__type__": "update"}
+                    }
+                def reset_submission(session_id):
+                    return {
+                        submit_button: {"interactive": False, "__type__": "update", "value": "Submit your feedback! 🚀"},
+                        reason_textbox: {"interactive": True, "__type__": "update", "value": ""},
+                        btn_model_A: {"interactive": True, "__type__": "update", "elem_classes": "btn_boderline_gray"},
+                        btn_tie: {"interactive": True, "__type__": "update", "elem_classes": "btn_boderline_gray"},
+                        btn_model_B: {"interactive": True, "__type__": "update", "elem_classes": "btn_boderline_gray"},
+                        user_name: {"interactive": True, "__type__": "update"},
+                        user_choice: {"value": "Your choice: N/A", "__type__": "update"},
+                        assignment: {"__type__": "update", "visible": False},
+                        gpt4_accordion: {"__type__": "update", "open": False},
+                    }
+                # reset the reason_textbox, submit_button, and btn_model_A
+                session_id.change(reset_submission, inputs=[session_id],
+                                  outputs=[submit_button, reason_textbox, btn_model_A, btn_tie, btn_model_B, user_name,
+                                           user_choice, assignment, gpt4_accordion])
+                submit_button.click(submit_feedback,
+                                    inputs=[session_id, reason_textbox, user_choice, user_name, assignment],
+                                    outputs=[submit_button, reason_textbox, btn_model_A, btn_tie, btn_model_B,
+                                             user_name, assignment])
+                # Display chat history when button is clicked
+                # TODO: add the model list and tag list
+                btn_show_history.click(fn=display_chat_history, inputs=[selected_models, select_tasks],
+                                       outputs=[session_id, user_intent, Chatbot_Common, Chatbot_A, Chatbot_B,
+                                                gpt4_reason, checklist, assignment])
+                btn_pass.click(fn=display_chat_history, inputs=[selected_models, select_tasks],
+                               outputs=[session_id, user_intent, Chatbot_Common, Chatbot_A, Chatbot_B, gpt4_reason,
+                                        checklist,
+                                        assignment])  # the pass button will be the same function of resampling
+            with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=3):
+                gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
+        gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
+        with gr.Row():
+            with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
+                gr.Textbox(
+                    value=CITATION_TEXT,
+                    lines=7,
+                    label="Copy the BibTeX snippet to cite this source",
+                    elem_id="citation-button",
+                    show_copy_button=True)
+                # ).style(show_copy_button=True)
+    return demo
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--result_file", help="Path to results table", default="data_dir/elo_ranks.all.jsonl")
+    parser.add_argument("--length_balation_file", help="Path to results table",
+                        default="data_dir/elo_ranks.length_ablation.all.jsonl")
+    parser.add_argument("--skip_empty_result_file", help="Path to results table",
+                        default="data_dir/elo_ranks.skip_empty.all.jsonl")
+    parser.add_argument("--skip_empty_length_balation_file", help="Path to results table",
+                        default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl")
+    args = parser.parse_args()
+    LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime(
+        "%Y-%m-%d %H:%M:%S")
+    original_df = pd.read_json(args.result_file, lines=True)
+    ablation_df = pd.read_json(args.length_balation_file, lines=True)
+    skip_empty_original_df = pd.read_json(args.skip_empty_result_file, lines=True)
+    skip_empty_ablation_df = pd.read_json(args.skip_empty_length_balation_file, lines=True)
+    # available_models = sorted(list(set(list(original_df["model name "]))))
+    available_models = list(model_info.keys())
+    # remove the rows where the model name is not in the available_models
+    original_df = original_df[original_df["model name "].isin(available_models)]
+    ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]
+    skip_empty_ablation_df = skip_empty_ablation_df[skip_empty_ablation_df["model name "].isin(available_models)]
+    skip_empty_original_df = skip_empty_original_df[skip_empty_original_df["model name "].isin(available_models)]
+    model_len_info = json.load(open("model_len_info.json", "r"))
+    original_df = post_processing(original_df, model_len_info)
+    ablation_df = post_processing(ablation_df, model_len_info)
+    skip_empty_original_df = post_processing(skip_empty_original_df, model_len_info)
+    skip_empty_ablation_df = post_processing(skip_empty_ablation_df, model_len_info)
+    TYPES = ["markdown", "number"]
+    demo = build_demo(TYPES)
+    demo.launch(share=args.share, height=1000)