Spaces:
Paused
Paused
Ahmad Shallouf
commited on
Commit
ยท
2df6a4f
1
Parent(s):
b766929
added initial design
Browse files- .gitignore +1 -0
- .idea/.gitignore +2 -0
- .idea/ComparativeQA-Benchmark.iml +3 -1
- .idea/misc.xml +3 -0
- CQI_Leaderboard.csv +3 -0
- DataProcessing.ipynb +90 -0
- __pycache__/app.cpython-39.pyc +0 -0
- app.py +46 -4
- app_wildbench.py +526 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/venv/
|
.idea/.gitignore
CHANGED
@@ -6,3 +6,5 @@
|
|
6 |
# Datasource local storage ignored files
|
7 |
/dataSources/
|
8 |
/dataSources.local.xml
|
|
|
|
|
|
6 |
# Datasource local storage ignored files
|
7 |
/dataSources/
|
8 |
/dataSources.local.xml
|
9 |
+
# GitHub Copilot persisted chat sessions
|
10 |
+
/copilot/chatSessions
|
.idea/ComparativeQA-Benchmark.iml
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<module type="PYTHON_MODULE" version="4">
|
3 |
<component name="NewModuleRootManager">
|
4 |
-
<content url="file://$MODULE_DIR$"
|
|
|
|
|
5 |
<orderEntry type="jdk" jdkName="Python 3.9" jdkType="Python SDK" />
|
6 |
<orderEntry type="sourceFolder" forTests="false" />
|
7 |
</component>
|
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<module type="PYTHON_MODULE" version="4">
|
3 |
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$">
|
5 |
+
<excludeFolder url="file://$MODULE_DIR$/.idea/copilot/chatSessions" />
|
6 |
+
</content>
|
7 |
<orderEntry type="jdk" jdkName="Python 3.9" jdkType="Python SDK" />
|
8 |
<orderEntry type="sourceFolder" forTests="false" />
|
9 |
</component>
|
.idea/misc.xml
CHANGED
@@ -4,4 +4,7 @@
|
|
4 |
<option name="sdkName" value="Python 3.9 (CSI)" />
|
5 |
</component>
|
6 |
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
|
|
|
|
|
|
|
7 |
</project>
|
|
|
4 |
<option name="sdkName" value="Python 3.9 (CSI)" />
|
5 |
</component>
|
6 |
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
|
7 |
+
<component name="PythonCompatibilityInspectionAdvertiser">
|
8 |
+
<option name="version" value="3" />
|
9 |
+
</component>
|
10 |
</project>
|
CQI_Leaderboard.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
Model,Accuracy,Precision,Recall,F1 Score,Evaluation Time,Overall Score
|
2 |
+
Dummy,0.5,0.5,0.5,0.5,0.5,0.5
|
3 |
+
Dummy2,0.6,0.6,0.6,0.6,0.6,0.6
|
DataProcessing.ipynb
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 3,
|
6 |
+
"id": "initial_id",
|
7 |
+
"metadata": {
|
8 |
+
"collapsed": true,
|
9 |
+
"ExecuteTime": {
|
10 |
+
"end_time": "2024-03-24T11:48:41.895997Z",
|
11 |
+
"start_time": "2024-03-24T11:48:41.863555Z"
|
12 |
+
}
|
13 |
+
},
|
14 |
+
"outputs": [
|
15 |
+
{
|
16 |
+
"data": {
|
17 |
+
"text/plain": " Model Accuracy Precision Recall F1 Score Evaluation Time \\\n0 Dummy 0.5 0.5 0.5 0.5 0.5 \n0 Dummy2 0.6 0.6 0.6 0.6 0.6 \n\n Overall Score \n0 0.5 \n0 0.6 ",
|
18 |
+
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Model</th>\n <th>Accuracy</th>\n <th>Precision</th>\n <th>Recall</th>\n <th>F1 Score</th>\n <th>Evaluation Time</th>\n <th>Overall Score</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Dummy</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n <td>0.5</td>\n </tr>\n <tr>\n <th>0</th>\n <td>Dummy2</td>\n <td>0.6</td>\n <td>0.6</td>\n <td>0.6</td>\n <td>0.6</td>\n <td>0.6</td>\n <td>0.6</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
19 |
+
},
|
20 |
+
"execution_count": 3,
|
21 |
+
"metadata": {},
|
22 |
+
"output_type": "execute_result"
|
23 |
+
}
|
24 |
+
],
|
25 |
+
"source": [
|
26 |
+
"import pandas as pd\n",
|
27 |
+
"import numpy as np\n",
|
28 |
+
"\n",
|
29 |
+
"# Build a dataframe with Model, Accuracy, Precision, Recall, F1 Score, Evaluation Time, Overall Score\n",
|
30 |
+
"\n",
|
31 |
+
"model_results = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Evaluation Time', 'Overall Score'])\n",
|
32 |
+
"\n",
|
33 |
+
"# Add dummy data using concat\n",
|
34 |
+
"model_results = pd.concat([model_results, pd.DataFrame([['Dummy', 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Evaluation Time', 'Overall Score'])])\n",
|
35 |
+
"\n",
|
36 |
+
"# add more dummy data\n",
|
37 |
+
"model_results = pd.concat([model_results, pd.DataFrame([['Dummy2', 0.6, 0.6, 0.6, 0.6, 0.6, 0.6]], columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Evaluation Time', 'Overall Score'])])\n",
|
38 |
+
"\n",
|
39 |
+
"model_results"
|
40 |
+
]
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"cell_type": "code",
|
44 |
+
"outputs": [],
|
45 |
+
"source": [
|
46 |
+
"# Save the model results to a csv file\n",
|
47 |
+
"model_results.to_csv('CQI_Leaderboard.csv', index=False)"
|
48 |
+
],
|
49 |
+
"metadata": {
|
50 |
+
"collapsed": false,
|
51 |
+
"ExecuteTime": {
|
52 |
+
"end_time": "2024-03-24T11:49:23.687615Z",
|
53 |
+
"start_time": "2024-03-24T11:49:23.602354Z"
|
54 |
+
}
|
55 |
+
},
|
56 |
+
"id": "d6d288e1af91dd1d",
|
57 |
+
"execution_count": 4
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"cell_type": "code",
|
61 |
+
"outputs": [],
|
62 |
+
"source": [],
|
63 |
+
"metadata": {
|
64 |
+
"collapsed": false
|
65 |
+
},
|
66 |
+
"id": "f164c55726b7cbaf"
|
67 |
+
}
|
68 |
+
],
|
69 |
+
"metadata": {
|
70 |
+
"kernelspec": {
|
71 |
+
"display_name": "Python 3",
|
72 |
+
"language": "python",
|
73 |
+
"name": "python3"
|
74 |
+
},
|
75 |
+
"language_info": {
|
76 |
+
"codemirror_mode": {
|
77 |
+
"name": "ipython",
|
78 |
+
"version": 2
|
79 |
+
},
|
80 |
+
"file_extension": ".py",
|
81 |
+
"mimetype": "text/x-python",
|
82 |
+
"name": "python",
|
83 |
+
"nbconvert_exporter": "python",
|
84 |
+
"pygments_lexer": "ipython2",
|
85 |
+
"version": "2.7.6"
|
86 |
+
}
|
87 |
+
},
|
88 |
+
"nbformat": 4,
|
89 |
+
"nbformat_minor": 5
|
90 |
+
}
|
__pycache__/app.cpython-39.pyc
ADDED
Binary file (1.67 kB). View file
|
|
app.py
CHANGED
@@ -1,9 +1,51 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
|
|
|
|
|
|
|
3 |
|
4 |
-
|
5 |
-
|
|
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
|
5 |
+
# UI Root
|
6 |
+
with gr.Blocks() as demo:
|
7 |
+
gr.Markdown("## CompUGE-Bench: Comparative Understanding and Generation Evaluation Benchmarks")
|
8 |
|
9 |
+
# Main Tabs
|
10 |
+
with gr.Tab("Leaderboards"):
|
11 |
+
gr.Markdown("### Leaderboards")
|
12 |
|
13 |
+
# CQI Tab
|
14 |
+
with gr.Tab("CQI"):
|
15 |
+
gr.Markdown("### Comparative Question Identification Leaderboard")
|
16 |
+
# read dataframe from CQI_Learboard.csv
|
17 |
+
# TODO: replace the following line with the actual leaderboard file
|
18 |
+
CQI_Leaderboard = pd.read_csv("CQI_Leaderboard.csv")
|
19 |
+
cqi_leaderboard = gr.components.Dataframe(CQI_Leaderboard)
|
20 |
|
21 |
+
# OAI Tab
|
22 |
+
with gr.Tab("OAI"):
|
23 |
+
gr.Markdown("### Object & Aspect Identification Leaderboard")
|
24 |
+
gr.Markdown("The OAI leaderboard will be opened soon!")
|
25 |
+
|
26 |
+
# SC Tab
|
27 |
+
with gr.Tab("SC"):
|
28 |
+
gr.Markdown("### Stance Clasification Leaderboard")
|
29 |
+
gr.Markdown("The SC leaderboard will be opened soon!")
|
30 |
+
|
31 |
+
# SG Tab
|
32 |
+
with gr.Tab("SG"):
|
33 |
+
gr.Markdown("### Summary Generation Leaderboard")
|
34 |
+
gr.Markdown("The Summary Generation leaderboard will be opened soon!")
|
35 |
+
|
36 |
+
# Model Submissions Tab
|
37 |
+
with gr.Tab("Model Submissions"):
|
38 |
+
gr.Markdown("### Submission")
|
39 |
+
gr.Markdown("The submission will be opened soon!")
|
40 |
+
|
41 |
+
# About Tab
|
42 |
+
with gr.Tab("About"):
|
43 |
+
gr.Markdown("### About")
|
44 |
+
gr.Markdown("CompUGE-Bench is a benchmark for comparative understanding and generation evaluation.")
|
45 |
+
|
46 |
+
# Contact Tab
|
47 |
+
with gr.Tab("Contact"):
|
48 |
+
gr.Markdown("### Contact")
|
49 |
+
gr.Markdown("For any questions, please contact us at [email protected]")
|
50 |
+
# Launch public demo
|
51 |
+
demo.launch(share=True)
|
app_wildbench.py
ADDED
@@ -0,0 +1,526 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
|
2 |
+
import ast
|
3 |
+
import argparse
|
4 |
+
import glob
|
5 |
+
import pickle
|
6 |
+
import plotly
|
7 |
+
import gradio as gr
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
import gradio as gr
|
11 |
+
import pandas as pd
|
12 |
+
from pathlib import Path
|
13 |
+
import json
|
14 |
+
from constants import BANNER, CITATION_TEXT, WINRATE_HEATMAP, css, js_code, all_task_types, DEFAULT_LP, TASK_TYPE_STR, \
|
15 |
+
js_light
|
16 |
+
from datetime import datetime, timezone
|
17 |
+
from data_utils import load_eval_results, sample_an_eval_result, apply_length_penalty, post_processing, add_winrates, \
|
18 |
+
add_winrates_tasks
|
19 |
+
# from gradio.themes.utils import colors, fonts, sizes
|
20 |
+
from themes import Seafoam
|
21 |
+
from huggingface_hub import HfApi
|
22 |
+
# from datasets import Dataset, load_dataset, concatenate_datasets
|
23 |
+
import os, uuid
|
24 |
+
from utils_display import model_info
|
25 |
+
|
26 |
+
# get the last updated time from the elo_ranks.all.jsonl file
|
27 |
+
LAST_UPDATED = None
|
28 |
+
with open("_intro.md", "r") as f:
|
29 |
+
INTRO_MD = f.read()
|
30 |
+
|
31 |
+
with open("_about_us.md", "r") as f:
|
32 |
+
ABOUT_MD = f.read()
|
33 |
+
|
34 |
+
with open("_header.md", "r") as f:
|
35 |
+
HEADER_MD = f.read()
|
36 |
+
|
37 |
+
original_df, ablation_df = None, None
|
38 |
+
eval_results = load_eval_results()
|
39 |
+
|
40 |
+
available_models = [] # to be filled in later
|
41 |
+
|
42 |
+
|
43 |
+
def display_chat_history(model_selections, task_selections):
|
44 |
+
eval_item = sample_an_eval_result(eval_results, model_selections, task_selections)
|
45 |
+
session_id = eval_item["session_id"]
|
46 |
+
chats = [x["content"] for x in eval_item['conversation_input']]
|
47 |
+
# form a list of tuples of two adjacent messages in chats
|
48 |
+
chats_common = chats[:] + [None]
|
49 |
+
# chats_modelA = ["Model A Output"] + [eval_item["model_A_output"]]
|
50 |
+
# chats_modelB = ["Model B Output"] + [eval_item["model_B_output"]]
|
51 |
+
chats_modelA = [None] + [eval_item["model_A_output"]]
|
52 |
+
chats_modelB = [None] + [eval_item["model_B_output"]]
|
53 |
+
message_history_common = [(chats_common[i], chats_common[i + 1]) for i in range(0, len(chats_common) - 1, 2)]
|
54 |
+
message_history_model_A = [(chats_modelA[i], chats_modelA[i + 1]) for i in range(0, len(chats_modelA) - 1, 2)]
|
55 |
+
message_history_model_B = [(chats_modelB[i], chats_modelB[i + 1]) for i in range(0, len(chats_modelB) - 1, 2)]
|
56 |
+
checklist_string = ""
|
57 |
+
for item in eval_item["checklist"]:
|
58 |
+
checklist_string += f"1. {item}\n"
|
59 |
+
list_reasons = eval_item["reason"].strip().split(". ")
|
60 |
+
# remove the last one if it is empty
|
61 |
+
if list_reasons[-1] == "":
|
62 |
+
list_reasons = list_reasons[:-1]
|
63 |
+
list_reasons = "\n".join([f"- {item}." for item in list_reasons])
|
64 |
+
gpt4_reason = f"### Choice: {eval_item['choice']}. Reason: โฌ๏ธ\n" + list_reasons
|
65 |
+
assignment_string = f"Model A: {eval_item['model_A']} | Model B: {eval_item['model_B']}"
|
66 |
+
user_intent = f"- ๐: `{session_id}` \n- ๐ฌ **User Intent:** {eval_item['intent']} \n- โ๏ธ **Task category**: {', '.join(eval_item['all_tags'])}"
|
67 |
+
return session_id, user_intent, message_history_common, message_history_model_A, message_history_model_B, gpt4_reason, checklist_string, assignment_string
|
68 |
+
|
69 |
+
|
70 |
+
def slider_change_main(length_penalty):
|
71 |
+
global original_df, ablation_df
|
72 |
+
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty)
|
73 |
+
adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
|
74 |
+
adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
|
75 |
+
adjusted_df = add_winrates(adjusted_df)
|
76 |
+
adjusted_df = adjusted_df.drop(columns=["Length"])
|
77 |
+
return adjusted_df
|
78 |
+
|
79 |
+
|
80 |
+
def slider_change_full(length_penalty, show_winrate):
|
81 |
+
global original_df, ablation_df
|
82 |
+
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty)
|
83 |
+
# sort the model by the "Task-Avg Elo" column
|
84 |
+
adjusted_df = adjusted_df.sort_values(by="Task-Avg Elo", ascending=False)
|
85 |
+
adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
|
86 |
+
if show_winrate == "none":
|
87 |
+
return adjusted_df
|
88 |
+
elif show_winrate == "gpt-3.5":
|
89 |
+
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5")
|
90 |
+
elif show_winrate == "gpt-4":
|
91 |
+
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4")
|
92 |
+
return adjusted_df
|
93 |
+
|
94 |
+
|
95 |
+
seafoam = Seafoam()
|
96 |
+
|
97 |
+
|
98 |
+
def build_demo(TYPES):
|
99 |
+
global original_df, ablation_df, skip_empty_original_df, skip_empty_ablation_df, available_models
|
100 |
+
with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
|
101 |
+
# with gr.Blocks(theme=seafoam, css=css) as demo:
|
102 |
+
gr.HTML(BANNER, elem_id="banner")
|
103 |
+
# gr.Markdown("### Work in progress. Please do not share.", elem_classes="markdown-text") # TODO: remove this later.
|
104 |
+
gr.Markdown(HEADER_MD, elem_classes="markdown-text")
|
105 |
+
|
106 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
107 |
+
with gr.TabItem("๐
Leaderboard", elem_id="od-benchmark-tab-table", id=0):
|
108 |
+
gr.Markdown(
|
109 |
+
f"**Version**: WildBench (v1.0; 2024.03.07) | **# Examples**: 1024 | **# Models**: {len(available_models)} | **# Comparisons**: 26k",
|
110 |
+
elem_classes="markdown-text")
|
111 |
+
|
112 |
+
with gr.TabItem("Main Table", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
113 |
+
# original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
|
114 |
+
default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
|
115 |
+
default_main_df = default_main_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
|
116 |
+
default_main_df = add_winrates(default_main_df)
|
117 |
+
default_main_df = default_main_df.drop(columns=["Length"])
|
118 |
+
# TODO: add the win rate for GPT-4 and GPT-3.5T
|
119 |
+
with gr.Row():
|
120 |
+
with gr.Column(scale=4):
|
121 |
+
gr.Markdown(
|
122 |
+
"**Overall Elo**: [Standard Elo rating with boostrap.](https://en.wikipedia.org/wiki/Elo_rating_system). | **Task-Avg Elo**: Compute Elo on subsets of each task type and then take avg. | **Win Rates**: [Estimated by Elo differences](https://www.hexwiki.net/index.php/Elo_rating#Definition). | **Length penalty**: Models w/ longer outputs are penalized. (Plz check ๐ **Details**.)",
|
123 |
+
elem_classes="markdown-text-small top-left-LP")
|
124 |
+
with gr.Column(scale=0.8):
|
125 |
+
length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
|
126 |
+
label="Length Penalty", elem_id="length-penalty-slider")
|
127 |
+
# checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
|
128 |
+
leaderboard_table = gr.components.Dataframe(
|
129 |
+
value=default_main_df,
|
130 |
+
datatype=TYPES,
|
131 |
+
# max_rows=None,
|
132 |
+
height=1000,
|
133 |
+
elem_id="leaderboard-table",
|
134 |
+
interactive=False,
|
135 |
+
visible=True,
|
136 |
+
min_width=60,
|
137 |
+
)
|
138 |
+
length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider],
|
139 |
+
outputs=[leaderboard_table])
|
140 |
+
|
141 |
+
with gr.TabItem("All Tasks (Win% vs GPT-3.5T)", elem_id="od-benchmark-tab-table-ablation", id=1):
|
142 |
+
with gr.Row():
|
143 |
+
with gr.Column(scale=4):
|
144 |
+
gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP")
|
145 |
+
with gr.Column(scale=0.8):
|
146 |
+
length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
|
147 |
+
label="Length Penalty",
|
148 |
+
elem_id="length-penalty-slider")
|
149 |
+
default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
|
150 |
+
# do not show the "# battles" column here
|
151 |
+
default_full_df = default_full_df.drop(
|
152 |
+
columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
|
153 |
+
default_full_df = add_winrates_tasks(default_full_df, ref="gpt-3.5")
|
154 |
+
|
155 |
+
leaderboard_table_full = gr.components.Dataframe(
|
156 |
+
value=default_full_df,
|
157 |
+
datatype=TYPES,
|
158 |
+
# max_rows=None,
|
159 |
+
height=1000,
|
160 |
+
elem_id="leaderboard-table-full_table",
|
161 |
+
interactive=False,
|
162 |
+
visible=True,
|
163 |
+
min_width=60,
|
164 |
+
)
|
165 |
+
show_winrate = gr.Checkbox(value="gpt-3.5", visible=False)
|
166 |
+
length_penlty_slider_full.change(fn=slider_change_full,
|
167 |
+
inputs=[length_penlty_slider_full, show_winrate],
|
168 |
+
outputs=[leaderboard_table_full])
|
169 |
+
|
170 |
+
with gr.TabItem("All Tasks (Win% vs GPT-4)", elem_id="od-benchmark-tab-table-ablation", id=2):
|
171 |
+
with gr.Row():
|
172 |
+
with gr.Column(scale=4):
|
173 |
+
gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP")
|
174 |
+
with gr.Column(scale=0.8):
|
175 |
+
length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
|
176 |
+
label="Length Penalty",
|
177 |
+
elem_id="length-penalty-slider")
|
178 |
+
default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
|
179 |
+
# do not show the "# battles" column here
|
180 |
+
default_full_df = default_full_df.drop(
|
181 |
+
columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
|
182 |
+
default_full_df = add_winrates_tasks(default_full_df, ref="gpt-4")
|
183 |
+
leaderboard_table_full = gr.components.Dataframe(
|
184 |
+
value=default_full_df,
|
185 |
+
datatype=TYPES,
|
186 |
+
# max_rows=None,
|
187 |
+
height=1000,
|
188 |
+
elem_id="leaderboard-table-full_table",
|
189 |
+
interactive=False,
|
190 |
+
visible=True,
|
191 |
+
min_width=60,
|
192 |
+
)
|
193 |
+
show_winrate = gr.Checkbox(value="gpt-4", visible=False)
|
194 |
+
length_penlty_slider_full.change(fn=slider_change_full,
|
195 |
+
inputs=[length_penlty_slider_full, show_winrate],
|
196 |
+
outputs=[leaderboard_table_full])
|
197 |
+
|
198 |
+
with gr.TabItem("All Tasks (Elo)", elem_id="od-benchmark-tab-table-ablation", id=3):
|
199 |
+
with gr.Row():
|
200 |
+
with gr.Column(scale=4):
|
201 |
+
gr.Markdown(TASK_TYPE_STR, elem_classes="markdown-text-small top-left-LP")
|
202 |
+
with gr.Column(scale=0.8):
|
203 |
+
length_penlty_slider_full = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP,
|
204 |
+
label="Length Penalty",
|
205 |
+
elem_id="length-penalty-slider")
|
206 |
+
default_full_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP)
|
207 |
+
# do not show the "# battles" column here
|
208 |
+
default_full_df = default_full_df.drop(
|
209 |
+
columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"])
|
210 |
+
leaderboard_table_full = gr.components.Dataframe(
|
211 |
+
value=default_full_df,
|
212 |
+
datatype=TYPES,
|
213 |
+
# max_rows=None,
|
214 |
+
height=1000,
|
215 |
+
elem_id="leaderboard-table-full_table",
|
216 |
+
interactive=False,
|
217 |
+
visible=True,
|
218 |
+
min_width=60,
|
219 |
+
)
|
220 |
+
show_winrate = gr.Checkbox(value="none", visible=False)
|
221 |
+
length_penlty_slider_full.change(fn=slider_change_full,
|
222 |
+
inputs=[length_penlty_slider_full, show_winrate],
|
223 |
+
outputs=[leaderboard_table_full])
|
224 |
+
|
225 |
+
# with gr.TabItem("Pairwise Win Rates", elem_id="od-benchmark-tab-table-ablation", id=4):
|
226 |
+
# # TODO: show all winrate
|
227 |
+
# # winrates_heatmap = pickle.load(open("data_dir/pairwise_win_fractions.pkl", "rb"))
|
228 |
+
# # gr.Plot(value=winrates_heatmap, scale=2, min_width=800, container=False, elem_classes="plotly-plot", visible=True)
|
229 |
+
# gr.HTML(WINRATE_HEATMAP, visible=True)
|
230 |
+
|
231 |
+
with gr.TabItem("๐ Details", elem_id="od-benchmark-tab-table", id=1):
|
232 |
+
gr.Markdown(INTRO_MD, elem_classes="markdown-text-details")
|
233 |
+
|
234 |
+
with gr.TabItem("๐ Explore | ๐ Evaluate", elem_id="od-benchmark-tab-table", id=2):
|
235 |
+
|
236 |
+
with gr.Row():
|
237 |
+
btn_show_history = gr.Button("๐ฒ Click here to sample an example + a pair of LLM outputs! ",
|
238 |
+
elem_classes="sample_button")
|
239 |
+
|
240 |
+
with gr.Row():
|
241 |
+
with gr.Column(scale=1.5):
|
242 |
+
with gr.Accordion("Choose models to sample from", open=False, elem_classes="accordion-label"):
|
243 |
+
model_options = available_models
|
244 |
+
selected_models = gr.CheckboxGroup(model_options, info="", value=model_options,
|
245 |
+
show_label=False, elem_id="select-models")
|
246 |
+
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
247 |
+
# clear the selected_models
|
248 |
+
clear_button.click(lambda: {selected_models: {"value": [], "__type__": "update"}},
|
249 |
+
inputs=[], outputs=[selected_models])
|
250 |
+
with gr.Column(scale=1):
|
251 |
+
with gr.Accordion("Choose task types to sample from", open=False,
|
252 |
+
elem_classes="accordion-label"):
|
253 |
+
select_tasks = gr.CheckboxGroup(all_task_types, info="", value=all_task_types,
|
254 |
+
show_label=False, elem_id="select-tasks")
|
255 |
+
clear_task_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
256 |
+
# clear the select_tasks
|
257 |
+
clear_task_button.click(lambda: {select_tasks: {"value": [], "__type__": "update"}},
|
258 |
+
inputs=[], outputs=[select_tasks])
|
259 |
+
|
260 |
+
with gr.Row():
|
261 |
+
with gr.Column():
|
262 |
+
gr.Markdown("## ๐ข Chat History", elem_classes="markdown-text")
|
263 |
+
Chatbot_Common = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height="auto",
|
264 |
+
container=False, label="Common Chat History", likeable=False,
|
265 |
+
show_share_button=False, show_label=True,
|
266 |
+
elem_classes="chat-common", layout="bubble")
|
267 |
+
Chatbot_Common.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
268 |
+
with gr.Accordion("โ๏ธ Task Annotation", elem_classes="accordion-label", open=False):
|
269 |
+
user_intent = gr.Markdown("", elem_classes="markdown-text-small")
|
270 |
+
# two columns for the two models
|
271 |
+
with gr.Row():
|
272 |
+
# https://www.gradio.app/docs/chatbot
|
273 |
+
with gr.Column():
|
274 |
+
gr.Markdown("## โฌ
๏ธ Model A Output", elem_classes="markdown-text")
|
275 |
+
Chatbot_A = gr.Chatbot(height="auto", container=False, label="Model A Output", likeable=False,
|
276 |
+
show_share_button=False, show_label=True, elem_classes="chat-specific",
|
277 |
+
layout="bubble")
|
278 |
+
Chatbot_A.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
279 |
+
with gr.Column():
|
280 |
+
# add a Markdown to show this is for Model B
|
281 |
+
gr.Markdown("## โก๏ธ Model B Output", elem_classes="markdown-text")
|
282 |
+
Chatbot_B = gr.Chatbot(height="auto", container=False, label="Model B Output", likeable=False,
|
283 |
+
show_share_button=False, show_label=True, elem_classes="chat-specific",
|
284 |
+
layout="bubble")
|
285 |
+
Chatbot_B.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
286 |
+
with gr.Row():
|
287 |
+
# Here we can show the GPT-4 judgement for the model outputs
|
288 |
+
# show a textarea
|
289 |
+
with gr.Column():
|
290 |
+
with gr.Accordion("โฑ๏ธ Checklist", open=False, elem_classes="accordion-label"):
|
291 |
+
checklist = gr.Markdown("### Checklist: \n Will be shown later.",
|
292 |
+
elem_classes="markdown-text-tiny")
|
293 |
+
with gr.Accordion("โ๏ธ GPT-4 Judgement", open=False,
|
294 |
+
elem_classes="accordion-label") as gpt4_accordion:
|
295 |
+
# gpt4_reason = gr.TextArea(label="GPT-4 Judgement", placeholder="Will be shown later.", type="text", elem_classes="", max_lines=10, show_copy_button=True)
|
296 |
+
gpt4_reason = gr.Markdown("Will be shown later.", elem_classes="markdown-text-tiny")
|
297 |
+
|
298 |
+
with gr.Row():
|
299 |
+
# show buttons for user to choose which model output is better or Tie
|
300 |
+
btn_model_A = gr.Button("โฌ
๏ธ Model A is better! ", elem_classes="btn_boderline_gray", scale=2,
|
301 |
+
interactive=False)
|
302 |
+
btn_tie = gr.Button("๐ฐ Tie", elem_classes="btn_boderline_gray", scale=2, interactive=False)
|
303 |
+
btn_model_B = gr.Button("โก๏ธ Model B is better!", elem_classes="btn_boderline_gray", scale=2,
|
304 |
+
interactive=False)
|
305 |
+
with gr.Row():
|
306 |
+
with gr.Column(scale=2):
|
307 |
+
reason_textbox = gr.Textbox(label="Reason", placeholder="Please input your reason here.",
|
308 |
+
type="text", elem_classes="", max_lines=10, lines=8,
|
309 |
+
show_copy_button=False, visible=True, scale=4, interactive=True)
|
310 |
+
with gr.Column():
|
311 |
+
with gr.Row():
|
312 |
+
user_choice = gr.Markdown("Your choice: N/A", elem_classes="markdown-text", visible=True)
|
313 |
+
btn_pass = gr.Button("๐ Next", elem_classes="btn_boderline_next", scale=1)
|
314 |
+
user_name = gr.Textbox(label="Your HF Username", placeholder="Your HuggingFace username",
|
315 |
+
type="text", elem_classes="", max_lines=1, show_copy_button=False,
|
316 |
+
visible=True, interactive=True, show_label=False)
|
317 |
+
# login_btn = gr.LoginButton(visible=False, interactive=True, elem_classes="btn_boderline")
|
318 |
+
submit_button = gr.Button("Submit your feedback! ๐", elem_classes="btn_boderline", visible=True,
|
319 |
+
interactive=False)
|
320 |
+
assignment = gr.Markdown("Model A: | Model B: ", elem_classes="markdown-text-tiny-red",
|
321 |
+
visible=False)
|
322 |
+
|
323 |
+
session_id = gr.Textbox(label="Session ID", placeholder="N/A.", type="text", elem_classes="",
|
324 |
+
max_lines=10, show_copy_button=False, visible=False)
|
325 |
+
|
326 |
+
def show_reason_and_submit(session_id, user_name_text, btn, request: gr.Request):
|
327 |
+
|
328 |
+
if request.username is not None:
|
329 |
+
user_name_text = request.username
|
330 |
+
result_dict = {
|
331 |
+
reason_textbox: {"visible": True, "__type__": "update"},
|
332 |
+
submit_button: {"visible": True, "__type__": "update", "interactive": True},
|
333 |
+
user_name: {"visible": True, "__type__": "update", "value": user_name_text},
|
334 |
+
}
|
335 |
+
if "Model A" in btn:
|
336 |
+
choice = "Model A"
|
337 |
+
result_dict.update({
|
338 |
+
user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
|
339 |
+
btn_model_A: {"elem_classes": "btn_boderline_selected", "__type__": "update"},
|
340 |
+
btn_model_B: {"elem_classes": "btn_boderline", "__type__": "update"},
|
341 |
+
btn_tie: {"elem_classes": "btn_boderline", "__type__": "update"},
|
342 |
+
})
|
343 |
+
elif "Model B" in btn:
|
344 |
+
choice = "Model B"
|
345 |
+
result_dict.update({
|
346 |
+
user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
|
347 |
+
btn_model_B: {"elem_classes": "btn_boderline_selected", "__type__": "update"},
|
348 |
+
btn_model_A: {"elem_classes": "btn_boderline", "__type__": "update"},
|
349 |
+
btn_tie: {"elem_classes": "btn_boderline", "__type__": "update"},
|
350 |
+
})
|
351 |
+
elif "Tie" in btn:
|
352 |
+
choice = "Tie"
|
353 |
+
result_dict.update({
|
354 |
+
user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
|
355 |
+
btn_tie: {"elem_classes": "btn_boderline_selected", "__type__": "update"},
|
356 |
+
btn_model_A: {"elem_classes": "btn_boderline", "__type__": "update"},
|
357 |
+
btn_model_B: {"elem_classes": "btn_boderline", "__type__": "update"},
|
358 |
+
})
|
359 |
+
else:
|
360 |
+
choice = "N/A"
|
361 |
+
result_dict.update({
|
362 |
+
user_choice: {"value": f"Your choice: **{choice}**", "__type__": "update", "visible": True},
|
363 |
+
})
|
364 |
+
return result_dict
|
365 |
+
|
366 |
+
btn_model_A.click(show_reason_and_submit, inputs=[session_id, user_name, btn_model_A],
|
367 |
+
outputs=[user_choice, reason_textbox, submit_button, user_name, btn_model_A, btn_tie,
|
368 |
+
btn_model_B])
|
369 |
+
btn_tie.click(show_reason_and_submit, inputs=[session_id, user_name, btn_tie],
|
370 |
+
outputs=[user_choice, reason_textbox, submit_button, user_name, btn_model_A, btn_tie,
|
371 |
+
btn_model_B])
|
372 |
+
btn_model_B.click(show_reason_and_submit, inputs=[session_id, user_name, btn_model_B],
|
373 |
+
outputs=[user_choice, reason_textbox, submit_button, user_name, btn_model_A, btn_tie,
|
374 |
+
btn_model_B])
|
375 |
+
|
376 |
+
def submit_feedback(session_id, user_reason, user_choice, user_name_text, assignment_string,
|
377 |
+
request: gr.Request):
|
378 |
+
if "N/A" in session_id or "N/A" in user_choice:
|
379 |
+
# send a message to the user to sample an example and select a choice first
|
380 |
+
return {
|
381 |
+
submit_button: {"interactive": True, "__type__": "update",
|
382 |
+
"value": "Submit your feedback! ๐ Please sample an example and select a choice!"},
|
383 |
+
}
|
384 |
+
# create a jsonl file and upload it to hf
|
385 |
+
choice_str = ""
|
386 |
+
if "Model A" in user_choice:
|
387 |
+
choice_str = "Model A"
|
388 |
+
elif "Model B" in user_choice:
|
389 |
+
choice_str = "Model B"
|
390 |
+
elif "Tie" in user_choice:
|
391 |
+
choice_str = "Tie"
|
392 |
+
else:
|
393 |
+
choice_str = "N/A"
|
394 |
+
if user_name_text == "" and request.username is None:
|
395 |
+
user_name_text = "Anonymous"
|
396 |
+
if request.username is not None:
|
397 |
+
user_name_text = request.username
|
398 |
+
feedback_item = {
|
399 |
+
"session_id": session_id,
|
400 |
+
"user_name": user_name_text,
|
401 |
+
"user_reason": user_reason,
|
402 |
+
"user_choice": choice_str,
|
403 |
+
"ip": request.client.host,
|
404 |
+
"assignment_string": assignment_string
|
405 |
+
}
|
406 |
+
jsonl_str = json.dumps(feedback_item)
|
407 |
+
api = HfApi()
|
408 |
+
token = os.getenv("HF_TOKEN")
|
409 |
+
if token is None:
|
410 |
+
raise ValueError(
|
411 |
+
"Hugging Face token not found. Ensure the HF_TOKEN environment variable is set.")
|
412 |
+
|
413 |
+
# Generate a random filename using UUID
|
414 |
+
filename = f"{uuid.uuid4()}.json"
|
415 |
+
|
416 |
+
# Define the repository
|
417 |
+
repo_id = "WildEval/WildBench-HumanFeedback"
|
418 |
+
|
419 |
+
# Upload the json_str as a file directly to the specified path in your dataset repository
|
420 |
+
api.upload_file(
|
421 |
+
token=token,
|
422 |
+
repo_id=repo_id,
|
423 |
+
repo_type="dataset",
|
424 |
+
path_or_fileobj=jsonl_str.encode("utf-8"), # Convert string to bytes
|
425 |
+
path_in_repo=filename,
|
426 |
+
commit_message=f"Add user feedback for session_id: {session_id}. Assignment: {assignment_string}",
|
427 |
+
)
|
428 |
+
return {
|
429 |
+
submit_button: {"interactive": False, "__type__": "update",
|
430 |
+
"value": "Submitted! โ
\n Please click ๐ Next."},
|
431 |
+
reason_textbox: {"interactive": False, "__type__": "update"},
|
432 |
+
btn_model_A: {"interactive": False, "__type__": "update"},
|
433 |
+
btn_tie: {"interactive": False, "__type__": "update"},
|
434 |
+
btn_model_B: {"interactive": False, "__type__": "update"},
|
435 |
+
user_name: {"interactive": False, "__type__": "update"},
|
436 |
+
assignment: {"visible": True, "__type__": "update"}
|
437 |
+
}
|
438 |
+
|
439 |
+
def reset_submission(session_id):
|
440 |
+
return {
|
441 |
+
submit_button: {"interactive": False, "__type__": "update", "value": "Submit your feedback! ๐"},
|
442 |
+
reason_textbox: {"interactive": True, "__type__": "update", "value": ""},
|
443 |
+
btn_model_A: {"interactive": True, "__type__": "update", "elem_classes": "btn_boderline_gray"},
|
444 |
+
btn_tie: {"interactive": True, "__type__": "update", "elem_classes": "btn_boderline_gray"},
|
445 |
+
btn_model_B: {"interactive": True, "__type__": "update", "elem_classes": "btn_boderline_gray"},
|
446 |
+
user_name: {"interactive": True, "__type__": "update"},
|
447 |
+
user_choice: {"value": "Your choice: N/A", "__type__": "update"},
|
448 |
+
assignment: {"__type__": "update", "visible": False},
|
449 |
+
gpt4_accordion: {"__type__": "update", "open": False},
|
450 |
+
}
|
451 |
+
|
452 |
+
# reset the reason_textbox, submit_button, and btn_model_A
|
453 |
+
session_id.change(reset_submission, inputs=[session_id],
|
454 |
+
outputs=[submit_button, reason_textbox, btn_model_A, btn_tie, btn_model_B, user_name,
|
455 |
+
user_choice, assignment, gpt4_accordion])
|
456 |
+
submit_button.click(submit_feedback,
|
457 |
+
inputs=[session_id, reason_textbox, user_choice, user_name, assignment],
|
458 |
+
outputs=[submit_button, reason_textbox, btn_model_A, btn_tie, btn_model_B,
|
459 |
+
user_name, assignment])
|
460 |
+
|
461 |
+
# Display chat history when button is clicked
|
462 |
+
# TODO: add the model list and tag list
|
463 |
+
btn_show_history.click(fn=display_chat_history, inputs=[selected_models, select_tasks],
|
464 |
+
outputs=[session_id, user_intent, Chatbot_Common, Chatbot_A, Chatbot_B,
|
465 |
+
gpt4_reason, checklist, assignment])
|
466 |
+
btn_pass.click(fn=display_chat_history, inputs=[selected_models, select_tasks],
|
467 |
+
outputs=[session_id, user_intent, Chatbot_Common, Chatbot_A, Chatbot_B, gpt4_reason,
|
468 |
+
checklist,
|
469 |
+
assignment]) # the pass button will be the same function of resampling
|
470 |
+
|
471 |
+
with gr.TabItem("๐ฎ About Us", elem_id="od-benchmark-tab-table", id=3):
|
472 |
+
gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
|
473 |
+
gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
|
474 |
+
|
475 |
+
with gr.Row():
|
476 |
+
with gr.Accordion("๐ Citation", open=False, elem_classes="accordion-label"):
|
477 |
+
gr.Textbox(
|
478 |
+
value=CITATION_TEXT,
|
479 |
+
lines=7,
|
480 |
+
label="Copy the BibTeX snippet to cite this source",
|
481 |
+
elem_id="citation-button",
|
482 |
+
show_copy_button=True)
|
483 |
+
# ).style(show_copy_button=True)
|
484 |
+
|
485 |
+
return demo
|
486 |
+
|
487 |
+
|
488 |
+
if __name__ == "__main__":
|
489 |
+
parser = argparse.ArgumentParser()
|
490 |
+
parser.add_argument("--share", action="store_true")
|
491 |
+
parser.add_argument("--result_file", help="Path to results table", default="data_dir/elo_ranks.all.jsonl")
|
492 |
+
parser.add_argument("--length_balation_file", help="Path to results table",
|
493 |
+
default="data_dir/elo_ranks.length_ablation.all.jsonl")
|
494 |
+
parser.add_argument("--skip_empty_result_file", help="Path to results table",
|
495 |
+
default="data_dir/elo_ranks.skip_empty.all.jsonl")
|
496 |
+
parser.add_argument("--skip_empty_length_balation_file", help="Path to results table",
|
497 |
+
default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl")
|
498 |
+
args = parser.parse_args()
|
499 |
+
|
500 |
+
LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime(
|
501 |
+
"%Y-%m-%d %H:%M:%S")
|
502 |
+
|
503 |
+
original_df = pd.read_json(args.result_file, lines=True)
|
504 |
+
ablation_df = pd.read_json(args.length_balation_file, lines=True)
|
505 |
+
skip_empty_original_df = pd.read_json(args.skip_empty_result_file, lines=True)
|
506 |
+
skip_empty_ablation_df = pd.read_json(args.skip_empty_length_balation_file, lines=True)
|
507 |
+
|
508 |
+
# available_models = sorted(list(set(list(original_df["model name "]))))
|
509 |
+
available_models = list(model_info.keys())
|
510 |
+
# remove the rows where the model name is not in the available_models
|
511 |
+
original_df = original_df[original_df["model name "].isin(available_models)]
|
512 |
+
ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]
|
513 |
+
skip_empty_ablation_df = skip_empty_ablation_df[skip_empty_ablation_df["model name "].isin(available_models)]
|
514 |
+
skip_empty_original_df = skip_empty_original_df[skip_empty_original_df["model name "].isin(available_models)]
|
515 |
+
|
516 |
+
model_len_info = json.load(open("model_len_info.json", "r"))
|
517 |
+
|
518 |
+
original_df = post_processing(original_df, model_len_info)
|
519 |
+
ablation_df = post_processing(ablation_df, model_len_info)
|
520 |
+
skip_empty_original_df = post_processing(skip_empty_original_df, model_len_info)
|
521 |
+
skip_empty_ablation_df = post_processing(skip_empty_ablation_df, model_len_info)
|
522 |
+
|
523 |
+
TYPES = ["markdown", "number"]
|
524 |
+
|
525 |
+
demo = build_demo(TYPES)
|
526 |
+
demo.launch(share=args.share, height=1000)
|