Spaces:
Running
Running
LinB203
commited on
Commit
Β·
850ad91
1
Parent(s):
ebc5bbb
demo
Browse files- app.py +5 -45
- constants.py +9 -14
- file/result.csv +10 -2
- file/sample_to_upload.csv +1 -1
- src/__pycache__/utils_display.cpython-38.pyc +0 -0
- src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc +0 -0
app.py
CHANGED
@@ -21,25 +21,16 @@ def add_new_eval(
|
|
21 |
input_file,
|
22 |
model_name_textbox: str,
|
23 |
revision_name_textbox: str,
|
24 |
-
model_type: str,
|
25 |
model_link: str,
|
26 |
-
LLM_type: str,
|
27 |
-
LLM_name_textbox: str,
|
28 |
):
|
29 |
if input_file is None:
|
30 |
return "Error! Empty file!"
|
31 |
else:
|
32 |
input_data = input_file.decode("utf-8").split('\n')[1].split(',')
|
33 |
-
input_data = [
|
34 |
|
35 |
csv_data = pd.read_csv(CSV_DIR)
|
36 |
|
37 |
-
|
38 |
-
if LLM_type == 'Other':
|
39 |
-
LLM_name = LLM_name_textbox
|
40 |
-
else:
|
41 |
-
LLM_name = LLM_type
|
42 |
-
|
43 |
if revision_name_textbox == '':
|
44 |
col = csv_data.shape[0]
|
45 |
model_name = model_name_textbox
|
@@ -50,7 +41,7 @@ def add_new_eval(
|
|
50 |
if revision_name_textbox not in name_list:
|
51 |
col = csv_data.shape[0]
|
52 |
else:
|
53 |
-
col = name_list.index(revision_name_textbox)
|
54 |
|
55 |
if model_link == '':
|
56 |
model_name = model_name # no url
|
@@ -59,9 +50,7 @@ def add_new_eval(
|
|
59 |
|
60 |
# add new data
|
61 |
new_data = [
|
62 |
-
|
63 |
-
model_name,
|
64 |
-
LLM_name,
|
65 |
input_data[0],
|
66 |
input_data[1],
|
67 |
input_data[2],
|
@@ -86,14 +75,14 @@ def add_new_eval(
|
|
86 |
|
87 |
def get_baseline_df():
|
88 |
# pdb.set_trace()
|
89 |
-
df = pd.read_csv(CSV_DIR
|
90 |
df = df.sort_values(by="Avg. All", ascending=False)
|
91 |
present_columns = MODEL_INFO + checkbox_group.value
|
92 |
df = df[present_columns]
|
93 |
return df
|
94 |
|
95 |
def get_all_df():
|
96 |
-
df = pd.read_csv(CSV_DIR
|
97 |
df = df.sort_values(by="Avg. All", ascending=False)
|
98 |
return df
|
99 |
|
@@ -182,32 +171,6 @@ with block:
|
|
182 |
revision_name_textbox = gr.Textbox(
|
183 |
label="Revision Model Name", placeholder="LLaMA-7B"
|
184 |
)
|
185 |
-
model_type = gr.Dropdown(
|
186 |
-
choices=[
|
187 |
-
"LLM",
|
188 |
-
"ImageLLM",
|
189 |
-
"VideoLLM",
|
190 |
-
"Other",
|
191 |
-
],
|
192 |
-
label="Model type",
|
193 |
-
multiselect=False,
|
194 |
-
value="ImageLLM",
|
195 |
-
interactive=True,
|
196 |
-
)
|
197 |
-
|
198 |
-
with gr.Column():
|
199 |
-
|
200 |
-
LLM_type = gr.Dropdown(
|
201 |
-
choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
|
202 |
-
label="LLM type",
|
203 |
-
multiselect=False,
|
204 |
-
value="LLaMA-7B",
|
205 |
-
interactive=True,
|
206 |
-
)
|
207 |
-
LLM_name_textbox = gr.Textbox(
|
208 |
-
label="LLM model (for Other)",
|
209 |
-
placeholder="LLaMA-13B"
|
210 |
-
)
|
211 |
model_link = gr.Textbox(
|
212 |
label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
|
213 |
)
|
@@ -224,10 +187,7 @@ with block:
|
|
224 |
input_file,
|
225 |
model_name_textbox,
|
226 |
revision_name_textbox,
|
227 |
-
model_type,
|
228 |
model_link,
|
229 |
-
LLM_type,
|
230 |
-
LLM_name_textbox,
|
231 |
],
|
232 |
# outputs = submission_result,
|
233 |
)
|
|
|
21 |
input_file,
|
22 |
model_name_textbox: str,
|
23 |
revision_name_textbox: str,
|
|
|
24 |
model_link: str,
|
|
|
|
|
25 |
):
|
26 |
if input_file is None:
|
27 |
return "Error! Empty file!"
|
28 |
else:
|
29 |
input_data = input_file.decode("utf-8").split('\n')[1].split(',')
|
30 |
+
input_data = [float(i) for i in input_data]
|
31 |
|
32 |
csv_data = pd.read_csv(CSV_DIR)
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
if revision_name_textbox == '':
|
35 |
col = csv_data.shape[0]
|
36 |
model_name = model_name_textbox
|
|
|
41 |
if revision_name_textbox not in name_list:
|
42 |
col = csv_data.shape[0]
|
43 |
else:
|
44 |
+
col = name_list.index(revision_name_textbox)
|
45 |
|
46 |
if model_link == '':
|
47 |
model_name = model_name # no url
|
|
|
50 |
|
51 |
# add new data
|
52 |
new_data = [
|
53 |
+
model_name,
|
|
|
|
|
54 |
input_data[0],
|
55 |
input_data[1],
|
56 |
input_data[2],
|
|
|
75 |
|
76 |
def get_baseline_df():
|
77 |
# pdb.set_trace()
|
78 |
+
df = pd.read_csv(CSV_DIR)
|
79 |
df = df.sort_values(by="Avg. All", ascending=False)
|
80 |
present_columns = MODEL_INFO + checkbox_group.value
|
81 |
df = df[present_columns]
|
82 |
return df
|
83 |
|
84 |
def get_all_df():
|
85 |
+
df = pd.read_csv(CSV_DIR)
|
86 |
df = df.sort_values(by="Avg. All", ascending=False)
|
87 |
return df
|
88 |
|
|
|
171 |
revision_name_textbox = gr.Textbox(
|
172 |
label="Revision Model Name", placeholder="LLaMA-7B"
|
173 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
model_link = gr.Textbox(
|
175 |
label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
|
176 |
)
|
|
|
187 |
input_file,
|
188 |
model_name_textbox,
|
189 |
revision_name_textbox,
|
|
|
190 |
model_link,
|
|
|
|
|
191 |
],
|
192 |
# outputs = submission_result,
|
193 |
)
|
constants.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
# this is .py for store constants
|
2 |
-
MODEL_INFO = ["Model
|
3 |
|
4 |
TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
|
5 |
"ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT",
|
@@ -7,16 +7,16 @@ TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "
|
|
7 |
"Driving-exam", "Driving-decision-making", "SQA3D"]
|
8 |
|
9 |
AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"]
|
10 |
-
DATA_TITILE_TYPE = ["markdown",
|
11 |
"number", "number", "number", "number", "number", "number", "number",
|
12 |
"number", "number", "number",
|
13 |
-
"number", "number", "number"
|
|
|
14 |
CSV_DIR = "./file/result.csv"
|
15 |
|
16 |
# COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
17 |
COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
|
18 |
|
19 |
-
|
20 |
LEADERBORAD_INTRODUCTION = """# Video-Bench Leaderboard
|
21 |
|
22 |
Welcome to the leaderboard of the Video-Bench! π
|
@@ -35,14 +35,11 @@ SUBMIT_INTRODUCTION = """# Submit Introduction
|
|
35 |
5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
|
36 |
|
37 |
## Submit Example
|
38 |
-
For example, if you want to upload
|
39 |
-
1. Fill in '
|
40 |
-
2. Fill in '
|
41 |
-
|
42 |
-
|
43 |
-
4. Select 'Flan-T5-XL' in 'LLM Type'.
|
44 |
-
5. Select 'All' in 'Evaluation Dimension'.
|
45 |
-
6. Upload results.json.
|
46 |
7. Click the 'Submit Eval' button.
|
47 |
8. Click 'Refresh' to obtain the uploaded leaderboard.
|
48 |
"""
|
@@ -61,8 +58,6 @@ LEADERBORAD_INFO = """
|
|
61 |
By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
|
62 |
"""
|
63 |
|
64 |
-
|
65 |
-
|
66 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
67 |
CITATION_BUTTON_TEXT = r"""@misc{ning2023videobench,
|
68 |
title={Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models},
|
|
|
1 |
# this is .py for store constants
|
2 |
+
MODEL_INFO = ["Model"]
|
3 |
|
4 |
TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
|
5 |
"ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT",
|
|
|
7 |
"Driving-exam", "Driving-decision-making", "SQA3D"]
|
8 |
|
9 |
AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"]
|
10 |
+
DATA_TITILE_TYPE = ["markdown",
|
11 |
"number", "number", "number", "number", "number", "number", "number",
|
12 |
"number", "number", "number",
|
13 |
+
"number", "number", "number",
|
14 |
+
"number", "number", "number", "number", ]
|
15 |
CSV_DIR = "./file/result.csv"
|
16 |
|
17 |
# COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
18 |
COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
|
19 |
|
|
|
20 |
LEADERBORAD_INTRODUCTION = """# Video-Bench Leaderboard
|
21 |
|
22 |
Welcome to the leaderboard of the Video-Bench! π
|
|
|
35 |
5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
|
36 |
|
37 |
## Submit Example
|
38 |
+
For example, if you want to upload Video-ChatGPT's result in the leaderboard, you need to:
|
39 |
+
1. Fill in 'Video-ChatGPT' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
|
40 |
+
2. Fill in 'Video-ChatGPT' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
|
41 |
+
3. Fill in 'https://github.com/x/x' in 'Model Link'.
|
42 |
+
6. Upload results.csv.
|
|
|
|
|
|
|
43 |
7. Click the 'Submit Eval' button.
|
44 |
8. Click 'Refresh' to obtain the uploaded leaderboard.
|
45 |
"""
|
|
|
58 |
By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
|
59 |
"""
|
60 |
|
|
|
|
|
61 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
62 |
CITATION_BUTTON_TEXT = r"""@misc{ning2023videobench,
|
63 |
title={Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models},
|
file/result.csv
CHANGED
@@ -1,2 +1,10 @@
|
|
1 |
-
Model
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
|
2 |
+
Random,28.45459441,25.84861538,24.47045673,35.04471112,0.3458,0.26224,0.265,0.22377,0.25,0.25,0.1667,0.2,0.26151895,0.272594752,0.368055556,0.44209,0.25
|
3 |
+
GPT-VideoChat,35.41215477,34.12376923,29.60966667,42.5030284,0.4455,0.4215,0.374,0.33744,0.27663,0.2241,0.27775,0.2615,0.34109,0.2857,0.388888,0.553846,0.31428571
|
4 |
+
GPT-Video-ChatGPT,38.5186297,39.81651709,29.244,46.495372,0.466,0.575,0.463,0.3559,0.348,0.2413,0.277747222,0.28764,0.3652,0.22448,0.4166666,0.582051,0.372
|
5 |
+
GPT-Otter,37.47000387,37.51728162,32.99,41.90273,0.443,0.5495,0.4695,0.34266,0.3265,0.22413,0.166666611,0.2765,0.370635,0.342565,0.5277777,0.4871794,0.2965
|
6 |
+
GPT-PandaGPT,37.52393217,37.53914677,31.98733333,43.0453164,0.449624,0.5042521,0.44594594,0.29663,0.33016,0.3301,0.166665,0.2785,0.37063,0.31049,0.4166,0.5602564,0.30757651
|
7 |
+
GPT-Valley,33.95521521,28.38772829,29.20933333,44.268584,0.381,0.32032,0.2802802,0.3141,0.2905,0.203448,0.111108278,0.237,0.32587,0.31341,0.41666,0.5653846,0.333
|
8 |
+
GPT-mPLUG-owl,33.14659856,33.16526701,26.39762867,39.8769,0.41470735,0.4245,0.363,0.31656,0.2705,0.2275862,0.277777611,0.2395,0.3017,0.25072886,0.333333,0.510256,0.32
|
9 |
+
GPT-VideoLLaMA,32.83174044,32.48401966,27.79906667,38.212135,0.3985,0.4115,0.3405,0.312766,0.289,0.275862,0.166666556,0.2475,0.324082,0.26239,0.30555555,0.4910256,0.3115
|
10 |
+
GPT-Chat-UniVi,35.31147004,37.87,27.43,40.64,0.49,0.486,0.4165,0.413,0.29,0.2827,0.166666649,0.2305,0.3357,0.2566,0.3889,0.5308,0.2907
|
file/sample_to_upload.csv
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
|
2 |
-
|
|
|
1 |
Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
|
2 |
+
40,2,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1
|
src/__pycache__/utils_display.cpython-38.pyc
DELETED
Binary file (4.22 kB)
|
|
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc
DELETED
Binary file (1.17 kB)
|
|