LinB203 commited on
Commit
850ad91
Β·
1 Parent(s): ebc5bbb
app.py CHANGED
@@ -21,25 +21,16 @@ def add_new_eval(
21
  input_file,
22
  model_name_textbox: str,
23
  revision_name_textbox: str,
24
- model_type: str,
25
  model_link: str,
26
- LLM_type: str,
27
- LLM_name_textbox: str,
28
  ):
29
  if input_file is None:
30
  return "Error! Empty file!"
31
  else:
32
  input_data = input_file.decode("utf-8").split('\n')[1].split(',')
33
- input_data = [str(i) for i in input_data]
34
 
35
  csv_data = pd.read_csv(CSV_DIR)
36
 
37
-
38
- if LLM_type == 'Other':
39
- LLM_name = LLM_name_textbox
40
- else:
41
- LLM_name = LLM_type
42
-
43
  if revision_name_textbox == '':
44
  col = csv_data.shape[0]
45
  model_name = model_name_textbox
@@ -50,7 +41,7 @@ def add_new_eval(
50
  if revision_name_textbox not in name_list:
51
  col = csv_data.shape[0]
52
  else:
53
- col = name_list.index(revision_name_textbox)
54
 
55
  if model_link == '':
56
  model_name = model_name # no url
@@ -59,9 +50,7 @@ def add_new_eval(
59
 
60
  # add new data
61
  new_data = [
62
- model_type,
63
- model_name,
64
- LLM_name,
65
  input_data[0],
66
  input_data[1],
67
  input_data[2],
@@ -86,14 +75,14 @@ def add_new_eval(
86
 
87
  def get_baseline_df():
88
  # pdb.set_trace()
89
- df = pd.read_csv(CSV_DIR, dtype=str)
90
  df = df.sort_values(by="Avg. All", ascending=False)
91
  present_columns = MODEL_INFO + checkbox_group.value
92
  df = df[present_columns]
93
  return df
94
 
95
  def get_all_df():
96
- df = pd.read_csv(CSV_DIR, dtype=str)
97
  df = df.sort_values(by="Avg. All", ascending=False)
98
  return df
99
 
@@ -182,32 +171,6 @@ with block:
182
  revision_name_textbox = gr.Textbox(
183
  label="Revision Model Name", placeholder="LLaMA-7B"
184
  )
185
- model_type = gr.Dropdown(
186
- choices=[
187
- "LLM",
188
- "ImageLLM",
189
- "VideoLLM",
190
- "Other",
191
- ],
192
- label="Model type",
193
- multiselect=False,
194
- value="ImageLLM",
195
- interactive=True,
196
- )
197
-
198
- with gr.Column():
199
-
200
- LLM_type = gr.Dropdown(
201
- choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
202
- label="LLM type",
203
- multiselect=False,
204
- value="LLaMA-7B",
205
- interactive=True,
206
- )
207
- LLM_name_textbox = gr.Textbox(
208
- label="LLM model (for Other)",
209
- placeholder="LLaMA-13B"
210
- )
211
  model_link = gr.Textbox(
212
  label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
213
  )
@@ -224,10 +187,7 @@ with block:
224
  input_file,
225
  model_name_textbox,
226
  revision_name_textbox,
227
- model_type,
228
  model_link,
229
- LLM_type,
230
- LLM_name_textbox,
231
  ],
232
  # outputs = submission_result,
233
  )
 
21
  input_file,
22
  model_name_textbox: str,
23
  revision_name_textbox: str,
 
24
  model_link: str,
 
 
25
  ):
26
  if input_file is None:
27
  return "Error! Empty file!"
28
  else:
29
  input_data = input_file.decode("utf-8").split('\n')[1].split(',')
30
+ input_data = [float(i) for i in input_data]
31
 
32
  csv_data = pd.read_csv(CSV_DIR)
33
 
 
 
 
 
 
 
34
  if revision_name_textbox == '':
35
  col = csv_data.shape[0]
36
  model_name = model_name_textbox
 
41
  if revision_name_textbox not in name_list:
42
  col = csv_data.shape[0]
43
  else:
44
+ col = name_list.index(revision_name_textbox)
45
 
46
  if model_link == '':
47
  model_name = model_name # no url
 
50
 
51
  # add new data
52
  new_data = [
53
+ model_name,
 
 
54
  input_data[0],
55
  input_data[1],
56
  input_data[2],
 
75
 
76
  def get_baseline_df():
77
  # pdb.set_trace()
78
+ df = pd.read_csv(CSV_DIR)
79
  df = df.sort_values(by="Avg. All", ascending=False)
80
  present_columns = MODEL_INFO + checkbox_group.value
81
  df = df[present_columns]
82
  return df
83
 
84
  def get_all_df():
85
+ df = pd.read_csv(CSV_DIR)
86
  df = df.sort_values(by="Avg. All", ascending=False)
87
  return df
88
 
 
171
  revision_name_textbox = gr.Textbox(
172
  label="Revision Model Name", placeholder="LLaMA-7B"
173
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  model_link = gr.Textbox(
175
  label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
176
  )
 
187
  input_file,
188
  model_name_textbox,
189
  revision_name_textbox,
 
190
  model_link,
 
 
191
  ],
192
  # outputs = submission_result,
193
  )
constants.py CHANGED
@@ -1,5 +1,5 @@
1
  # this is .py for store constants
2
- MODEL_INFO = ["Model Type", "Model", "Language Model"]
3
 
4
  TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
5
  "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT",
@@ -7,16 +7,16 @@ TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "
7
  "Driving-exam", "Driving-decision-making", "SQA3D"]
8
 
9
  AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"]
10
- DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown",
11
  "number", "number", "number", "number", "number", "number", "number",
12
  "number", "number", "number",
13
- "number", "number", "number"]
 
14
  CSV_DIR = "./file/result.csv"
15
 
16
  # COLUMN_NAMES = MODEL_INFO + TASK_INFO
17
  COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
18
 
19
-
20
  LEADERBORAD_INTRODUCTION = """# Video-Bench Leaderboard
21
 
22
  Welcome to the leaderboard of the Video-Bench! πŸ†
@@ -35,14 +35,11 @@ SUBMIT_INTRODUCTION = """# Submit Introduction
35
  5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
36
 
37
  ## Submit Example
38
- For example, if you want to upload InstructBLIP's result in the leaderboard, you need to:
39
- 1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
40
- 2. Fill in 'InstructBLIP' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
41
- 2. Select 'ImageLLM' in 'Model Type'.
42
- 3. Fill in 'https://github.com/salesforce/LAVIS' in 'Model Link'.
43
- 4. Select 'Flan-T5-XL' in 'LLM Type'.
44
- 5. Select 'All' in 'Evaluation Dimension'.
45
- 6. Upload results.json.
46
  7. Click the 'Submit Eval' button.
47
  8. Click 'Refresh' to obtain the uploaded leaderboard.
48
  """
@@ -61,8 +58,6 @@ LEADERBORAD_INFO = """
61
  By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
62
  """
63
 
64
-
65
-
66
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
67
  CITATION_BUTTON_TEXT = r"""@misc{ning2023videobench,
68
  title={Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models},
 
1
  # this is .py for store constants
2
+ MODEL_INFO = ["Model"]
3
 
4
  TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
5
  "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT",
 
7
  "Driving-exam", "Driving-decision-making", "SQA3D"]
8
 
9
  AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"]
10
+ DATA_TITILE_TYPE = ["markdown",
11
  "number", "number", "number", "number", "number", "number", "number",
12
  "number", "number", "number",
13
+ "number", "number", "number",
14
+ "number", "number", "number", "number", ]
15
  CSV_DIR = "./file/result.csv"
16
 
17
  # COLUMN_NAMES = MODEL_INFO + TASK_INFO
18
  COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
19
 
 
20
  LEADERBORAD_INTRODUCTION = """# Video-Bench Leaderboard
21
 
22
  Welcome to the leaderboard of the Video-Bench! πŸ†
 
35
  5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
36
 
37
  ## Submit Example
38
+ For example, if you want to upload Video-ChatGPT's result in the leaderboard, you need to:
39
+ 1. Fill in 'Video-ChatGPT' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
40
+ 2. Fill in 'Video-ChatGPT' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
41
+ 3. Fill in 'https://github.com/x/x' in 'Model Link'.
42
+ 6. Upload results.csv.
 
 
 
43
  7. Click the 'Submit Eval' button.
44
  8. Click 'Refresh' to obtain the uploaded leaderboard.
45
  """
 
58
  By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
59
  """
60
 
 
 
61
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
62
  CITATION_BUTTON_TEXT = r"""@misc{ning2023videobench,
63
  title={Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models},
file/result.csv CHANGED
@@ -1,2 +1,10 @@
1
- Model Type,Model,Language Model,Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
2
- LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 
 
 
 
 
 
 
 
 
1
+ Model,Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
2
+ Random,28.45459441,25.84861538,24.47045673,35.04471112,0.3458,0.26224,0.265,0.22377,0.25,0.25,0.1667,0.2,0.26151895,0.272594752,0.368055556,0.44209,0.25
3
+ GPT-VideoChat,35.41215477,34.12376923,29.60966667,42.5030284,0.4455,0.4215,0.374,0.33744,0.27663,0.2241,0.27775,0.2615,0.34109,0.2857,0.388888,0.553846,0.31428571
4
+ GPT-Video-ChatGPT,38.5186297,39.81651709,29.244,46.495372,0.466,0.575,0.463,0.3559,0.348,0.2413,0.277747222,0.28764,0.3652,0.22448,0.4166666,0.582051,0.372
5
+ GPT-Otter,37.47000387,37.51728162,32.99,41.90273,0.443,0.5495,0.4695,0.34266,0.3265,0.22413,0.166666611,0.2765,0.370635,0.342565,0.5277777,0.4871794,0.2965
6
+ GPT-PandaGPT,37.52393217,37.53914677,31.98733333,43.0453164,0.449624,0.5042521,0.44594594,0.29663,0.33016,0.3301,0.166665,0.2785,0.37063,0.31049,0.4166,0.5602564,0.30757651
7
+ GPT-Valley,33.95521521,28.38772829,29.20933333,44.268584,0.381,0.32032,0.2802802,0.3141,0.2905,0.203448,0.111108278,0.237,0.32587,0.31341,0.41666,0.5653846,0.333
8
+ GPT-mPLUG-owl,33.14659856,33.16526701,26.39762867,39.8769,0.41470735,0.4245,0.363,0.31656,0.2705,0.2275862,0.277777611,0.2395,0.3017,0.25072886,0.333333,0.510256,0.32
9
+ GPT-VideoLLaMA,32.83174044,32.48401966,27.79906667,38.212135,0.3985,0.4115,0.3405,0.312766,0.289,0.275862,0.166666556,0.2475,0.324082,0.26239,0.30555555,0.4910256,0.3115
10
+ GPT-Chat-UniVi,35.31147004,37.87,27.43,40.64,0.49,0.486,0.4165,0.413,0.29,0.2827,0.166666649,0.2305,0.3357,0.2566,0.3889,0.5308,0.2907
file/sample_to_upload.csv CHANGED
@@ -1,2 +1,2 @@
1
  Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
2
- 4,2,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 
1
  Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
2
+ 40,2,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1
src/__pycache__/utils_display.cpython-38.pyc DELETED
Binary file (4.22 kB)
 
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc DELETED
Binary file (1.17 kB)