Spaces:

LanguageBind
/

Video-Bench

Running

App Files Files Community

LinB203 commited on Nov 28, 2023

Commit

850ad91

1 Parent(s): ebc5bbb

demo

Browse files

Files changed (6) hide show

app.py +5 -45
constants.py +9 -14
file/result.csv +10 -2
file/sample_to_upload.csv +1 -1
src/__pycache__/utils_display.cpython-38.pyc +0 -0
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc +0 -0

app.py CHANGED Viewed

@@ -21,25 +21,16 @@ def add_new_eval(
     input_file,
     model_name_textbox: str,
     revision_name_textbox: str,
-    model_type: str,
     model_link: str,
-    LLM_type: str,
-    LLM_name_textbox: str,
 ):
     if input_file is None:
         return "Error! Empty file!"
     else:
         input_data = input_file.decode("utf-8").split('\n')[1].split(',')
-        input_data = [str(i) for i in input_data]
         csv_data = pd.read_csv(CSV_DIR)
-        if LLM_type == 'Other':
-            LLM_name = LLM_name_textbox
-        else:
-            LLM_name = LLM_type
         if revision_name_textbox == '':
             col = csv_data.shape[0]
             model_name = model_name_textbox
@@ -50,7 +41,7 @@ def add_new_eval(
             if revision_name_textbox not in name_list:
                 col = csv_data.shape[0]
             else:
-                col = name_list.index(revision_name_textbox)
         if model_link == '':
             model_name = model_name  # no url
@@ -59,9 +50,7 @@ def add_new_eval(
         # add new data
         new_data = [
-            model_type,
-            model_name,
-            LLM_name,
             input_data[0],
             input_data[1],
             input_data[2],
@@ -86,14 +75,14 @@ def add_new_eval(
 def get_baseline_df():
     # pdb.set_trace()
-    df = pd.read_csv(CSV_DIR, dtype=str)
     df = df.sort_values(by="Avg. All", ascending=False)
     present_columns = MODEL_INFO + checkbox_group.value
     df = df[present_columns]
     return df
 def get_all_df():
-    df = pd.read_csv(CSV_DIR, dtype=str)
     df = df.sort_values(by="Avg. All", ascending=False)
     return df
@@ -182,32 +171,6 @@ with block:
                     revision_name_textbox = gr.Textbox(
                         label="Revision Model Name", placeholder="LLaMA-7B"
                     )
-                    model_type = gr.Dropdown(
-                        choices=[
-                            "LLM",
-                            "ImageLLM",
-                            "VideoLLM",
-                            "Other",
-                        ],
-                        label="Model type",
-                        multiselect=False,
-                        value="ImageLLM",
-                        interactive=True,
-                    )
-                with gr.Column():
-                    LLM_type = gr.Dropdown(
-                        choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
-                        label="LLM type",
-                        multiselect=False,
-                        value="LLaMA-7B",
-                        interactive=True,
-                    )
-                    LLM_name_textbox = gr.Textbox(
-                        label="LLM model (for Other)",
-                        placeholder="LLaMA-13B"
-                    )
                     model_link = gr.Textbox(
                         label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
                     )
@@ -224,10 +187,7 @@ with block:
                         input_file,
                         model_name_textbox,
                         revision_name_textbox,
-                        model_type,
                         model_link,
-                        LLM_type,
-                        LLM_name_textbox,
                     ],
                     # outputs = submission_result,
                 )

     input_file,
     model_name_textbox: str,
     revision_name_textbox: str,
     model_link: str,
 ):
     if input_file is None:
         return "Error! Empty file!"
     else:
         input_data = input_file.decode("utf-8").split('\n')[1].split(',')
+        input_data = [float(i) for i in input_data]
         csv_data = pd.read_csv(CSV_DIR)
         if revision_name_textbox == '':
             col = csv_data.shape[0]
             model_name = model_name_textbox
             if revision_name_textbox not in name_list:
                 col = csv_data.shape[0]
             else:
+                col = name_list.index(revision_name_textbox)
         if model_link == '':
             model_name = model_name  # no url
         # add new data
         new_data = [
+            model_name,
             input_data[0],
             input_data[1],
             input_data[2],
 def get_baseline_df():
     # pdb.set_trace()
+    df = pd.read_csv(CSV_DIR)
     df = df.sort_values(by="Avg. All", ascending=False)
     present_columns = MODEL_INFO + checkbox_group.value
     df = df[present_columns]
     return df
 def get_all_df():
+    df = pd.read_csv(CSV_DIR)
     df = df.sort_values(by="Avg. All", ascending=False)
     return df
                     revision_name_textbox = gr.Textbox(
                         label="Revision Model Name", placeholder="LLaMA-7B"
                     )
                     model_link = gr.Textbox(
                         label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
                     )
                         input_file,
                         model_name_textbox,
                         revision_name_textbox,
                         model_link,
                     ],
                     # outputs = submission_result,
                 )

constants.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # this is .py for store constants
-MODEL_INFO = ["Model Type", "Model", "Language Model"]
 TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
                 "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT",
@@ -7,16 +7,16 @@ TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "
                 "Driving-exam", "Driving-decision-making", "SQA3D"]
 AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"]
-DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown",
                     "number", "number", "number", "number", "number", "number", "number",
                     "number", "number", "number",
-                    "number", "number", "number"]
 CSV_DIR = "./file/result.csv"
 # COLUMN_NAMES = MODEL_INFO + TASK_INFO
 COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
 LEADERBORAD_INTRODUCTION = """# Video-Bench Leaderboard
     Welcome to the leaderboard of the Video-Bench! 🏆
@@ -35,14 +35,11 @@ SUBMIT_INTRODUCTION = """# Submit Introduction
     5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
     ## Submit Example
-    For example, if you want to upload InstructBLIP's result in the leaderboard, you need to:
-    1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
-    2. Fill in 'InstructBLIP' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
-    2. Select 'ImageLLM' in 'Model Type'.
-    3. Fill in 'https://github.com/salesforce/LAVIS' in 'Model Link'.
-    4. Select 'Flan-T5-XL' in 'LLM Type'.
-    5. Select 'All' in 'Evaluation Dimension'.
-    6. Upload results.json.
     7. Click the 'Submit Eval' button.
     8. Click 'Refresh' to obtain the uploaded leaderboard.
 """
@@ -61,8 +58,6 @@ LEADERBORAD_INFO = """
       By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""@misc{ning2023videobench,
       title={Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models},

 # this is .py for store constants
+MODEL_INFO = ["Model"]
 TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
                 "ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT",
                 "Driving-exam", "Driving-decision-making", "SQA3D"]
 AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"]
+DATA_TITILE_TYPE = ["markdown",
                     "number", "number", "number", "number", "number", "number", "number",
                     "number", "number", "number",
+                    "number", "number", "number",
+                    "number", "number", "number", "number", ]
 CSV_DIR = "./file/result.csv"
 # COLUMN_NAMES = MODEL_INFO + TASK_INFO
 COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
 LEADERBORAD_INTRODUCTION = """# Video-Bench Leaderboard
     Welcome to the leaderboard of the Video-Bench! 🏆
     5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
     ## Submit Example
+    For example, if you want to upload Video-ChatGPT's result in the leaderboard, you need to:
+    1. Fill in 'Video-ChatGPT' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
+    2. Fill in 'Video-ChatGPT' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
+    3. Fill in 'https://github.com/x/x' in 'Model Link'.
+    6. Upload results.csv.
     7. Click the 'Submit Eval' button.
     8. Click 'Refresh' to obtain the uploaded leaderboard.
 """
       By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""@misc{ning2023videobench,
       title={Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models},

file/result.csv CHANGED Viewed

@@ -1,2 +1,10 @@
-Model Type,Model,Language Model,Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
-LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1

+Model,Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
+Random,28.45459441,25.84861538,24.47045673,35.04471112,0.3458,0.26224,0.265,0.22377,0.25,0.25,0.1667,0.2,0.26151895,0.272594752,0.368055556,0.44209,0.25
+GPT-VideoChat,35.41215477,34.12376923,29.60966667,42.5030284,0.4455,0.4215,0.374,0.33744,0.27663,0.2241,0.27775,0.2615,0.34109,0.2857,0.388888,0.553846,0.31428571
+GPT-Video-ChatGPT,38.5186297,39.81651709,29.244,46.495372,0.466,0.575,0.463,0.3559,0.348,0.2413,0.277747222,0.28764,0.3652,0.22448,0.4166666,0.582051,0.372
+GPT-Otter,37.47000387,37.51728162,32.99,41.90273,0.443,0.5495,0.4695,0.34266,0.3265,0.22413,0.166666611,0.2765,0.370635,0.342565,0.5277777,0.4871794,0.2965
+GPT-PandaGPT,37.52393217,37.53914677,31.98733333,43.0453164,0.449624,0.5042521,0.44594594,0.29663,0.33016,0.3301,0.166665,0.2785,0.37063,0.31049,0.4166,0.5602564,0.30757651
+GPT-Valley,33.95521521,28.38772829,29.20933333,44.268584,0.381,0.32032,0.2802802,0.3141,0.2905,0.203448,0.111108278,0.237,0.32587,0.31341,0.41666,0.5653846,0.333
+GPT-mPLUG-owl,33.14659856,33.16526701,26.39762867,39.8769,0.41470735,0.4245,0.363,0.31656,0.2705,0.2275862,0.277777611,0.2395,0.3017,0.25072886,0.333333,0.510256,0.32
+GPT-VideoLLaMA,32.83174044,32.48401966,27.79906667,38.212135,0.3985,0.4115,0.3405,0.312766,0.289,0.275862,0.166666556,0.2475,0.324082,0.26239,0.30555555,0.4910256,0.3115
+GPT-Chat-UniVi,35.31147004,37.87,27.43,40.64,0.49,0.486,0.4165,0.413,0.29,0.2827,0.166666649,0.2305,0.3357,0.2566,0.3889,0.5308,0.2907

file/sample_to_upload.csv CHANGED Viewed

	@@ -1,2 +1,2 @@
1	Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
2	- 4,2,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1


1	Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
2	+ 40,2,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1

src/__pycache__/utils_display.cpython-38.pyc DELETED Viewed

Binary file (4.22 kB)

src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc DELETED Viewed

Binary file (1.17 kB)