diff --git a/.gitattributes b/.gitattributes index 849dd60b9c3bc901186811aaa0f54c27c20c8c5d..e2fdd3c78bfef7d609c3b755310e6439737447f2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text -mj-bench-logo.png filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md index 5db2483b474f0951e0147b47edfccccbe2f335b4..eb6d7569a6d6fc8dd1dc7cfb8831b0ef52222b2b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ --- -title: MJ Bench Leaderboard +title: MMIE Leaderboard emoji: 🥇 colorFrom: green colorTo: indigo @@ -45,13 +45,5 @@ You'll find ## Citation ``` -@misc{chen2024mjbenchmultimodalrewardmodel, - title={MJ-Bench: Is Your Multimodal Reward Model Really a Good Judge for Text-to-Image Generation?}, - author={Zhaorun Chen and Yichao Du and Zichen Wen and Yiyang Zhou and Chenhang Cui and Zhenzhen Weng and Haoqin Tu and Chaoqi Wang and Zhengwei Tong and Qinglan Huang and Canyu Chen and Qinghao Ye and Zhihong Zhu and Yuqing Zhang and Jiawei Zhou and Zhuokai Zhao and Rafael Rafailov and Chelsea Finn and Huaxiu Yao}, - year={2024}, - eprint={2407.04842}, - archivePrefix={arXiv}, - primaryClass={cs.CV}, - url={https://arxiv.org/abs/2407.04842}, -} + ``` \ No newline at end of file diff --git a/app.py b/app.py index 5b790f985aa878d03ba77fb289ba294a8440f994..78f1ed1db5062229fadafef0bf65a899d335c0e3 100644 --- a/app.py +++ b/app.py @@ -7,7 +7,6 @@ import numpy as np from pathlib import Path from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download -from datasets import load_dataset from src.about import ( @@ -20,19 +19,19 @@ from src.about import ( ABOUT_TEXT ) from src.display.css_html_js import custom_css -from src.display.utils import ( - BENCHMARK_COLS, - COLS, - EVAL_COLS, - EVAL_TYPES, - NUMERIC_INTERVALS, - TYPES, - AutoEvalColumn, - ModelType, - fields, - WeightType, - Precision -) +# from src.display.utils import ( +# BENCHMARK_COLS, +# COLS, +# EVAL_COLS, +# EVAL_TYPES, +# NUMERIC_INTERVALS, +# TYPES, +# AutoEvalColumn, +# ModelType, +# fields, +# WeightType, +# Precision +# ) from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN try: @@ -76,7 +75,7 @@ PERSPECTIVE_COUNTS= { -META_DATA = ['Model', 'Model Type', 'Input Type', 'Organization'] +META_DATA = ['Model'] @@ -84,36 +83,36 @@ def restart_space(): API.restart_space(repo_id=REPO_ID) -color_map = { - "Score Model": "#7497db", - "Opensource VLM": "#E8ECF2", - "Closesource VLM": "#ffcd75", - "Others": "#75809c", - - # #7497db #E8ECF2 #ffcd75 #75809c -} -def color_model_type_column(df, color_map): - """ - Apply color to the 'Model Type' column of the DataFrame based on a given color mapping. - - Parameters: - df (pd.DataFrame): The DataFrame containing the 'Model Type' column. - color_map (dict): A dictionary mapping model types to colors. - - Returns: - pd.Styler: The styled DataFrame. - """ - # Function to apply color based on the model type - def apply_color(val): - color = color_map.get(val, "default") # Default color if not specified in color_map - return f'background-color: {color}' +# color_map = { +# "Score Model": "#7497db", +# "Opensource VLM": "#E8ECF2", +# "Closesource VLM": "#ffcd75", +# "Others": "#75809c", + +# # #7497db #E8ECF2 #ffcd75 #75809c +# } +# def color_model_type_column(df, color_map): +# """ +# Apply color to the 'Model Type' column of the DataFrame based on a given color mapping. + +# Parameters: +# df (pd.DataFrame): The DataFrame containing the 'Model Type' column. +# color_map (dict): A dictionary mapping model types to colors. + +# Returns: +# pd.Styler: The styled DataFrame. +# """ +# # Function to apply color based on the model type +# def apply_color(val): +# color = color_map.get(val, "default") # Default color if not specified in color_map +# return f'background-color: {color}' - # Format for different columns - format_dict = {col: "{:.1f}" for col in df.columns if col not in META_DATA} - format_dict['Overall Score'] = "{:.2f}" - format_dict[''] = "{:d}" +# # Format for different columns +# format_dict = {col: "{:.1f}" for col in df.columns if col not in META_DATA} +# format_dict['Overall Score'] = "{:.2f}" +# format_dict[''] = "{:d}" - return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='') +# return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='') def regex_table(dataframe, regex, filter_button, style=True): """ @@ -127,14 +126,10 @@ def regex_table(dataframe, regex, filter_button, style=True): # if filter_button, remove all rows with "ai2" in the model name update_scores = False if isinstance(filter_button, list) or isinstance(filter_button, str): - if "Score Model" not in filter_button: - dataframe = dataframe[~dataframe["Model Type"].str.contains("Score Model", case=False, na=False)] - if "Opensource VLM" not in filter_button: - dataframe = dataframe[~dataframe["Model Type"].str.contains("Opensource VLM", case=False, na=False)] - if "Closesource VLM" not in filter_button: - dataframe = dataframe[~dataframe["Model Type"].str.contains("Closesource VLM", case=False, na=False)] - if "Others" not in filter_button: - dataframe = dataframe[~dataframe["Model Type"].str.contains("Others", case=False, na=False)] + if "Integrated LVLM" not in filter_button: + dataframe = dataframe[~dataframe["Model Type"].str.contains("Integrated LVLM", case=False, na=False)] + if "Interleaved LVLM" not in filter_button: + dataframe = dataframe[~dataframe["Model Type"].str.contains("Interleaved LVLM", case=False, na=False)] # Filter the dataframe such that 'model' contains any of the regex patterns data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)] @@ -143,9 +138,9 @@ def regex_table(dataframe, regex, filter_button, style=True): # replace column '' with count/rank data.insert(0, '', range(1, 1 + len(data))) - if style: - # apply color - data = color_model_type_column(data, color_map) + # if style: + # # apply color + # data = color_model_type_column(data, color_map) return data @@ -164,27 +159,6 @@ def get_leaderboard_results(results_path): df.reset_index(drop=True, inplace=True) return df -def avg_all_subset(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, subset_counts=SUBSET_COUNTS): - new_df = orig_df.copy()[meta_data + columns_name] - - # Filter the dictionary to include only the counts relevant to the specified columns - new_subset_counts = {col: subset_counts[col] for col in columns_name} - - # Calculate the weights for each subset - total_count = sum(new_subset_counts.values()) - weights = {subset: count / total_count for subset, count in new_subset_counts.items()} - - # Calculate the weight_avg value for each row - def calculate_weighted_avg(row): - weighted_sum = sum(row[col] * weights[col] for col in columns_name) - return weighted_sum - - new_df["Overall Score"] = new_df.apply(calculate_weighted_avg, axis=1) - - cols = meta_data + ["Overall Score"] + columns_name - new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True) - return new_df - def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=META_DATA, perspective_counts=PERSPECTIVE_COUNTS): new_df = orig_df[meta_data + columns_name] @@ -200,28 +174,63 @@ def avg_all_perspective(orig_df: pd.DataFrame, columns_name: list, meta_data=MET new_df = new_df[cols].sort_values(by="Overall Score", ascending=False).reset_index(drop=True) return new_df +data = { + "Model": [ + "MiniGPT-5", "EMU-2", "GILL", "Anole", + "GPT-4o - Openjourney", "GPT-4o - SD-3", "GPT-4o - SD-XL", "GPT-4o - Flux", + "Gemini-1.5 - Openjourney", "Gemini-1.5 - SD-3", "Gemini-1.5 - SD-XL", "Gemini-1.5 - Flux", + "LLAVA-34b - Openjourney", "LLAVA-34b - SD-3", "LLAVA-34b - SD-XL", "LLAVA-34b - Flux", + "Qwen-VL-70b - Openjourney", "Qwen-VL-70b - SD-3", "Qwen-VL-70b - SD-XL", "Qwen-VL-70b - Flux" + ], + "Model Type":[ + "Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM", "Interleaved LVLM", + "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", + "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", + "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", + "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", "Integrated LVLM", + ], + "Situational analysis": [ + 47.63, 39.65, 46.72, 48.95, + 53.05, 53.00, 56.12, 54.97, + 48.08, 47.48, 49.43, 47.07, + 54.12, 54.72, 55.97, 54.23, + 52.73, 54.98, 52.58, 54.23 + ], + "Project-based learning": [ + 55.12, 46.12, 57.57, 59.05, + 71.40, 71.20, 73.25, 68.80, + 67.93, 68.70, 71.85, 68.33, + 73.47, 72.55, 74.60, 71.32, + 71.63, 71.87, 73.57, 69.47 + ], + "Multi-step reasoning": [ + 42.17, 50.75, 39.33, 51.72, + 53.67, 53.67, 53.67, 53.67, + 60.05, 60.05, 60.05, 60.05, + 47.28, 47.28, 47.28, 47.28, + 55.63, 55.63, 55.63, 55.63 + ], + "AVG": [ + 50.92, 45.33, 51.58, 55.22, + 63.65, 63.52, 65.47, 62.63, + 61.57, 61.87, 64.15, 61.55, + 63.93, 63.57, 65.05, 62.73, + 64.05, 64.75, 65.12, 63.18 + ] +} +df = pd.DataFrame(data) +total_models = len(df) -results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/detailed-results") -orig_df = get_leaderboard_results(results_path) -colmuns_name = list(SUBSET_COUNTS.keys()) -detailed_df = avg_all_subset(orig_df, colmuns_name).round(2) - -results_path = Path(f"{EVAL_RESULTS_PATH}/mjbench-results/overall-results") -orig_df = get_leaderboard_results(results_path) -colmuns_name = list(PERSPECTIVE_COUNTS.keys()) -perspective_df = avg_all_perspective(orig_df, colmuns_name).round(2) - -total_models = len(detailed_df) with gr.Blocks(css=custom_css) as app: with gr.Row(): with gr.Column(scale=6): gr.Markdown(INTRODUCTION_TEXT.format(str(total_models))) with gr.Column(scale=4): - gr.Markdown("![](https://huggingface.co/spaces/MJ-Bench/MJ-Bench-Leaderboard/resolve/main/src/mj-bench-logo.jpg)") + gr.Markdown("![](https://huggingface.co/spaces/MMIE/Leaderboard/resolve/main/src/logo.png)") # gr.HTML(BGB_LOGO, elem_classes="logo") with gr.Tabs(elem_classes="tab-buttons") as tabs: - with gr.TabItem("🏆 MJ-Bench Leaderboard"): + with gr.TabItem("🏆 MMIE Leaderboard"): with gr.Row(): search_overall = gr.Textbox( label="Model Search (delimit with , )", @@ -229,88 +238,46 @@ with gr.Blocks(css=custom_css) as app: show_label=False ) model_type_overall = gr.CheckboxGroup( - choices=["Score Model", "Opensource VLM", "Closesource VLM", "Others"], - value=["Score Model", "Opensource VLM", "Closesource VLM", "Others"], - label="Model Types", + choices=["Interleaved LVLM", "Integrated LVLM"], + value=["Interleaved LVLM", "Integrated LVLM"], + label="Model Type", show_label=False, interactive=True, ) with gr.Row(): - mjbench_table_overall_hidden = gr.Dataframe( - perspective_df, - headers=perspective_df.columns.tolist(), - elem_id="mjbench_leadboard_overall_hidden", + mmie_table_overall_hidden = gr.Dataframe( + df, + headers=df.columns.tolist(), + elem_id="mmie_leadboard_overall_hidden", wrap=True, visible=False, ) - mjbench_table_overall = gr.Dataframe( + mmie_table_overall = gr.Dataframe( regex_table( - perspective_df.copy(), + df.copy(), "", - ["Score Model", "Opensource VLM", "Closesource VLM", "Others"] + ["Interleaved LVLM", "Integrated LVLM"] ), - headers=perspective_df.columns.tolist(), - elem_id="mjbench_leadboard_overall", + headers=df.columns.tolist(), + elem_id="mmie_leadboard_overall", wrap=True, height=1000, ) - # with gr.TabItem("🔍 MJ-Bench Detailed Results"): - # with gr.Row(): - # search_detail = gr.Textbox( - # label="Model Search (delimit with , )", - # placeholder="🔍 Search model (separate multiple queries with ``) and press ENTER...", - # show_label=False - # ) - # model_type_detail = gr.CheckboxGroup( - # choices=["Score Model", "Opensource VLM", "Closesource VLM", "Others"], - # value=["Score Model", "Opensource VLM", "Closesource VLM", "Others"], - # label="Model Types", - # show_label=False, - # interactive=True, - # ) - # with gr.Row(): - # mjbench_table_detail_hidden = gr.Dataframe( - # detailed_df, - # headers=detailed_df.columns.tolist(), - # elem_id="mjbench_detailed_hidden", - # # column_widths = ["500px", "500px"], - # wrap=True, - # visible=False, - # ) - # mjbench_table_detail = gr.Dataframe( - # regex_table( - # detailed_df.copy(), - # "", - # ["Score Model", "Opensource VLM", "Closesource VLM", "Others"] - # ), - # headers=detailed_df.columns.tolist(), - # elem_id="mjbench_detailed", - # column_widths = ["40px", "200px", "180px", "130px", "150px"] + ["130px"]*50, - # wrap=True, - # height=1000, - # ) with gr.TabItem("About"): with gr.Row(): gr.Markdown(ABOUT_TEXT) with gr.Accordion("📚 Citation", open=False): citation_button = gr.Textbox( - value=r"""@misc{mjbench2024mjbench, - title={MJ-BENCH: Is Your Multimodal Reward Model Really a Good Judge?}, - author={Chen*, Zhaorun and Du*, Yichao and Wen, Zichen and Zhou, Yiyang and Cui, Chenhang and Weng, Zhenzhen and Tu, Haoqin and Wang, Chaoqi and Tong, Zhengwei and HUANG, Leria and Chen, Canyu and Ye Qinghao and Zhu, Zhihong and Zhang, Yuqing and Zhou, Jiawei and Zhao, Zhuokai and Rafailov, Rafael and Finn, Chelsea and Yao, Huaxiu}, - year={2024} -}""", + value=r"""""", lines=7, label="Copy the following to cite these results.", elem_id="citation-button", show_copy_button=True, ) - search_overall.change(regex_table, inputs=[mjbench_table_overall_hidden, search_overall, model_type_overall], outputs=mjbench_table_overall) - model_type_overall.change(regex_table, inputs=[mjbench_table_overall_hidden, search_overall, model_type_overall], outputs=mjbench_table_overall) - - # search_detail.change(regex_table, inputs=[mjbench_table_detail_hidden, search_detail, model_type_detail], outputs=mjbench_table_detail) - # model_type_detail.change(regex_table, inputs=[mjbench_table_detail_hidden, search_detail, model_type_detail], outputs=mjbench_table_detail) + search_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall) + model_type_overall.change(regex_table, inputs=[mmie_table_overall_hidden, search_overall, model_type_overall], outputs=mmie_table_overall) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=18000) # restarted every 3h diff --git a/evals/.gitattributes b/evals/.gitattributes deleted file mode 100644 index 28df5f900b358436f0267334b3e3e9af33f917ba..0000000000000000000000000000000000000000 --- a/evals/.gitattributes +++ /dev/null @@ -1,55 +0,0 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.lz4 filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text -# Audio files - uncompressed -*.pcm filter=lfs diff=lfs merge=lfs -text -*.sam filter=lfs diff=lfs merge=lfs -text -*.raw filter=lfs diff=lfs merge=lfs -text -# Audio files - compressed -*.aac filter=lfs diff=lfs merge=lfs -text -*.flac filter=lfs diff=lfs merge=lfs -text -*.mp3 filter=lfs diff=lfs merge=lfs -text -*.ogg filter=lfs diff=lfs merge=lfs -text -*.wav filter=lfs diff=lfs merge=lfs -text -# Image files - uncompressed -*.bmp filter=lfs diff=lfs merge=lfs -text -*.gif filter=lfs diff=lfs merge=lfs -text -*.png filter=lfs diff=lfs merge=lfs -text -*.tiff filter=lfs diff=lfs merge=lfs -text -# Image files - compressed -*.jpg filter=lfs diff=lfs merge=lfs -text -*.jpeg filter=lfs diff=lfs merge=lfs -text -*.webp filter=lfs diff=lfs merge=lfs -text diff --git a/evals/README.md b/evals/README.md deleted file mode 100644 index bdcb7492fc5d10d433fb90897f90b0b985d0e8ad..0000000000000000000000000000000000000000 --- a/evals/README.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1 -# Doc / guide: https://huggingface.co/docs/hub/datasets-cards -{} ---- -# Coming Soon diff --git a/evals/mjbench-results/detailed-results/AestheticsPredictor.json b/evals/mjbench-results/detailed-results/AestheticsPredictor.json deleted file mode 100644 index 9e60ca09e04382a86c80bd0449a623562181f80a..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/AestheticsPredictor.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "AestheticsPredictor", - "Model Type": "Score Model", - "Input Type": "Single Image", - "Organization": "LAION", - "Alignment-Object": 35.9, - "Alignment-Attribute": 38.4, - "Alignment-Action": 43.6, - "Alignment-Location": 31.6, - "Alignment-Count": 35.7, - "Alignment-Avg": 34.8, - "Safety-Toxicity-Crime": 51.7, - "Safety-Toxicity-Shocking": 58.6, - "Safety-Toxicity-Disgust": 64.3, - "Safety-Toxicity-Avg": 57.3, - "Safety-Nsfw-Evident": 14.6, - "Safety-Nsfw-Evasive": 55.2, - "Safety-Nsfw-Subtle": 14.2, - "Safety-Nsfw-Avg": 37.5, - "Quality-Distortion-Human_face": 78.7, - "Quality-Distortion-Human_limb": 57.1, - "Quality-Distortion-Object": 51.3, - "Quality-Distortion-Avg": 52.1, - "Quality-Blurry-Defocused": 90.1, - "Quality-Blurry-Motion": 93.4, - "Quality-Blurry-Avg": 91.6, - "Bias-Age": 59.4, - "Bias-Gender": 62.0, - "Bias-Race": 64.2, - "Bias-Nationality": 62.4, - "Bias-Religion": 61.0, - "Bias-Avg": 62.0, - "Bias-Age-NDS": 85.3, - "Bias-Gender-NDS": 85.9, - "Bias-Race-NDS": 86.3, - "Bias-Nationality-NDS": 85.8, - "Bias-Religion-NDS": 86.2, - "Bias-Avg-NDS": 85.9, - "Bias-Age-GES": 91.9, - "Bias-Gender-GES": 92.1, - "Bias-Race-GES": 92.4, - "Bias-Nationality-GES": 92.1, - "Bias-Religion-GES": 92.3, - "Bias-Avg-GES": 92.1 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/BLIP-v2.json b/evals/mjbench-results/detailed-results/BLIP-v2.json deleted file mode 100644 index 2c7c2ae44b9f14f2bb0f06b69132cd7257498124..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/BLIP-v2.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "BLIP-v2", - "Model Type": "Score Model", - "Input Type": "Single Image", - "Organization": "Salesforce", - "Alignment-Object": 23.5, - "Alignment-Attribute": 22.7, - "Alignment-Action": 24.8, - "Alignment-Location": 19.7, - "Alignment-Count": 16.1, - "Alignment-Avg": 21.5, - "Safety-Toxicity-Crime": 6.9, - "Safety-Toxicity-Shocking": 0.0, - "Safety-Toxicity-Disgust": 4.8, - "Safety-Toxicity-Avg": 4.5, - "Safety-Nsfw-Evident": 58.4, - "Safety-Nsfw-Evasive": 51.1, - "Safety-Nsfw-Subtle": 35.7, - "Safety-Nsfw-Avg": 49.1, - "Quality-Distortion-Human_face": 3.6, - "Quality-Distortion-Human_limb": 2.0, - "Quality-Distortion-Object": 1.1, - "Quality-Distortion-Avg": 1.9, - "Quality-Blurry-Defocused": 8.3, - "Quality-Blurry-Motion": 47.2, - "Quality-Blurry-Avg": 15.0, - "Bias-Age": 69.6, - "Bias-Gender": 68.5, - "Bias-Race": 65.9, - "Bias-Nationality": 68.6, - "Bias-Religion": 74.7, - "Bias-Avg": 68.5, - "Bias-Age-NDS": 85.3, - "Bias-Gender-NDS": 83.6, - "Bias-Race-NDS": 82.7, - "Bias-Nationality-NDS": 81.8, - "Bias-Religion-NDS": 87.5, - "Bias-Avg-NDS": 83.6, - "Bias-Age-GES": 92.2, - "Bias-Gender-GES": 91.3, - "Bias-Race-GES": 90.7, - "Bias-Nationality-GES": 90.4, - "Bias-Religion-GES": 93.1, - "Bias-Avg-GES": 91.3 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/CLIP-v2.json b/evals/mjbench-results/detailed-results/CLIP-v2.json deleted file mode 100644 index 331fb7fca8a896f1f2df3055bc4704367ed0bf59..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/CLIP-v2.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "CLIP-v2", - "Model Type": "Score Model", - "Input Type": "Single Image", - "Organization": "LAION", - "Alignment-Object": 42.2, - "Alignment-Attribute": 45.9, - "Alignment-Action": 45.3, - "Alignment-Location": 43.4, - "Alignment-Count": 55.4, - "Alignment-Avg": 44.0, - "Safety-Toxicity-Crime": 89.7, - "Safety-Toxicity-Shocking": 96.6, - "Safety-Toxicity-Disgust": 97.6, - "Safety-Toxicity-Avg": 94.4, - "Safety-Nsfw-Evident": 20.8, - "Safety-Nsfw-Evasive": 4.5, - "Safety-Nsfw-Subtle": 16.6, - "Safety-Nsfw-Avg": 7.9, - "Quality-Distortion-Human_face": 26.6, - "Quality-Distortion-Human_limb": 17.2, - "Quality-Distortion-Object": 34.0, - "Quality-Distortion-Avg": 19.3, - "Quality-Blurry-Defocused": 50.6, - "Quality-Blurry-Motion": 63.7, - "Quality-Blurry-Avg": 56.7, - "Bias-Age": 57.2, - "Bias-Gender": 57.8, - "Bias-Race": 55.5, - "Bias-Nationality": 59.5, - "Bias-Religion": 60.8, - "Bias-Avg": 57.7, - "Bias-Age-NDS": 73.6, - "Bias-Gender-NDS": 75.2, - "Bias-Race-NDS": 73.1, - "Bias-Nationality-NDS": 79.1, - "Bias-Religion-NDS": 78.4, - "Bias-Avg-NDS": 75.2, - "Bias-Age-GES": 73.6, - "Bias-Gender-GES": 75.2, - "Bias-Race-GES": 73.1, - "Bias-Nationality-GES": 79.1, - "Bias-Religion-GES": 78.4, - "Bias-Avg-GES": 75.2 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/Claude 3 Opus.json b/evals/mjbench-results/detailed-results/Claude 3 Opus.json deleted file mode 100644 index b46148065f408114942ffca0a9f59e98ce1ae43e..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/Claude 3 Opus.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "Claude 3 Opus", - "Model Type": "Closesource VLM", - "Input Type": "Multi Image", - "Organization": "Anthropic", - "Alignment-Object": 64.9, - "Alignment-Attribute": 38.9, - "Alignment-Action": 44.4, - "Alignment-Location": 55.3, - "Alignment-Count": 55.4, - "Alignment-Avg": 57.1, - "Safety-Toxicity-Crime": 62.1, - "Safety-Toxicity-Shocking": 37.9, - "Safety-Toxicity-Disgust": 50.0, - "Safety-Toxicity-Avg": 50.6, - "Safety-Nsfw-Evident": 10.5, - "Safety-Nsfw-Evasive": 6.2, - "Safety-Nsfw-Subtle": 3.6, - "Safety-Nsfw-Avg": 8.3, - "Quality-Distortion-Human_face": 26.6, - "Quality-Distortion-Human_limb": 19.3, - "Quality-Distortion-Object": 10.7, - "Quality-Distortion-Avg": 17.6, - "Quality-Blurry-Defocused": 89.6, - "Quality-Blurry-Motion": 93.3, - "Quality-Blurry-Avg": 92.7, - "Bias-Age": 53.9, - "Bias-Gender": 58.2, - "Bias-Race": 62.1, - "Bias-Nationality": 59.0, - "Bias-Religion": 54.0, - "Bias-Avg": 58.2, - "Bias-Age-NDS": 63.3, - "Bias-Gender-NDS": 66.1, - "Bias-Race-NDS": 67.5, - "Bias-Nationality-NDS": 66.9, - "Bias-Religion-NDS": 66.8, - "Bias-Avg-NDS": 66.1, - "Bias-Age-GES": 83.2, - "Bias-Gender-GES": 85.2, - "Bias-Race-GES": 86.5, - "Bias-Nationality-GES": 85.8, - "Bias-Religion-GES": 84.8, - "Bias-Avg-GES": 85.2 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/GPT-4-vision.json b/evals/mjbench-results/detailed-results/GPT-4-vision.json deleted file mode 100644 index b22c8b6c6a7eb8e167c187a594c773a1d152d48e..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/GPT-4-vision.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "GPT-4-vision", - "Model Type": "Closesource VLM", - "Input Type": "Multi Image", - "Organization": "OpenAI", - "Alignment-Object": 68.1, - "Alignment-Attribute": 62.9, - "Alignment-Action": 64.1, - "Alignment-Location": 67.1, - "Alignment-Count": 73.2, - "Alignment-Avg": 66.1, - "Safety-Toxicity-Crime": 75.9, - "Safety-Toxicity-Shocking": 69.0, - "Safety-Toxicity-Disgust": 81.0, - "Safety-Toxicity-Avg": 76.4, - "Safety-Nsfw-Evident": 69.5, - "Safety-Nsfw-Evasive": 43.2, - "Safety-Nsfw-Subtle": 32.5, - "Safety-Nsfw-Avg": 44.1, - "Quality-Distortion-Human_face": 87.6, - "Quality-Distortion-Human_limb": 57.6, - "Quality-Distortion-Object": 83.1, - "Quality-Distortion-Avg": 75.7, - "Quality-Blurry-Defocused": 98.8, - "Quality-Blurry-Motion": 99.3, - "Quality-Blurry-Avg": 99.2, - "Bias-Age": 76.7, - "Bias-Gender": 79.1, - "Bias-Race": 77.4, - "Bias-Nationality": 81.0, - "Bias-Religion": 86.5, - "Bias-Avg": 79.1, - "Bias-Age-NDS": 81.2, - "Bias-Gender-NDS": 80.2, - "Bias-Race-NDS": 77.6, - "Bias-Nationality-NDS": 79.9, - "Bias-Religion-NDS": 88.2, - "Bias-Avg-NDS": 80.2, - "Bias-Age-GES": 93.0, - "Bias-Gender-GES": 93.2, - "Bias-Race-GES": 92.2, - "Bias-Nationality-GES": 93.4, - "Bias-Religion-GES": 96.4, - "Bias-Avg-GES": 93.2 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/GPT-4o.json b/evals/mjbench-results/detailed-results/GPT-4o.json deleted file mode 100644 index abccbc7b9df4ad0b425d86f8b1fe153a432e16a4..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/GPT-4o.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "GPT-4o", - "Model Type": "Closesource VLM", - "Input Type": "Multi Image", - "Organization": "OpenAI", - "Alignment-Object": 62.2, - "Alignment-Attribute": 57.2, - "Alignment-Action": 64.1, - "Alignment-Location": 63.2, - "Alignment-Count": 67.9, - "Alignment-Avg": 61.5, - "Safety-Toxicity-Crime": 86.2, - "Safety-Toxicity-Shocking": 96.6, - "Safety-Toxicity-Disgust": 95.2, - "Safety-Toxicity-Avg": 92.1, - "Safety-Nsfw-Evident": 72.3, - "Safety-Nsfw-Evasive": 51.7, - "Safety-Nsfw-Subtle": 38.9, - "Safety-Nsfw-Avg": 54.3, - "Quality-Distortion-Human_face": 99.4, - "Quality-Distortion-Human_limb": 78.2, - "Quality-Distortion-Object": 100.0, - "Quality-Distortion-Avg": 93.8, - "Quality-Blurry-Defocused": 100.0, - "Quality-Blurry-Motion": 100.0, - "Quality-Blurry-Avg": 100.0, - "Bias-Age": 60.9, - "Bias-Gender": 66.6, - "Bias-Race": 69.1, - "Bias-Nationality": 68.2, - "Bias-Religion": 69.6, - "Bias-Avg": 66.6, - "Bias-Age-NDS": 81.2, - "Bias-Gender-NDS": 82.7, - "Bias-Race-NDS": 82.8, - "Bias-Nationality-NDS": 83.2, - "Bias-Religion-NDS": 86.1, - "Bias-Avg-NDS": 82.7, - "Bias-Age-GES": 91.8, - "Bias-Gender-GES": 92.9, - "Bias-Race-GES": 93.1, - "Bias-Nationality-GES": 93.3, - "Bias-Religion-GES": 94.4, - "Bias-Avg-GES": 92.9 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/Gemini Ultra.json b/evals/mjbench-results/detailed-results/Gemini Ultra.json deleted file mode 100644 index ce608920a67e9f2b281814822418cea71a8a17a2..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/Gemini Ultra.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "Gemini Ultra", - "Model Type": "Closesource VLM", - "Input Type": "Multi Image", - "Organization": "Google", - "Alignment-Object": 71.7, - "Alignment-Attribute": 65.1, - "Alignment-Action": 63.2, - "Alignment-Location": 64.5, - "Alignment-Count": 67.8, - "Alignment-Avg": 67.2, - "Safety-Toxicity-Crime": 65.5, - "Safety-Toxicity-Shocking": 41.4, - "Safety-Toxicity-Disgust": 78.6, - "Safety-Toxicity-Avg": 64.0, - "Safety-Nsfw-Evident": 31.6, - "Safety-Nsfw-Evasive": 19.1, - "Safety-Nsfw-Subtle": 10.3, - "Safety-Nsfw-Avg": 22.7, - "Quality-Distortion-Human_face": 73.4, - "Quality-Distortion-Human_limb": 32.5, - "Quality-Distortion-Object": 61.0, - "Quality-Distortion-Avg": 55.7, - "Quality-Blurry-Defocused": 86.5, - "Quality-Blurry-Motion": 97.3, - "Quality-Blurry-Avg": 93.9, - "Bias-Age": 48.7, - "Bias-Gender": 56.9, - "Bias-Race": 62.9, - "Bias-Nationality": 60.0, - "Bias-Religion": 49.9, - "Bias-Avg": 56.9, - "Bias-Age-NDS": 72.6, - "Bias-Gender-NDS": 75.8, - "Bias-Race-NDS": 78.4, - "Bias-Nationality-NDS": 77.0, - "Bias-Religion-NDS": 72.3, - "Bias-Avg-NDS": 75.8, - "Bias-Age-GES": 86.6, - "Bias-Gender-GES": 89.0, - "Bias-Race-GES": 90.8, - "Bias-Nationality-GES": 90.0, - "Bias-Religion-GES": 86.2, - "Bias-Avg-GES": 89.0 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/HPS-v2.1.json b/evals/mjbench-results/detailed-results/HPS-v2.1.json deleted file mode 100644 index e1df4addec80df31df55da9b6747f2ecb4a272ce..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/HPS-v2.1.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "HPS-v2.1", - "Model Type": "Score Model", - "Input Type": "Single Image", - "Organization": "CUHK MMLab", - "Alignment-Object": 49.4, - "Alignment-Attribute": 53.7, - "Alignment-Action": 49.6, - "Alignment-Location": 51.3, - "Alignment-Count": 57.1, - "Alignment-Avg": 48.8, - "Safety-Toxicity-Crime": 89.7, - "Safety-Toxicity-Shocking": 86.2, - "Safety-Toxicity-Disgust": 85.7, - "Safety-Toxicity-Avg": 87.6, - "Safety-Nsfw-Evident": 1.1, - "Safety-Nsfw-Evasive": 30.8, - "Safety-Nsfw-Subtle": 0.6, - "Safety-Nsfw-Avg": 15.1, - "Quality-Distortion-Human_face": 60.4, - "Quality-Distortion-Human_limb": 37.1, - "Quality-Distortion-Object": 80.3, - "Quality-Distortion-Avg": 51.7, - "Quality-Blurry-Defocused": 85.7, - "Quality-Blurry-Motion": 94.6, - "Quality-Blurry-Avg": 88.6, - "Bias-Age": 52.9, - "Bias-Gender": 55.3, - "Bias-Race": 55.7, - "Bias-Nationality": 55.0, - "Bias-Religion": 62.4, - "Bias-Avg": 55.3, - "Bias-Age-NDS": 75.8, - "Bias-Gender-NDS": 78.2, - "Bias-Race-NDS": 79.5, - "Bias-Nationality-NDS": 78.6, - "Bias-Religion-NDS": 79.3, - "Bias-Avg-NDS": 78.2, - "Bias-Age-GES": 86.4, - "Bias-Gender-GES": 87.8, - "Bias-Race-GES": 88.5, - "Bias-Nationality-GES": 88.0, - "Bias-Religion-GES": 88.5, - "Bias-Avg-GES": 87.8 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/Idefics2-8b.json b/evals/mjbench-results/detailed-results/Idefics2-8b.json deleted file mode 100644 index 4176686854d06c6479c88932308eac6942c25b4f..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/Idefics2-8b.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "Idefics2-8b", - "Model Type": "Opensource VLM", - "Input Type": "Multi Image", - "Organization": "HuggingFace", - "Alignment-Object": 35.5, - "Alignment-Attribute": 31.7, - "Alignment-Action": 30.8, - "Alignment-Location": 29.9, - "Alignment-Count": 30.4, - "Alignment-Avg": 32.6, - "Safety-Toxicity-Crime": 58.6, - "Safety-Toxicity-Shocking": 44.8, - "Safety-Toxicity-Disgust": 57.1, - "Safety-Toxicity-Avg": 52.8, - "Safety-Nsfw-Evident": 32.9, - "Safety-Nsfw-Evasive": 13.2, - "Safety-Nsfw-Subtle": 19.5, - "Safety-Nsfw-Avg": 20.2, - "Quality-Distortion-Human_face": 29.6, - "Quality-Distortion-Human_limb": 25.8, - "Quality-Distortion-Object": 2.3, - "Quality-Distortion-Avg": 21.7, - "Quality-Blurry-Defocused": 70.6, - "Quality-Blurry-Motion": 46.9, - "Quality-Blurry-Avg": 58.7, - "Bias-Age": 37.4, - "Bias-Gender": 42.7, - "Bias-Race": 45.3, - "Bias-Nationality": 46.9, - "Bias-Religion": 35.2, - "Bias-Avg": 42.7, - "Bias-Age-NDS": 55.1, - "Bias-Gender-NDS": 59.2, - "Bias-Race-NDS": 61.7, - "Bias-Nationality-NDS": 62.8, - "Bias-Religion-NDS": 51.0, - "Bias-Avg-NDS": 59.2, - "Bias-Age-GES": 77.0, - "Bias-Gender-GES": 79.7, - "Bias-Race-GES": 81.3, - "Bias-Nationality-GES": 82.0, - "Bias-Religion-GES": 74.4, - "Bias-Avg-GES": 79.8 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/ImageReward.json b/evals/mjbench-results/detailed-results/ImageReward.json deleted file mode 100644 index a86dec315516c1ef1ad4bab964e2e608554aaf87..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/ImageReward.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "ImageReward", - "Model Type": "Score Model", - "Input Type": "Single Image", - "Organization": "THUDM", - "Alignment-Object": 50.6, - "Alignment-Attribute": 52.8, - "Alignment-Action": 47.1, - "Alignment-Location": 57.9, - "Alignment-Count": 53.6, - "Alignment-Avg": 51.1, - "Safety-Toxicity-Crime": 96.6, - "Safety-Toxicity-Shocking": 96.6, - "Safety-Toxicity-Disgust": 95.2, - "Safety-Toxicity-Avg": 95.5, - "Safety-Nsfw-Evident": 31.1, - "Safety-Nsfw-Evasive": 10.2, - "Safety-Nsfw-Subtle": 27.4, - "Safety-Nsfw-Avg": 18.2, - "Quality-Distortion-Human_face": 31.4, - "Quality-Distortion-Human_limb": 34.4, - "Quality-Distortion-Object": 40.2, - "Quality-Distortion-Avg": 33.3, - "Quality-Blurry-Defocused": 77.4, - "Quality-Blurry-Motion": 86.6, - "Quality-Blurry-Avg": 82.1, - "Bias-Age": 41.8, - "Bias-Gender": 40.4, - "Bias-Race": 36.8, - "Bias-Nationality": 39.5, - "Bias-Religion": 52.8, - "Bias-Avg": 40.4, - "Bias-Age-NDS": 73.9, - "Bias-Gender-NDS": 73.2, - "Bias-Race-NDS": 70.9, - "Bias-Nationality-NDS": 73.0, - "Bias-Religion-NDS": 80.2, - "Bias-Avg-NDS": 73.2, - "Bias-Age-GES": 85.5, - "Bias-Gender-GES": 85.0, - "Bias-Race-GES": 83.6, - "Bias-Nationality-GES": 84.8, - "Bias-Religion-GES": 89.0, - "Bias-Avg-GES": 85.0 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/Instructblip-7b.json b/evals/mjbench-results/detailed-results/Instructblip-7b.json deleted file mode 100644 index 82396eb8ef5a152880dd56c3bf035aa884799eba..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/Instructblip-7b.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "Instructblip-7b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "Salesforce", - "Alignment-Object": 17.1, - "Alignment-Attribute": 17.4, - "Alignment-Action": 16.2, - "Alignment-Location": 13.1, - "Alignment-Count": 21.4, - "Alignment-Avg": 17.1, - "Safety-Toxicity-Crime": 31.0, - "Safety-Toxicity-Shocking": 34.5, - "Safety-Toxicity-Disgust": 40.5, - "Safety-Toxicity-Avg": 39.3, - "Safety-Nsfw-Evident": 36.9, - "Safety-Nsfw-Evasive": 24.2, - "Safety-Nsfw-Subtle": 30.6, - "Safety-Nsfw-Avg": 33.7, - "Quality-Distortion-Human_face": 12.4, - "Quality-Distortion-Human_limb": 9.3, - "Quality-Distortion-Object": 21.0, - "Quality-Distortion-Avg": 13.3, - "Quality-Blurry-Defocused": 32.3, - "Quality-Blurry-Motion": 31.1, - "Quality-Blurry-Avg": 31.7, - "Bias-Age": 52.5, - "Bias-Gender": 53.6, - "Bias-Race": 53.6, - "Bias-Nationality": 52.0, - "Bias-Religion": 61.1, - "Bias-Avg": 53.6, - "Bias-Age-NDS": 80.8, - "Bias-Gender-NDS": 80.6, - "Bias-Race-NDS": 80.3, - "Bias-Nationality-NDS": 79.0, - "Bias-Religion-NDS": 85.4, - "Bias-Avg-NDS": 80.6, - "Bias-Age-GES": 91.0, - "Bias-Gender-GES": 91.2, - "Bias-Race-GES": 91.1, - "Bias-Nationality-GES": 90.4, - "Bias-Religion-GES": 93.8, - "Bias-Avg-GES": 91.1 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json b/evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json deleted file mode 100644 index 79f1a35bed8639d300054d61c19e97cd171cdc2a..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/InternVL-Chat-V1-5.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "InternVL-Chat-V1-5", - "Model Type": "Opensource VLM", - "Input Type": "Multi Image", - "Organization": "OpenGVLab", - "Alignment-Object": 73.3, - "Alignment-Attribute": 74.8, - "Alignment-Action": 78.6, - "Alignment-Location": 80.5, - "Alignment-Count": 78.6, - "Alignment-Avg": 75.8, - "Safety-Toxicity-Crime": 34.5, - "Safety-Toxicity-Shocking": 10.3, - "Safety-Toxicity-Disgust": 28.6, - "Safety-Toxicity-Avg": 25.8, - "Safety-Nsfw-Evident": 23.3, - "Safety-Nsfw-Evasive": 10.6, - "Safety-Nsfw-Subtle": 7.2, - "Safety-Nsfw-Avg": 16.2, - "Quality-Distortion-Human_face": 97.0, - "Quality-Distortion-Human_limb": 95.4, - "Quality-Distortion-Object": 97.1, - "Quality-Distortion-Avg": 97.1, - "Quality-Blurry-Defocused": 89.7, - "Quality-Blurry-Motion": 89.7, - "Quality-Blurry-Avg": 89.7, - "Bias-Age": 40.0, - "Bias-Gender": 41.3, - "Bias-Race": 42.1, - "Bias-Nationality": 42.0, - "Bias-Religion": 39.8, - "Bias-Avg": 41.3, - "Bias-Age-NDS": 74.0, - "Bias-Gender-NDS": 74.1, - "Bias-Race-NDS": 73.6, - "Bias-Nationality-NDS": 73.9, - "Bias-Religion-NDS": 76.6, - "Bias-Avg-NDS": 74.1, - "Bias-Age-GES": 86.9, - "Bias-Gender-GES": 87.2, - "Bias-Race-GES": 87.1, - "Bias-Nationality-GES": 87.3, - "Bias-Religion-GES": 88.0, - "Bias-Avg-GES": 87.2 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json b/evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json deleted file mode 100644 index 5e7ab74d1cbc235611e0187360b3cf64a86202b0..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/LLaVA-1.5-13b.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "LLaVA-1.5-13b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "UW-Madison & Microsoft", - "Alignment-Object": 17.7, - "Alignment-Attribute": 13.5, - "Alignment-Action": 11.8, - "Alignment-Location": 16.5, - "Alignment-Count": 8.9, - "Alignment-Avg": 10.3, - "Safety-Toxicity-Crime": 31.0, - "Safety-Toxicity-Shocking": 31.0, - "Safety-Toxicity-Disgust": 40.5, - "Safety-Toxicity-Avg": 33.7, - "Safety-Nsfw-Evident": 40.8, - "Safety-Nsfw-Evasive": 29.9, - "Safety-Nsfw-Subtle": 33.6, - "Safety-Nsfw-Avg": 34.7, - "Quality-Distortion-Human_face": 20.1, - "Quality-Distortion-Human_limb": 14.6, - "Quality-Distortion-Object": 13.3, - "Quality-Distortion-Avg": 16.4, - "Quality-Blurry-Defocused": 18.0, - "Quality-Blurry-Motion": 34.0, - "Quality-Blurry-Avg": 26.1, - "Bias-Age": 67.0, - "Bias-Gender": 70.1, - "Bias-Race": 68.9, - "Bias-Nationality": 72.7, - "Bias-Religion": 75.1, - "Bias-Avg": 70.1, - "Bias-Age-NDS": 71.9, - "Bias-Gender-NDS": 74.8, - "Bias-Race-NDS": 76.6, - "Bias-Nationality-NDS": 74.0, - "Bias-Religion-NDS": 80.6, - "Bias-Avg-NDS": 74.8, - "Bias-Age-GES": 87.5, - "Bias-Gender-GES": 88.8, - "Bias-Race-GES": 88.9, - "Bias-Nationality-GES": 89.5, - "Bias-Religion-GES": 90.1, - "Bias-Avg-GES": 88.8 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json b/evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json deleted file mode 100644 index a8b52bc40294a549ac05d7c7d20873e761e0709a..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/LLaVA-1.5-7b.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "LLaVA-1.5-7b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "UW-Madison & Microsoft", - "Alignment-Object": 20.7, - "Alignment-Attribute": 25.2, - "Alignment-Action": 23.1, - "Alignment-Location": 18.2, - "Alignment-Count": 17.9, - "Alignment-Avg": 22.0, - "Safety-Toxicity-Crime": 44.8, - "Safety-Toxicity-Shocking": 41.4, - "Safety-Toxicity-Disgust": 47.6, - "Safety-Toxicity-Avg": 43.8, - "Safety-Nsfw-Evident": 35.7, - "Safety-Nsfw-Evasive": 21.2, - "Safety-Nsfw-Subtle": 17.6, - "Safety-Nsfw-Avg": 26.3, - "Quality-Distortion-Human_face": 13.6, - "Quality-Distortion-Human_limb": 7.3, - "Quality-Distortion-Object": 9.2, - "Quality-Distortion-Avg": 10.2, - "Quality-Blurry-Defocused": 7.1, - "Quality-Blurry-Motion": 19.1, - "Quality-Blurry-Avg": 13.1, - "Bias-Age": 80.8, - "Bias-Gender": 83.9, - "Bias-Race": 84.6, - "Bias-Nationality": 84.9, - "Bias-Religion": 88.1, - "Bias-Avg": 84.0, - "Bias-Age-NDS": 67.6, - "Bias-Gender-NDS": 71.4, - "Bias-Race-NDS": 75.8, - "Bias-Nationality-NDS": 68.4, - "Bias-Religion-NDS": 77.3, - "Bias-Avg-NDS": 71.4, - "Bias-Age-GES": 87.4, - "Bias-Gender-GES": 88.9, - "Bias-Race-GES": 90.1, - "Bias-Nationality-GES": 88.7, - "Bias-Religion-GES": 90.7, - "Bias-Avg-GES": 88.9 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json b/evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json deleted file mode 100644 index fae40b999c44b6ca112e9864c88166f0a290c51b..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/LLaVA-NeXT-mistral-7b.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "LLaVA-NeXT-mistral-7b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "UW-Madison & ByteDance", - "Alignment-Object": 25.9, - "Alignment-Attribute": 30.0, - "Alignment-Action": 41.9, - "Alignment-Location": 33.8, - "Alignment-Count": 35.7, - "Alignment-Avg": 31.3, - "Safety-Toxicity-Crime": 20.7, - "Safety-Toxicity-Shocking": 24.1, - "Safety-Toxicity-Disgust": 19.0, - "Safety-Toxicity-Avg": 21.3, - "Safety-Nsfw-Evident": 35.7, - "Safety-Nsfw-Evasive": 14.1, - "Safety-Nsfw-Subtle": 23.3, - "Safety-Nsfw-Avg": 25.6, - "Quality-Distortion-Human_face": 28.4, - "Quality-Distortion-Human_limb": 27.8, - "Quality-Distortion-Object": 19.0, - "Quality-Distortion-Avg": 30.1, - "Quality-Blurry-Defocused": 41.7, - "Quality-Blurry-Motion": 66.1, - "Quality-Blurry-Avg": 53.9, - "Bias-Age": 54.3, - "Bias-Gender": 56.7, - "Bias-Race": 57.0, - "Bias-Nationality": 56.1, - "Bias-Religion": 64.8, - "Bias-Avg": 56.6, - "Bias-Age-NDS": 63.2, - "Bias-Gender-NDS": 64.1, - "Bias-Race-NDS": 62.5, - "Bias-Nationality-NDS": 63.8, - "Bias-Religion-NDS": 74.2, - "Bias-Avg-NDS": 64.1, - "Bias-Age-GES": 82.1, - "Bias-Gender-GES": 82.8, - "Bias-Race-GES": 82.4, - "Bias-Nationality-GES": 82.5, - "Bias-Religion-GES": 87.8, - "Bias-Avg-GES": 82.8 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json b/evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json deleted file mode 100644 index 575a09f2ba54bb01200aaa8d1569cfcbcf81bf56..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/LLaVA-NeXT-vicuna-13b.json +++ /dev/null @@ -1,35 +0,0 @@ -[ - { - "Model": "LLaVA-NeXT-vicuna-13b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "UW-Madison & ByteDance", - "Alignment-Object": 25.9, - "Alignment-Attribute": 27.4, - "Alignment-Action": 31.6, - "Alignment-Location": 38.9, - "Alignment-Count": 32.1, - "Alignment-Avg": 29.1, - "Safety-Toxicity-Crime": 44.8, - "Safety-Toxicity-Shocking": 37.9, - "Safety-Toxicity-Disgust": 52.4, - "Safety-Toxicity-Avg": 43.8, - "Safety-Nsfw-Evident": 40.9, - "Safety-Nsfw-Evasive": 25.1, - "Safety-Nsfw-Subtle": 27.8, - "Safety-Nsfw-Avg": 36.5, - "Quality-Distortion-Human_face": 18.9, - "Quality-Distortion-Human_limb": 27.8, - "Quality-Distortion-Object": 12.0, - "Quality-Distortion-Avg": 20.5, - "Quality-Blurry-Defocused": 40.6, - "Quality-Blurry-Motion": 45.4, - "Quality-Blurry-Avg": 43.0, - "Bias-Age": 54.3, - "Bias-Gender": 56.7, - "Bias-Race": 57.0, - "Bias-Nationality": 56.1, - "Bias-Religion": 64.8, - "Bias-Avg": 56.6 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/MiniGPT4-v2.json b/evals/mjbench-results/detailed-results/MiniGPT4-v2.json deleted file mode 100644 index 26289b1e61eec04717e06565994c4dbc6482ed5b..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/MiniGPT4-v2.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "MiniGPT4-v2", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "Vision-CAIR", - "Alignment-Object": 37.5, - "Alignment-Attribute": 30.9, - "Alignment-Action": 30.8, - "Alignment-Location": 32.5, - "Alignment-Count": 39.3, - "Alignment-Avg": 32.8, - "Safety-Toxicity-Crime": 41.4, - "Safety-Toxicity-Shocking": 62.1, - "Safety-Toxicity-Disgust": 42.9, - "Safety-Toxicity-Avg": 48.3, - "Safety-Nsfw-Evident": 39.6, - "Safety-Nsfw-Evasive": 21.4, - "Safety-Nsfw-Subtle": 36.5, - "Safety-Nsfw-Avg": 32.6, - "Quality-Distortion-Human_face": 39.6, - "Quality-Distortion-Human_limb": 39.1, - "Quality-Distortion-Object": 42.0, - "Quality-Distortion-Avg": 40.0, - "Quality-Blurry-Defocused": 33.4, - "Quality-Blurry-Motion": 37.4, - "Quality-Blurry-Avg": 35.4, - "Bias-Age": 31.8, - "Bias-Gender": 32.2, - "Bias-Race": 31.9, - "Bias-Nationality": 34.1, - "Bias-Religion": 28.3, - "Bias-Avg": 32.2, - "Bias-Age-NDS": 68.1, - "Bias-Gender-NDS": 67.2, - "Bias-Race-NDS": 66.2, - "Bias-Nationality-NDS": 67.0, - "Bias-Religion-NDS": 69.3, - "Bias-Avg-NDS": 67.2, - "Bias-Age-GES": 83.7, - "Bias-Gender-GES": 83.3, - "Bias-Race-GES": 82.8, - "Bias-Nationality-GES": 83.4, - "Bias-Religion-GES": 84.1, - "Bias-Avg-GES": 83.3 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/PickScore-v1.json b/evals/mjbench-results/detailed-results/PickScore-v1.json deleted file mode 100644 index 5854469254effa5fd4b258a4fd1330437543ff65..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/PickScore-v1.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "PickScore-v1", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "Stability AI", - "Alignment-Object": 60.9, - "Alignment-Attribute": 60.3, - "Alignment-Action": 62.4, - "Alignment-Location": 59.2, - "Alignment-Count": 67.9, - "Alignment-Avg": 60.9, - "Safety-Toxicity-Crime": 89.7, - "Safety-Toxicity-Shocking": 82.8, - "Safety-Toxicity-Disgust": 88.1, - "Safety-Toxicity-Avg": 86.5, - "Safety-Nsfw-Evident": 3.1, - "Safety-Nsfw-Evasive": 48.2, - "Safety-Nsfw-Subtle": 2.1, - "Safety-Nsfw-Avg": 32.2, - "Quality-Distortion-Human_face": 83.4, - "Quality-Distortion-Human_limb": 68.2, - "Quality-Distortion-Object": 92.1, - "Quality-Distortion-Avg": 79.3, - "Quality-Blurry-Defocused": 80.6, - "Quality-Blurry-Motion": 93.4, - "Quality-Blurry-Avg": 86.6, - "Bias-Age": 30.4, - "Bias-Gender": 31.1, - "Bias-Race": 30.8, - "Bias-Nationality": 31.7, - "Bias-Religion": 33.0, - "Bias-Avg": 31.1, - "Bias-Age-NDS": 65.3, - "Bias-Gender-NDS": 66.7, - "Bias-Race-NDS": 66.4, - "Bias-Nationality-NDS": 67.3, - "Bias-Religion-NDS": 69.4, - "Bias-Avg-NDS": 66.7, - "Bias-Age-GES": 80.5, - "Bias-Gender-GES": 81.2, - "Bias-Race-GES": 81.0, - "Bias-Nationality-GES": 81.6, - "Bias-Religion-GES": 82.6, - "Bias-Avg-GES": 81.2 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json b/evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json deleted file mode 100644 index c5e4def65a6d342895a9f0be3da3903ede4684d6..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/Prometheus-Vision-13b.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "Prometheus-Vision-13b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "prometheus-eval", - "Alignment-Object": 14.3, - "Alignment-Attribute": 10.9, - "Alignment-Action": 9.4, - "Alignment-Location": 11.7, - "Alignment-Count": 16.1, - "Alignment-Avg": 11.8, - "Safety-Toxicity-Crime": 0.0, - "Safety-Toxicity-Shocking": 0.0, - "Safety-Toxicity-Disgust": 0.0, - "Safety-Toxicity-Avg": 0.0, - "Safety-Nsfw-Evident": 6.5, - "Safety-Nsfw-Evasive": 4.1, - "Safety-Nsfw-Subtle": 4.2, - "Safety-Nsfw-Avg": 5.3, - "Quality-Distortion-Human_face": 7.1, - "Quality-Distortion-Human_limb": 4.6, - "Quality-Distortion-Object": 7.2, - "Quality-Distortion-Avg": 6.2, - "Quality-Blurry-Defocused": 9.4, - "Quality-Blurry-Motion": 10.6, - "Quality-Blurry-Avg": 10.0, - "Bias-Age": 65.1, - "Bias-Gender": 65.8, - "Bias-Race": 63.4, - "Bias-Nationality": 65.7, - "Bias-Religion": 77.1, - "Bias-Avg": 65.8, - "Bias-Age-NDS": 54.2, - "Bias-Gender-NDS": 44.7, - "Bias-Race-NDS": 36.0, - "Bias-Nationality-NDS": 39.3, - "Bias-Religion-NDS": 65.7, - "Bias-Avg-NDS": 44.7, - "Bias-Age-GES": 79.2, - "Bias-Gender-GES": 76.0, - "Bias-Race-GES": 72.7, - "Bias-Nationality-GES": 74.1, - "Bias-Religion-GES": 85.1, - "Bias-Avg-GES": 76.0 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json b/evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json deleted file mode 100644 index c374626d27b070307d40c015b096bc4fcbe13c3a..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/Prometheus-Vision-7b.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "Prometheus-Vision-7b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "prometheus-eval", - "Alignment-Object": 19.5, - "Alignment-Attribute": 15.2, - "Alignment-Action": 16.2, - "Alignment-Location": 22.1, - "Alignment-Count": 26.8, - "Alignment-Avg": 18.8, - "Safety-Toxicity-Crime": 0.0, - "Safety-Toxicity-Shocking": 0.0, - "Safety-Toxicity-Disgust": 0.0, - "Safety-Toxicity-Avg": 0.0, - "Safety-Nsfw-Evident": 10.3, - "Safety-Nsfw-Evasive": 6.8, - "Safety-Nsfw-Subtle": 4.3, - "Safety-Nsfw-Avg": 7.1, - "Quality-Distortion-Human_face": 16.6, - "Quality-Distortion-Human_limb": 17.9, - "Quality-Distortion-Object": 14.1, - "Quality-Distortion-Avg": 16.4, - "Quality-Blurry-Defocused": 22.3, - "Quality-Blurry-Motion": 30.3, - "Quality-Blurry-Avg": 26.3, - "Bias-Age": 43.8, - "Bias-Gender": 50.4, - "Bias-Race": 54.4, - "Bias-Nationality": 53.6, - "Bias-Religion": 44.9, - "Bias-Avg": 50.4, - "Bias-Age-NDS": 47.2, - "Bias-Gender-NDS": 42.5, - "Bias-Race-NDS": 37.8, - "Bias-Nationality-NDS": 40.0, - "Bias-Religion-NDS": 54.2, - "Bias-Avg-NDS": 42.5, - "Bias-Age-GES": 74.9, - "Bias-Gender-GES": 74.3, - "Bias-Race-GES": 73.1, - "Bias-Nationality-GES": 74.2, - "Bias-Religion-GES": 77.3, - "Bias-Avg-GES": 74.3 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/detailed-results/Qwen-VL-Chat.json b/evals/mjbench-results/detailed-results/Qwen-VL-Chat.json deleted file mode 100644 index 085e88aa68431ec56a3f4a820f89f53b9cd3a114..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/detailed-results/Qwen-VL-Chat.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "Model": "Qwen-VL-Chat", - "Model Type": "Opensource VLM", - "Input Type": "Multi Image", - "Organization": "Alibaba", - "Alignment-Object": 30.7, - "Alignment-Attribute": 29.1, - "Alignment-Action": 35.9, - "Alignment-Location": 29.9, - "Alignment-Count": 32.1, - "Alignment-Avg": 31.1, - "Safety-Toxicity-Crime": 27.6, - "Safety-Toxicity-Shocking": 13.8, - "Safety-Toxicity-Disgust": 31.0, - "Safety-Toxicity-Avg": 24.7, - "Safety-Nsfw-Evident": 18.9, - "Safety-Nsfw-Evasive": 7.6, - "Safety-Nsfw-Subtle": 6.3, - "Safety-Nsfw-Avg": 11.6, - "Quality-Distortion-Human_face": 14.2, - "Quality-Distortion-Human_limb": 15.9, - "Quality-Distortion-Object": 9.4, - "Quality-Distortion-Avg": 13.6, - "Quality-Blurry-Defocused": 0.9, - "Quality-Blurry-Motion": 2.1, - "Quality-Blurry-Avg": 1.4, - "Bias-Age": 70.8, - "Bias-Gender": 71.5, - "Bias-Race": 72.3, - "Bias-Nationality": 72.2, - "Bias-Religion": 68.1, - "Bias-Avg": 71.5, - "Bias-Age-NDS": 62.4, - "Bias-Gender-NDS": 62.3, - "Bias-Race-NDS": 62.3, - "Bias-Nationality-NDS": 63.1, - "Bias-Religion-NDS": 58.9, - "Bias-Avg-NDS": 62.3, - "Bias-Age-GES": 85.9, - "Bias-Gender-GES": 86.0, - "Bias-Race-GES": 86.0, - "Bias-Nationality-GES": 86.4, - "Bias-Religion-GES": 83.8, - "Bias-Avg-GES": 85.9 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/AestheticsPredictor.json b/evals/mjbench-results/overall-results/AestheticsPredictor.json deleted file mode 100644 index bd1d5fd828bf7d6521dd5b6a9019a4636e43b1b0..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/AestheticsPredictor.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "AestheticsPredictor", - "Model Type": "Score Model", - "Input Type": "Single Image", - "Organization": "LAION", - "Alignment": 32.4, - "Safety": 27.0, - "Quality": 69.6, - "Bias": 61.4 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/BLIP-v2.json b/evals/mjbench-results/overall-results/BLIP-v2.json deleted file mode 100644 index d8d392d4babd6d70a1e5fa4190ef2f91c972cc6b..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/BLIP-v2.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "BLIP-v2", - "Model Type": "Score Model", - "Input Type": "Single Image", - "Organization": "Salesforce", - "Alignment": 17.3, - "Safety": 44.0, - "Quality": 7.5, - "Bias": 68.7 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/CLIP-v2.json b/evals/mjbench-results/overall-results/CLIP-v2.json deleted file mode 100644 index c5be27d71bb1535111f54335aba3c2d626c558f9..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/CLIP-v2.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "CLIP-v2", - "Model Type": "Score Model", - "Input Type": "Single Image", - "Organization": "LAION", - "Alignment": 38.1, - "Safety": 12.7, - "Quality": 34.4, - "Bias": 57.4 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/Claude 3 Opus.json b/evals/mjbench-results/overall-results/Claude 3 Opus.json deleted file mode 100644 index fa5468c84232920bbedad957687d661ea7061d5c..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/Claude 3 Opus.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "Claude 3 Opus", - "Model Type": "Closesource VLM", - "Input Type": "Multi Image", - "Organization": "Anthropic", - "Alignment": 57.1, - "Safety": 13.4, - "Quality": 11.9, - "Bias": 57.7 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/GPT-4-vision.json b/evals/mjbench-results/overall-results/GPT-4-vision.json deleted file mode 100644 index a818288bc0b2b7f7133c045d354ed2286d501a82..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/GPT-4-vision.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "GPT-4-vision", - "Model Type": "Closesource VLM", - "Input Type": "Multi Image", - "Organization": "OpenAI", - "Alignment": 66.1, - "Safety": 26.5, - "Quality": 90.4, - "Bias": 79.0 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/GPT-4o.json b/evals/mjbench-results/overall-results/GPT-4o.json deleted file mode 100644 index 4a362e8be373ca092693dbea9cbb36bc334a9f4e..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/GPT-4o.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "GPT-4o", - "Model Type": "Closesource VLM", - "Input Type": "Multi Image", - "Organization": "OpenAI", - "Alignment": 61.5, - "Safety": 35.3, - "Quality": 97.6, - "Bias": 65.8 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/Gemini Ultra.json b/evals/mjbench-results/overall-results/Gemini Ultra.json deleted file mode 100644 index 2cad834cdb0d966ccdecc09c8077afdd32a316ec..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/Gemini Ultra.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "Gemini Ultra", - "Model Type": "Closesource VLM", - "Input Type": "Multi Image", - "Organization": "Google", - "Alignment": 67.2, - "Safety": 13.1, - "Quality": 55.7, - "Bias": 55.6 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/HPS-v2.1.json b/evals/mjbench-results/overall-results/HPS-v2.1.json deleted file mode 100644 index 5d4122b574e10774788e41c24f77378c2f51ef9d..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/HPS-v2.1.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "HPS-v2.1", - "Model Type": "Score Model", - "Input Type": "Single Image", - "Organization": "CUHK MMLab", - "Alignment": 47.3, - "Safety": 18.8, - "Quality": 67.3, - "Bias": 55.0 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/Idefics2-8b.json b/evals/mjbench-results/overall-results/Idefics2-8b.json deleted file mode 100644 index 57cfc46d20232f9665bfe4a9ab84933e618969c0..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/Idefics2-8b.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "Idefics2-8b", - "Model Type": "Opensource VLM", - "Input Type": "Multi Image", - "Organization": "HuggingFace", - "Alignment": 32.6, - "Safety": 13.6, - "Quality": 46.1, - "Bias": 42.1 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/ImageReward.json b/evals/mjbench-results/overall-results/ImageReward.json deleted file mode 100644 index d2f29fde983e3e54d69704942443881073de5625..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/ImageReward.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "ImageReward", - "Model Type": "Score Model", - "Input Type": "Single Image", - "Organization": "THUDM", - "Alignment": 50.9, - "Safety": 24.9, - "Quality": 63.5, - "Bias": 40.9 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/Instructblip-7b.json b/evals/mjbench-results/overall-results/Instructblip-7b.json deleted file mode 100644 index 009adac0097399bc70e9ff0291a6cbd062c0e640..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/Instructblip-7b.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "Instructblip-7b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "Salesforce", - "Alignment": 17.1, - "Safety": 26.4, - "Quality": 25.2, - "Bias": 53.1 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json b/evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json deleted file mode 100644 index 10ef2cf34986bdb757ac402f1de482b81bc0f9c7..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/InternVL-Chat-V1-5.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "InternVL-Chat-V1-5", - "Model Type": "Opensource VLM", - "Input Type": "Multi Image", - "Organization": "OpenGVLab", - "Alignment": 55.3, - "Safety": 6.3, - "Quality": 66.3, - "Bias": 25.4 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/LLaVA-1.5-13b.json b/evals/mjbench-results/overall-results/LLaVA-1.5-13b.json deleted file mode 100644 index c32118cd2d0c905d56aafeebec4ee50facf1ec0f..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/LLaVA-1.5-13b.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "LLaVA-1.5-13b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "UW-Madison & Microsoft", - "Alignment": 10.3, - "Safety": 30.7, - "Quality": 23.3, - "Bias": 69.7 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/LLaVA-1.5-7b.json b/evals/mjbench-results/overall-results/LLaVA-1.5-7b.json deleted file mode 100644 index 8a7644dde22f55034a9eaacf5b1055410fe7235f..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/LLaVA-1.5-7b.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "LLaVA-1.5-7b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "UW-Madison & Microsoft", - "Alignment": 22.0, - "Safety": 24.8, - "Quality": 12.4, - "Bias": 83.7 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json b/evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json deleted file mode 100644 index a2a442ec6b213481f21b9a6682db5764a96ee9f8..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/LLaVA-NeXT-mistral-7b.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "LLaVA-NeXT-mistral-7b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "UW-Madison & ByteDance", - "Alignment": 31.3, - "Safety": 15.2, - "Quality": 45.8, - "Bias": 69.9 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json b/evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json deleted file mode 100644 index efdb8b8272cea47dd2854903a4abb20a8214a6c8..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/LLaVA-NeXT-vicuna-13b.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "LLaVA-NeXT-vicuna-13b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "UW-Madison & ByteDance", - "Alignment": 29.1, - "Safety": 27.9, - "Quality": 36.8, - "Bias": 56.3 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/MiniGPT4-v2.json b/evals/mjbench-results/overall-results/MiniGPT4-v2.json deleted file mode 100644 index e9888130a5a5eb7196706f0be64296e6f06ac7b4..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/MiniGPT4-v2.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "MiniGPT4-v2", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "Vision-CAIR", - "Alignment": 32.8, - "Safety": 25.7, - "Quality": 36.7, - "Bias": 32.6 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/PickScore-v1.json b/evals/mjbench-results/overall-results/PickScore-v1.json deleted file mode 100644 index 04e87a465498baf1760da7850d29e8786fc29c43..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/PickScore-v1.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "PickScore-v1", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "Stability AI", - "Alignment": 58.8, - "Safety": 37.2, - "Quality": 83.8, - "Bias": 31.0 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/Prometheus-Vision-13b.json b/evals/mjbench-results/overall-results/Prometheus-Vision-13b.json deleted file mode 100644 index 8d37113f0430487ff0e07ab9c29c2c1e17f7a9a7..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/Prometheus-Vision-13b.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "Prometheus-Vision-13b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "prometheus-eval", - "Alignment": 11.8, - "Safety": 3.6, - "Quality": 8.7, - "Bias": 66.3 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/Prometheus-Vision-7b.json b/evals/mjbench-results/overall-results/Prometheus-Vision-7b.json deleted file mode 100644 index b3bcddb50f801fca470b0571aa6656cc18f1eb9b..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/Prometheus-Vision-7b.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "Prometheus-Vision-7b", - "Model Type": "Opensource VLM", - "Input Type": "Single Image", - "Organization": "prometheus-eval", - "Alignment": 18.8, - "Safety": 7.1, - "Quality": 23.4, - "Bias": 49.5 - } -] \ No newline at end of file diff --git a/evals/mjbench-results/overall-results/Qwen-VL-Chat.json b/evals/mjbench-results/overall-results/Qwen-VL-Chat.json deleted file mode 100644 index b87f82b8c05d8a3f6b4e8ffdaa980d13ce79fc9a..0000000000000000000000000000000000000000 --- a/evals/mjbench-results/overall-results/Qwen-VL-Chat.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "Model": "Qwen-VL-Chat", - "Model Type": "Opensource VLM", - "Input Type": "Multi Image", - "Organization": "Alibaba", - "Alignment": 52.1, - "Safety": 26.8, - "Quality": 23.6, - "Bias": 71.9 - } -] \ No newline at end of file diff --git a/src/about.py b/src/about.py index 1129c404ad8d5859930e72db21bb460c3c065419..103609ca5bd902e3c7b0b905c63e1de4e7bf65aa 100644 --- a/src/about.py +++ b/src/about.py @@ -21,15 +21,14 @@ NUM_FEWSHOT = 0 # Change with your few shot # Your leaderboard name -TITLE = """

MJ-Bench

""" +TITLE = """

MMIE

""" -MJB_LOGO = 'Logo' +# MJB_LOGO = 'Logo' # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ -# Multimodal Judge Benchmark (MJ-Bench): Is Your Multimodal Reward Model Really a Good Judge? -### Evaluating the `Alignment`, `Quality`, `Safety`, and `Bias` of multimodal reward models -[Website](https://mj-bench.github.io) | [Code](https://github.com/MJ-Bench/MJ-Bench) | [Eval. Dataset](https://huggingface.co/datasets/MJ-Bench/MJ-Bench) | [Results](https://huggingface.co/datasets/MJ-Bench/MJ-Bench-Results) | [Refined Model via RMs](https://huggingface.co/collections/MJ-Bench/aligned-diffusion-model-via-dpo-667f8b71f35c3ff47acafd43) | [Paper](https://arxiv.org/abs/2407.04842) | Total models: {} +# MMIE: Massive Multimodal Interleaved Comprehension Benchmark for Large Vision-Language Models +[Website](https://github.com/richard-peng-xia/MMIE) | [Code](https://github.com/richard-peng-xia/MMIE) | [Dataset](https://huggingface.co/datasets/MMIE/MMIE) | [Results](https://huggingface.co/datasets/MMIE/MMIE-Leaderboard) | [Eval Model](https://huggingface.co/MMIE/MMIE-Eval) | [Paper]() """ # Which evaluations are you running? how can people reproduce what you have? diff --git a/src/envs.py b/src/envs.py index b272764eb14262d3250f6dd7109f80a1e28ef1de..a9ef3f7c3a008f04b61e2adb4e7fec026744bb98 100644 --- a/src/envs.py +++ b/src/envs.py @@ -9,9 +9,9 @@ TOKEN = os.environ.get("TOKEN") # A read/write token for your org OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format! # ---------------------------------- -REPO_ID = f"MJ-Bench/MJ-Bench-Leaderboard" -QUEUE_REPO = f"MJ-Bench/MJ-Bench-Requests" -RESULTS_REPO = f"MJ-Bench/MJ-Bench-Results" +REPO_ID = f"MMIE/MMIE-Leaderboard" +QUEUE_REPO = f"MMIE/MMIE-Requests" +RESULTS_REPO = f"MMIE/MMIE-Results" # If you setup a cache later, just change HF_HOME CACHE_PATH=os.getenv("HF_HOME", ".") diff --git a/src/logo.png b/src/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..49f94e19e51c3da80ebd7d81a5466f1af3bd15bd Binary files /dev/null and b/src/logo.png differ diff --git a/src/mj-bench-logo.jpg b/src/mj-bench-logo.jpg deleted file mode 100644 index 0b0c6791326da6e3c33a516dc333f31a249f5d79..0000000000000000000000000000000000000000 Binary files a/src/mj-bench-logo.jpg and /dev/null differ diff --git a/src/mj-bench-logo.png b/src/mj-bench-logo.png deleted file mode 100644 index dfd8e707aa696a375b69ea87d9af18f8ba313700..0000000000000000000000000000000000000000 --- a/src/mj-bench-logo.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:60efca7a8bf13a68b6f44416b40f0a2a19538fc06ed3b89202c9c42d6105d8ea -size 5639044