|
import pandas as pd |
|
import gradio as gr |
|
import csv |
|
import json |
|
import os |
|
import shutil |
|
from huggingface_hub import Repository |
|
import numpy as np |
|
|
|
|
|
with open("./static/eval_results/all_model_keywords_stats.json", "r") as f: |
|
MODEL_DATA = json.load(f) |
|
|
|
with open("./static/eval_results/all_summary.json", "r") as f: |
|
SUMMARY_DATA = json.load(f) |
|
|
|
|
|
|
|
MODEL_NAME_MAP = { |
|
"GPT_4o": "GPT-4o (0513)", |
|
"Claude_3.5": "Claude-3.5-Sonnet", |
|
"Gemini_1.5_pro_002": "Gemini-1.5-Pro-002", |
|
"InternVL2_76B": "InternVL2-Llama3-76B", |
|
"Qwen2_VL_72B": "Qwen2-VL-72B", |
|
"llava_onevision_72B": "Llava-OneVision-72B", |
|
"GPT_4o_mini": "GPT-4o mini", |
|
"Gemini_1.5_flash_002": "Gemini-1.5-Flash-002", |
|
"Pixtral_12B": "Pixtral 12B", |
|
"Qwen2_VL_7B": "Qwen2-VL-7B", |
|
"InternVL2_8B": "InternVL2-8B", |
|
"llava_onevision_7B": "Llava-OneVision-7B", |
|
"Llama_3_2_11B": "Llama-3.2-11B", |
|
"Phi-3.5-vision": "Phi-3.5-Vision", |
|
"MiniCPM_v2.6": "MiniCPM-V2.6", |
|
"Idefics3": "Idefics3-8B-Llama3", |
|
} |
|
|
|
|
|
DIMENSION_NAME_MAP = { |
|
"skills": "Skills", |
|
"input_format": "Input Format", |
|
"output_format": "Output Format", |
|
"input_num": "Visual Input Number", |
|
"app": "Application" |
|
} |
|
|
|
KEYWORD_NAME_MAP = { |
|
|
|
"Object Recognition and Classification": "Object Recognition", |
|
"Text Recognition (OCR)": "OCR", |
|
"Language Understanding and Generation": "Language", |
|
"Scene and Event Understanding": "Scene/Event", |
|
"Mathematical and Logical Reasoning": "Math/Logic", |
|
"Commonsense and Social Reasoning": "Commonsense", |
|
"Ethical and Safety Reasoning": "Ethics/Safety", |
|
"Domain-Specific Knowledge and Skills": "Domain-Specific", |
|
"Spatial and Temporal Reasoning": "Spatial/Temporal", |
|
"Planning and Decision Making": "Planning/Decision", |
|
|
|
'User Interface Screenshots': "UI related", |
|
'Text-Based Images and Documents': "Documents", |
|
'Diagrams and Data Visualizations': "Infographics", |
|
'Videos': "Videos", |
|
'Artistic and Creative Content': "Arts/Creative", |
|
'Photographs': "Photographs", |
|
'3D Models and Aerial Imagery': "3D related", |
|
|
|
'Information_Extraction': "Info Extraction", |
|
'Planning' : "Planning", |
|
'Coding': "Coding", |
|
'Perception': "Perception", |
|
'Metrics': "Metrics", |
|
'Science': "Science", |
|
'Knowledge': "Knowledge", |
|
'Mathematics': "Math", |
|
|
|
'contextual_formatted_text': "Contexual", |
|
'structured_output': "Structured", |
|
'exact_text': "Exact", |
|
'numerical_data': "Numerical", |
|
'open_ended_output': "Open-ended", |
|
'multiple_choice': "MC", |
|
"6-8 images": "6-8 imgs", |
|
"1-image": "1 img", |
|
"2-3 images": "2-3 imgs", |
|
"4-5 images": "4-5 imgs", |
|
"9-image or more": "9+ imgs", |
|
"video": "Video", |
|
} |
|
|
|
|
|
SUPER_GROUPS = {DIMENSION_NAME_MAP[dim]: [KEYWORD_NAME_MAP.get(k, k) for k in MODEL_DATA[next(iter(MODEL_DATA))][dim].keys()] |
|
for dim in MODEL_DATA[next(iter(MODEL_DATA))]} |
|
|
|
SUBMISSION_NAME = "test_leaderboard_submission" |
|
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/cccjc/", SUBMISSION_NAME) |
|
CSV_DIR = "./test_leaderboard_submission/results.csv" |
|
|
|
def get_original_dimension(mapped_dimension): |
|
return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension) |
|
|
|
def get_original_keyword(mapped_keyword): |
|
return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword) |
|
|
|
|
|
MODEL_GROUPS = { |
|
"All": list(MODEL_DATA.keys()), |
|
"Flagship Models": ['GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B'], |
|
"Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'], |
|
"Proprietary Flagship models": ['GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'], |
|
"Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'], |
|
"Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B'], |
|
"Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'], |
|
} |
|
|
|
def get_display_model_name(model_name): |
|
return MODEL_NAME_MAP.get(model_name, model_name) |
|
|
|
def get_df(selected_super_group, selected_model_group): |
|
original_dimension = get_original_dimension(selected_super_group) |
|
data = [] |
|
for model in MODEL_GROUPS[selected_model_group]: |
|
model_data = MODEL_DATA[model] |
|
summary = SUMMARY_DATA[model] |
|
core_score = max(summary["core_noncot"]["macro_mean_score"], summary["core_cot"]["macro_mean_score"]) |
|
row = { |
|
"Models": get_display_model_name(model), |
|
"Overall": round(summary["overall_score"] * 100, 2), |
|
"Core": round(core_score * 100, 2), |
|
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2) |
|
} |
|
for keyword in SUPER_GROUPS[selected_super_group]: |
|
original_keyword = get_original_keyword(keyword) |
|
if original_dimension in model_data and original_keyword in model_data[original_dimension]: |
|
row[keyword] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2) |
|
else: |
|
row[keyword] = None |
|
data.append(row) |
|
|
|
df = pd.DataFrame(data) |
|
df = df.sort_values(by="Overall", ascending=False) |
|
return df |
|
|
|
def get_leaderboard_data(selected_super_group, selected_model_group): |
|
df = get_df(selected_super_group, selected_model_group) |
|
headers = ["Models", "Overall", "Core", "Open-ended"] + SUPER_GROUPS[selected_super_group] |
|
data = df[headers].values.tolist() |
|
return headers, data |
|
|