diff --git "a/static/eval_results/Default/Gemini_1.5_flash_002/task_results.json" "b/static/eval_results/Default/Gemini_1.5_flash_002/task_results.json"
new file mode 100644--- /dev/null
+++ "b/static/eval_results/Default/Gemini_1.5_flash_002/task_results.json"
@@ -0,0 +1,7756 @@
+[
+    {
+        "name": "monthly_weather_days_count",
+        "score": 0.3095238095238095,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "planning_visual_floortile",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "geometry_reasoning_overlapped_circle",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "geometry_reasoning_grid",
+        "score": 0.6785714285714286,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "webpage_code_understanding",
+        "score": 0.8888888888888888,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 9,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Coding",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "planning_visual_barman",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "transit_map_intersection_points",
+        "score": 0.4068877551020408,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "geometry_reasoning_nested_squares",
+        "score": 0.4642857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "scibench_fundamental_wo_solution",
+        "score": 0.3469387755102041,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 49,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Science",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "GUI_Act_Web_Single",
+        "score": 0.03886509470801488,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "remaining_playback_time_calculation",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "planning_screenshot_blocksworld",
+        "score": 0.06666666666666667,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "extract_webpage_headline",
+        "score": 0.7142857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "entertainment_web_game_style",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "logical_reasoning_find_odd_one_out",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "relative_reflectance_of_different_regions",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ascii_art_understanding",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "TRANCE_physics_reasoning_event",
+        "score": 0.14285714285714285,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "3D Models and Aerial Imagery",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "music_sheet_format_QA",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Knowledge",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "recover_masked_word_in_figure",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "medical_content_based_retrieval_radiology",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Science",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "GUI_Act_Mobile_tap",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "dish_ingredient_match",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "code_retrieval",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "mahjong",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "web_action_grounding",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "TRANCE_physics_reasoning_view",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "3D Models and Aerial Imagery",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "planning_screenshot_tyreworld",
+        "score": 0.06666666666666667,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "rocks_samples_identify",
+        "score": 0.2,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "counting",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "medical_parasite_detection",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Science",
+        "output_format": "structured_output",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "interpret_force_perspective_illusion",
+        "score": 0.7333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "chinese_idiom_recognition",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "2d_image_jigsaw_puzzle_easy",
+        "score": 0.14999999999999997,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "product_ocr_qa",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "knowledge_sign_recognition",
+        "score": 0.3333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 9,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "medical_multi_organ_segmentation_rater",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Science",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "song_title_identification_from_lyrics",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "signage_navigation",
+        "score": 0.6666666666666666,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ishihara_test",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "booking_web_recommendation",
+        "score": 0.6203514739229025,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "visual_correspondance_in_two_images",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "rocks_samples_compare",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "pokemon_3D_recognition",
+        "score": 0.8333333333333334,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "3D Models and Aerial Imagery",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "worldle",
+        "score": 0.35558727927939476,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "orchestra_score_recognition",
+        "score": 0.03571428571428571,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "logical_reasoning_fit_pattern",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "distinguish_ai_generated_image",
+        "score": 0.8421052631578947,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ancient_map_understanding",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "weather_map_climate_type_temperature_parsing",
+        "score": 0.6785714285714286,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "soccer_offside",
+        "score": 0.1111111111111111,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 9,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "icon_arithmetic_puzzle",
+        "score": 0.7142857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "code_translation_Python",
+        "score": 0.41666666666666663,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "road_map_find_highway_between_two_place",
+        "score": 0.6470588235294118,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 17,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "GUI_Act_Web_Multi",
+        "score": 0.22271751659129607,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "code_translation_hard",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "media_QA_web_stackoverflow",
+        "score": 0.5476190476190476,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "relative_depth_of_different_points",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "comic_page_ordering",
+        "score": 0.14285714285714285,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Planning",
+        "output_format": "contextual_formatted_text",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "code_execution",
+        "score": 0.4375,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "paper_review_rating",
+        "score": 0.7558635964363686,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Metrics",
+        "output_format": "numerical_data",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "location_vqa",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "mensa_iq_test",
+        "score": 0.47990196078431374,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 17,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MMMU_pro_exam_screenshot",
+        "score": 0.2727272727272727,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 99,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Science",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "llavaguard",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Ethical and Safety Reasoning",
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "geometry_reasoning_count_line_intersections",
+        "score": 0.32142857142857145,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "geometry_reasoning_circled_letter",
+        "score": 0.7142857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "video_eval_dynamic_pref",
+        "score": 0.75,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Metrics",
+        "output_format": "multiple_choice",
+        "num_input": "video"
+    },
+    {
+        "name": "video_eval_factual_pref",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning",
+            "Ethical and Safety Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Metrics",
+        "output_format": "multiple_choice",
+        "num_input": "video"
+    },
+    {
+        "name": "vln_identify_location",
+        "score": 0.2303030303030303,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "video_segments_reordering",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "video"
+    },
+    {
+        "name": "planning_screenshot_termes",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "image_translation_en2cn",
+        "score": 0.3100359127375053,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 9,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "circuit_diagram_understanding",
+        "score": 0.13333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Science",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "polygon_interior_angles",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "planning_screenshot_storage",
+        "score": 0.06666666666666667,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "music_sheet_name",
+        "score": 0.13333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "code_solution_compare",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Language Understanding and Generation",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Coding",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "logical_reasoning_2d_folding",
+        "score": 0.14285714285714285,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "font_recognition",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "planning_visual_blocksworld",
+        "score": 0.13333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "video_action_recognition",
+        "score": 0.8214285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "video"
+    },
+    {
+        "name": "code_visualization_output_understanding",
+        "score": 0.2,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 10,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Coding",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "autorater_3d_model_texturing",
+        "score": 0.7142857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "3D Models and Aerial Imagery",
+        "app": "Metrics",
+        "output_format": "contextual_formatted_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "google_streetview_line_reasoning",
+        "score": 0.2,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "medical_polyp_segmentation_single_object_rater",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Science",
+        "output_format": "structured_output",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "media_recommend_solutions_stackoverflow",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Language Understanding and Generation",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Coding",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "medical_keywords_based_retrieval_non_radiology",
+        "score": 0.8571428571428571,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Science",
+        "output_format": "exact_text",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "autorater_artifact",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Metrics",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "planning_screenshot_floortile",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "knowledge_graph_understanding",
+        "score": 0.6,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "CLEVRER_physics",
+        "score": 0.4,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 20,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "multiview_reasoning_camera_moving",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "TRANCE_physics_reasoning_basic",
+        "score": 0.47058823529411764,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 17,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "3D Models and Aerial Imagery",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "highest_discount_game_price_identification",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "chess_find_legal_moves",
+        "score": 0.0319296239070534,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "autonomous_driving_scene_analysis",
+        "score": 0.9285714285714286,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "autorater_artifact_reason",
+        "score": 0.6,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Metrics",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "video_grounding_spatial",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "Movie_retrieval_by_actor",
+        "score": 0.7142857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "autorater_unmask",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Metrics",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "ocr_article_journal",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "recipe_image_ordering",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "ocr_math_equation",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "autorater_motion_guided_editing",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Metrics",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "google_streetview_circle_sorting",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "ocr_resume_skill_plain",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ocr_table_to_markdown",
+        "score": 0.8571428571428571,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "photoshop_operation",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "autorater_semantics",
+        "score": 0.8571428571428571,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Metrics",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "ocr_table_to_html",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "code_translation_advanced",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "counting_multi_image",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "ocr_math_text_latex",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "video_intent_recognition",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "ocr_table_to_latex",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ocr_resume_employer_plain",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "music_sheet_sentiment",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "video_eval_visual_pref",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Videos",
+        "app": "Metrics",
+        "output_format": "multiple_choice",
+        "num_input": "video"
+    },
+    {
+        "name": "autorater_aesthetics",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Metrics",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "planning_screenshot_barman",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ocr_resume_experience_plain",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "autorater_control",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Metrics",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "topological_sort",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ocr_article_authors",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "hashtag_recommendation",
+        "score": 0.9404761904761905,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "planning_visual_grippers",
+        "score": 0.13333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "semantic_matching_of_two_images",
+        "score": 0.14285714285714285,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "healthcare_info_judgement",
+        "score": 0.9285714285714286,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills",
+            "Ethical and Safety Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Science",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "game_platform_support_identification",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_interactive_operations_word",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "booking_web_rating",
+        "score": 0.8571428571428571,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "position_relationship",
+        "score": 0.6,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "number_comparison",
+        "score": 0.9285714285714286,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "autorater_subject",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Metrics",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "app_interactive_operations_amazon",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "sign_language",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Videos",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "media_homepage_profile",
+        "score": 0.21282182729551152,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "pictionary_genai_output_chinese",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "top_video_creator_identification",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "autorater_mask",
+        "score": 0.7142857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Metrics",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "video_camera_motion_description",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "video"
+    },
+    {
+        "name": "pictionary_cartoon_drawing_guess",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "number_puzzle_kakuro_5x5",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "pictionary_skribbl_io",
+        "score": 0.15,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 20,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "Ad_count_detection",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "web_action_prediction",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_interactive_operations_instagram",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "number_puzzle_sudoku",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "pictionary_doodle_guess",
+        "score": 0.8,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "google_streetview_circle_reasoning",
+        "score": 0.06666666666666667,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "poetry_shakespearean_sonnet",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Language Understanding and Generation",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "paper_review_acceptance",
+        "score": 0.4,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Metrics",
+        "output_format": "exact_text",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "functionality_matching_in_different_objects",
+        "score": 0.4642857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "ocr_table_to_csv",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_layout_understanding_twitter",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_interactive_operations_zoom",
+        "score": 0.4666666666666667,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "multilingual_news_qa",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Language Understanding and Generation",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Photographs",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "annoying_word_search",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Planning",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "GUI_Act_Mobile_swipe",
+        "score": 0.49714178831993683,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "actor_recognition_in_Movie",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "flowchart_code_generation",
+        "score": 0.4444444444444444,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 9,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Coding",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ocr_resume_school_plain",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "calendar_schedule_suggestion",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "table_understanding_fact_verification",
+        "score": 0.7261904761904762,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_layout_understanding_ppt",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "research_website_parsing_homepage",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_interactive_operations_iphone_settings",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "play_go_capture_stone",
+        "score": 0.26666666666666666,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Planning and Decision Making"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "research_website_parsing_blogpost",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "pictionary_chinese_food_img2en",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "research_website_parsing_publication",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "poetry_haiku",
+        "score": 0.7333333333333333,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_layout_understanding_tiktok",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MFC_Bench_check_clip_stable_diffusion_generate",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "scibench_calculus_wo_solution",
+        "score": 0.30612244897959184,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 49,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_layout_understanding_alipay",
+        "score": 0.23529411764705882,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 17,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "counterfactual_arithmetic",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_interactive_operations_leetcode",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "3d_indoor_scene_text_bbox_prediction",
+        "score": 0.09826063389901919,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "3D Models and Aerial Imagery",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "stock_info_retrieval",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "app_layout_understanding_youtube",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "dvqa",
+        "score": 0.9473684210526315,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "license_plate_recognition",
+        "score": 0.8571428571428571,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MFC_Bench_check_background_change",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ti_fused_vqa_math",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "super_clevr",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "geometry_analytic",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MFC_Bench_check_face_attribute_edit",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_layout_understanding_amazon",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "arc_agi",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "multilingual_movie_info_parsing",
+        "score": 0.5408163265306122,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "newspaper_ocr_in_query_box",
+        "score": 0.2,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_interactive_operations_tiktok",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "poetry_custom_rhyming_scheme",
+        "score": 0.13333333333333333,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "insect_order_classification",
+        "score": 0.4,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_layout_understanding_word",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MFC_Bench_check_out_of_context",
+        "score": 0.8571428571428571,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "long_string_number_recognition",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "coco_person_detection",
+        "score": 0.5916519873131821,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_interactive_operations_youtube",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "chess_puzzles_checkmate",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Planning and Decision Making",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "vibe_eval_short_phrase",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "IAM_line_ocr_and_locate",
+        "score": 0.6702481953279147,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "image_style_recognition",
+        "score": 1.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "memorization_famous_treaty",
+        "score": 0.6785714285714286,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MFC_Bench_check_face_swap",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "constrained_generation_multi_contain_position_only",
+        "score": 0.2,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Language Understanding and Generation",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "chess_puzzles_crushing",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Planning and Decision Making",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_interactive_operations_notes",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "app_layout_understanding_instagram",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ti_fused_vqa_chemistry",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Science",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MFC_Bench_check_text_style",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "chess_puzzles_equality",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Planning and Decision Making",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "music_info_parsing",
+        "score": 0.39285714285714285,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "logo2k_same_type_logo_retrieval",
+        "score": 0.8214285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "MFC_Bench_check_text_entity_replace",
+        "score": 0.7142857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "deciphering_oracle_bone",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_layout_understanding_zoom",
+        "score": 0.5333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ocr_math_TheoremQA",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Mathematics",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "memorization_chinese_celebrity",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "waldo",
+        "score": 0.0002062628914307136,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 18,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "app_layout_understanding_iphone_settings",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "signboard_identification",
+        "score": 0.4666666666666667,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "visualdial_visual_dialog_image_guessing",
+        "score": 0.7333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "graph_shortest_path_planar",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "Bongard_Problem",
+        "score": 0.2894736842105263,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "memorization_indian_celebrity",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "graph_chordless_cycle",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "clevr_arithmetic",
+        "score": 0.2631578947368421,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "average_humidity_estimate_plot",
+        "score": 0.5600000000000002,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_layout_understanding_leetcode",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "graph_hamiltonian_cycle",
+        "score": 0.5089285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "structured_output",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "memorization_papers",
+        "score": 0.06666666666666667,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "long_string_letter_recognition",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "face_keypoint_detection",
+        "score": 0.746390336033466,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "math_breakpoint",
+        "score": 0.8666666666666667,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "nlvr2_two_image_compare_qa",
+        "score": 0.8571428571428571,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "iconqa",
+        "score": 0.3157894736842105,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "exchange_rate_estimate_plot",
+        "score": 0.9621285714285712,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "math_parity",
+        "score": 0.7333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "algebra",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "quizlet_question_solving",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Science",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "electricity_future_prediction_from_table",
+        "score": 0.7057894736842105,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "places365_similar_scene_retrieval",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Photographs",
+        "app": "Information_Extraction",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "app_layout_understanding_excel",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ti_fused_vqa_biology",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Science",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "movie_info_retrieval",
+        "score": 0.7142857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "question_solution_solving",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Science",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "coco_ood_global_image_retrieval_by_query_property",
+        "score": 0.681547619047619,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "action_sequence",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "science_molecule_chemistry",
+        "score": 0.9333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Science",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "graph_shortest_path_kamada_kawai",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "graph_isomorphism",
+        "score": 0.4666666666666667,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "geometry_length",
+        "score": 0.14285714285714285,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "clevrer_moving_direction_video",
+        "score": 0.125,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "multi_load_type_prediction_from_plot",
+        "score": 0.5357142857142856,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "landmark_check_two_images",
+        "score": 0.7555555555555556,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "av_vehicle_multiview_counting",
+        "score": 0.26666666666666666,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "brand_logo_recognition_and_elaboration",
+        "score": 0.8,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 25,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "coco_object_detection_by_query_property",
+        "score": 0.5798723155227672,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "youtube_video_info_parsing",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "electricity_plot_future_prediction",
+        "score": 0.9017526315789473,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "figureqa",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "stock_info_parsing",
+        "score": 0.7478991596638657,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "clevrer_video_moving_object_property_recognition",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "movie_info_parsing",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "landmark_recognition_and_qa",
+        "score": 0.5555555555555555,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "3d_fragments_understanding",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "3D Models and Aerial Imagery",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "widerface_face_count_and_event_classification",
+        "score": 0.5357142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "clevrer_object_existence_video",
+        "score": 0.375,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "video"
+    },
+    {
+        "name": "geometry_transformation",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "math_convexity_value_estimation",
+        "score": 0.570486129111546,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "stock_price_future_prediction",
+        "score": 0.7672857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "multilingual_game_info_parsing",
+        "score": 0.4642857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "vlnqa_egocentric_navigation_video",
+        "score": 0.3125,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Videos",
+        "app": "Planning",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "geometry_descriptive",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "physical_property_reasoning",
+        "score": 0.9285714285714286,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "mnist_pattern",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Planning",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "code_programming_test_advanced",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 18,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "multiple_states_identify_americas",
+        "score": 0.21428571428571433,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "traffic_future_prediction_from_line_plot",
+        "score": 0.7220526315789474,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "single_person_pose_estimation",
+        "score": 0.24564101770091742,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "chart_vqa",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "newspaper_page_parse_and_count",
+        "score": 0.3333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "graph_connectivity",
+        "score": 0.6,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "symbolic_graphics_programs_scalable_vector_graphics",
+        "score": 0.1111111111111111,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 18,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "map_diagram_qa",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "graph_maxflow",
+        "score": 0.13333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "symbolic_graphics_programs_computer_aided_design",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "places365_scene_type_classification",
+        "score": 0.8571428571428571,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "geometry_solid",
+        "score": 0.14285714285714285,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "av_multicamera_tracking_predict_bbox",
+        "score": 0.0017402394162957552,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "cvbench_adapted_cvbench_relation",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ili_ratio_future_prediction",
+        "score": 0.11578571428571437,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "game_info_parsing",
+        "score": 0.7727272727272726,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "av_human_multiview_counting",
+        "score": 0.26666666666666666,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "game_info_retrieval",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "emotion_recognition",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "visual_prediction_rater_novel_view_synthesis",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "3D Models and Aerial Imagery",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "top_rated_hotel_identification",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "cvbench_adapted_cvbench_distance",
+        "score": 0.7142857142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "av_view_identification",
+        "score": 0.2333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "visual_prediction_rater_openable_part_segmentation",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "cheapest_flight_identification",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "pmc_vqa_medical_image_qa",
+        "score": 0.5263157894736842,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Science",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "cvbench_adapted_cvbench_count",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "visual_prediction_rater_depth_estimation",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "cvbench_adapted_cvbench_depth",
+        "score": 0.8571428571428571,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "visual_prediction_rater_plane_segmentation",
+        "score": 0.28888888888888886,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "visual_prediction_rater_semantic_segmentation",
+        "score": 0.39583333333333326,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "video_to_camera_trajectory_retrieval",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "video"
+    },
+    {
+        "name": "visual_prediction_rater_panoptic_segmentation",
+        "score": 0.2619047619047619,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "human_relationship_reasoning",
+        "score": 0.9375,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Commonsense and Social Reasoning",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "visual_prediction_rater_3d_assembled_quality_understanding",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "3D Models and Aerial Imagery",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "humor_understand_caption_match",
+        "score": 0.6666666666666666,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Commonsense and Social Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "vln_english_next_step",
+        "score": 0.2,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Planning",
+        "output_format": "contextual_formatted_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "video_motion_matching_real_3D",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "video"
+    },
+    {
+        "name": "logical_reasoning_2D_views_of_3D_shapes",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "move_pos_to_pos_hanoi_4_pole",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Planning and Decision Making",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "cam_traj_to_video_selection",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "photo_sharing_image_retrieval",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "visual_prediction_rater_surface_normal_estimation",
+        "score": 0.33333333333333337,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "chess_puzzle_single_step",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Planning and Decision Making",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "geographic_remote_sensing_land_cover",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "3D Models and Aerial Imagery",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "weather_info_parsing",
+        "score": 0.7539682539682538,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "egocentric_spatial_reasoning",
+        "score": 0.6666666666666666,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 9,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "sta_action_localization_video",
+        "score": 0.3125,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "google_streetview_direction_understanding",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "snli_ve_visual_entailment",
+        "score": 0.6,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Scene and Event Understanding",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "code_translation_easy",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "code_match_problem",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "exact_text",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "electricity_load_estimate_plot",
+        "score": 0.589357142857143,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "muma_theory_of_mind_belief_of_goal",
+        "score": 0.5333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Videos",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "code_programming_test_easy",
+        "score": 0.1875,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 24,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "perception_test_video_action_count",
+        "score": 0.25,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "video"
+    },
+    {
+        "name": "famous_building_recognition",
+        "score": 0.84375,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "chess_winner_identification",
+        "score": 0.4,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "code_programming_test_hard",
+        "score": 0.14285714285714285,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "google_streetview_line_sorting",
+        "score": 0.06666666666666667,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "planning_visual_termes",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "perception_test_object_shuffle_video",
+        "score": 0.4375,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Videos",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "video"
+    },
+    {
+        "name": "hotel_booking_confirmation_parsing",
+        "score": 0.3928571428571428,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "perception_test_video_character_order",
+        "score": 0.875,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Planning",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "clevrer_video_moving_object_count",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 21,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "video"
+    },
+    {
+        "name": "geometry_area",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "star_object_interaction_video",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Videos",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "multiple_states_identify_africa",
+        "score": 0.19999999999999998,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ti_fused_vqa_physics",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Science",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "multiple_states_identify_europe",
+        "score": 0.15714285714285717,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "code_error_line_identification",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Domain-Specific Knowledge and Skills",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "multiple_choice",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "multiple_states_identify_asia",
+        "score": 0.4714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "muma_theory_of_mind_social_goal",
+        "score": 0.4666666666666667,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "science_basic_physics",
+        "score": 0.8,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Science",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "code_programming_extremely_hard",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning",
+            "Planning and Decision Making"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "animal_pose_estimation",
+        "score": 0.24492301011444534,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "music_info_retrieval",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "face_identity_matching",
+        "score": 0.7333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "action_prediction",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "video"
+    },
+    {
+        "name": "nextqa_mc",
+        "score": 0.8947368421052632,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Videos",
+        "app": "Information_Extraction",
+        "output_format": "multiple_choice",
+        "num_input": "video"
+    },
+    {
+        "name": "mvsa_sentiment_classification",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "weather_info_retrieval",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "egocentric_analysis_single_image",
+        "score": 0.5555555555555556,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 9,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "video_content_reasoning",
+        "score": 0.8888888888888888,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 9,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "video"
+    },
+    {
+        "name": "TV_show_info_parsing",
+        "score": 0.753968253968254,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "mindmap_elements_parsing",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "rebus",
+        "score": 0.30434782608695654,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 23,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Planning",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "Forensic_Detection_of_different_images",
+        "score": 0.42857142857142855,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "video_grounding_temporal",
+        "score": 0.3333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "video"
+    },
+    {
+        "name": "paper_vqa",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "poetry_petrarchian_sonnet_optional_meter",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Language Understanding and Generation",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "poetry_acrostic_alliteration",
+        "score": 0.13333333333333333,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Language Understanding and Generation",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "vln_identify_robot",
+        "score": 0.5333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "poetry_limerick",
+        "score": 0.4666666666666667,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "poetry_acrostic",
+        "score": 0.5333333333333333,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_interactive_operations_excel",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "constrained_generation_contain_length",
+        "score": 0.4,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "constrained_generation_contain_contain_length",
+        "score": 0.9333333333333333,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MMSoc_HatefulMemes",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Ethical and Safety Reasoning",
+            "Commonsense and Social Reasoning",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "constrained_generation_contain_repeat_length",
+        "score": 0.2,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "constrained_generation_contain_position_length",
+        "score": 0.8,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "constrained_generation_multi_contain_repeat_position_only_length",
+        "score": 0.13333333333333333,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "constrained_generation_contain_position_images",
+        "score": 0.6,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "graph_theory",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "constrained_generation_xor_images",
+        "score": 0.7333333333333333,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "constrained_generation_multi_contain_repeat",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Language Understanding and Generation",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "constrained_generation_contain_contain_images",
+        "score": 0.8666666666666667,
+        "eval_type": "rule",
+        "num_demo": 0,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "vizwiz_quality_accessment_for_blind",
+        "score": 0.14285714285714285,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Metrics",
+        "output_format": "contextual_formatted_text",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "ball_cup_swap_3",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "vln_hindi_next_step",
+        "score": 0.13333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Planning",
+        "output_format": "contextual_formatted_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "video_motion_matching_3D_real",
+        "score": 0.3333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "video"
+    },
+    {
+        "name": "chess_sygyzy_endgames",
+        "score": 0.07619047619047618,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Planning and Decision Making",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "medical_counting_lymphocytes",
+        "score": 0.07142857142857142,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Science",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "LaTeX_complex_formula_convertion",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 17,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "TV_show_retrieval_by_character",
+        "score": 0.9285714285714286,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "medical_image_artifacts_indentification",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Science",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "planning_screenshot_grippers",
+        "score": 0.2,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "planning_visual_storage",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Planning and Decision Making",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "music_sheet_author",
+        "score": 0.25,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "music_sheet_note_count",
+        "score": 0.058823529411764705,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 17,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Knowledge",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "medical_abdomen_MRI_organ_recognition",
+        "score": 0.27380952380952384,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Science",
+        "output_format": "contextual_formatted_text",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "medical_cell_recognition",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Science",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_interactive_operations_twitter",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "panel_images_single_question",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_interactive_operations_ppt",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "panel_images_multi_question",
+        "score": 0.6666666666666666,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "crossword_mini_5x5",
+        "score": 0.35000000000000003,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Language Understanding and Generation",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_interactive_operations_alipay",
+        "score": 0.4117647058823529,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 17,
+        "skills": [
+            "Object Recognition and Classification",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "multiple_choice",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "maze_2d_8x8",
+        "score": 0.0,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Spatial and Temporal Reasoning",
+            "Planning and Decision Making"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "table_understanding_complex_question_answering",
+        "score": 0.2857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "reward_models_I2T_reward",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Metrics",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "medical_retrieval_given_surgeon_activity",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Videos",
+        "app": "Science",
+        "output_format": "multiple_choice",
+        "num_input": "video"
+    },
+    {
+        "name": "shape_composition_shapes",
+        "score": 0.3137755102040816,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "shape_composition_colours",
+        "score": 0.2828798185941043,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Perception",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MMSoc_Misinformation_PolitiFact",
+        "score": 0.8571428571428571,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Commonsense and Social Reasoning",
+            "Language Understanding and Generation",
+            "Ethical and Safety Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MMSoc_Memotion",
+        "score": 0.6000000000000001,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 17,
+        "skills": [
+            "Commonsense and Social Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Knowledge",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MFC_Bench_check_veracity",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MMSoc_Misinformation_GossipCop",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Commonsense and Social Reasoning",
+            "Language Understanding and Generation",
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Ethical and Safety Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "waybill_number_sequence_extraction",
+        "score": 0.21428571428571427,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ocr_math_MATH",
+        "score": 0.5333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Mathematics",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "funsd_document_qa",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "medical_abdomen_endscopy_organ_recognition",
+        "score": 0.09523809523809523,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Science",
+        "output_format": "contextual_formatted_text",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "reward_models_T2I_reward",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Photographs",
+        "app": "Metrics",
+        "output_format": "exact_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "graph_hamiltonian_path",
+        "score": 0.3654761904761905,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Mathematics",
+        "output_format": "structured_output",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "vln_tegulu_next_step",
+        "score": 0.26666666666666666,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Planning",
+        "output_format": "structured_output",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "medical_blood_vessels_recognition",
+        "score": 0.6785714285714286,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Science",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "cultural_vqa",
+        "score": 0.3333333333333333,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "character_recognition_in_TV_shows",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "code_output_result",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "exact_text",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "kvqa_knowledge_aware_qa",
+        "score": 0.47368421052631576,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "tqa_textbook_qa",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Science",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "arxiv_vqa",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Science",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "painting_QA",
+        "score": 0.7857142857142857,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Knowledge",
+        "output_format": "exact_text",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "realworld_qa_en2cn",
+        "score": 0.6428571428571429,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Information_Extraction",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "MMMU_physics_chemistry_MCQ",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Science",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "code_add_tag",
+        "score": 0.4,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "contextual_formatted_text",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "handwritten_math_expression_extraction",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Mathematical and Logical Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "3d_indoor_scene_text_bbox_selection",
+        "score": 0.5714285714285714,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "3D Models and Aerial Imagery",
+        "app": "Perception",
+        "output_format": "multiple_choice",
+        "num_input": "1-image"
+    },
+    {
+        "name": "app_layout_understanding_notes",
+        "score": 0.35714285714285715,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "exact_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "table_understanding",
+        "score": 0.5,
+        "eval_type": "rule",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "numerical_data",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ascii_art_30",
+        "score": 0.0,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "contextual_formatted_text",
+        "num_input": "1-image"
+    },
+    {
+        "name": "table2latex_complex",
+        "score": 0.7666666666666668,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 9,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "meme_explain",
+        "score": 0.8571428571428571,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Commonsense and Social Reasoning",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "generated_video_artifacts",
+        "score": 0.35624999999999996,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Metrics",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "funny_image_title",
+        "score": 0.5928571428571429,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "wikihow_complex_task_completion",
+        "score": 0.8222222222222222,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 9,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Planning",
+        "output_format": "open_ended_output",
+        "num_input": "9-image or more"
+    },
+    {
+        "name": "sceneqa_scene_transition_video",
+        "score": 0.3,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "video_summary",
+        "score": 0.6642857142857144,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Videos",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "funqa_unexpected_action_magic_video",
+        "score": 0.5199999999999999,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Videos",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "video_detail_description",
+        "score": 0.563157894736842,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "paper_review_writing",
+        "score": 0.6199999999999999,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Metrics",
+        "output_format": "open_ended_output",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "funqa_unexpected_action_creative_video",
+        "score": 0.3466666666666667,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "guess_image_generation_prompt",
+        "score": 0.8263157894736842,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "traffic_accident_analysis",
+        "score": 0.6214285714285716,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "activitynetqa",
+        "score": 0.5052631578947369,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Scene and Event Understanding",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Videos",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "scibench_w_solution_open_ended",
+        "score": 0.45000000000000007,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 25,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Science",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "vibe-eval",
+        "score": 0.6071428571428571,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Ethical and Safety Reasoning",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "video_qa",
+        "score": 0.8571428571428573,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Videos",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "image_humor_understanding",
+        "score": 0.9068965517241379,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 29,
+        "skills": [
+            "Commonsense and Social Reasoning",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "doc_vqa",
+        "score": 0.8187500000000001,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 16,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "4-5 images"
+    },
+    {
+        "name": "docci_image_description_long",
+        "score": 0.7642857142857143,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "humor_explanation",
+        "score": 0.8533333333333335,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Commonsense and Social Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "nextqa_oe",
+        "score": 0.32105263157894737,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Videos",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "graph_interpretation",
+        "score": 0.789655172413793,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 29,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Language Understanding and Generation",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "science_figure_explanation",
+        "score": 0.8551724137931035,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 29,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "ocrqa",
+        "score": 0.7758620689655171,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 29,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "bar_chart_interpretation",
+        "score": 0.6310344827586206,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 29,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "multi_lingual_manual_explanation_scooter_Spanish",
+        "score": 0.3428571428571428,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation",
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "multi_lingual_manual_explanation_scooter_Russian",
+        "score": 0.24285714285714283,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation",
+            "Ethical and Safety Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "multi_lingual_manual_explanation_scooter_Arabic",
+        "score": 0.34285714285714286,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Language Understanding and Generation",
+            "Ethical and Safety Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "multi_lingual_manual_explanation_scooter_Chinese",
+        "score": 0.3142857142857142,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning",
+            "Ethical and Safety Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "funqa_unexpected_action_humor_video",
+        "score": 0.39333333333333337,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 15,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Videos",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "multi_lingual_manual_explanation_scooter_French",
+        "score": 0.36428571428571427,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation",
+            "Ethical and Safety Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "6-8 images"
+    },
+    {
+        "name": "multi_lingual_Ruozhiba_expalnation_Spanish",
+        "score": 0.3071428571428572,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "figurative_speech_explanation",
+        "score": 0.8137931034482758,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 29,
+        "skills": [
+            "Scene and Event Understanding",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "multi_lingual_Ruozhiba_expalnation_English",
+        "score": 0.1857142857142857,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "multi_lingual_Ruozhiba_expalnation_Russian",
+        "score": 0.22142857142857145,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Commonsense and Social Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "multi_lingual_Ruozhiba_expalnation_Arabic",
+        "score": 0.29999999999999993,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "defeasible_reasoning",
+        "score": 0.8448275862068967,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 29,
+        "skills": [
+            "Scene and Event Understanding",
+            "Language Understanding and Generation",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "multi_lingual_Ruozhiba_expalnation_Japanese",
+        "score": 0.33571428571428574,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "image_captioning_with_additional_requirements",
+        "score": 0.8285714285714286,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Scene and Event Understanding",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "multi_lingual_Ruozhiba_expalnation_French",
+        "score": 0.2785714285714286,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "visualization_with_code",
+        "score": 0.5714285714285714,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Coding",
+        "output_format": "structured_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "table_understanding_fetaqa",
+        "score": 0.47857142857142865,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "red_teaming_jailbreak",
+        "score": 0.8300000000000001,
+        "eval_type": "llm",
+        "num_demo": 0,
+        "num_query": 20,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Ethical and Safety Reasoning"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "red_teaming_celebrity",
+        "score": 0.8500000000000002,
+        "eval_type": "llm",
+        "num_demo": 0,
+        "num_query": 20,
+        "skills": [
+            "Commonsense and Social Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "GUI_Chat_Easy",
+        "score": 0.6884615384615385,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 26,
+        "skills": [
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "video_short_title",
+        "score": 0.6642857142857143,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Language Understanding and Generation",
+            "Scene and Event Understanding"
+        ],
+        "input_format": "Videos",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "video_content_follow_up",
+        "score": 0.8214285714285715,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Language Understanding and Generation",
+            "Planning and Decision Making"
+        ],
+        "input_format": "Videos",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "GUI_Chat_Hard",
+        "score": 0.3806451612903227,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 31,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "red_teaming_politics",
+        "score": 0.705,
+        "eval_type": "llm",
+        "num_demo": 0,
+        "num_query": 20,
+        "skills": [
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning",
+            "Ethical and Safety Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "electrocardiogram",
+        "score": 0.24285714285714285,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Science",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "tweets_captioning",
+        "score": 0.5214285714285714,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Text Recognition (OCR)",
+            "Language Understanding and Generation",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Photographs",
+        "app": "Perception",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "red_teaming_captcha",
+        "score": 0.10000000000000003,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Text Recognition (OCR)"
+        ],
+        "input_format": "Text-Based Images and Documents",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "red_teaming_racial",
+        "score": 0.7850000000000004,
+        "eval_type": "llm",
+        "num_demo": 0,
+        "num_query": 20,
+        "skills": [
+            "Ethical and Safety Reasoning",
+            "Scene and Event Understanding",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "red_teaming_visualmisleading",
+        "score": 0.8789473684210528,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Ethical and Safety Reasoning",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "bridge_strategies_worldclass",
+        "score": 0.15,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Planning and Decision Making",
+            "Mathematical and Logical Reasoning",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "bridge_strategies_expert",
+        "score": 0.32142857142857134,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "video2notes",
+        "score": 0.7071428571428572,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Scene and Event Understanding",
+            "Language Understanding and Generation",
+            "Object Recognition and Classification"
+        ],
+        "input_format": "Videos",
+        "app": "Information_Extraction",
+        "output_format": "open_ended_output",
+        "num_input": "video"
+    },
+    {
+        "name": "iq_test",
+        "score": 0.6482758620689654,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 29,
+        "skills": [
+            "Object Recognition and Classification",
+            "Mathematical and Logical Reasoning",
+            "Spatial and Temporal Reasoning"
+        ],
+        "input_format": "Diagrams and Data Visualizations",
+        "app": "Planning",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "unusual_images",
+        "score": 0.8689655172413793,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 29,
+        "skills": [
+            "Scene and Event Understanding",
+            "Spatial and Temporal Reasoning",
+            "Language Understanding and Generation"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "red_teaming_visual_order_B",
+        "score": 0.9,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "art_explanation",
+        "score": 0.7068965517241379,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 29,
+        "skills": [
+            "Language Understanding and Generation",
+            "Scene and Event Understanding",
+            "Commonsense and Social Reasoning"
+        ],
+        "input_format": "Artistic and Creative Content",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    },
+    {
+        "name": "red_teaming_visual_order_A",
+        "score": 0.9,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 19,
+        "skills": [
+            "Object Recognition and Classification",
+            "Domain-Specific Knowledge and Skills"
+        ],
+        "input_format": "Photographs",
+        "app": "Knowledge",
+        "output_format": "open_ended_output",
+        "num_input": "2-3 images"
+    },
+    {
+        "name": "bridge_strategies_advanced",
+        "score": 0.1642857142857143,
+        "eval_type": "llm",
+        "num_demo": 1,
+        "num_query": 14,
+        "skills": [
+            "Object Recognition and Classification",
+            "Spatial and Temporal Reasoning",
+            "Planning and Decision Making"
+        ],
+        "input_format": "User Interface Screenshots",
+        "app": "Planning",
+        "output_format": "open_ended_output",
+        "num_input": "1-image"
+    }
+]
\ No newline at end of file