[ { "name": "code_translation_Python", "score": 0.6458333333333334, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "vln_identify_robot", "score": 0.13333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Planning", "output_format": "exact_text", "num_input": "9-image or more" }, { "name": "google_streetview_line_sorting", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "structured_output", "num_input": "9-image or more" }, { "name": "dish_ingredient_match", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "vln_identify_location", "score": 0.3878787878787879, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Planning", "output_format": "structured_output", "num_input": "4-5 images" }, { "name": "video_eval_visual_pref", "score": 0.75, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Videos", "app": "Metrics", "output_format": "multiple_choice", "num_input": "video" }, { "name": "image_translation_en2cn", "score": 0.47189890122171807, "eval_type": "rule", "num_demo": 1, "num_query": 9, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "CLEVRER_physics", "score": 0.25, "eval_type": "rule", "num_demo": 1, "num_query": 20, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "media_homepage_profile", "score": 0.4997371675943104, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "9-image or more" }, { "name": "logical_reasoning_find_odd_one_out", "score": 0.8928571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Artistic and Creative Content", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "geometry_reasoning_overlapped_circle", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "structured_output", "num_input": "1-image" }, { "name": "GUI_Act_Mobile_tap", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "TRANCE_physics_reasoning_basic", "score": 0.8823529411764706, "eval_type": "rule", "num_demo": 1, "num_query": 17, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "3D Models and Aerial Imagery", "app": "Perception", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "booking_web_rating", "score": 1.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "9-image or more" }, { "name": "logical_reasoning_fit_pattern", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "knowledge_sign_recognition", "score": 0.3333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 9, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "Text-Based Images and Documents", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "6-8 images" }, { "name": "ishihara_test", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "relative_depth_of_different_points", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "autonomous_driving_scene_analysis", "score": 1.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "geometry_reasoning_count_line_intersections", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "structured_output", "num_input": "1-image" }, { "name": "code_translation_hard", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "structured_output", "num_input": "6-8 images" }, { "name": "llavaguard", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Ethical and Safety Reasoning", "Scene and Event Understanding", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "knowledge_graph_understanding", "score": 0.4666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Commonsense and Social Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "game_platform_support_identification", "score": 0.9285714285714286, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Object Recognition and Classification" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "medical_content_based_retrieval_radiology", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Science", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "planning_visual_blocksworld", "score": 0.3333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning", "Object Recognition and Classification" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "recover_masked_word_in_figure", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "healthcare_info_judgement", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills", "Ethical and Safety Reasoning" ], "input_format": "User Interface Screenshots", "app": "Science", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "photoshop_operation", "score": 0.32857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "pokemon_3D_recognition", "score": 0.8, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "3D Models and Aerial Imagery", "app": "Perception", "output_format": "structured_output", "num_input": "6-8 images" }, { "name": "multilingual_news_qa", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Language Understanding and Generation", "Text Recognition (OCR)" ], "input_format": "Photographs", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "9-image or more" }, { "name": "worldle", "score": 0.31144102130193474, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "rocks_samples_identify", "score": 0.2, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "4-5 images" }, { "name": "circuit_diagram_understanding", "score": 0.3333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Science", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "paper_vqa", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "product_ocr_qa", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "planning_screenshot_barman", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "counting", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "TV_show_retrieval_by_character", "score": 0.9285714285714286, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "9-image or more" }, { "name": "planning_visual_barman", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning", "Object Recognition and Classification" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "multiview_reasoning_camera_moving", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "visual_correspondance_in_two_images", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "music_sheet_note_count", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 17, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Mathematical and Logical Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Knowledge", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "video_segments_reordering", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Videos", "app": "Perception", "output_format": "structured_output", "num_input": "video" }, { "name": "code_match_problem", "score": 0.9285714285714286, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "exact_text", "num_input": "4-5 images" }, { "name": "media_recommend_solutions_stackoverflow", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Language Understanding and Generation", "Domain-Specific Knowledge and Skills" ], "input_format": "User Interface Screenshots", "app": "Coding", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "code_translation_advanced", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "structured_output", "num_input": "4-5 images" }, { "name": "planning_visual_floortile", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "signage_navigation", "score": 0.8666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "exact_text", "num_input": "1-image" }, { "name": "google_streetview_circle_reasoning", "score": 0.3333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "media_QA_web_stackoverflow", "score": 0.6666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills", "Mathematical and Logical Reasoning" ], "input_format": "User Interface Screenshots", "app": "Coding", "output_format": "structured_output", "num_input": "1-image" }, { "name": "vln_english_next_step", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Planning", "output_format": "contextual_formatted_text", "num_input": "9-image or more" }, { "name": "sign_language", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Videos", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "video_grounding_spatial", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "Ad_count_detection", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "User Interface Screenshots", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "move_pos_to_pos_hanoi_4_pole", "score": 0.003968253968253968, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Planning and Decision Making", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "geometry_reasoning_grid", "score": 0.9642857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "structured_output", "num_input": "1-image" }, { "name": "2d_image_jigsaw_puzzle_easy", "score": 0.3428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "location_vqa", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "exact_text", "num_input": "1-image" }, { "name": "flowchart_code_generation", "score": 0.6666666666666666, "eval_type": "rule", "num_demo": 1, "num_query": 9, "skills": [ "Mathematical and Logical Reasoning", "Language Understanding and Generation" ], "input_format": "Diagrams and Data Visualizations", "app": "Coding", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "medical_polyp_segmentation_single_object_rater", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Science", "output_format": "structured_output", "num_input": "4-5 images" }, { "name": "geometry_reasoning_circled_letter", "score": 0.8214285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Spatial and Temporal Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "structured_output", "num_input": "1-image" }, { "name": "song_title_identification_from_lyrics", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "transit_map_intersection_points", "score": 0.7172619047619049, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Text Recognition (OCR)" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "structured_output", "num_input": "1-image" }, { "name": "webpage_code_understanding", "score": 0.7777777777777778, "eval_type": "rule", "num_demo": 1, "num_query": 9, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Coding", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "google_streetview_direction_understanding", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "chess_find_legal_moves", "score": 0.06698805429719713, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "topological_sort", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "structured_output", "num_input": "1-image" }, { "name": "MMMU_pro_exam_screenshot", "score": 0.42424242424242425, "eval_type": "rule", "num_demo": 1, "num_query": 99, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "User Interface Screenshots", "app": "Science", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "Forensic_Detection_of_different_images", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "TRANCE_physics_reasoning_event", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "3D Models and Aerial Imagery", "app": "Perception", "output_format": "multiple_choice", "num_input": "2-3 images" }, { "name": "monthly_weather_days_count", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Perception", "output_format": "structured_output", "num_input": "1-image" }, { "name": "mindmap_elements_parsing", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Artistic and Creative Content", "app": "Perception", "output_format": "structured_output", "num_input": "1-image" }, { "name": "interpret_force_perspective_illusion", "score": 1.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "exact_text", "num_input": "1-image" }, { "name": "code_solution_compare", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Language Understanding and Generation", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Coding", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "functionality_matching_in_different_objects", "score": 0.5357142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "annoying_word_search", "score": 0.0035714285714285718, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Spatial and Temporal Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Planning", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "planning_visual_storage", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "google_streetview_line_reasoning", "score": 0.4, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "geometry_reasoning_nested_squares", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "structured_output", "num_input": "1-image" }, { "name": "polygon_interior_angles", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "ancient_map_understanding", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Knowledge", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "GUI_Act_Web_Multi", "score": 0.4642857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "rocks_samples_compare", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "2-3 images" }, { "name": "mensa_iq_test", "score": 0.5495098039215687, "eval_type": "rule", "num_demo": 1, "num_query": 17, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "top_video_creator_identification", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Scene and Event Understanding" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "logical_reasoning_2D_views_of_3D_shapes", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "structured_output", "num_input": "4-5 images" }, { "name": "music_sheet_sentiment", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Commonsense and Social Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Knowledge", "output_format": "exact_text", "num_input": "4-5 images" }, { "name": "vln_tegulu_next_step", "score": 0.06666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Planning", "output_format": "structured_output", "num_input": "9-image or more" }, { "name": "paper_review_rating", "score": 0.6543300312736264, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Language Understanding and Generation", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Metrics", "output_format": "numerical_data", "num_input": "4-5 images" }, { "name": "video_camera_motion_description", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Perception", "output_format": "exact_text", "num_input": "video" }, { "name": "play_go_capture_stone", "score": 0.26666666666666666, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Spatial and Temporal Reasoning", "Planning and Decision Making" ], "input_format": "Artistic and Creative Content", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "medical_multi_organ_segmentation_rater", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Science", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "ascii_art_understanding", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "GUI_Act_Web_Single", "score": 0.07140372068949602, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "video_grounding_temporal", "score": 0.13333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Perception", "output_format": "multiple_choice", "num_input": "video" }, { "name": "planning_screenshot_grippers", "score": 0.7333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "video_intent_recognition", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Videos", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "counting_multi_image", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "numerical_data", "num_input": "2-3 images" }, { "name": "calendar_schedule_suggestion", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "chinese_idiom_recognition", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Artistic and Creative Content", "app": "Knowledge", "output_format": "exact_text", "num_input": "1-image" }, { "name": "web_action_prediction", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Perception", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "video_action_recognition", "score": 0.8214285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Perception", "output_format": "structured_output", "num_input": "video" }, { "name": "highest_discount_game_price_identification", "score": 0.9285714285714286, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Mathematical and Logical Reasoning" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "GUI_Act_Mobile_swipe", "score": 0.5487385867546344, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "video_eval_factual_pref", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Commonsense and Social Reasoning", "Ethical and Safety Reasoning" ], "input_format": "Videos", "app": "Metrics", "output_format": "multiple_choice", "num_input": "video" }, { "name": "road_map_find_highway_between_two_place", "score": 0.8235294117647058, "eval_type": "rule", "num_demo": 1, "num_query": 17, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "code_execution", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "code_translation_easy", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "music_sheet_format_QA", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "Text-Based Images and Documents", "app": "Knowledge", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "position_relationship", "score": 0.4666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "booking_web_recommendation", "score": 0.7624716553287981, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "9-image or more" }, { "name": "planning_screenshot_termes", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "music_sheet_author", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Knowledge", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "actor_recognition_in_Movie", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Knowledge", "output_format": "exact_text", "num_input": "1-image" }, { "name": "font_recognition", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "Text-Based Images and Documents", "app": "Knowledge", "output_format": "exact_text", "num_input": "1-image" }, { "name": "extract_webpage_headline", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "hashtag_recommendation", "score": 0.9583333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "Bongard_Problem", "score": 0.3157894736842105, "eval_type": "rule", "num_demo": 1, "num_query": 19, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "relative_reflectance_of_different_regions", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "structured_output", "num_input": "1-image" }, { "name": "ti_fused_vqa_physics", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Science", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "coco_ood_global_image_retrieval_by_query_property", "score": 0.8601190476190477, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Artistic and Creative Content", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "9-image or more" }, { "name": "multilingual_movie_info_parsing", "score": 0.7448979591836732, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "planning_screenshot_floortile", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "cheapest_flight_identification", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "6-8 images" }, { "name": "planning_screenshot_blocksworld", "score": 0.3333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "soccer_offside", "score": 0.2222222222222222, "eval_type": "rule", "num_demo": 1, "num_query": 9, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "electricity_load_estimate_plot", "score": 0.6787142857142856, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "pmc_vqa_medical_image_qa", "score": 0.7894736842105263, "eval_type": "rule", "num_demo": 1, "num_query": 19, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Science", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "waldo", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 18, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Artistic and Creative Content", "app": "Perception", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "number_comparison", "score": 1.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Mathematics", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "video_to_camera_trajectory_retrieval", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Videos", "app": "Perception", "output_format": "multiple_choice", "num_input": "video" }, { "name": "medical_keywords_based_retrieval_non_radiology", "score": 1.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Science", "output_format": "exact_text", "num_input": "4-5 images" }, { "name": "movie_info_parsing", "score": 0.7321428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Object Recognition and Classification" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "paper_review_acceptance", "score": 0.4666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Language Understanding and Generation", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Metrics", "output_format": "exact_text", "num_input": "4-5 images" }, { "name": "code_programming_test_easy", "score": 0.5416666666666666, "eval_type": "rule", "num_demo": 1, "num_query": 24, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "structured_output", "num_input": "1-image" }, { "name": "planning_screenshot_storage", "score": 0.13333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "perception_test_object_shuffle_video", "score": 0.1875, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Videos", "app": "Planning", "output_format": "multiple_choice", "num_input": "video" }, { "name": "insect_order_classification", "score": 0.13333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "face_identity_matching", "score": 0.5333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "numerical_data", "num_input": "4-5 images" }, { "name": "scibench_calculus_wo_solution", "score": 0.22448979591836735, "eval_type": "rule", "num_demo": 1, "num_query": 49, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "graph_shortest_path_kamada_kawai", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "medical_parasite_detection", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Science", "output_format": "structured_output", "num_input": "6-8 images" }, { "name": "places365_similar_scene_retrieval", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Object Recognition and Classification" ], "input_format": "Photographs", "app": "Information_Extraction", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "mahjong", "score": 0.06666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Photographs", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "multi_load_type_prediction_from_plot", "score": 0.46428571428571425, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "multiple_choice", "num_input": "6-8 images" }, { "name": "code_visualization_output_understanding", "score": 0.6, "eval_type": "rule", "num_demo": 1, "num_query": 10, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Coding", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "chess_sygyzy_endgames", "score": 0.09714285714285713, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Planning and Decision Making", "Mathematical and Logical Reasoning" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "human_relationship_reasoning", "score": 1.0, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Commonsense and Social Reasoning", "Object Recognition and Classification" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "Movie_retrieval_by_actor", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "9-image or more" }, { "name": "rebus", "score": 0.5217391304347826, "eval_type": "rule", "num_demo": 1, "num_query": 23, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Planning", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "muma_theory_of_mind_social_goal", "score": 0.4666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Scene and Event Understanding", "Commonsense and Social Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "graph_shortest_path_planar", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "comic_page_ordering", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning", "Language Understanding and Generation" ], "input_format": "Artistic and Creative Content", "app": "Planning", "output_format": "contextual_formatted_text", "num_input": "6-8 images" }, { "name": "iconqa", "score": 0.5263157894736842, "eval_type": "rule", "num_demo": 1, "num_query": 19, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Artistic and Creative Content", "app": "Mathematics", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "LaTeX_complex_formula_convertion", "score": 0.23529411764705882, "eval_type": "rule", "num_demo": 1, "num_query": 17, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "structured_output", "num_input": "1-image" }, { "name": "stock_info_parsing", "score": 0.9747899159663866, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "icon_arithmetic_puzzle", "score": 0.9285714285714286, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "stock_price_future_prediction", "score": 0.8250714285714286, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "clevrer_moving_direction_video", "score": 0.375, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Videos", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "video_eval_dynamic_pref", "score": 0.875, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Metrics", "output_format": "multiple_choice", "num_input": "video" }, { "name": "geometry_length", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "orchestra_score_recognition", "score": 0.25, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "3d_fragments_understanding", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "3D Models and Aerial Imagery", "app": "Perception", "output_format": "numerical_data", "num_input": "2-3 images" }, { "name": "web_action_grounding", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "vizwiz_quality_accessment_for_blind", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Metrics", "output_format": "contextual_formatted_text", "num_input": "6-8 images" }, { "name": "logical_reasoning_2d_folding", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "traffic_future_prediction_from_line_plot", "score": 0.6207368421052633, "eval_type": "rule", "num_demo": 1, "num_query": 19, "skills": [ "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "ti_fused_vqa_biology", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Science", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "vln_hindi_next_step", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Planning", "output_format": "contextual_formatted_text", "num_input": "9-image or more" }, { "name": "remaining_playback_time_calculation", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Mathematical and Logical Reasoning" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "muma_theory_of_mind_belief_of_goal", "score": 0.6666666666666666, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Scene and Event Understanding", "Commonsense and Social Reasoning", "Language Understanding and Generation" ], "input_format": "Videos", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "animal_pose_estimation", "score": 0.2688508092335989, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "weather_info_retrieval", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "9-image or more" }, { "name": "average_humidity_estimate_plot", "score": 0.7853333333333332, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "vlnqa_egocentric_navigation_video", "score": 0.5625, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Spatial and Temporal Reasoning", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Videos", "app": "Planning", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "music_info_parsing", "score": 0.7053571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Object Recognition and Classification" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "code_programming_test_hard", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "structured_output", "num_input": "4-5 images" }, { "name": "nlvr2_two_image_compare_qa", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Perception", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "geometry_transformation", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "multilingual_game_info_parsing", "score": 0.8303571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "egocentric_analysis_single_image", "score": 0.3333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 9, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "landmark_recognition_and_qa", "score": 0.6, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "action_sequence", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "exact_text", "num_input": "9-image or more" }, { "name": "graph_connectivity", "score": 1.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "structured_output", "num_input": "1-image" }, { "name": "youtube_video_info_parsing", "score": 0.6904761904761906, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Object Recognition and Classification" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "physical_property_reasoning", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "av_vehicle_multiview_counting", "score": 0.06666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "numerical_data", "num_input": "6-8 images" }, { "name": "arc_agi", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "exact_text", "num_input": "4-5 images" }, { "name": "IAM_line_ocr_and_locate", "score": 0.8184827502429544, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Spatial and Temporal Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "license_plate_recognition", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Object Recognition and Classification" ], "input_format": "Photographs", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "av_human_multiview_counting", "score": 0.06666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "numerical_data", "num_input": "6-8 images" }, { "name": "clevrer_object_existence_video", "score": 0.5625, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Perception", "output_format": "multiple_choice", "num_input": "video" }, { "name": "famous_building_recognition", "score": 0.84375, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "emotion_recognition", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "exact_text", "num_input": "6-8 images" }, { "name": "TV_show_info_parsing", "score": 0.7698412698412698, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "image_style_recognition", "score": 1.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Artistic and Creative Content", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "graph_theory", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "exact_text", "num_input": "1-image" }, { "name": "electricity_plot_future_prediction", "score": 0.7095421052631579, "eval_type": "rule", "num_demo": 1, "num_query": 19, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "game_info_retrieval", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "9-image or more" }, { "name": "mnist_pattern", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Planning", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "graph_isomorphism", "score": 0.8, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "landmark_check_two_images", "score": 0.7777777777777779, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Artistic and Creative Content", "app": "Knowledge", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "nextqa_mc", "score": 0.8421052631578947, "eval_type": "rule", "num_demo": 1, "num_query": 19, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Videos", "app": "Information_Extraction", "output_format": "multiple_choice", "num_input": "video" }, { "name": "graph_hamiltonian_cycle", "score": 0.37499999999999994, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "structured_output", "num_input": "6-8 images" }, { "name": "stock_info_retrieval", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "9-image or more" }, { "name": "long_string_letter_recognition", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "action_prediction", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Videos", "app": "Perception", "output_format": "multiple_choice", "num_input": "video" }, { "name": "geometry_area", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "av_view_identification", "score": 0.27777777777777773, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "6-8 images" }, { "name": "newspaper_ocr_in_query_box", "score": 0.6, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Text Recognition (OCR)", "Spatial and Temporal Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "code_programming_test_advanced", "score": 0.24074074074074073, "eval_type": "rule", "num_demo": 1, "num_query": 18, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "code_programming_extremely_hard", "score": 0.0625, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning", "Planning and Decision Making" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "structured_output", "num_input": "4-5 images" }, { "name": "ti_fused_vqa_math", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "perception_test_video_character_order", "score": 0.6875, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Planning", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "coco_object_detection_by_query_property", "score": 0.5565966568582713, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "3d_indoor_scene_text_bbox_prediction", "score": 0.04739437903890144, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "3D Models and Aerial Imagery", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "top_rated_hotel_identification", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "widerface_face_count_and_event_classification", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "structured_output", "num_input": "1-image" }, { "name": "math_parity", "score": 0.6666666666666666, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "snli_ve_visual_entailment", "score": 0.8, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Scene and Event Understanding", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "science_basic_physics", "score": 0.8, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Science", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "face_keypoint_detection", "score": 0.5987447167547407, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "structured_output", "num_input": "1-image" }, { "name": "music_info_retrieval", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "9-image or more" }, { "name": "figureqa", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "chess_winner_identification", "score": 0.4666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "algebra", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "clevrer_video_moving_object_count", "score": 0.6666666666666666, "eval_type": "rule", "num_demo": 1, "num_query": 21, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Perception", "output_format": "numerical_data", "num_input": "video" }, { "name": "math_convexity_value_estimation", "score": 0.5753130452443872, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Mathematical and Logical Reasoning", "Object Recognition and Classification" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "structured_output", "num_input": "1-image" }, { "name": "map_diagram_qa", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "deciphering_oracle_bone", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Knowledge", "output_format": "exact_text", "num_input": "1-image" }, { "name": "funsd_document_qa", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "newspaper_page_parse_and_count", "score": 0.5333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Scene and Event Understanding" ], "input_format": "Text-Based Images and Documents", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "weather_info_parsing", "score": 0.9087301587301589, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Object Recognition and Classification" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "vibe_eval_short_phrase", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "signboard_identification", "score": 0.7333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "game_info_parsing", "score": 0.9415584415584416, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "cam_traj_to_video_selection", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Videos", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "clevrer_video_moving_object_property_recognition", "score": 0.75, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "geometry_analytic", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "long_string_number_recognition", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "waybill_number_sequence_extraction", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "egocentric_spatial_reasoning", "score": 0.3333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 9, "skills": [ "Spatial and Temporal Reasoning", "Object Recognition and Classification" ], "input_format": "Photographs", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "code_error_line_identification", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Domain-Specific Knowledge and Skills", "Mathematical and Logical Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "multiple_choice", "num_input": "2-3 images" }, { "name": "single_person_pose_estimation", "score": 0.333520279485717, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "structured_output", "num_input": "1-image" }, { "name": "photo_sharing_image_retrieval", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Perception", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "quizlet_question_solving", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning" ], "input_format": "User Interface Screenshots", "app": "Science", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "chart_vqa", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "electricity_future_prediction_from_table", "score": 0.7636842105263157, "eval_type": "rule", "num_demo": 1, "num_query": 19, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "humor_understand_caption_match", "score": 0.6, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Commonsense and Social Reasoning", "Language Understanding and Generation" ], "input_format": "Artistic and Creative Content", "app": "Knowledge", "output_format": "exact_text", "num_input": "1-image" }, { "name": "hotel_booking_confirmation_parsing", "score": 0.7071428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Object Recognition and Classification" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "sta_action_localization_video", "score": 0.3125, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "geometry_descriptive", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "coco_person_detection", "score": 0.5531252543894322, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification" ], "input_format": "Photographs", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "video_content_reasoning", "score": 0.7777777777777778, "eval_type": "rule", "num_demo": 1, "num_query": 9, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "graph_maxflow", "score": 0.4666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "places365_scene_type_classification", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "research_website_parsing_blogpost", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "research_website_parsing_publication", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "research_website_parsing_homepage", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "reward_models_I2T_reward", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Metrics", "output_format": "exact_text", "num_input": "1-image" }, { "name": "reward_models_T2I_reward", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Metrics", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "constrained_generation_contain_contain_images", "score": 0.9333333333333333, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Perception", "output_format": "open_ended_output", "num_input": "2-3 images" }, { "name": "constrained_generation_contain_repeat_length", "score": 0.2, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "constrained_generation_multi_contain_repeat_position_only_length", "score": 0.2, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "open_ended_output", "num_input": "2-3 images" }, { "name": "constrained_generation_contain_length", "score": 0.0, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "constrained_generation_contain_position_images", "score": 0.9333333333333333, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "open_ended_output", "num_input": "2-3 images" }, { "name": "constrained_generation_contain_position_length", "score": 0.26666666666666666, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "constrained_generation_xor_images", "score": 0.8, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Perception", "output_format": "open_ended_output", "num_input": "2-3 images" }, { "name": "constrained_generation_multi_contain_repeat", "score": 0.0, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Language Understanding and Generation", "Object Recognition and Classification" ], "input_format": "Photographs", "app": "Perception", "output_format": "open_ended_output", "num_input": "2-3 images" }, { "name": "constrained_generation_contain_contain_length", "score": 0.6666666666666666, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "constrained_generation_multi_contain_position_only", "score": 0.2, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Language Understanding and Generation", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "open_ended_output", "num_input": "2-3 images" }, { "name": "panel_images_single_question", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Artistic and Creative Content", "app": "Perception", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "panel_images_multi_question", "score": 0.8095238095238094, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Artistic and Creative Content", "app": "Perception", "output_format": "structured_output", "num_input": "1-image" }, { "name": "chess_puzzles_checkmate", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Planning and Decision Making", "Mathematical and Logical Reasoning" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "chess_puzzles_equality", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Spatial and Temporal Reasoning", "Planning and Decision Making", "Mathematical and Logical Reasoning" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "chess_puzzles_crushing", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Planning and Decision Making", "Mathematical and Logical Reasoning" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "table_understanding_fact_verification", "score": 0.9047619047619049, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning", "Language Understanding and Generation" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "table_understanding_complex_question_answering", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "number_puzzle_sudoku", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "number_puzzle_kakuro_5x5", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "pictionary_chinese_food_img2en", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "pictionary_skribbl_io", "score": 0.15, "eval_type": "rule", "num_demo": 1, "num_query": 20, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Artistic and Creative Content", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "pictionary_genai_output_chinese", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Artistic and Creative Content", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "pictionary_doodle_guess", "score": 0.8, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Artistic and Creative Content", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "pictionary_cartoon_drawing_guess", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Artistic and Creative Content", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "MFC_Bench_check_face_swap", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "MFC_Bench_check_veracity", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Text Recognition (OCR)" ], "input_format": "Diagrams and Data Visualizations", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "MFC_Bench_check_out_of_context", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "MFC_Bench_check_background_change", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Object Recognition and Classification" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "MFC_Bench_check_clip_stable_diffusion_generate", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "MFC_Bench_check_text_style", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "MFC_Bench_check_text_entity_replace", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "MFC_Bench_check_face_attribute_edit", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "app_interactive_operations_leetcode", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "app_interactive_operations_instagram", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "app_interactive_operations_iphone_settings", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "app_interactive_operations_ppt", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "app_interactive_operations_notes", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "app_interactive_operations_amazon", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "app_interactive_operations_excel", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "app_interactive_operations_youtube", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "app_interactive_operations_twitter", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "app_interactive_operations_alipay", "score": 0.6470588235294118, "eval_type": "rule", "num_demo": 1, "num_query": 17, "skills": [ "Object Recognition and Classification", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "app_interactive_operations_zoom", "score": 0.4, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "app_interactive_operations_word", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "app_interactive_operations_tiktok", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "crossword_mini_5x5", "score": 0.7714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Language Understanding and Generation", "Mathematical and Logical Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "ball_cup_swap_3", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Artistic and Creative Content", "app": "Planning", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "music_sheet_name", "score": 0.4, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Knowledge", "output_format": "exact_text", "num_input": "4-5 images" }, { "name": "code_retrieval", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "weather_map_climate_type_temperature_parsing", "score": 0.8214285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Text Recognition (OCR)" ], "input_format": "Diagrams and Data Visualizations", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "planning_visual_termes", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning" ], "input_format": "Artistic and Creative Content", "app": "Planning", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "recipe_image_ordering", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Planning", "output_format": "multiple_choice", "num_input": "6-8 images" }, { "name": "distinguish_ai_generated_image", "score": 0.631578947368421, "eval_type": "rule", "num_demo": 1, "num_query": 19, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "exact_text", "num_input": "1-image" }, { "name": "planning_screenshot_tyreworld", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "structured_output", "num_input": "1-image" }, { "name": "google_streetview_circle_sorting", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "structured_output", "num_input": "9-image or more" }, { "name": "semantic_matching_of_two_images", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "TRANCE_physics_reasoning_view", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "3D Models and Aerial Imagery", "app": "Perception", "output_format": "multiple_choice", "num_input": "2-3 images" }, { "name": "entertainment_web_game_style", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Scene and Event Understanding" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "scibench_fundamental_wo_solution", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 49, "skills": [ "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Science", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "planning_visual_grippers", "score": 0.6666666666666666, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Planning and Decision Making", "Spatial and Temporal Reasoning" ], "input_format": "Artistic and Creative Content", "app": "Planning", "output_format": "structured_output", "num_input": "2-3 images" }, { "name": "brand_logo_recognition_and_elaboration", "score": 0.82, "eval_type": "rule", "num_demo": 1, "num_query": 25, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "logo2k_same_type_logo_retrieval", "score": 0.9285714285714286, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification" ], "input_format": "Text-Based Images and Documents", "app": "Knowledge", "output_format": "structured_output", "num_input": "6-8 images" }, { "name": "clevr_arithmetic", "score": 0.631578947368421, "eval_type": "rule", "num_demo": 1, "num_query": 19, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning" ], "input_format": "Photographs", "app": "Mathematics", "output_format": "numerical_data", "num_input": "2-3 images" }, { "name": "super_clevr", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Mathematical and Logical Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "visualdial_visual_dialog_image_guessing", "score": 1.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Perception", "output_format": "multiple_choice", "num_input": "9-image or more" }, { "name": "science_molecule_chemistry", "score": 0.9333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Science", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "ti_fused_vqa_chemistry", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Science", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "graph_hamiltonian_path", "score": 0.375, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "structured_output", "num_input": "6-8 images" }, { "name": "perception_test_video_action_count", "score": 0.25, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Perception", "output_format": "numerical_data", "num_input": "video" }, { "name": "star_object_interaction_video", "score": 0.4375, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Videos", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "video" }, { "name": "chess_puzzle_single_step", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Spatial and Temporal Reasoning", "Planning and Decision Making", "Mathematical and Logical Reasoning" ], "input_format": "Photographs", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "movie_info_retrieval", "score": 0.9285714285714286, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "9-image or more" }, { "name": "exchange_rate_estimate_plot", "score": 0.9841571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "graph_chordless_cycle", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "av_multicamera_tracking_predict_bbox", "score": 0.03751549483739501, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "numerical_data", "num_input": "9-image or more" }, { "name": "mvsa_sentiment_classification", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "math_breakpoint", "score": 0.4666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "ili_ratio_future_prediction", "score": 0.3397142857142856, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "dvqa", "score": 1.0, "eval_type": "rule", "num_demo": 1, "num_query": 19, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning", "Text Recognition (OCR)" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "geometry_solid", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Mathematics", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "question_solution_solving", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Science", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "autorater_3d_model_texturing", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "3D Models and Aerial Imagery", "app": "Metrics", "output_format": "contextual_formatted_text", "num_input": "2-3 images" }, { "name": "autorater_aesthetics", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Metrics", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "autorater_artifact_reason", "score": 0.4666666666666667, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Metrics", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "autorater_subject", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Metrics", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "autorater_motion_guided_editing", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Metrics", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "autorater_unmask", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Artistic and Creative Content", "app": "Metrics", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "autorater_semantics", "score": 0.9285714285714286, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Metrics", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "autorater_mask", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Metrics", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "autorater_control", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Metrics", "output_format": "exact_text", "num_input": "2-3 images" }, { "name": "autorater_artifact", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Metrics", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "counterfactual_arithmetic", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Mathematics", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "poetry_acrostic_alliteration", "score": 0.3333333333333333, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Language Understanding and Generation", "Object Recognition and Classification" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "poetry_shakespearean_sonnet", "score": 0.0, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Language Understanding and Generation", "Object Recognition and Classification" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "poetry_haiku", "score": 0.4, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "poetry_petrarchian_sonnet_optional_meter", "score": 0.0, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Language Understanding and Generation", "Object Recognition and Classification" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "poetry_acrostic", "score": 0.7333333333333333, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "poetry_limerick", "score": 0.5333333333333333, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "poetry_custom_rhyming_scheme", "score": 0.26666666666666666, "eval_type": "rule", "num_demo": 0, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "shape_composition_shapes", "score": 0.5346938775510204, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Artistic and Creative Content", "app": "Perception", "output_format": "structured_output", "num_input": "1-image" }, { "name": "shape_composition_colours", "score": 0.4522108843537415, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification" ], "input_format": "Artistic and Creative Content", "app": "Perception", "output_format": "structured_output", "num_input": "1-image" }, { "name": "ocr_article_authors", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)" ], "input_format": "Text-Based Images and Documents", "app": "Information_Extraction", "output_format": "structured_output", "num_input": "1-image" }, { "name": "ocr_table_to_html", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "structured_output", "num_input": "1-image" }, { "name": "ocr_article_journal", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "ocr_resume_skill_plain", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "ocr_math_equation", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "ocr_table_to_latex", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Diagrams and Data Visualizations", "app": "Coding", "output_format": "structured_output", "num_input": "1-image" }, { "name": "ocr_resume_experience_plain", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "ocr_resume_employer_plain", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "ocr_math_text_latex", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "ocr_table_to_markdown", "score": 0.9285714285714286, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Diagrams and Data Visualizations", "app": "Coding", "output_format": "structured_output", "num_input": "1-image" }, { "name": "ocr_resume_school_plain", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)" ], "input_format": "Text-Based Images and Documents", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "ocr_table_to_csv", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "structured_output", "num_input": "1-image" }, { "name": "memorization_indian_celebrity", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "memorization_chinese_celebrity", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "memorization_famous_treaty", "score": 0.5357142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Artistic and Creative Content", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "memorization_papers", "score": 0.5333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "ocr_math_MATH", "score": 0.7333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Mathematics", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "ocr_math_TheoremQA", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Mathematics", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "MMSoc_HatefulMemes", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Ethical and Safety Reasoning", "Commonsense and Social Reasoning", "Text Recognition (OCR)" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "MMSoc_Misinformation_PolitiFact", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Commonsense and Social Reasoning", "Language Understanding and Generation", "Ethical and Safety Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "MMSoc_Misinformation_GossipCop", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Commonsense and Social Reasoning", "Language Understanding and Generation", "Object Recognition and Classification", "Scene and Event Understanding", "Ethical and Safety Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "MMSoc_Memotion", "score": 0.5529411764705884, "eval_type": "rule", "num_demo": 1, "num_query": 17, "skills": [ "Commonsense and Social Reasoning", "Language Understanding and Generation" ], "input_format": "Artistic and Creative Content", "app": "Knowledge", "output_format": "structured_output", "num_input": "1-image" }, { "name": "app_layout_understanding_leetcode", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "app_layout_understanding_instagram", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "app_layout_understanding_iphone_settings", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "app_layout_understanding_ppt", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "medical_abdomen_MRI_organ_recognition", "score": 0.25, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Science", "output_format": "contextual_formatted_text", "num_input": "4-5 images" }, { "name": "medical_cell_recognition", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Science", "output_format": "exact_text", "num_input": "1-image" }, { "name": "medical_image_artifacts_indentification", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Science", "output_format": "exact_text", "num_input": "1-image" }, { "name": "medical_abdomen_endscopy_organ_recognition", "score": 0.34523809523809523, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Science", "output_format": "contextual_formatted_text", "num_input": "4-5 images" }, { "name": "medical_retrieval_given_surgeon_activity", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Domain-Specific Knowledge and Skills" ], "input_format": "Videos", "app": "Science", "output_format": "multiple_choice", "num_input": "video" }, { "name": "medical_counting_lymphocytes", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Science", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "medical_blood_vessels_recognition", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Science", "output_format": "structured_output", "num_input": "1-image" }, { "name": "app_layout_understanding_amazon", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Scene and Event Understanding" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "app_layout_understanding_excel", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "app_layout_understanding_youtube", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "app_layout_understanding_twitter", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "app_layout_understanding_alipay", "score": 0.8235294117647058, "eval_type": "rule", "num_demo": 1, "num_query": 17, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "app_layout_understanding_zoom", "score": 0.6666666666666666, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "app_layout_understanding_word", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "app_layout_understanding_tiktok", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "maze_2d_8x8", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Spatial and Temporal Reasoning", "Planning and Decision Making" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "exact_text", "num_input": "1-image" }, { "name": "visual_prediction_rater_depth_estimation", "score": 0.47619047619047616, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "visual_prediction_rater_plane_segmentation", "score": 0.6222222222222221, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "visual_prediction_rater_openable_part_segmentation", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "visual_prediction_rater_panoptic_segmentation", "score": 0.5952380952380952, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "visual_prediction_rater_surface_normal_estimation", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "visual_prediction_rater_3d_assembled_quality_understanding", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "3D Models and Aerial Imagery", "app": "Perception", "output_format": "multiple_choice", "num_input": "2-3 images" }, { "name": "visual_prediction_rater_novel_view_synthesis", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "3D Models and Aerial Imagery", "app": "Perception", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "visual_prediction_rater_semantic_segmentation", "score": 0.4166666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 16, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "multiple_choice", "num_input": "4-5 images" }, { "name": "cvbench_adapted_cvbench_relation", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "cvbench_adapted_cvbench_distance", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "cvbench_adapted_cvbench_depth", "score": 1.0, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "exact_text", "num_input": "1-image" }, { "name": "cvbench_adapted_cvbench_count", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "Photographs", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "symbolic_graphics_programs_scalable_vector_graphics", "score": 0.1111111111111111, "eval_type": "rule", "num_demo": 1, "num_query": 18, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "symbolic_graphics_programs_computer_aided_design", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "multiple_states_identify_africa", "score": 0.8142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "multiple_states_identify_europe", "score": 0.7, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "multiple_states_identify_asia", "score": 0.5857142857142856, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "multiple_states_identify_americas", "score": 0.7, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "geographic_remote_sensing_land_cover", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding" ], "input_format": "3D Models and Aerial Imagery", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "2-3 images" }, { "name": "video_motion_matching_3D_real", "score": 0.4666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Perception", "output_format": "multiple_choice", "num_input": "video" }, { "name": "video_motion_matching_real_3D", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Perception", "output_format": "multiple_choice", "num_input": "video" }, { "name": "cultural_vqa", "score": 0.4, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "character_recognition_in_TV_shows", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "code_output_result", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "exact_text", "num_input": "4-5 images" }, { "name": "kvqa_knowledge_aware_qa", "score": 0.3157894736842105, "eval_type": "rule", "num_demo": 1, "num_query": 19, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "tqa_textbook_qa", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Text Recognition (OCR)" ], "input_format": "Diagrams and Data Visualizations", "app": "Science", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "painting_QA", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Artistic and Creative Content", "app": "Knowledge", "output_format": "exact_text", "num_input": "4-5 images" }, { "name": "MMMU_physics_chemistry_MCQ", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Science", "output_format": "exact_text", "num_input": "1-image" }, { "name": "arxiv_vqa", "score": 0.9285714285714286, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Mathematical and Logical Reasoning", "Language Understanding and Generation" ], "input_format": "Diagrams and Data Visualizations", "app": "Science", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "realworld_qa_en2cn", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Information_Extraction", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "code_add_tag", "score": 0.4666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15, "skills": [ "Text Recognition (OCR)", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "contextual_formatted_text", "num_input": "2-3 images" }, { "name": "table_understanding", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Text Recognition (OCR)" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "numerical_data", "num_input": "1-image" }, { "name": "handwritten_math_expression_extraction", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Mathematical and Logical Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "3d_indoor_scene_text_bbox_selection", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "3D Models and Aerial Imagery", "app": "Perception", "output_format": "multiple_choice", "num_input": "1-image" }, { "name": "app_layout_understanding_notes", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation", "Text Recognition (OCR)" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "exact_text", "num_input": "1-image" }, { "name": "red_teaming_racial", "score": 0.8450000000000001, "eval_type": "llm", "num_demo": 0, "num_query": 20, "skills": [ "Ethical and Safety Reasoning", "Scene and Event Understanding", "Object Recognition and Classification" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "ascii_art_30", "score": 0.2857142857142857, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "contextual_formatted_text", "num_input": "1-image" }, { "name": "table2latex_complex", "score": 0.7222222222222222, "eval_type": "llm", "num_demo": 1, "num_query": 9, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Coding", "output_format": "structured_output", "num_input": "1-image" }, { "name": "meme_explain", "score": 0.8142857142857143, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Commonsense and Social Reasoning", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "video_summary", "score": 0.6357142857142858, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Videos", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "video" }, { "name": "funqa_unexpected_action_magic_video", "score": 0.58, "eval_type": "llm", "num_demo": 1, "num_query": 15, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning", "Language Understanding and Generation" ], "input_format": "Videos", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "video" }, { "name": "paper_review_writing", "score": 0.58, "eval_type": "llm", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Language Understanding and Generation", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Metrics", "output_format": "open_ended_output", "num_input": "4-5 images" }, { "name": "activitynetqa", "score": 0.4, "eval_type": "llm", "num_demo": 1, "num_query": 19, "skills": [ "Scene and Event Understanding", "Object Recognition and Classification" ], "input_format": "Videos", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "video" }, { "name": "scibench_w_solution_open_ended", "score": 0.258, "eval_type": "llm", "num_demo": 1, "num_query": 25, "skills": [ "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills", "Language Understanding and Generation" ], "input_format": "Diagrams and Data Visualizations", "app": "Science", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "generated_video_artifacts", "score": 0.325, "eval_type": "llm", "num_demo": 1, "num_query": 16, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Commonsense and Social Reasoning" ], "input_format": "Videos", "app": "Metrics", "output_format": "open_ended_output", "num_input": "video" }, { "name": "funny_image_title", "score": 0.6928571428571428, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "wikihow_complex_task_completion", "score": 0.8444444444444446, "eval_type": "llm", "num_demo": 1, "num_query": 9, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Artistic and Creative Content", "app": "Planning", "output_format": "open_ended_output", "num_input": "9-image or more" }, { "name": "video_detail_description", "score": 0.38947368421052636, "eval_type": "llm", "num_demo": 1, "num_query": 19, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning" ], "input_format": "Videos", "app": "Perception", "output_format": "open_ended_output", "num_input": "video" }, { "name": "funqa_unexpected_action_creative_video", "score": 0.32666666666666655, "eval_type": "llm", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Commonsense and Social Reasoning" ], "input_format": "Videos", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "video" }, { "name": "guess_image_generation_prompt", "score": 0.8473684210526317, "eval_type": "llm", "num_demo": 1, "num_query": 19, "skills": [ "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Artistic and Creative Content", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "traffic_accident_analysis", "score": 0.5357142857142857, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Commonsense and Social Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "video_qa", "score": 0.8857142857142859, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Spatial and Temporal Reasoning", "Language Understanding and Generation" ], "input_format": "Videos", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "video" }, { "name": "vibe-eval", "score": 0.4928571428571428, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Ethical and Safety Reasoning", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "nextqa_oe", "score": 0.3684210526315789, "eval_type": "llm", "num_demo": 1, "num_query": 19, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Videos", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "video" }, { "name": "video_short_title", "score": 0.7500000000000001, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Language Understanding and Generation", "Scene and Event Understanding" ], "input_format": "Videos", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "video" }, { "name": "video2notes", "score": 0.7142857142857143, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Language Understanding and Generation", "Object Recognition and Classification" ], "input_format": "Videos", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "video" }, { "name": "image_humor_understanding", "score": 0.8931034482758619, "eval_type": "llm", "num_demo": 1, "num_query": 29, "skills": [ "Commonsense and Social Reasoning", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "graph_interpretation", "score": 0.882758620689655, "eval_type": "llm", "num_demo": 1, "num_query": 29, "skills": [ "Mathematical and Logical Reasoning", "Language Understanding and Generation", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "science_figure_explanation", "score": 0.8551724137931034, "eval_type": "llm", "num_demo": 1, "num_query": 29, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "ocrqa", "score": 0.8827586206896549, "eval_type": "llm", "num_demo": 1, "num_query": 29, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "electrocardiogram", "score": 0.3285714285714286, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Science", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "GUI_Chat_Easy", "score": 0.7307692307692307, "eval_type": "llm", "num_demo": 1, "num_query": 26, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "doc_vqa", "score": 0.8750000000000001, "eval_type": "llm", "num_demo": 1, "num_query": 16, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Text-Based Images and Documents", "app": "Perception", "output_format": "open_ended_output", "num_input": "4-5 images" }, { "name": "docci_image_description_long", "score": 0.7928571428571428, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "humor_explanation", "score": 0.5866666666666666, "eval_type": "llm", "num_demo": 1, "num_query": 15, "skills": [ "Commonsense and Social Reasoning", "Language Understanding and Generation" ], "input_format": "Artistic and Creative Content", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "video_content_follow_up", "score": 0.8500000000000002, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Language Understanding and Generation", "Planning and Decision Making" ], "input_format": "Videos", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "video" }, { "name": "GUI_Chat_Hard", "score": 0.45806451612903226, "eval_type": "llm", "num_demo": 1, "num_query": 31, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "iq_test", "score": 0.6482758620689654, "eval_type": "llm", "num_demo": 1, "num_query": 29, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning", "Spatial and Temporal Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "unusual_images", "score": 0.8931034482758619, "eval_type": "llm", "num_demo": 1, "num_query": 29, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "tweets_captioning", "score": 0.6499999999999998, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Photographs", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "art_explanation", "score": 0.7517241379310345, "eval_type": "llm", "num_demo": 1, "num_query": 29, "skills": [ "Language Understanding and Generation", "Scene and Event Understanding", "Commonsense and Social Reasoning" ], "input_format": "Artistic and Creative Content", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "bar_chart_interpretation", "score": 0.6931034482758621, "eval_type": "llm", "num_demo": 1, "num_query": 29, "skills": [ "Object Recognition and Classification", "Mathematical and Logical Reasoning", "Language Understanding and Generation" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "funqa_unexpected_action_humor_video", "score": 0.3733333333333333, "eval_type": "llm", "num_demo": 1, "num_query": 15, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Commonsense and Social Reasoning" ], "input_format": "Videos", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "video" }, { "name": "figurative_speech_explanation", "score": 0.8310344827586205, "eval_type": "llm", "num_demo": 1, "num_query": 29, "skills": [ "Scene and Event Understanding", "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "Artistic and Creative Content", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "defeasible_reasoning", "score": 0.8551724137931035, "eval_type": "llm", "num_demo": 1, "num_query": 29, "skills": [ "Scene and Event Understanding", "Language Understanding and Generation", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "image_captioning_with_additional_requirements", "score": 0.9357142857142858, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Scene and Event Understanding", "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "visualization_with_code", "score": 0.65, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Coding", "output_format": "structured_output", "num_input": "1-image" }, { "name": "table_understanding_fetaqa", "score": 0.7071428571428572, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Text Recognition (OCR)", "Language Understanding and Generation" ], "input_format": "Diagrams and Data Visualizations", "app": "Perception", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "red_teaming_jailbreak", "score": 0.9349999999999999, "eval_type": "llm", "num_demo": 0, "num_query": 20, "skills": [ "Text Recognition (OCR)", "Ethical and Safety Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "red_teaming_celebrity", "score": 0.8850000000000001, "eval_type": "llm", "num_demo": 0, "num_query": 20, "skills": [ "Commonsense and Social Reasoning", "Language Understanding and Generation" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "red_teaming_captcha", "score": 0.10000000000000003, "eval_type": "llm", "num_demo": 1, "num_query": 19, "skills": [ "Text Recognition (OCR)" ], "input_format": "Text-Based Images and Documents", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "red_teaming_visual_order_B", "score": 0.905263157894737, "eval_type": "llm", "num_demo": 1, "num_query": 19, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "2-3 images" }, { "name": "red_teaming_politics", "score": 0.8100000000000002, "eval_type": "llm", "num_demo": 0, "num_query": 20, "skills": [ "Scene and Event Understanding", "Commonsense and Social Reasoning", "Ethical and Safety Reasoning" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "red_teaming_visual_order_A", "score": 0.905263157894737, "eval_type": "llm", "num_demo": 1, "num_query": 19, "skills": [ "Object Recognition and Classification", "Domain-Specific Knowledge and Skills" ], "input_format": "Photographs", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "2-3 images" }, { "name": "red_teaming_visualmisleading", "score": 0.8789473684210528, "eval_type": "llm", "num_demo": 1, "num_query": 19, "skills": [ "Ethical and Safety Reasoning", "Commonsense and Social Reasoning" ], "input_format": "Artistic and Creative Content", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "bridge_strategies_worldclass", "score": 0.33571428571428574, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Planning and Decision Making", "Mathematical and Logical Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Diagrams and Data Visualizations", "app": "Planning", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "bridge_strategies_advanced", "score": 0.3071428571428571, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "bridge_strategies_expert", "score": 0.4142857142857143, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Spatial and Temporal Reasoning", "Planning and Decision Making" ], "input_format": "User Interface Screenshots", "app": "Planning", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "multi_lingual_manual_explanation_scooter_Spanish", "score": 0.5428571428571429, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Language Understanding and Generation", "Scene and Event Understanding", "Commonsense and Social Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "6-8 images" }, { "name": "multi_lingual_manual_explanation_scooter_Russian", "score": 0.7000000000000001, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation", "Ethical and Safety Reasoning" ], "input_format": "Diagrams and Data Visualizations", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "6-8 images" }, { "name": "multi_lingual_manual_explanation_scooter_Arabic", "score": 0.6071428571428571, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Language Understanding and Generation", "Ethical and Safety Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "6-8 images" }, { "name": "multi_lingual_manual_explanation_scooter_Chinese", "score": 0.6928571428571428, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Language Understanding and Generation", "Commonsense and Social Reasoning", "Ethical and Safety Reasoning", "Domain-Specific Knowledge and Skills" ], "input_format": "Text-Based Images and Documents", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "6-8 images" }, { "name": "multi_lingual_manual_explanation_scooter_French", "score": 0.6357142857142858, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Object Recognition and Classification", "Text Recognition (OCR)", "Language Understanding and Generation", "Ethical and Safety Reasoning" ], "input_format": "Text-Based Images and Documents", "app": "Information_Extraction", "output_format": "open_ended_output", "num_input": "6-8 images" }, { "name": "multi_lingual_Ruozhiba_expalnation_Spanish", "score": 0.4928571428571429, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "User Interface Screenshots", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "multi_lingual_Ruozhiba_expalnation_English", "score": 0.4714285714285714, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "User Interface Screenshots", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "multi_lingual_Ruozhiba_expalnation_Russian", "score": 0.4357142857142858, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Commonsense and Social Reasoning", "Language Understanding and Generation" ], "input_format": "User Interface Screenshots", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "multi_lingual_Ruozhiba_expalnation_Arabic", "score": 0.5785714285714285, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "User Interface Screenshots", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "multi_lingual_Ruozhiba_expalnation_Japanese", "score": 0.5714285714285714, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "User Interface Screenshots", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "multi_lingual_Ruozhiba_expalnation_French", "score": 0.47857142857142854, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Language Understanding and Generation", "Commonsense and Social Reasoning" ], "input_format": "User Interface Screenshots", "app": "Knowledge", "output_format": "open_ended_output", "num_input": "1-image" }, { "name": "sceneqa_scene_transition_video", "score": 0.32857142857142857, "eval_type": "llm", "num_demo": 1, "num_query": 14, "skills": [ "Scene and Event Understanding", "Spatial and Temporal Reasoning", "Language Understanding and Generation" ], "input_format": "Videos", "app": "Perception", "output_format": "open_ended_output", "num_input": "video" } ]