Spaces:
Running
Running
| import matplotlib | |
| matplotlib.use('Agg') # Use Agg backend for thread safety | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import pandas as pd | |
| import seaborn as sns | |
| import json | |
| import os | |
| from leaderboard_utils import ( | |
| get_organization, | |
| get_mario_leaderboard, | |
| get_sokoban_leaderboard, | |
| get_2048_leaderboard, | |
| get_candy_leaderboard, | |
| get_tetris_leaderboard, | |
| get_tetris_planning_leaderboard, | |
| get_combined_leaderboard, | |
| GAME_ORDER | |
| ) | |
| # Load model colors | |
| with open('assets/model_color.json', 'r') as f: | |
| MODEL_COLORS = json.load(f) | |
| # Define game score columns mapping | |
| GAME_SCORE_COLUMNS = { | |
| "Super Mario Bros": "Score", | |
| "Sokoban": "Levels Cracked", | |
| "2048": "Score", | |
| "Candy Crash": "Average Score", | |
| "Tetris (complete)": "Score", | |
| "Tetris (planning only)": "Score" | |
| } | |
| def normalize_values(values, mean, std): | |
| """ | |
| Normalize values using z-score and scale to 0-100 range | |
| Args: | |
| values (list): List of values to normalize | |
| mean (float): Mean value for normalization | |
| std (float): Standard deviation for normalization | |
| Returns: | |
| list: Normalized values scaled to 0-100 range | |
| """ | |
| if std == 0: | |
| return [50 if v > 0 else 0 for v in values] # Handle zero std case | |
| z_scores = [(v - mean) / std for v in values] | |
| # Scale z-scores to 0-100 range, with mean at 50 | |
| scaled_values = [max(0, min(100, (z * 30) + 50)) for z in z_scores] | |
| return scaled_values | |
| def simplify_model_name(model_name): | |
| """ | |
| Simplify model name by either taking first 11 chars or string before third '-' | |
| """ | |
| hyphen_parts = model_name.split('-') | |
| return '-'.join(hyphen_parts[:3]) if len(hyphen_parts) >= 3 else model_name[:11] | |
| def create_horizontal_bar_chart(df, game_name): | |
| """ | |
| Create horizontal bar chart for detailed game view | |
| Args: | |
| df (pd.DataFrame): DataFrame containing game data | |
| game_name (str): Name of the game to display | |
| Returns: | |
| matplotlib.figure.Figure: The generated bar chart figure | |
| """ | |
| # Close any existing figures to prevent memory leaks | |
| plt.close('all') | |
| # Set style | |
| plt.style.use('default') | |
| # Increase figure width to accommodate long model names | |
| fig, ax = plt.subplots(figsize=(20, 11)) | |
| # Sort by score | |
| if game_name == "Super Mario Bros": | |
| score_col = "Score" | |
| df_sorted = df.sort_values(by=score_col, ascending=True) | |
| elif game_name == "Sokoban": | |
| # Process Sokoban scores by splitting and getting max level | |
| def get_max_level(levels_str): | |
| try: | |
| # Split by semicolon, strip whitespace, filter empty strings, convert to integers | |
| levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()] | |
| return max(levels) if levels else 0 | |
| except: | |
| return 0 | |
| # Create a temporary column with max levels | |
| df['Max Level'] = df['Levels Cracked'].apply(get_max_level) | |
| df_sorted = df.sort_values(by='Max Level', ascending=True) | |
| score_col = 'Max Level' | |
| elif game_name == "2048": | |
| score_col = "Score" | |
| df_sorted = df.sort_values(by=score_col, ascending=True) | |
| elif game_name == "Candy Crash": | |
| score_col = "Average Score" | |
| df_sorted = df.sort_values(by=score_col, ascending=True) | |
| elif game_name in ["Tetris (complete)", "Tetris (planning only)"]: | |
| score_col = "Score" | |
| df_sorted = df.sort_values(by=score_col, ascending=True) | |
| else: | |
| return None | |
| # Create color gradient | |
| colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(df_sorted))) | |
| # Create horizontal bars | |
| bars = ax.barh(range(len(df_sorted)), df_sorted[score_col], color=colors) | |
| # Add more space for labels on the left | |
| plt.subplots_adjust(left=0.3) | |
| # Customize the chart | |
| ax.set_yticks(range(len(df_sorted))) | |
| # Format player names: keep organization info and truncate the rest if too long | |
| def format_player_name(player, org): | |
| max_length = 40 # Maximum length for player name | |
| if len(player) > max_length: | |
| # Keep the first part and last part of the name | |
| parts = player.split('-') | |
| if len(parts) > 3: | |
| formatted = f"{parts[0]}-{parts[1]}-...{parts[-1]}" | |
| else: | |
| formatted = player[:max_length-3] + "..." | |
| else: | |
| formatted = player | |
| return f"{formatted} [{org}]" | |
| player_labels = [format_player_name(row['Player'], row['Organization']) | |
| for _, row in df_sorted.iterrows()] | |
| ax.set_yticklabels(player_labels, fontsize=9) | |
| # Add value labels on the bars | |
| for i, bar in enumerate(bars): | |
| width = bar.get_width() | |
| if game_name == "Candy Crash": | |
| score_text = f'{width:.1f}' | |
| else: | |
| score_text = f'{width:.0f}' | |
| ax.text(width, bar.get_y() + bar.get_height()/2, | |
| score_text, | |
| ha='left', va='center', | |
| fontsize=10, | |
| fontweight='bold', | |
| color='white', | |
| bbox=dict(facecolor=(0, 0, 0, 0.3), | |
| edgecolor='none', | |
| alpha=0.5, | |
| pad=2)) | |
| # Set title and labels | |
| ax.set_title(f"{game_name} Performance", | |
| pad=20, | |
| fontsize=14, | |
| fontweight='bold', | |
| color='#2c3e50') | |
| if game_name == "Sokoban": | |
| ax.set_xlabel("Maximum Level Reached", | |
| fontsize=12, | |
| fontweight='bold', | |
| color='#2c3e50', | |
| labelpad=10) | |
| else: | |
| ax.set_xlabel(score_col, | |
| fontsize=12, | |
| fontweight='bold', | |
| color='#2c3e50', | |
| labelpad=10) | |
| # Add grid lines | |
| ax.grid(True, axis='x', linestyle='--', alpha=0.3) | |
| # Remove top and right spines | |
| ax.spines['top'].set_visible(False) | |
| ax.spines['right'].set_visible(False) | |
| # Adjust layout | |
| plt.tight_layout() | |
| return fig | |
| def create_radar_charts(df): | |
| """ | |
| Create two radar charts with improved normalization using z-scores | |
| """ | |
| # Close any existing figures to prevent memory leaks | |
| plt.close('all') | |
| # Define reasoning models | |
| reasoning_models = [ | |
| 'claude-3-7-sonnet-20250219(thinking)', | |
| 'o1-2024-12-17', | |
| 'gemini-2.0-flash-thinking-exp-1219', | |
| 'o3-mini-2025-01-31(medium)', | |
| 'gemini-2.5-pro-exp-03-25', | |
| 'o1-mini-2024-09-12', | |
| 'deepseek-r1' | |
| ] | |
| # Split dataframe into reasoning and non-reasoning models | |
| df_reasoning = df[df['Player'].isin(reasoning_models)] | |
| df_others = df[~df['Player'].isin(reasoning_models)] | |
| # Get game columns | |
| game_columns = [col for col in df.columns if col.endswith(' Score')] | |
| categories = [col.replace(' Score', '') for col in game_columns] | |
| # Create figure with two subplots - adjusted size for new layout | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6), subplot_kw=dict(projection='polar')) | |
| fig.patch.set_facecolor('white') # Set figure background to white | |
| def get_game_stats(df, game_col): | |
| """ | |
| Get mean and std for a game column, handling missing values | |
| """ | |
| values = [] | |
| for val in df[game_col]: | |
| if isinstance(val, str) and val == '_': | |
| values.append(0) | |
| else: | |
| try: | |
| values.append(float(val)) | |
| except: | |
| values.append(0) | |
| return np.mean(values), np.std(values) | |
| def setup_radar_plot(ax, data, title): | |
| ax.set_facecolor('white') # Set subplot background to white | |
| num_vars = len(categories) | |
| angles = np.linspace(0, 2*np.pi, num_vars, endpoint=False) | |
| angles = np.concatenate((angles, [angles[0]])) | |
| # Plot grid lines with darker color | |
| grid_values = [10, 30, 50, 70, 90] | |
| ax.set_rgrids(grid_values, | |
| labels=grid_values, | |
| angle=45, | |
| fontsize=6, | |
| alpha=0.7, # Increased alpha for better visibility | |
| color='#404040') # Darker color for grid labels | |
| # Make grid lines darker but still subtle | |
| ax.grid(True, color='#404040', alpha=0.3) # Darker grid lines | |
| # Define darker, more vibrant colors for the radar plots | |
| colors = ['#1f77b4', '#d62728', '#2ca02c', '#ff7f0e', '#9467bd', '#8c564b'] | |
| # Calculate game statistics once | |
| game_stats = {col: get_game_stats(df, col) for col in game_columns} | |
| # Plot data with darker lines and higher opacity for fills | |
| for idx, (_, row) in enumerate(data.iterrows()): | |
| values = [] | |
| for col in game_columns: | |
| val = row[col] | |
| if isinstance(val, str) and val == '_': | |
| values.append(0) | |
| else: | |
| try: | |
| values.append(float(val)) | |
| except: | |
| values.append(0) | |
| # Normalize values using game statistics | |
| normalized_values = [] | |
| for i, v in enumerate(values): | |
| mean, std = game_stats[game_columns[i]] | |
| normalized_value = normalize_values([v], mean, std)[0] | |
| normalized_values.append(normalized_value) | |
| # Complete the circular plot | |
| normalized_values = np.concatenate((normalized_values, [normalized_values[0]])) | |
| model_name = simplify_model_name(row['Player']) | |
| ax.plot(angles, normalized_values, 'o-', linewidth=2.0, # Increased line width | |
| label=model_name, | |
| color=colors[idx % len(colors)], | |
| markersize=4) # Increased marker size | |
| ax.fill(angles, normalized_values, | |
| alpha=0.3, # Increased fill opacity | |
| color=colors[idx % len(colors)]) | |
| # Format categories | |
| formatted_categories = [] | |
| for game in categories: | |
| if game == "Tetris (planning only)": | |
| game = "Tetris\n(planning)" | |
| elif game == "Tetris (complete)": | |
| game = "Tetris\n(complete)" | |
| elif game == "Super Mario Bros": | |
| game = "Super\nMario" | |
| elif game == "Candy Crash": | |
| game = "Candy\nCrash" | |
| formatted_categories.append(game) | |
| ax.set_xticks(angles[:-1]) | |
| ax.set_xticklabels(formatted_categories, | |
| fontsize=8, # Slightly larger font | |
| color='#202020', # Darker text | |
| fontweight='bold') # Bold text | |
| ax.tick_params(pad=10, colors='#202020') # Darker tick colors | |
| ax.set_title(title, | |
| pad=20, | |
| fontsize=11, # Slightly larger title | |
| color='#202020', # Darker title | |
| fontweight='bold') # Bold title | |
| legend = ax.legend(loc='upper right', | |
| bbox_to_anchor=(1.3, 1.1), | |
| fontsize=7, # Slightly larger legend | |
| framealpha=0.9, # More opaque legend | |
| edgecolor='#404040', # Darker edge | |
| ncol=1) | |
| ax.set_ylim(0, 105) | |
| ax.spines['polar'].set_color('#404040') # Darker spine | |
| ax.spines['polar'].set_alpha(0.5) # More visible spine | |
| # Setup both plots | |
| setup_radar_plot(ax1, df_reasoning, "Reasoning Models") | |
| setup_radar_plot(ax2, df_others, "Non-Reasoning Models") | |
| plt.subplots_adjust(right=0.85, wspace=0.3) | |
| return fig | |
| def get_combined_leaderboard_with_radar(rank_data, selected_games): | |
| """ | |
| Get combined leaderboard and create radar charts | |
| """ | |
| df = get_combined_leaderboard(rank_data, selected_games) | |
| radar_fig = create_radar_charts(df) | |
| return df, radar_fig | |
| def create_organization_radar_chart(rank_data): | |
| """ | |
| Create radar chart comparing organizations | |
| """ | |
| # Get combined leaderboard with all games | |
| df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER}) | |
| # Group by organization and calculate average scores | |
| org_performance = {} | |
| for org in df["Organization"].unique(): | |
| org_df = df[df["Organization"] == org] | |
| scores = {} | |
| for game in GAME_ORDER: | |
| game_scores = org_df[f"{game} Score"].apply(lambda x: float(x) if x != "_" else 0) | |
| scores[game] = game_scores.mean() | |
| org_performance[org] = scores | |
| # Create radar chart | |
| return create_radar_charts(pd.DataFrame([org_performance])) | |
| def create_top_players_radar_chart(rank_data, n=5): | |
| """ | |
| Create radar chart for top N players | |
| """ | |
| # Get combined leaderboard with all games | |
| df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER}) | |
| # Get top N players | |
| top_players = df["Player"].head(n).tolist() | |
| # Create radar chart for top players | |
| return create_radar_charts(df[df["Player"].isin(top_players)]) | |
| def create_player_radar_chart(rank_data, player_name): | |
| """ | |
| Create radar chart for a specific player | |
| """ | |
| # Get combined leaderboard with all games | |
| df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER}) | |
| # Get player's data | |
| player_df = df[df["Player"] == player_name] | |
| if player_df.empty: | |
| return None | |
| # Create radar chart for the player | |
| return create_radar_charts(player_df) | |
| def create_group_bar_chart(df): | |
| """ | |
| Create a grouped bar chart comparing AI model performance across different games | |
| Args: | |
| df (pd.DataFrame): DataFrame containing the combined leaderboard data | |
| Returns: | |
| matplotlib.figure.Figure: The generated group bar chart figure | |
| """ | |
| # Close any existing figures to prevent memory leaks | |
| plt.close('all') | |
| # Create figure and axis with better styling | |
| sns.set_style("whitegrid") | |
| fig = plt.figure(figsize=(20, 11)) | |
| # Create subplot with specific spacing | |
| ax = plt.subplot(111) | |
| # Adjust the subplot parameters | |
| plt.subplots_adjust(top=0.90, # Add more space at the top | |
| bottom=0.15, # Add more space at the bottom | |
| right=0.85, # Add more space for legend | |
| left=0.05) # Add space on the left | |
| # Get unique models | |
| models = df['Player'].unique() | |
| # Get active games (those that have score columns in the DataFrame) | |
| active_games = [] | |
| for game in GAME_ORDER: | |
| score_col = f"{game} Score" # Use the same column name for all games | |
| if score_col in df.columns: | |
| active_games.append(game) | |
| n_games = len(active_games) | |
| if n_games == 0: | |
| return fig # Return empty figure if no games are selected | |
| # Keep track of which models have data in any game | |
| models_with_data = set() | |
| # Calculate normalized scores for each game | |
| for game_idx, game in enumerate(active_games): | |
| # Get all scores for this game | |
| game_scores = [] | |
| # Use the same score column name for all games | |
| score_col = f"{game} Score" | |
| for model in models: | |
| try: | |
| score = df[df['Player'] == model][score_col].values[0] | |
| if score != '_' and float(score) > 0: # Only include non-zero scores | |
| game_scores.append((model, float(score))) | |
| models_with_data.add(model) # Add model to set if it has valid data | |
| except (IndexError, ValueError): | |
| continue | |
| if not game_scores: # Skip if no valid scores for this game | |
| continue | |
| # Sort scores from highest to lowest | |
| game_scores.sort(key=lambda x: x[1], reverse=True) | |
| # Extract sorted models and scores | |
| sorted_models = [x[0] for x in game_scores] | |
| scores = [x[1] for x in game_scores] | |
| # Calculate mean and std for normalization | |
| mean = np.mean(scores) | |
| std = np.std(scores) | |
| # Normalize scores | |
| normalized_scores = normalize_values(scores, mean, std) | |
| # Calculate bar width based on number of models in this game | |
| n_models_in_game = len(sorted_models) | |
| bar_width = 0.8 / n_models_in_game if n_models_in_game > 0 else 0.8 | |
| # Plot bars for each model | |
| for i, (model, score) in enumerate(zip(sorted_models, normalized_scores)): | |
| # Only add to legend if first appearance and model has data | |
| should_label = model in models_with_data and model not in [l.get_text() for l in ax.get_legend().get_texts()] if ax.get_legend() else True | |
| # Get color from MODEL_COLORS, use a default if not found | |
| color = MODEL_COLORS.get(model, f"C{i % 10}") # Use matplotlib default colors as fallback | |
| ax.bar(game_idx + i*bar_width, score, | |
| width=bar_width, | |
| label=model if should_label else "", | |
| color=color, | |
| alpha=0.8) | |
| # Customize the plot | |
| ax.set_xticks(np.arange(n_games)) | |
| ax.set_xticklabels(active_games, rotation=45, ha='right', fontsize=10) | |
| ax.set_ylabel('Normalized Performance Score', fontsize=12) | |
| ax.set_title('AI Model Performance Comparison Across Gaming Tasks', | |
| fontsize=14, pad=20) | |
| # Add grid lines | |
| ax.grid(True, axis='y', linestyle='--', alpha=0.3) | |
| # Create legend with unique entries | |
| handles, labels = ax.get_legend_handles_labels() | |
| by_label = dict(zip(labels, handles)) | |
| # Sort models by their first appearance in active games | |
| model_order = [] | |
| for game in active_games: | |
| score_col = f"{game} Score" # Use the same column name for all games | |
| for model in models: | |
| try: | |
| score = df[df['Player'] == model][score_col].values[0] | |
| if score != '_' and float(score) > 0 and model not in model_order: | |
| model_order.append(model) | |
| except (IndexError, ValueError): | |
| continue | |
| # Create legend with sorted models | |
| sorted_handles = [by_label[model] for model in model_order if model in by_label] | |
| sorted_labels = [model for model in model_order if model in by_label] | |
| ax.legend(sorted_handles, sorted_labels, | |
| bbox_to_anchor=(1.00, 1), # Moved from (1.15, 1) to (1.05, 1) to shift left | |
| loc='upper left', | |
| fontsize=9, | |
| title='AI Models', | |
| title_fontsize=10) | |
| # No need for tight_layout() as we're manually controlling the spacing | |
| return fig | |
| def get_combined_leaderboard_with_group_bar(rank_data, selected_games): | |
| """ | |
| Get combined leaderboard and create group bar chart | |
| Args: | |
| rank_data (dict): Dictionary containing rank data | |
| selected_games (dict): Dictionary of game names and their selection status | |
| Returns: | |
| tuple: (DataFrame, matplotlib.figure.Figure) containing the leaderboard data and group bar chart | |
| """ | |
| df = get_combined_leaderboard(rank_data, selected_games) | |
| group_bar_fig = create_group_bar_chart(df) | |
| return df, group_bar_fig | |
| def save_visualization(fig, filename): | |
| """ | |
| Save visualization to file | |
| """ | |
| fig.savefig(filename, bbox_inches='tight', dpi=300) | |