lmgame_bench

Running

lmgame_bench / data_visualization.py

Yuxuan-Zhang-Dexter

update gradip app

6ebb0fb 7 months ago

20 kB

	import matplotlib
	matplotlib.use('Agg') # Use Agg backend for thread safety
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import seaborn as sns
	import json
	import os
	from leaderboard_utils import (
	get_organization,
	get_mario_leaderboard,
	get_sokoban_leaderboard,
	get_2048_leaderboard,
	get_candy_leaderboard,
	get_tetris_leaderboard,
	get_tetris_planning_leaderboard,
	get_combined_leaderboard,
	GAME_ORDER
	)

	# Load model colors
	with open('assets/model_color.json', 'r') as f:
	MODEL_COLORS = json.load(f)

	# Define game score columns mapping
	GAME_SCORE_COLUMNS = {
	"Super Mario Bros": "Score",
	"Sokoban": "Levels Cracked",
	"2048": "Score",
	"Candy Crash": "Average Score",
	"Tetris (complete)": "Score",
	"Tetris (planning only)": "Score"
	}

	def normalize_values(values, mean, std):
	"""
	Normalize values using z-score and scale to 0-100 range

	Args:
	values (list): List of values to normalize
	mean (float): Mean value for normalization
	std (float): Standard deviation for normalization

	Returns:
	list: Normalized values scaled to 0-100 range
	"""
	if std == 0:
	return [50 if v > 0 else 0 for v in values] # Handle zero std case
	z_scores = [(v - mean) / std for v in values]
	# Scale z-scores to 0-100 range, with mean at 50
	scaled_values = [max(0, min(100, (z * 30) + 50)) for z in z_scores]
	return scaled_values

	def simplify_model_name(model_name):
	"""
	Simplify model name by either taking first 11 chars or string before third '-'
	"""
	hyphen_parts = model_name.split('-')
	return '-'.join(hyphen_parts[:3]) if len(hyphen_parts) >= 3 else model_name[:11]

	def create_horizontal_bar_chart(df, game_name):
	"""
	Create horizontal bar chart for detailed game view

	Args:
	df (pd.DataFrame): DataFrame containing game data
	game_name (str): Name of the game to display

	Returns:
	matplotlib.figure.Figure: The generated bar chart figure
	"""
	# Close any existing figures to prevent memory leaks
	plt.close('all')

	# Set style
	plt.style.use('default')
	# Increase figure width to accommodate long model names
	fig, ax = plt.subplots(figsize=(20, 11))

	# Sort by score
	if game_name == "Super Mario Bros":
	score_col = "Score"
	df_sorted = df.sort_values(by=score_col, ascending=True)
	elif game_name == "Sokoban":
	# Process Sokoban scores by splitting and getting max level
	def get_max_level(levels_str):
	try:
	# Split by semicolon, strip whitespace, filter empty strings, convert to integers
	levels = [int(x.strip()) for x in levels_str.split(";") if x.strip()]
	return max(levels) if levels else 0
	except:
	return 0

	# Create a temporary column with max levels
	df['Max Level'] = df['Levels Cracked'].apply(get_max_level)
	df_sorted = df.sort_values(by='Max Level', ascending=True)
	score_col = 'Max Level'
	elif game_name == "2048":
	score_col = "Score"
	df_sorted = df.sort_values(by=score_col, ascending=True)
	elif game_name == "Candy Crash":
	score_col = "Average Score"
	df_sorted = df.sort_values(by=score_col, ascending=True)
	elif game_name in ["Tetris (complete)", "Tetris (planning only)"]:
	score_col = "Score"
	df_sorted = df.sort_values(by=score_col, ascending=True)
	else:
	return None

	# Create color gradient
	colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(df_sorted)))

	# Create horizontal bars
	bars = ax.barh(range(len(df_sorted)), df_sorted[score_col], color=colors)

	# Add more space for labels on the left
	plt.subplots_adjust(left=0.3)

	# Customize the chart
	ax.set_yticks(range(len(df_sorted)))

	# Format player names: keep organization info and truncate the rest if too long
	def format_player_name(player, org):
	max_length = 40 # Maximum length for player name
	if len(player) > max_length:
	# Keep the first part and last part of the name
	parts = player.split('-')
	if len(parts) > 3:
	formatted = f"{parts[0]}-{parts[1]}-...{parts[-1]}"
	else:
	formatted = player[:max_length-3] + "..."
	else:
	formatted = player
	return f"{formatted} [{org}]"

	player_labels = [format_player_name(row['Player'], row['Organization'])
	for _, row in df_sorted.iterrows()]
	ax.set_yticklabels(player_labels, fontsize=9)

	# Add value labels on the bars
	for i, bar in enumerate(bars):
	width = bar.get_width()
	if game_name == "Candy Crash":
	score_text = f'{width:.1f}'
	else:
	score_text = f'{width:.0f}'

	ax.text(width, bar.get_y() + bar.get_height()/2,
	score_text,
	ha='left', va='center',
	fontsize=10,
	fontweight='bold',
	color='white',
	bbox=dict(facecolor=(0, 0, 0, 0.3),
	edgecolor='none',
	alpha=0.5,
	pad=2))

	# Set title and labels
	ax.set_title(f"{game_name} Performance",
	pad=20,
	fontsize=14,
	fontweight='bold',
	color='#2c3e50')

	if game_name == "Sokoban":
	ax.set_xlabel("Maximum Level Reached",
	fontsize=12,
	fontweight='bold',
	color='#2c3e50',
	labelpad=10)
	else:
	ax.set_xlabel(score_col,
	fontsize=12,
	fontweight='bold',
	color='#2c3e50',
	labelpad=10)

	# Add grid lines
	ax.grid(True, axis='x', linestyle='--', alpha=0.3)

	# Remove top and right spines
	ax.spines['top'].set_visible(False)
	ax.spines['right'].set_visible(False)

	# Adjust layout
	plt.tight_layout()

	return fig

	def create_radar_charts(df):
	"""
	Create two radar charts with improved normalization using z-scores
	"""
	# Close any existing figures to prevent memory leaks
	plt.close('all')

	# Define reasoning models
	reasoning_models = [
	'claude-3-7-sonnet-20250219(thinking)',
	'o1-2024-12-17',
	'gemini-2.0-flash-thinking-exp-1219',
	'o3-mini-2025-01-31(medium)',
	'gemini-2.5-pro-exp-03-25',
	'o1-mini-2024-09-12',
	'deepseek-r1'
	]

	# Split dataframe into reasoning and non-reasoning models
	df_reasoning = df[df['Player'].isin(reasoning_models)]
	df_others = df[~df['Player'].isin(reasoning_models)]

	# Get game columns
	game_columns = [col for col in df.columns if col.endswith(' Score')]
	categories = [col.replace(' Score', '') for col in game_columns]

	# Create figure with two subplots - adjusted size for new layout
	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6), subplot_kw=dict(projection='polar'))
	fig.patch.set_facecolor('white') # Set figure background to white

	def get_game_stats(df, game_col):
	"""
	Get mean and std for a game column, handling missing values
	"""
	values = []
	for val in df[game_col]:
	if isinstance(val, str) and val == '_':
	values.append(0)
	else:
	try:
	values.append(float(val))
	except:
	values.append(0)
	return np.mean(values), np.std(values)

	def setup_radar_plot(ax, data, title):
	ax.set_facecolor('white') # Set subplot background to white

	num_vars = len(categories)
	angles = np.linspace(0, 2*np.pi, num_vars, endpoint=False)
	angles = np.concatenate((angles, [angles[0]]))

	# Plot grid lines with darker color
	grid_values = [10, 30, 50, 70, 90]
	ax.set_rgrids(grid_values,
	labels=grid_values,
	angle=45,
	fontsize=6,
	alpha=0.7, # Increased alpha for better visibility
	color='#404040') # Darker color for grid labels

	# Make grid lines darker but still subtle
	ax.grid(True, color='#404040', alpha=0.3) # Darker grid lines

	# Define darker, more vibrant colors for the radar plots
	colors = ['#1f77b4', '#d62728', '#2ca02c', '#ff7f0e', '#9467bd', '#8c564b']

	# Calculate game statistics once
	game_stats = {col: get_game_stats(df, col) for col in game_columns}

	# Plot data with darker lines and higher opacity for fills
	for idx, (_, row) in enumerate(data.iterrows()):
	values = []
	for col in game_columns:
	val = row[col]
	if isinstance(val, str) and val == '_':
	values.append(0)
	else:
	try:
	values.append(float(val))
	except:
	values.append(0)

	# Normalize values using game statistics
	normalized_values = []
	for i, v in enumerate(values):
	mean, std = game_stats[game_columns[i]]
	normalized_value = normalize_values([v], mean, std)[0]
	normalized_values.append(normalized_value)

	# Complete the circular plot
	normalized_values = np.concatenate((normalized_values, [normalized_values[0]]))

	model_name = simplify_model_name(row['Player'])
	ax.plot(angles, normalized_values, 'o-', linewidth=2.0, # Increased line width
	label=model_name,
	color=colors[idx % len(colors)],
	markersize=4) # Increased marker size
	ax.fill(angles, normalized_values,
	alpha=0.3, # Increased fill opacity
	color=colors[idx % len(colors)])

	# Format categories
	formatted_categories = []
	for game in categories:
	if game == "Tetris (planning only)":
	game = "Tetris\n(planning)"
	elif game == "Tetris (complete)":
	game = "Tetris\n(complete)"
	elif game == "Super Mario Bros":
	game = "Super\nMario"
	elif game == "Candy Crash":
	game = "Candy\nCrash"
	formatted_categories.append(game)

	ax.set_xticks(angles[:-1])
	ax.set_xticklabels(formatted_categories,
	fontsize=8, # Slightly larger font
	color='#202020', # Darker text
	fontweight='bold') # Bold text
	ax.tick_params(pad=10, colors='#202020') # Darker tick colors

	ax.set_title(title,
	pad=20,
	fontsize=11, # Slightly larger title
	color='#202020', # Darker title
	fontweight='bold') # Bold title

	legend = ax.legend(loc='upper right',
	bbox_to_anchor=(1.3, 1.1),
	fontsize=7, # Slightly larger legend
	framealpha=0.9, # More opaque legend
	edgecolor='#404040', # Darker edge
	ncol=1)

	ax.set_ylim(0, 105)
	ax.spines['polar'].set_color('#404040') # Darker spine
	ax.spines['polar'].set_alpha(0.5) # More visible spine

	# Setup both plots
	setup_radar_plot(ax1, df_reasoning, "Reasoning Models")
	setup_radar_plot(ax2, df_others, "Non-Reasoning Models")

	plt.subplots_adjust(right=0.85, wspace=0.3)

	return fig

	def get_combined_leaderboard_with_radar(rank_data, selected_games):
	"""
	Get combined leaderboard and create radar charts
	"""
	df = get_combined_leaderboard(rank_data, selected_games)
	radar_fig = create_radar_charts(df)
	return df, radar_fig

	def create_organization_radar_chart(rank_data):
	"""
	Create radar chart comparing organizations
	"""
	# Get combined leaderboard with all games
	df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER})

	# Group by organization and calculate average scores
	org_performance = {}
	for org in df["Organization"].unique():
	org_df = df[df["Organization"] == org]
	scores = {}
	for game in GAME_ORDER:
	game_scores = org_df[f"{game} Score"].apply(lambda x: float(x) if x != "_" else 0)
	scores[game] = game_scores.mean()
	org_performance[org] = scores

	# Create radar chart
	return create_radar_charts(pd.DataFrame([org_performance]))

	def create_top_players_radar_chart(rank_data, n=5):
	"""
	Create radar chart for top N players
	"""
	# Get combined leaderboard with all games
	df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER})

	# Get top N players
	top_players = df["Player"].head(n).tolist()

	# Create radar chart for top players
	return create_radar_charts(df[df["Player"].isin(top_players)])

	def create_player_radar_chart(rank_data, player_name):
	"""
	Create radar chart for a specific player
	"""
	# Get combined leaderboard with all games
	df = get_combined_leaderboard(rank_data, {game: True for game in GAME_ORDER})

	# Get player's data
	player_df = df[df["Player"] == player_name]

	if player_df.empty:
	return None

	# Create radar chart for the player
	return create_radar_charts(player_df)

	def create_group_bar_chart(df):
	"""
	Create a grouped bar chart comparing AI model performance across different games

	Args:
	df (pd.DataFrame): DataFrame containing the combined leaderboard data

	Returns:
	matplotlib.figure.Figure: The generated group bar chart figure
	"""
	# Close any existing figures to prevent memory leaks
	plt.close('all')

	# Create figure and axis with better styling
	sns.set_style("whitegrid")
	fig = plt.figure(figsize=(20, 11))

	# Create subplot with specific spacing
	ax = plt.subplot(111)

	# Adjust the subplot parameters
	plt.subplots_adjust(top=0.90, # Add more space at the top
	bottom=0.15, # Add more space at the bottom
	right=0.85, # Add more space for legend
	left=0.05) # Add space on the left

	# Get unique models
	models = df['Player'].unique()

	# Get active games (those that have score columns in the DataFrame)
	active_games = []
	for game in GAME_ORDER:
	score_col = f"{game} Score" # Use the same column name for all games
	if score_col in df.columns:
	active_games.append(game)

	n_games = len(active_games)
	if n_games == 0:
	return fig # Return empty figure if no games are selected

	# Keep track of which models have data in any game
	models_with_data = set()

	# Calculate normalized scores for each game
	for game_idx, game in enumerate(active_games):
	# Get all scores for this game
	game_scores = []

	# Use the same score column name for all games
	score_col = f"{game} Score"

	for model in models:
	try:
	score = df[df['Player'] == model][score_col].values[0]
	if score != '_' and float(score) > 0: # Only include non-zero scores
	game_scores.append((model, float(score)))
	models_with_data.add(model) # Add model to set if it has valid data
	except (IndexError, ValueError):
	continue

	if not game_scores: # Skip if no valid scores for this game
	continue

	# Sort scores from highest to lowest
	game_scores.sort(key=lambda x: x[1], reverse=True)

	# Extract sorted models and scores
	sorted_models = [x[0] for x in game_scores]
	scores = [x[1] for x in game_scores]

	# Calculate mean and std for normalization
	mean = np.mean(scores)
	std = np.std(scores)

	# Normalize scores
	normalized_scores = normalize_values(scores, mean, std)

	# Calculate bar width based on number of models in this game
	n_models_in_game = len(sorted_models)
	bar_width = 0.8 / n_models_in_game if n_models_in_game > 0 else 0.8

	# Plot bars for each model
	for i, (model, score) in enumerate(zip(sorted_models, normalized_scores)):
	# Only add to legend if first appearance and model has data
	should_label = model in models_with_data and model not in [l.get_text() for l in ax.get_legend().get_texts()] if ax.get_legend() else True

	# Get color from MODEL_COLORS, use a default if not found
	color = MODEL_COLORS.get(model, f"C{i % 10}") # Use matplotlib default colors as fallback

	ax.bar(game_idx + i*bar_width, score,
	width=bar_width,
	label=model if should_label else "",
	color=color,
	alpha=0.8)

	# Customize the plot
	ax.set_xticks(np.arange(n_games))
	ax.set_xticklabels(active_games, rotation=45, ha='right', fontsize=10)
	ax.set_ylabel('Normalized Performance Score', fontsize=12)
	ax.set_title('AI Model Performance Comparison Across Gaming Tasks',
	fontsize=14, pad=20)

	# Add grid lines
	ax.grid(True, axis='y', linestyle='--', alpha=0.3)

	# Create legend with unique entries
	handles, labels = ax.get_legend_handles_labels()
	by_label = dict(zip(labels, handles))

	# Sort models by their first appearance in active games
	model_order = []
	for game in active_games:
	score_col = f"{game} Score" # Use the same column name for all games
	for model in models:
	try:
	score = df[df['Player'] == model][score_col].values[0]
	if score != '_' and float(score) > 0 and model not in model_order:
	model_order.append(model)
	except (IndexError, ValueError):
	continue

	# Create legend with sorted models
	sorted_handles = [by_label[model] for model in model_order if model in by_label]
	sorted_labels = [model for model in model_order if model in by_label]

	ax.legend(sorted_handles, sorted_labels,
	bbox_to_anchor=(1.00, 1), # Moved from (1.15, 1) to (1.05, 1) to shift left
	loc='upper left',
	fontsize=9,
	title='AI Models',
	title_fontsize=10)

	# No need for tight_layout() as we're manually controlling the spacing

	return fig

	def get_combined_leaderboard_with_group_bar(rank_data, selected_games):
	"""
	Get combined leaderboard and create group bar chart

	Args:
	rank_data (dict): Dictionary containing rank data
	selected_games (dict): Dictionary of game names and their selection status

	Returns:
	tuple: (DataFrame, matplotlib.figure.Figure) containing the leaderboard data and group bar chart
	"""
	df = get_combined_leaderboard(rank_data, selected_games)
	group_bar_fig = create_group_bar_chart(df)
	return df, group_bar_fig

	def save_visualization(fig, filename):
	"""
	Save visualization to file
	"""
	fig.savefig(filename, bbox_inches='tight', dpi=300)