Spaces:

honestlyanubhav
/

Taxi-v3_Q-learning

Sleeping

DebatableMiracle

app.py

31bef6c 5 months ago

27.4 kB

	import streamlit as st
	import numpy as np
	import gymnasium as gym
	import matplotlib.pyplot as plt
	import seaborn as sns
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import base64
	from io import BytesIO
	from PIL import Image
	import time

	# Use session state to persist data across reruns
	if 'trained_qtable' not in st.session_state:
	st.session_state.trained_qtable = None
	if 'agent_videos' not in st.session_state:
	st.session_state.agent_videos = {}
	if 'training_completed' not in st.session_state:
	st.session_state.training_completed = False
	if 'training_params' not in st.session_state:
	st.session_state.training_params = {}
	if 'final_metrics' not in st.session_state:
	st.session_state.final_metrics = {}

	# Set page configuration for a cleaner look
	st.set_page_config(
	page_title="Taxi-v3 Q-Learning Dashboard",
	page_icon="🚕",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS to make the dashboard look cleaner
	st.markdown("""
	<style>
	.main .block-container {
	padding-top: 2rem;
	padding-bottom: 2rem;
	}
	.stTabs [data-baseweb="tab-list"] {
	gap: 10px;
	}
	.stTabs [data-baseweb="tab"] {
	background-color: #f0f2f6;
	border-radius: 4px 4px 0px 0px;
	padding: 10px 20px;
	font-weight: 600;
	}
	.stTabs [aria-selected="true"] {
	background-color: #e6f0ff;
	border-bottom: 2px solid #4e8df5;
	}
	.reportview-container .main .block-container {
	max-width: 1200px;
	}
	div[data-testid="stSidebarNav"] li div a {
	margin-left: 1rem;
	padding: 1rem;
	width: 300px;
	border-radius: 0.5rem;
	}
	div[data-testid="stSidebarNav"] li div::focus-visible {
	background-color: rgba(151, 166, 195, 0.15);
	}
	.stMetric {
	background-color: #f0f2f6;
	padding: 15px 20px;
	border-radius: 6px;
	margin-bottom: 10px;
	}
	.css-12w0qpk {
	background-color: #f8f9fa;
	}
	</style>
	""", unsafe_allow_html=True)

	# Header
	st.markdown("""
	<div style="text-align: center; margin-bottom: 30px;">
	<h1 style="color: #1e3a8a; margin-bottom:0;">🚕 Taxi-v3 Q-Learning Dashboard</h1>
	<p style="color: #64748b; font-size: 1.2em;">Interactive Reinforcement Learning Visualization</p>
	</div>
	""", unsafe_allow_html=True)

	# Create a two-column layout for the main dashboard
	col1, col2 = st.columns([3, 2])

	with col2:
	st.markdown("### 🎮 Environment Preview")

	# Fix: Create a proper environment preview by resetting first
	preview_env = gym.make("Taxi-v3", render_mode="rgb_array")
	preview_env.reset() # Reset the environment first
	env_preview = preview_env.render()
	st.image(env_preview, caption="Taxi-v3 Environment", use_column_width=True)

	st.markdown("""
	<div style="background-color: #f0f8ff; padding: 15px; border-radius: 10px; margin-top: 20px;">
	<h4 style="margin-top: 0;">📝 About this Environment</h4>
	<p>The Taxi-v3 task involves navigating a taxi to pick up a passenger and drop them off at a destination.</p>
	<ul>
	<li><b>Yellow</b>: taxi</li>
	<li><b>Blue</b>: pick-up location</li>
	<li><b>Purple</b>: drop-off location</li>
	<li><b>Green</b>: passenger</li>
	<li><b>Letters (R, G, Y, B)</b>: locations</li>
	</ul>
	</div>
	""", unsafe_allow_html=True)

	with col1:
	st.markdown("### ⚙️ Training Parameters")

	# Only show parameters if training hasn't completed yet
	if not st.session_state.training_completed:
	# Create a cleaner parameter input section
	col_a, col_b = st.columns(2)

	with col_a:
	n_episodes = st.number_input("Training Episodes", min_value=1000, max_value=100000, value=25000, step=1000)
	learning_rate = st.slider("Learning Rate (α)", 0.01, 1.0, 0.7, 0.01,
	format="%.2f", help="Controls how much new information overrides old information")
	gamma = st.slider("Discount Factor (γ)", 0.80, 0.99, 0.95, 0.01,
	format="%.2f", help="Determines the importance of future rewards")
	max_steps = st.slider("Max Steps per Episode", 50, 500, 99)

	with col_b:
	min_epsilon = st.slider("Min Exploration Rate (ε)", 0.01, 0.5, 0.05, 0.01,
	format="%.2f", help="Minimum probability of random action")
	max_epsilon = st.slider("Max Exploration Rate (ε)", 0.5, 1.0, 1.0, 0.01,
	format="%.2f", help="Starting probability of random action")
	decay_rate = st.slider("Epsilon Decay Rate", 0.0001, 0.01, 0.001, 0.0001,
	format="%.4f", help="How quickly exploration decreases")
	n_eval_episodes = st.slider("Evaluation Episodes", 10, 200, 100,
	help="Number of episodes to evaluate performance")

	# Additional parameters in a collapsed section
	with st.expander("Advanced Settings"):
	log_freq = st.slider("Q-table Update Frequency (every N episodes)", 1, 1000, 500)
	eval_every = st.slider("Evaluation Frequency (% of training)", 5, 50, 10,
	help="How often to evaluate agent performance")
	video_length = st.slider("Evaluation Video Length (steps)", 10, 200, 50,
	help="Maximum steps to show in visualization videos")
	else:
	# If training is completed, show the parameters that were used
	st.info("Training completed with the following parameters:")
	params = st.session_state.training_params

	col_a, col_b = st.columns(2)
	with col_a:
	st.write(f"Training Episodes: {params['n_episodes']}")
	st.write(f"Learning Rate (α): {params['learning_rate']}")
	st.write(f"Discount Factor (γ): {params['gamma']}")
	st.write(f"Max Steps per Episode: {params['max_steps']}")

	with col_b:
	st.write(f"Min Exploration Rate (ε): {params['min_epsilon']}")
	st.write(f"Max Exploration Rate (ε): {params['max_epsilon']}")
	st.write(f"Epsilon Decay Rate: {params['decay_rate']}")
	st.write(f"Evaluation Episodes: {params['n_eval_episodes']}")

	# Option to reset and train again
	if st.button("Reset and Train Again", type="secondary"):
	st.session_state.training_completed = False
	st.session_state.trained_qtable = None
	st.session_state.agent_videos = {}
	st.session_state.training_params = {}
	st.session_state.final_metrics = {}
	st.experimental_rerun()

	# Initialize Q-table
	def initialize_q_table(state_space, action_space):
	return np.zeros((state_space, action_space))

	# Policies
	def greedy_policy(Qtable, state):
	return np.argmax(Qtable[state, :])

	def epsilon_greedy_policy(Qtable, state, epsilon):
	if np.random.uniform(0, 1) > epsilon:
	return greedy_policy(Qtable, state)
	else:
	return env.action_space.sample()

	# Function to create animation of agent behavior
	def create_agent_video(env, Q, max_steps=100, seed=None):
	frames = []
	state, info = env.reset(seed=seed)

	# Add the initial frame
	frames.append(env.render())

	for _ in range(max_steps):
	# Choose action based on greedy policy
	action = greedy_policy(Q, state)

	# Step in environment
	state, reward, terminated, truncated, _ = env.step(action)

	# Render the frame after taking action
	frames.append(env.render())

	# Break if episode is done
	if terminated or truncated:
	break

	return frames

	# Evaluation function
	def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed=None):
	rewards = []
	steps = []
	success_count = 0

	for episode in range(n_eval_episodes):
	state, info = env.reset(seed=seed[episode] if seed else None)
	total_rewards_ep = 0
	num_steps = 0

	for step in range(max_steps):
	action = greedy_policy(Q, state)
	state, reward, terminated, truncated, _ = env.step(action)
	total_rewards_ep += reward
	num_steps += 1

	if terminated or truncated:
	if reward > 0: # Successfully completed the task
	success_count += 1
	break

	rewards.append(total_rewards_ep)
	steps.append(num_steps)

	success_rate = success_count / n_eval_episodes * 100
	return np.mean(rewards), np.std(rewards), np.mean(steps), success_rate

	# Function to convert frames to HTML video
	def frames_to_html_video(frames, fps=5):
	if not frames:
	return "<p>No frames available</p>"

	try:
	# Create a PIL image from each frame
	pil_images = [Image.fromarray(frame) for frame in frames]

	# Save as animated GIF to a BytesIO object
	buffer = BytesIO()
	pil_images[0].save(
	buffer,
	format='GIF',
	save_all=True,
	append_images=pil_images[1:],
	duration=1000/fps,
	loop=0
	)
	buffer.seek(0)

	# Encode as base64
	encoded = base64.b64encode(buffer.read()).decode("utf-8")

	# Embed in HTML
	html = f'<img src="data:image/gif;base64,{encoded}" alt="agent behavior" style="width:100%">'
	return html
	except Exception as e:
	return f"<p>Error generating video: {str(e)}</p>"

	# Training function
	def train_agent(env, eval_env, params):
	# Unpack parameters
	n_episodes = params["n_episodes"]
	learning_rate = params["learning_rate"]
	gamma = params["gamma"]
	max_steps = params["max_steps"]
	min_epsilon = params["min_epsilon"]
	max_epsilon = params["max_epsilon"]
	decay_rate = params["decay_rate"]
	log_freq = params["log_freq"]
	eval_every = params["eval_every"]
	n_eval_episodes = params["n_eval_episodes"]
	video_length = params["video_length"]

	# Store parameters in session state
	st.session_state.training_params = params

	# Initialize Q-table
	Qtable = initialize_q_table(env.observation_space.n, env.action_space.n)

	# Training metrics
	reward_log = []
	steps_log = []
	qtable_snapshots = {}
	videos = {}
	epsilons = []

	# Calculate checkpoints (ensure at least one checkpoint at the beginning)
	num_checkpoints = 10 # Number of checkpoints to create
	checkpoint_episodes = [int(n_episodes * i / num_checkpoints) for i in range(1, num_checkpoints + 1)]
	checkpoint_episodes[0] = max(1, checkpoint_episodes[0]) # Ensure first checkpoint is at least at episode 1

	# Progress tracking
	progress_bar = st.progress(0)
	status_text = st.empty()

	# Dashboard components
	tab1, tab2, tab3 = st.tabs(["📊 Training Progress", "🎬 Agent Evolution", "📋 Q-Table Visualization"])

	with tab1:
	col_metrics1, col_metrics2, col_metrics3, col_metrics4 = st.columns(4)
	with col_metrics1:
	current_episode_metric = st.empty()
	with col_metrics2:
	avg_reward_metric = st.empty()
	with col_metrics3:
	avg_steps_metric = st.empty()
	with col_metrics4:
	success_rate_metric = st.empty()

	metrics_chart = st.empty()
	epsilon_chart = st.empty()

	with tab2:
	video_placeholder = st.empty()

	with tab3:
	qtable_visualization = st.empty()

	# Training loop
	start_time = time.time()

	for episode in range(n_episodes):
	# Calculate epsilon for this episode
	epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
	epsilons.append(epsilon)

	state, info = env.reset()
	total_reward = 0

	# Episode steps
	for step in range(max_steps):
	action = epsilon_greedy_policy(Qtable, state, epsilon)
	new_state, reward, terminated, truncated, _ = env.step(action)

	# Update Q-table
	Qtable[state][action] += learning_rate * (
	reward + gamma * np.max(Qtable[new_state, :]) - Qtable[state][action]
	)

	total_reward += reward
	if terminated or truncated:
	break

	state = new_state

	# Evaluation at checkpoints
	if episode in checkpoint_episodes or episode == n_episodes - 1:
	mean_reward, std_reward, mean_steps, success_rate = evaluate_agent(
	env, max_steps, n_eval_episodes, Qtable
	)
	reward_log.append((episode, mean_reward, std_reward))
	steps_log.append((episode, mean_steps))

	# Create and store video of agent behavior
	try:
	video_frames = create_agent_video(eval_env, Qtable, max_steps=video_length)
	videos[episode] = video_frames
	except Exception as e:
	st.warning(f"Could not create video for episode {episode}: {str(e)}")
	videos[episode] = []

	# Take Q-table snapshot
	qtable_snapshots[episode] = Qtable.copy()

	# Update metrics display
	current_episode_metric.metric("Episodes", f"{episode}/{n_episodes}",
	delta=f"{episode/n_episodes:.1%}")
	avg_reward_metric.metric("Avg. Reward", f"{mean_reward:.2f}",
	delta=f"{mean_reward - reward_log[-2][1]:.2f}" if len(reward_log) > 1 else None)
	avg_steps_metric.metric("Avg. Steps", f"{mean_steps:.1f}")
	success_rate_metric.metric("Success Rate", f"{success_rate:.1f}%")

	# Update progress charts
	if reward_log:
	# Prepare data for plots
	progress_df = pd.DataFrame(
	reward_log, columns=["Episode", "Mean Reward", "Std Reward"]
	)
	steps_df = pd.DataFrame(steps_log, columns=["Episode", "Mean Steps"])

	# Create subplots
	fig = make_subplots(specs=[[{"secondary_y": True}]])

	# Add reward line
	fig.add_trace(
	go.Scatter(
	x=progress_df["Episode"],
	y=progress_df["Mean Reward"],
	mode="lines+markers",
	name="Mean Reward",
	line=dict(color="#1f77b4", width=3),
	marker=dict(size=8)
	)
	)

	# Add steps line on secondary axis
	fig.add_trace(
	go.Scatter(
	x=steps_df["Episode"],
	y=steps_df["Mean Steps"],
	mode="lines+markers",
	name="Mean Steps",
	line=dict(color="#ff7f0e", width=3, dash="dot"),
	marker=dict(size=8)
	),
	secondary_y=True
	)

	# Add confidence interval for reward
	fig.add_trace(
	go.Scatter(
	x=progress_df["Episode"].tolist() + progress_df["Episode"].tolist()[::-1],
	y=(progress_df["Mean Reward"] + progress_df["Std Reward"]).tolist() +
	(progress_df["Mean Reward"] - progress_df["Std Reward"]).tolist()[::-1],
	fill="toself",
	fillcolor="rgba(31, 119, 180, 0.2)",
	line=dict(color="rgba(255,255,255,0)"),
	hoverinfo="skip",
	showlegend=False
	)
	)

	# Update layout
	fig.update_layout(
	title="Agent Performance Over Training",
	xaxis_title="Training Episode",
	margin=dict(l=20, r=20, t=40, b=20),
	legend=dict(
	orientation="h",
	yanchor="bottom",
	y=1.02,
	xanchor="right",
	x=1
	),
	height=400
	)

	# Set y-axes titles
	fig.update_yaxes(title_text="Reward", secondary_y=False)
	fig.update_yaxes(title_text="Steps", secondary_y=True)

	metrics_chart.plotly_chart(fig, use_container_width=True)

	# Epsilon decay chart
	epsilon_df = pd.DataFrame({
	"Episode": list(range(len(epsilons))),
	"Epsilon": epsilons
	})

	epsilon_fig = px.line(
	epsilon_df,
	x="Episode",
	y="Epsilon",
	title="Exploration Rate (Epsilon) Decay"
	)

	epsilon_fig.update_layout(
	xaxis_title="Training Episode",
	yaxis_title="Epsilon Value",
	height=250,
	margin=dict(l=20, r=20, t=40, b=20)
	)

	epsilon_chart.plotly_chart(epsilon_fig, use_container_width=True)

	# Update Q-table visualization
	qtable_fig = px.imshow(
	Qtable,
	labels=dict(x="Actions", y="States", color="Q-Value"),
	x=['South', 'North', 'East', 'West', 'Pickup', 'Dropoff'],
	zmin=Qtable.min(),
	zmax=Qtable.max(),
	color_continuous_scale="Viridis"
	)

	qtable_fig.update_layout(
	title=f"Q-table at Episode {episode}",
	height=600,
	margin=dict(l=20, r=20, t=40, b=20)
	)

	qtable_visualization.plotly_chart(qtable_fig, use_container_width=True)

	# Q-table snapshot at regular intervals
	if episode % log_freq == 0:
	qtable_snapshots[episode] = Qtable.copy()

	# Update progress
	if episode % 100 == 0:
	elapsed = time.time() - start_time
	estimated = elapsed / (episode + 1) * (n_episodes - episode - 1) if episode > 0 else 0
	status_text.text(f"Training in progress... Time elapsed: {elapsed:.1f}s \| Estimated time remaining: {estimated:.1f}s")
	progress_bar.progress((episode + 1) / n_episodes)

	# Training complete
	progress_bar.progress(1.0)
	status_text.success(f"✅ Training completed in {time.time() - start_time:.1f} seconds!")

	# Store results in session state for persistence
	st.session_state.trained_qtable = Qtable
	st.session_state.agent_videos = videos
	st.session_state.training_completed = True

	# Final evaluation
	final_mean, final_std, final_steps, final_success = evaluate_agent(
	env, max_steps, n_eval_episodes * 2, Qtable # Double evaluation episodes for final eval
	)

	# Store final metrics
	st.session_state.final_metrics = {
	"mean_reward": final_mean,
	"std_reward": final_std,
	"mean_steps": final_steps,
	"success_rate": final_success,
	"q_min": Qtable.min(),
	"q_max": Qtable.max()
	}

	return Qtable, videos

	# Environment setup
	env = gym.make("Taxi-v3")
	eval_env = gym.make("Taxi-v3", render_mode="rgb_array")

	# Create training button if training hasn't completed
	if not st.session_state.training_completed:
	train_col1, train_col2 = st.columns([3, 1])
	with train_col1:
	st.write("") # For spacing
	with train_col2:
	start_training = st.button("🚀 Start Training", type="primary", use_container_width=True)

	# Start training when button is clicked
	if start_training:
	params = {
	"n_episodes": n_episodes,
	"learning_rate": learning_rate,
	"gamma": gamma,
	"max_steps": max_steps,
	"min_epsilon": min_epsilon,
	"max_epsilon": max_epsilon,
	"decay_rate": decay_rate,
	"log_freq": log_freq,
	"eval_every": eval_every,
	"n_eval_episodes": n_eval_episodes,
	"video_length": video_length if 'video_length' in locals() else 50
	}

	trained_qtable, agent_videos = train_agent(env, eval_env, params)

	# If training is completed, show results
	if st.session_state.training_completed:
	# Create tabs for different visualizations
	tab1, tab2, tab3 = st.tabs(["📊 Training Results", "🎬 Agent Evolution", "📋 Q-Table Visualization"])

	with tab1:
	# Summary metrics in nice boxes
	st.markdown("### 📊 Final Performance")

	metrics = st.session_state.final_metrics
	metric_cols = st.columns(4)
	with metric_cols[0]:
	st.metric("Final Average Reward", f"{metrics['mean_reward']:.2f}", delta=f"±{metrics['std_reward']:.2f}")
	with metric_cols[1]:
	st.metric("Average Steps to Complete", f"{metrics['mean_steps']:.1f}")
	with metric_cols[2]:
	st.metric("Success Rate", f"{metrics['success_rate']:.1f}%")
	with metric_cols[3]:
	st.metric("Q-values Range", f"{metrics['q_min']:.2f} to {metrics['q_max']:.2f}")

	# Download trained Q-table
	st.subheader("Export Model")

	# Convert Q-table to bytes for download
	def get_table_download_link(array):
	csvfile = BytesIO()
	np.save(csvfile, array)
	b64 = base64.b64encode(csvfile.getvalue()).decode()
	href = f'<a href="data:application/octet-stream;base64,{b64}" download="qtable.npy">Download Q-table (.npy)</a>'
	return href

	st.markdown(get_table_download_link(st.session_state.trained_qtable), unsafe_allow_html=True)

	with tab2:
	# Create video selection slider
	video_episodes = sorted(list(st.session_state.agent_videos.keys()))

	if video_episodes:
	selected_episode = st.select_slider(
	"Select checkpoint to view agent behavior:",
	options=video_episodes,
	format_func=lambda x: f"Episode {x} ({x/st.session_state.training_params['n_episodes']:.0%})"
	)

	# Display the selected video
	st.markdown(f"### Agent Behavior at Episode {selected_episode} ({selected_episode/st.session_state.training_params['n_episodes']:.0%})")

	video_html = frames_to_html_video(st.session_state.agent_videos[selected_episode])
	st.markdown(video_html, unsafe_allow_html=True)

	# Add explanation of agent behavior
	st.markdown("""
	#### What Am I Looking At?

	This animated visualization shows how the trained agent behaves at different stages of training.
	You can observe:

	- The yellow square represents the taxi
	- The letters (R, G, Y, B) represent four fixed locations
	- The blue letter represents the passenger pickup location
	- The purple letter represents the passenger dropoff destination
	- When the passenger is in the taxi, the taxi turns green

	The agent makes decisions based on its learned Q-values. Early in training, movements may appear random as the agent explores.
	Later in training, the agent should take more direct routes to complete the task efficiently.
	""")

	with tab3:
	# Q-table visualization
	st.markdown("### Q-Table Visualization")
	st.info("This heatmap shows the learned Q-values that guide the agent's decision making.")

	# Generate a heatmap of the Q-table
	qtable_fig = px.imshow(
	st.session_state.trained_qtable,
	labels=dict(x="Actions", y="States", color="Q-Value"),
	x=['South', 'North', 'East', 'West', 'Pickup', 'Dropoff'],
	zmin=st.session_state.final_metrics["q_min"],
	zmax=st.session_state.final_metrics["q_max"],
	color_continuous_scale="Viridis"
	)

	qtable_fig.update_layout(
	title="Final Q-table",
	height=600,
	margin=dict(l=20, r=20, t=40, b=20)
	)

	st.plotly_chart(qtable_fig, use_container_width=True)

	# Add Q-table explanation
	st.markdown("""
	#### Understanding the Q-Table

	The Q-table is the heart of the Q-learning algorithm:

	- Each row represents a different state (there are 500 possible states in Taxi-v3)
	- Each column represents an action (South, North, East, West, Pickup, Dropoff)
	- The values (colors) represent the expected future reward for taking that action in that state
	- Brighter colors indicate higher expected rewards

	The agent selects actions by choosing the highest value (brightest color) for its current state.
	""")

	# Add educational resources at the bottom
	with st.expander("📚 Learn More About Q-Learning"):
	st.markdown("""
	### Key Concepts in Q-Learning

	* Q-Value: Represents the expected future reward for taking action A in state S
	* Exploration vs Exploitation: Balancing between trying new actions and using known good actions
	* Learning Rate (α): Controls how much new information overrides old information
	* Discount Factor (γ): Determines the importance of future rewards
	* Epsilon-greedy Policy: A strategy that balances exploration and exploitation

	### Taxi-v3 Environment Details

	The Taxi environment consists of a 5x5 grid world where a taxi needs to:
	1. Navigate to the passenger's location
	2. Pick up the passenger
	3. Navigate to the destination
	4. Drop off the passenger

	Actions:
	- Move South (0)
	- Move North (1)
	- Move East (2)
	- Move West (3)
	- Pickup passenger (4)
	- Dropoff passenger (5)

	Rewards:
	- -1 per time step
	- +20 for successful dropoff
	- -10 for illegal pickup/dropoff actions
	""")

	# Footer
	st.markdown("""
	<div style="text-align: center; margin-top: 30px; padding-top: 20px; border-top: 1px solid #eee;">
	<p style="color: #64748b;">Interactive Q-Learning Dashboard for Reinforcement Learning Education</p>
	</div>
	""", unsafe_allow_html=True)