🚕 Taxi-v3 Q-Learning Dashboard

import streamlit as st
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import base64
from io import BytesIO
from PIL import Image
import time

# Use session state to persist data across reruns
if 'trained_qtable' not in st.session_state:
    st.session_state.trained_qtable = None
if 'agent_videos' not in st.session_state:
    st.session_state.agent_videos = {}
if 'training_completed' not in st.session_state:
    st.session_state.training_completed = False
if 'training_params' not in st.session_state:
    st.session_state.training_params = {}
if 'final_metrics' not in st.session_state:
    st.session_state.final_metrics = {}

# Set page configuration for a cleaner look
st.set_page_config(
    page_title="Taxi-v3 Q-Learning Dashboard",
    page_icon="🚕",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS to make the dashboard look cleaner
st.markdown("""
<style>
    .main .block-container {
        padding-top: 2rem;
        padding-bottom: 2rem;
    }
    .stTabs [data-baseweb="tab-list"] {
        gap: 10px;
    }
    .stTabs [data-baseweb="tab"] {
        background-color: #f0f2f6;
        border-radius: 4px 4px 0px 0px;
        padding: 10px 20px;
        font-weight: 600;
    }
    .stTabs [aria-selected="true"] {
        background-color: #e6f0ff;
        border-bottom: 2px solid #4e8df5;
    }
    .reportview-container .main .block-container {
        max-width: 1200px;
    }
    div[data-testid="stSidebarNav"] li div a {
        margin-left: 1rem;
        padding: 1rem;
        width: 300px;
        border-radius: 0.5rem;
    }
    div[data-testid="stSidebarNav"] li div::focus-visible {
        background-color: rgba(151, 166, 195, 0.15);
    }
    .stMetric {
        background-color: #f0f2f6;
        padding: 15px 20px;
        border-radius: 6px;
        margin-bottom: 10px;
    }
    .css-12w0qpk {
        background-color: #f8f9fa;
    }
</style>
""", unsafe_allow_html=True)

# Header
st.markdown("""
    <div style="text-align: center; margin-bottom: 30px;">
        <h1 style="color: #1e3a8a; margin-bottom:0;">🚕 Taxi-v3 Q-Learning Dashboard</h1>
        <p style="color: #64748b; font-size: 1.2em;">Interactive Reinforcement Learning Visualization</p>
    </div>
""", unsafe_allow_html=True)

# Create a two-column layout for the main dashboard
col1, col2 = st.columns([3, 2])

with col2:
    st.markdown("### 🎮 Environment Preview")
    
    # Fix: Create a proper environment preview by resetting first
    preview_env = gym.make("Taxi-v3", render_mode="rgb_array")
    preview_env.reset()  # Reset the environment first
    env_preview = preview_env.render()
    st.image(env_preview, caption="Taxi-v3 Environment", use_column_width=True)
    
    st.markdown("""
    <div style="background-color: #f0f8ff; padding: 15px; border-radius: 10px; margin-top: 20px;">
        <h4 style="margin-top: 0;">📝 About this Environment</h4>
        <p>The Taxi-v3 task involves navigating a taxi to pick up a passenger and drop them off at a destination.</p>
        <ul>
            <li><b>Yellow</b>: taxi</li>
            <li><b>Blue</b>: pick-up location</li>
            <li><b>Purple</b>: drop-off location</li>
            <li><b>Green</b>: passenger</li>
            <li><b>Letters (R, G, Y, B)</b>: locations</li>
        </ul>
    </div>
    """, unsafe_allow_html=True)

with col1:
    st.markdown("### ⚙️ Training Parameters")
    
    # Only show parameters if training hasn't completed yet
    if not st.session_state.training_completed:
        # Create a cleaner parameter input section
        col_a, col_b = st.columns(2)
        
        with col_a:
            n_episodes = st.number_input("Training Episodes", min_value=1000, max_value=100000, value=25000, step=1000)
            learning_rate = st.slider("Learning Rate (α)", 0.01, 1.0, 0.7, 0.01, 
                                    format="%.2f", help="Controls how much new information overrides old information")
            gamma = st.slider("Discount Factor (γ)", 0.80, 0.99, 0.95, 0.01, 
                            format="%.2f", help="Determines the importance of future rewards")
            max_steps = st.slider("Max Steps per Episode", 50, 500, 99)
        
        with col_b:
            min_epsilon = st.slider("Min Exploration Rate (ε)", 0.01, 0.5, 0.05, 0.01, 
                                  format="%.2f", help="Minimum probability of random action")
            max_epsilon = st.slider("Max Exploration Rate (ε)", 0.5, 1.0, 1.0, 0.01, 
                                  format="%.2f", help="Starting probability of random action")
            decay_rate = st.slider("Epsilon Decay Rate", 0.0001, 0.01, 0.001, 0.0001, 
                                 format="%.4f", help="How quickly exploration decreases")
            n_eval_episodes = st.slider("Evaluation Episodes", 10, 200, 100, 
                                       help="Number of episodes to evaluate performance")

        # Additional parameters in a collapsed section
        with st.expander("Advanced Settings"):
            log_freq = st.slider("Q-table Update Frequency (every N episodes)", 1, 1000, 500)
            eval_every = st.slider("Evaluation Frequency (% of training)", 5, 50, 10, 
                                  help="How often to evaluate agent performance")
            video_length = st.slider("Evaluation Video Length (steps)", 10, 200, 50, 
                                   help="Maximum steps to show in visualization videos")
    else:
        # If training is completed, show the parameters that were used
        st.info("Training completed with the following parameters:")
        params = st.session_state.training_params
        
        col_a, col_b = st.columns(2)
        with col_a:
            st.write(f"**Training Episodes**: {params['n_episodes']}")
            st.write(f"**Learning Rate (α)**: {params['learning_rate']}")
            st.write(f"**Discount Factor (γ)**: {params['gamma']}")
            st.write(f"**Max Steps per Episode**: {params['max_steps']}")
        
        with col_b:
            st.write(f"**Min Exploration Rate (ε)**: {params['min_epsilon']}")
            st.write(f"**Max Exploration Rate (ε)**: {params['max_epsilon']}")
            st.write(f"**Epsilon Decay Rate**: {params['decay_rate']}")
            st.write(f"**Evaluation Episodes**: {params['n_eval_episodes']}")
        
        # Option to reset and train again
        if st.button("Reset and Train Again", type="secondary"):
            st.session_state.training_completed = False
            st.session_state.trained_qtable = None
            st.session_state.agent_videos = {}
            st.session_state.training_params = {}
            st.session_state.final_metrics = {}
            st.experimental_rerun()

# Initialize Q-table
def initialize_q_table(state_space, action_space):
    return np.zeros((state_space, action_space))

# Policies
def greedy_policy(Qtable, state):
    return np.argmax(Qtable[state, :])

def epsilon_greedy_policy(Qtable, state, epsilon):
    if np.random.uniform(0, 1) > epsilon:
        return greedy_policy(Qtable, state)
    else:
        return env.action_space.sample()

# Function to create animation of agent behavior
def create_agent_video(env, Q, max_steps=100, seed=None):
    frames = []
    state, info = env.reset(seed=seed)
    
    # Add the initial frame
    frames.append(env.render())
    
    for _ in range(max_steps):
        # Choose action based on greedy policy
        action = greedy_policy(Q, state)
        
        # Step in environment
        state, reward, terminated, truncated, _ = env.step(action)
        
        # Render the frame after taking action
        frames.append(env.render())
        
        # Break if episode is done
        if terminated or truncated:
            break
    
    return frames

# Evaluation function
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed=None):
    rewards = []
    steps = []
    success_count = 0
    
    for episode in range(n_eval_episodes):
        state, info = env.reset(seed=seed[episode] if seed else None)
        total_rewards_ep = 0
        num_steps = 0
        
        for step in range(max_steps):
            action = greedy_policy(Q, state)
            state, reward, terminated, truncated, _ = env.step(action)
            total_rewards_ep += reward
            num_steps += 1
            
            if terminated or truncated:
                if reward > 0:  # Successfully completed the task
                    success_count += 1
                break
        
        rewards.append(total_rewards_ep)
        steps.append(num_steps)
    
    success_rate = success_count / n_eval_episodes * 100
    return np.mean(rewards), np.std(rewards), np.mean(steps), success_rate

# Function to convert frames to HTML video
def frames_to_html_video(frames, fps=5):
    if not frames:
        return "<p>No frames available</p>"

    try:
        # Create a PIL image from each frame
        pil_images = [Image.fromarray(frame) for frame in frames]
        
        # Save as animated GIF to a BytesIO object
        buffer = BytesIO()
        pil_images[0].save(
            buffer, 
            format='GIF', 
            save_all=True, 
            append_images=pil_images[1:], 
            duration=1000/fps, 
            loop=0
        )
        buffer.seek(0)
        
        # Encode as base64
        encoded = base64.b64encode(buffer.read()).decode("utf-8")
        
        # Embed in HTML
        html = f'<img src="data:image/gif;base64,{encoded}" alt="agent behavior" style="width:100%">'
        return html
    except Exception as e:
        return f"<p>Error generating video: {str(e)}</p>"

# Training function
def train_agent(env, eval_env, params):
    # Unpack parameters
    n_episodes = params["n_episodes"]
    learning_rate = params["learning_rate"]
    gamma = params["gamma"]
    max_steps = params["max_steps"]
    min_epsilon = params["min_epsilon"]
    max_epsilon = params["max_epsilon"]
    decay_rate = params["decay_rate"]
    log_freq = params["log_freq"]
    eval_every = params["eval_every"]
    n_eval_episodes = params["n_eval_episodes"]
    video_length = params["video_length"]
    
    # Store parameters in session state
    st.session_state.training_params = params
    
    # Initialize Q-table
    Qtable = initialize_q_table(env.observation_space.n, env.action_space.n)
    
    # Training metrics
    reward_log = []
    steps_log = []
    qtable_snapshots = {}
    videos = {}
    epsilons = []
    
    # Calculate checkpoints (ensure at least one checkpoint at the beginning)
    num_checkpoints = 10  # Number of checkpoints to create
    checkpoint_episodes = [int(n_episodes * i / num_checkpoints) for i in range(1, num_checkpoints + 1)]
    checkpoint_episodes[0] = max(1, checkpoint_episodes[0])  # Ensure first checkpoint is at least at episode 1
    
    # Progress tracking
    progress_bar = st.progress(0)
    status_text = st.empty()
    
    # Dashboard components
    tab1, tab2, tab3 = st.tabs(["📊 Training Progress", "🎬 Agent Evolution", "📋 Q-Table Visualization"])
    
    with tab1:
        col_metrics1, col_metrics2, col_metrics3, col_metrics4 = st.columns(4)
        with col_metrics1:
            current_episode_metric = st.empty()
        with col_metrics2:
            avg_reward_metric = st.empty()
        with col_metrics3:
            avg_steps_metric = st.empty()
        with col_metrics4:
            success_rate_metric = st.empty()
            
        metrics_chart = st.empty()
        epsilon_chart = st.empty()
    
    with tab2:
        video_placeholder = st.empty()
        
    with tab3:
        qtable_visualization = st.empty()
    
    # Training loop
    start_time = time.time()
    
    for episode in range(n_episodes):
        # Calculate epsilon for this episode
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
        epsilons.append(epsilon)
        
        state, info = env.reset()
        total_reward = 0
        
        # Episode steps
        for step in range(max_steps):
            action = epsilon_greedy_policy(Qtable, state, epsilon)
            new_state, reward, terminated, truncated, _ = env.step(action)
            
            # Update Q-table
            Qtable[state][action] += learning_rate * (
                reward + gamma * np.max(Qtable[new_state, :]) - Qtable[state][action]
            )
            
            total_reward += reward
            if terminated or truncated:
                break
                
            state = new_state
        
        # Evaluation at checkpoints
        if episode in checkpoint_episodes or episode == n_episodes - 1:
            mean_reward, std_reward, mean_steps, success_rate = evaluate_agent(
                env, max_steps, n_eval_episodes, Qtable
            )
            reward_log.append((episode, mean_reward, std_reward))
            steps_log.append((episode, mean_steps))
            
            # Create and store video of agent behavior
            try:
                video_frames = create_agent_video(eval_env, Qtable, max_steps=video_length)
                videos[episode] = video_frames
            except Exception as e:
                st.warning(f"Could not create video for episode {episode}: {str(e)}")
                videos[episode] = []
            
            # Take Q-table snapshot
            qtable_snapshots[episode] = Qtable.copy()
            
            # Update metrics display
            current_episode_metric.metric("Episodes", f"{episode}/{n_episodes}", 
                                         delta=f"{episode/n_episodes:.1%}")
            avg_reward_metric.metric("Avg. Reward", f"{mean_reward:.2f}", 
                                    delta=f"{mean_reward - reward_log[-2][1]:.2f}" if len(reward_log) > 1 else None)
            avg_steps_metric.metric("Avg. Steps", f"{mean_steps:.1f}")
            success_rate_metric.metric("Success Rate", f"{success_rate:.1f}%")
            
            # Update progress charts
            if reward_log:
                # Prepare data for plots
                progress_df = pd.DataFrame(
                    reward_log, columns=["Episode", "Mean Reward", "Std Reward"]
                )
                steps_df = pd.DataFrame(steps_log, columns=["Episode", "Mean Steps"])
                
                # Create subplots
                fig = make_subplots(specs=[[{"secondary_y": True}]])
                
                # Add reward line
                fig.add_trace(
                    go.Scatter(
                        x=progress_df["Episode"],
                        y=progress_df["Mean Reward"],
                        mode="lines+markers",
                        name="Mean Reward",
                        line=dict(color="#1f77b4", width=3),
                        marker=dict(size=8)
                    )
                )
                
                # Add steps line on secondary axis
                fig.add_trace(
                    go.Scatter(
                        x=steps_df["Episode"],
                        y=steps_df["Mean Steps"],
                        mode="lines+markers",
                        name="Mean Steps",
                        line=dict(color="#ff7f0e", width=3, dash="dot"),
                        marker=dict(size=8)
                    ),
                    secondary_y=True
                )
                
                # Add confidence interval for reward
                fig.add_trace(
                    go.Scatter(
                        x=progress_df["Episode"].tolist() + progress_df["Episode"].tolist()[::-1],
                        y=(progress_df["Mean Reward"] + progress_df["Std Reward"]).tolist() + 
                           (progress_df["Mean Reward"] - progress_df["Std Reward"]).tolist()[::-1],
                        fill="toself",
                        fillcolor="rgba(31, 119, 180, 0.2)",
                        line=dict(color="rgba(255,255,255,0)"),
                        hoverinfo="skip",
                        showlegend=False
                    )
                )
                
                # Update layout
                fig.update_layout(
                    title="Agent Performance Over Training",
                    xaxis_title="Training Episode",
                    margin=dict(l=20, r=20, t=40, b=20),
                    legend=dict(
                        orientation="h",
                        yanchor="bottom",
                        y=1.02,
                        xanchor="right",
                        x=1
                    ),
                    height=400
                )
                
                # Set y-axes titles
                fig.update_yaxes(title_text="Reward", secondary_y=False)
                fig.update_yaxes(title_text="Steps", secondary_y=True)
                
                metrics_chart.plotly_chart(fig, use_container_width=True)
                
                # Epsilon decay chart
                epsilon_df = pd.DataFrame({
                    "Episode": list(range(len(epsilons))),
                    "Epsilon": epsilons
                })
                
                epsilon_fig = px.line(
                    epsilon_df, 
                    x="Episode", 
                    y="Epsilon",
                    title="Exploration Rate (Epsilon) Decay"
                )
                
                epsilon_fig.update_layout(
                    xaxis_title="Training Episode",
                    yaxis_title="Epsilon Value",
                    height=250,
                    margin=dict(l=20, r=20, t=40, b=20)
                )
                
                epsilon_chart.plotly_chart(epsilon_fig, use_container_width=True)
            
            # Update Q-table visualization
            qtable_fig = px.imshow(
                Qtable,
                labels=dict(x="Actions", y="States", color="Q-Value"),
                x=['South', 'North', 'East', 'West', 'Pickup', 'Dropoff'],
                zmin=Qtable.min(),
                zmax=Qtable.max(),
                color_continuous_scale="Viridis"
            )
            
            qtable_fig.update_layout(
                title=f"Q-table at Episode {episode}",
                height=600,
                margin=dict(l=20, r=20, t=40, b=20)
            )
            
            qtable_visualization.plotly_chart(qtable_fig, use_container_width=True)
        
        # Q-table snapshot at regular intervals
        if episode % log_freq == 0:
            qtable_snapshots[episode] = Qtable.copy()
        
        # Update progress
        if episode % 100 == 0:
            elapsed = time.time() - start_time
            estimated = elapsed / (episode + 1) * (n_episodes - episode - 1) if episode > 0 else 0
            status_text.text(f"Training in progress... Time elapsed: {elapsed:.1f}s | Estimated time remaining: {estimated:.1f}s")
            progress_bar.progress((episode + 1) / n_episodes)
    
    # Training complete
    progress_bar.progress(1.0)
    status_text.success(f"✅ Training completed in {time.time() - start_time:.1f} seconds!")
    
    # Store results in session state for persistence
    st.session_state.trained_qtable = Qtable
    st.session_state.agent_videos = videos
    st.session_state.training_completed = True
    
    # Final evaluation
    final_mean, final_std, final_steps, final_success = evaluate_agent(
        env, max_steps, n_eval_episodes * 2, Qtable  # Double evaluation episodes for final eval
    )
    
    # Store final metrics
    st.session_state.final_metrics = {
        "mean_reward": final_mean,
        "std_reward": final_std,
        "mean_steps": final_steps,
        "success_rate": final_success,
        "q_min": Qtable.min(),
        "q_max": Qtable.max()
    }
    
    return Qtable, videos

# Environment setup
env = gym.make("Taxi-v3")
eval_env = gym.make("Taxi-v3", render_mode="rgb_array")

# Create training button if training hasn't completed
if not st.session_state.training_completed:
    train_col1, train_col2 = st.columns([3, 1])
    with train_col1:
        st.write("")  # For spacing
    with train_col2:
        start_training = st.button("🚀 Start Training", type="primary", use_container_width=True)

    # Start training when button is clicked
    if start_training:
        params = {
            "n_episodes": n_episodes,
            "learning_rate": learning_rate,
            "gamma": gamma,
            "max_steps": max_steps,
            "min_epsilon": min_epsilon,
            "max_epsilon": max_epsilon,
            "decay_rate": decay_rate,
            "log_freq": log_freq,
            "eval_every": eval_every,
            "n_eval_episodes": n_eval_episodes,
            "video_length": video_length if 'video_length' in locals() else 50
        }
        
        trained_qtable, agent_videos = train_agent(env, eval_env, params)

# If training is completed, show results
if st.session_state.training_completed:
    # Create tabs for different visualizations
    tab1, tab2, tab3 = st.tabs(["📊 Training Results", "🎬 Agent Evolution", "📋 Q-Table Visualization"])
    
    with tab1:
        # Summary metrics in nice boxes
        st.markdown("### 📊 Final Performance")
        
        metrics = st.session_state.final_metrics
        metric_cols = st.columns(4)
        with metric_cols[0]:
            st.metric("Final Average Reward", f"{metrics['mean_reward']:.2f}", delta=f"±{metrics['std_reward']:.2f}")
        with metric_cols[1]:
            st.metric("Average Steps to Complete", f"{metrics['mean_steps']:.1f}")
        with metric_cols[2]:
            st.metric("Success Rate", f"{metrics['success_rate']:.1f}%")
        with metric_cols[3]:
            st.metric("Q-values Range", f"{metrics['q_min']:.2f} to {metrics['q_max']:.2f}")
        
        # Download trained Q-table
        st.subheader("Export Model")
        
        # Convert Q-table to bytes for download
        def get_table_download_link(array):
            csvfile = BytesIO()
            np.save(csvfile, array)
            b64 = base64.b64encode(csvfile.getvalue()).decode()
            href = f'<a href="data:application/octet-stream;base64,{b64}" download="qtable.npy">Download Q-table (.npy)</a>'
            return href
        
        st.markdown(get_table_download_link(st.session_state.trained_qtable), unsafe_allow_html=True)
    
    with tab2:
        # Create video selection slider
        video_episodes = sorted(list(st.session_state.agent_videos.keys()))
        
        if video_episodes:
            selected_episode = st.select_slider(
                "Select checkpoint to view agent behavior:",
                options=video_episodes,
                format_func=lambda x: f"Episode {x} ({x/st.session_state.training_params['n_episodes']:.0%})"
            )
            
            # Display the selected video
            st.markdown(f"### Agent Behavior at Episode {selected_episode} ({selected_episode/st.session_state.training_params['n_episodes']:.0%})")
            
            video_html = frames_to_html_video(st.session_state.agent_videos[selected_episode])
            st.markdown(video_html, unsafe_allow_html=True)
            
            # Add explanation of agent behavior
            st.markdown("""
            #### What Am I Looking At?
            
            This animated visualization shows how the trained agent behaves at different stages of training. 
            You can observe:
            
            - The **yellow square** represents the taxi
            - The **letters (R, G, Y, B)** represent four fixed locations
            - The **blue letter** represents the passenger pickup location
            - The **purple letter** represents the passenger dropoff destination
            - When the passenger is in the taxi, the taxi turns green
            
            The agent makes decisions based on its learned Q-values. Early in training, movements may appear random as the agent explores. 
            Later in training, the agent should take more direct routes to complete the task efficiently.
            """)
    
    with tab3:
        # Q-table visualization
        st.markdown("### Q-Table Visualization")
        st.info("This heatmap shows the learned Q-values that guide the agent's decision making.")
        
        # Generate a heatmap of the Q-table
        qtable_fig = px.imshow(
            st.session_state.trained_qtable,
            labels=dict(x="Actions", y="States", color="Q-Value"),
            x=['South', 'North', 'East', 'West', 'Pickup', 'Dropoff'],
            zmin=st.session_state.final_metrics["q_min"],
            zmax=st.session_state.final_metrics["q_max"],
            color_continuous_scale="Viridis"
        )
        
        qtable_fig.update_layout(
            title="Final Q-table",
            height=600,
            margin=dict(l=20, r=20, t=40, b=20)
        )
        
        st.plotly_chart(qtable_fig, use_container_width=True)
        
        # Add Q-table explanation
        st.markdown("""
        #### Understanding the Q-Table
        
        The Q-table is the heart of the Q-learning algorithm:
        
        - Each **row** represents a different state (there are 500 possible states in Taxi-v3)
        - Each **column** represents an action (South, North, East, West, Pickup, Dropoff)
        - The **values** (colors) represent the expected future reward for taking that action in that state
        - **Brighter colors** indicate higher expected rewards
        
        The agent selects actions by choosing the highest value (brightest color) for its current state.
        """)

# Add educational resources at the bottom
with st.expander("📚 Learn More About Q-Learning"):
    st.markdown("""
    ### Key Concepts in Q-Learning
    
    * **Q-Value**: Represents the expected future reward for taking action A in state S
    * **Exploration vs Exploitation**: Balancing between trying new actions and using known good actions
    * **Learning Rate (α)**: Controls how much new information overrides old information
    * **Discount Factor (γ)**: Determines the importance of future rewards
    * **Epsilon-greedy Policy**: A strategy that balances exploration and exploitation
    
    ### Taxi-v3 Environment Details
    
    The Taxi environment consists of a 5x5 grid world where a taxi needs to:
    1. Navigate to the passenger's location
    2. Pick up the passenger
    3. Navigate to the destination
    4. Drop off the passenger
    
    **Actions**:
    - Move South (0)
    - Move North (1)
    - Move East (2)
    - Move West (3)
    - Pickup passenger (4)
    - Dropoff passenger (5)
    
    **Rewards**:
    - -1 per time step
    - +20 for successful dropoff
    - -10 for illegal pickup/dropoff actions
    """)

# Footer
st.markdown("""
<div style="text-align: center; margin-top: 30px; padding-top: 20px; border-top: 1px solid #eee;">
    <p style="color: #64748b;">Interactive Q-Learning Dashboard for Reinforcement Learning Education</p>
</div>
""", unsafe_allow_html=True)