Spaces:

arad1367
/

PPO-Simulation

Running

File size: 44,047 Bytes

<!-- PPO Simulation By Pejman Ebrahimi -->
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>PPO Reinforcement Learning Simulation</title>
    <style>
      body {
        font-family: Arial, sans-serif;
        margin: 0;
        padding: 20px;
        line-height: 1.6;
        color: #333;
        background-color: #f8f9fa;
      }
      .container {
        max-width: 1000px;
        margin: 0 auto;
        background-color: white;
        padding: 20px;
        border-radius: 8px;
        box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
      }
      h1,
      h2,
      h3 {
        color: #2c3e50;
      }
      h1 {
        text-align: center;
        margin-bottom: 30px;
        border-bottom: 2px solid #3498db;
        padding-bottom: 10px;
      }
      .grid-container {
        display: grid;
        grid-template-columns: repeat(10, 1fr);
        gap: 2px;
        margin: 20px 0;
      }
      .cell {
        width: 100%;
        aspect-ratio: 1;
        background-color: #ecf0f1;
        display: flex;
        align-items: center;
        justify-content: center;
        cursor: pointer;
        position: relative;
        transition: all 0.3s;
      }
      .agent {
        background-color: #3498db;
        border-radius: 50%;
        width: 80%;
        height: 80%;
        position: absolute;
      }
      .goal {
        background-color: #2ecc71;
        width: 80%;
        height: 80%;
        position: absolute;
      }
      .obstacle {
        background-color: #e74c3c;
        width: 80%;
        height: 80%;
        position: absolute;
      }
      .panel {
        background-color: #f5f7f9;
        padding: 15px;
        border-radius: 5px;
        margin-bottom: 20px;
        border: 1px solid #ddd;
      }
      .controls {
        display: flex;
        gap: 10px;
        flex-wrap: wrap;
        margin: 20px 0;
      }
      button {
        padding: 8px 15px;
        background-color: #3498db;
        color: white;
        border: none;
        border-radius: 4px;
        cursor: pointer;
        transition: background-color 0.3s;
      }
      button:hover {
        background-color: #2980b9;
      }
      button:disabled {
        background-color: #95a5a6;
        cursor: not-allowed;
      }
      .sliders {
        display: flex;
        flex-direction: column;
        gap: 10px;
        margin: 15px 0;
      }
      .slider-container {
        display: flex;
        align-items: center;
      }
      .slider-container label {
        flex: 1;
        min-width: 180px;
      }
      .slider-container input {
        flex: 2;
      }
      .slider-value {
        flex: 0 0 50px;
        text-align: right;
      }
      #log-container {
        max-height: 200px;
        overflow-y: auto;
        background-color: #2c3e50;
        color: #ecf0f1;
        padding: 10px;
        border-radius: 4px;
        margin-top: 20px;
        font-family: monospace;
      }
      .log-entry {
        margin: 5px 0;
      }
      .tab-container {
        margin-top: 20px;
      }
      .tab-buttons {
        display: flex;
        border-bottom: 1px solid #ddd;
      }
      .tab-button {
        padding: 10px 20px;
        background-color: #f1f1f1;
        border: none;
        cursor: pointer;
        transition: background-color 0.3s;
      }
      .tab-button.active {
        background-color: #3498db;
        color: white;
      }
      .tab-content {
        display: none;
        padding: 15px;
        border: 1px solid #ddd;
        border-top: none;
        animation: fadeIn 0.5s;
      }
      .tab-content.active {
        display: block;
      }
      #policy-display {
        width: 100%;
        height: 300px;
        overflow: auto;
        margin-top: 10px;
      }
      .policy-grid {
        display: grid;
        grid-template-columns: repeat(10, 1fr);
        gap: 2px;
      }
      .policy-cell {
        aspect-ratio: 1;
        border: 1px solid #ddd;
        padding: 2px;
        font-size: 10px;
        display: flex;
        flex-direction: column;
        align-items: center;
        justify-content: center;
      }
      .arrow {
        width: 0;
        height: 0;
        border-style: solid;
        margin: 2px;
      }
      .arrow-up {
        border-width: 0 4px 8px 4px;
        border-color: transparent transparent #3498db transparent;
      }
      .arrow-right {
        border-width: 4px 0 4px 8px;
        border-color: transparent transparent transparent #3498db;
      }
      .arrow-down {
        border-width: 8px 4px 0 4px;
        border-color: #3498db transparent transparent transparent;
      }
      .arrow-left {
        border-width: 4px 8px 4px 0;
        border-color: transparent #3498db transparent transparent;
      }
      .progress-container {
        margin-top: 10px;
        background-color: #f1f1f1;
        border-radius: 5px;
        height: 20px;
        position: relative;
      }
      .progress-bar {
        height: 100%;
        background-color: #3498db;
        border-radius: 5px;
        width: 0%;
        transition: width 0.3s;
      }
      .chart-container {
        height: 300px;
        margin: 15px 0;
      }
      @keyframes fadeIn {
        from {
          opacity: 0;
        }
        to {
          opacity: 1;
        }
      }
      .popup {
        display: none;
        position: fixed;
        top: 50%;
        left: 50%;
        transform: translate(-50%, -50%);
        background-color: white;
        padding: 20px;
        border-radius: 8px;
        box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2);
        z-index: 1000;
        max-width: 80%;
        max-height: 80%;
        overflow-y: auto;
      }
      .popup-overlay {
        display: none;
        position: fixed;
        top: 0;
        left: 0;
        width: 100%;
        height: 100%;
        background-color: rgba(0, 0, 0, 0.5);
        z-index: 999;
      }
      .reward-display {
        font-weight: bold;
        font-size: 1.2em;
        text-align: center;
        margin: 10px 0;
      }
      .explanation {
        background-color: #e8f4fc;
        padding: 15px;
        border-radius: 5px;
        margin: 10px 0;
        border-left: 4px solid #3498db;
      }
      .highlight {
        background-color: #fffacd;
        padding: 2px 4px;
        border-radius: 3px;
      }
      .concept-box {
        border: 1px solid #ddd;
        margin: 15px 0;
        border-radius: 5px;
        overflow: hidden;
      }
      .concept-title {
        background-color: #3498db;
        color: white;
        padding: 10px;
        margin: 0;
      }
      .concept-content {
        padding: 15px;
      }
    </style>
  </head>
  <body>
    <div class="container">
      <h1>Proximal Policy Optimization (PPO) Simulation</h1>

      <div class="explanation">
        <p>
          This simulation demonstrates how an agent learns to navigate to a goal
          using <strong>Proximal Policy Optimization (PPO)</strong>. PPO is an
          on-policy reinforcement learning algorithm that uses a "clipping"
          mechanism to prevent large policy updates, making training more stable
          and efficient.
        </p>
      </div>

      <div class="tab-container">
        <div class="tab-buttons">
          <button class="tab-button active" onclick="openTab('simulation-tab')">
            Simulation
          </button>
          <button class="tab-button" onclick="openTab('concepts-tab')">
            PPO Concepts
          </button>
          <button class="tab-button" onclick="openTab('metrics-tab')">
            Training Metrics
          </button>
        </div>

        <div id="simulation-tab" class="tab-content active">
          <div class="panel">
            <h3>Environment</h3>
            <p>
              The agent (blue) must navigate to the goal (green) while avoiding
              obstacles (red).
            </p>
            <div class="grid-container" id="grid"></div>
            <div class="reward-display">
              Total Reward: <span id="reward-value">0</span>
            </div>
          </div>

          <div class="controls">
            <button id="start-btn" onclick="startTraining()">
              Start Training
            </button>
            <button id="reset-btn" onclick="resetEnvironment()">
              Reset Environment
            </button>
            <button id="step-btn" onclick="stepTraining()" disabled>
              Step Forward
            </button>
            <button id="place-obstacle-btn" onclick="toggleObstaclePlacement()">
              Place Obstacles
            </button>
            <button id="animation-speed-btn" onclick="toggleAnimationSpeed()">
              Animation Speed: Normal
            </button>
          </div>

          <div class="panel">
            <h3>PPO Parameters</h3>
            <div class="sliders">
              <div class="slider-container">
                <label for="clip-ratio">Clip Ratio (ε):</label>
                <input
                  type="range"
                  id="clip-ratio"
                  min="0.05"
                  max="0.5"
                  step="0.05"
                  value="0.2"
                  oninput="updateSliderValue('clip-ratio')"
                />
                <span class="slider-value" id="clip-ratio-value">0.2</span>
              </div>
              <div class="slider-container">
                <label for="learning-rate">Learning Rate:</label>
                <input
                  type="range"
                  id="learning-rate"
                  min="0.01"
                  max="1"
                  step="0.01"
                  value="0.1"
                  oninput="updateSliderValue('learning-rate')"
                />
                <span class="slider-value" id="learning-rate-value">0.1</span>
              </div>
              <div class="slider-container">
                <label for="epochs">PPO Epochs per Update:</label>
                <input
                  type="range"
                  id="epochs"
                  min="1"
                  max="10"
                  step="1"
                  value="4"
                  oninput="updateSliderValue('epochs')"
                />
                <span class="slider-value" id="epochs-value">4</span>
              </div>
            </div>
          </div>

          <div class="panel">
            <h3>Policy Visualization</h3>
            <p>
              This shows the current policy of the agent (arrows indicate
              preferred actions in each state).
            </p>
            <div id="policy-display">
              <div class="policy-grid" id="policy-grid"></div>
            </div>
          </div>

          <div id="log-container"></div>
        </div>

        <div id="concepts-tab" class="tab-content">
          <div class="concept-box">
            <h3 class="concept-title">What is PPO?</h3>
            <div class="concept-content">
              <p>
                Proximal Policy Optimization (PPO) is a policy gradient method
                for reinforcement learning developed by OpenAI in 2017. It has
                become one of the most popular RL algorithms due to its
                simplicity and effectiveness.
              </p>
              <p>PPO aims to balance two objectives:</p>
              <ul>
                <li>Improving the agent's policy to maximize rewards</li>
                <li>
                  Preventing large policy updates that could destabilize
                  training
                </li>
              </ul>
            </div>
          </div>

          <div class="concept-box">
            <h3 class="concept-title">Key Innovations in PPO</h3>
            <div class="concept-content">
              <p>
                The central innovation in PPO is the
                <strong>clipped surrogate objective function</strong>:
              </p>
              <p style="text-align: center">
                L<sup>CLIP</sup>(θ) = E[min(r<sub>t</sub>(θ)A<sub>t</sub>,
                clip(r<sub>t</sub>(θ), 1-ε, 1+ε)A<sub>t</sub>)]
              </p>
              <p>where:</p>
              <ul>
                <li>
                  <strong>r<sub>t</sub>(θ)</strong> is the ratio of
                  probabilities under new and old policies
                </li>
                <li>
                  <strong>A<sub>t</sub></strong> is the advantage estimate
                </li>
                <li>
                  <strong>ε</strong> is the clipping parameter (usually 0.1 or
                  0.2)
                </li>
              </ul>
              <p>
                The clipping mechanism ensures that the policy update stays
                within a "trust region" by limiting how much the new policy can
                deviate from the old one.
              </p>
            </div>
          </div>

          <div class="concept-box">
            <h3 class="concept-title">How PPO Works in This Simulation</h3>
            <div class="concept-content">
              <ol>
                <li>
                  The agent collects experience by interacting with the
                  environment using its current policy
                </li>
                <li>Advantages are computed for each state-action pair</li>
                <li>
                  The policy is updated using the clipped surrogate objective
                </li>
                <li>
                  Multiple optimization epochs are performed on the same batch
                  of data
                </li>
                <li>The process repeats with the new policy</li>
              </ol>
              <p>
                You can observe these steps in action in the simulation tab by
                watching the policy visualization and training metrics.
              </p>
            </div>
          </div>

          <div class="concept-box">
            <h3 class="concept-title">PPO vs. Other RL Algorithms</h3>
            <div class="concept-content">
              <p>PPO improves upon earlier algorithms in several ways:</p>
              <ul>
                <li>
                  <strong>vs. REINFORCE:</strong> More stable training due to
                  advantage estimation and clipping
                </li>
                <li>
                  <strong>vs. TRPO:</strong> Simpler implementation while
                  maintaining similar performance
                </li>
                <li>
                  <strong>vs. A2C/A3C:</strong> Better sample efficiency and
                  more stable policy updates
                </li>
                <li>
                  <strong>vs. Off-policy algorithms (DQN, DDPG):</strong> Less
                  sensitive to hyperparameters and often more stable
                </li>
              </ul>
            </div>
          </div>
        </div>

        <div id="metrics-tab" class="tab-content">
          <div class="panel">
            <h3>Training Progress</h3>
            <div class="progress-container">
              <div class="progress-bar" id="training-progress"></div>
            </div>
            <p id="episode-counter">Episodes: 0 / 100</p>
          </div>

          <div class="panel">
            <h3>Reward Over Time</h3>
            <div class="chart-container" id="reward-chart"></div>
          </div>

          <div class="panel">
            <h3>Policy Loss</h3>
            <div class="chart-container" id="policy-loss-chart"></div>
          </div>

          <div class="panel">
            <h3>Value Loss</h3>
            <div class="chart-container" id="value-loss-chart"></div>
          </div>
        </div>
      </div>
    </div>

    <div class="popup-overlay" id="popup-overlay"></div>
    <div class="popup" id="popup">
      <h2 id="popup-title">Title</h2>
      <div id="popup-content">Content</div>
      <button onclick="closePopup()">Close</button>
    </div>

    <script>
      // Environment configuration
      const GRID_SIZE = 10;
      let grid = [];
      let agentPos = { x: 0, y: 0 };
      let goalPos = { x: 9, y: 9 };
      let obstacles = [];
      let placingObstacles = false;

      // Agent and PPO parameters
      let policyNetwork = {};
      let valueNetwork = {};
      let clipRatio = 0.2;
      let learningRate = 0.1; // Default learning rate (0-1 range)
      let ppoEpochs = 4;
      let gamma = 0.99; // Discount factor
      let lambda = 0.95; // GAE parameter

      // Training state
      let isTraining = false;
      let episode = 0;
      let maxEpisodes = 100;
      let episodeSteps = 0;
      let maxStepsPerEpisode = 100; // Increased max steps to allow more exploration
      let totalReward = 0;
      let episodeRewards = [];
      let policyLosses = [];
      let valueLosses = [];

      // Tracking for visualization
      let trajectories = [];
      let oldPolicy = {};

      // Exploration parameters
      let explorationRate = 0.2; // Probability of taking a random action (exploration)

      // Initialize the environment
      function initializeEnvironment() {
        grid = [];
        obstacles = [];

        // Create the grid UI
        const gridContainer = document.getElementById("grid");
        gridContainer.innerHTML = "";

        for (let y = 0; y < GRID_SIZE; y++) {
          for (let x = 0; x < GRID_SIZE; x++) {
            const cell = document.createElement("div");
            cell.classList.add("cell");
            cell.dataset.x = x;
            cell.dataset.y = y;
            cell.addEventListener("click", handleCellClick);
            gridContainer.appendChild(cell);
          }
        }

        // Place agent and goal
        agentPos = { x: 0, y: 0 };
        goalPos = { x: 9, y: 9 };
        renderGrid();

        // Initialize policy and value networks
        initializeNetworks();
        renderPolicy();
        updateReward(0);
      }

      // Initialize policy and value networks
      function initializeNetworks() {
        policyNetwork = {};
        valueNetwork = {};

        // Initialize learning rate
        learningRate = parseFloat(
          document.getElementById("learning-rate").value
        );

        // Initialize policy and value for each state (cell)
        for (let y = 0; y < GRID_SIZE; y++) {
          for (let x = 0; x < GRID_SIZE; x++) {
            const stateKey = `${x},${y}`;

            // Initialize policy with random probabilities
            policyNetwork[stateKey] = {
              up: 0.25,
              right: 0.25,
              down: 0.25,
              left: 0.25,
            };

            // Initialize value to zero
            valueNetwork[stateKey] = 0;
          }
        }
      }

      function renderGrid() {
        // Clear all cells
        const cells = document.querySelectorAll(".cell");
        cells.forEach((cell) => {
          cell.innerHTML = "";
        });

        // Place agent
        const agentCell = document.querySelector(
          `.cell[data-x="${agentPos.x}"][data-y="${agentPos.y}"]`
        );
        const agentElement = document.createElement("div");
        agentElement.classList.add("agent");
        agentCell.appendChild(agentElement);

        // Place goal
        const goalCell = document.querySelector(
          `.cell[data-x="${goalPos.x}"][data-y="${goalPos.y}"]`
        );
        const goalElement = document.createElement("div");
        goalElement.classList.add("goal");
        goalCell.appendChild(goalElement);

        // Place obstacles
        obstacles.forEach((obstacle) => {
          const obstacleCell = document.querySelector(
            `.cell[data-x="${obstacle.x}"][data-y="${obstacle.y}"]`
          );
          const obstacleElement = document.createElement("div");
          obstacleElement.classList.add("obstacle");
          obstacleCell.appendChild(obstacleElement);
        });
      }

      function renderPolicy() {
        const policyGrid = document.getElementById("policy-grid");
        policyGrid.innerHTML = "";

        for (let y = 0; y < GRID_SIZE; y++) {
          for (let x = 0; x < GRID_SIZE; x++) {
            const cell = document.createElement("div");
            cell.classList.add("policy-cell");

            const stateKey = `${x},${y}`;
            const policy = policyNetwork[stateKey];

            // Skip rendering policy for obstacles
            if (isObstacle(x, y)) {
              cell.style.backgroundColor = "#e74c3c";
              policyGrid.appendChild(cell);
              continue;
            }

            // If it's the goal, mark it green
            if (x === goalPos.x && y === goalPos.y) {
              cell.style.backgroundColor = "#2ecc71";
              policyGrid.appendChild(cell);
              continue;
            }

            // Create arrows for each action probability
            for (const [action, prob] of Object.entries(policy)) {
              if (prob > 0.2) {
                // Only show significant probabilities
                const arrow = document.createElement("div");
                arrow.classList.add("arrow", `arrow-${action}`);
                arrow.style.opacity = Math.min(1, prob * 2); // Scale opacity with probability
                cell.appendChild(arrow);
              }
            }

            // Add state value indication using background color intensity
            const value = valueNetwork[stateKey];
            const normalizedValue = (value + 10) / 20; // Normalize to [0,1] range assuming values between -10 and 10
            const intensity = Math.max(
              0,
              Math.min(255, Math.floor(normalizedValue * 255))
            );
            cell.style.backgroundColor = `rgba(236, 240, 241, ${normalizedValue})`;

            policyGrid.appendChild(cell);
          }
        }
      }

      function handleCellClick(event) {
        const x = parseInt(event.currentTarget.dataset.x);
        const y = parseInt(event.currentTarget.dataset.y);

        if (placingObstacles) {
          // Don't allow obstacles on agent or goal
          if (
            (x === agentPos.x && y === agentPos.y) ||
            (x === goalPos.x && y === goalPos.y)
          ) {
            return;
          }

          const obstacleIndex = obstacles.findIndex(
            (o) => o.x === x && o.y === y
          );
          if (obstacleIndex === -1) {
            obstacles.push({ x, y });
          } else {
            obstacles.splice(obstacleIndex, 1);
          }
          renderGrid();
          renderPolicy();
        }
      }

      function toggleObstaclePlacement() {
        placingObstacles = !placingObstacles;
        const btn = document.getElementById("place-obstacle-btn");
        btn.textContent = placingObstacles ? "Done Placing" : "Place Obstacles";
        btn.style.backgroundColor = placingObstacles ? "#e74c3c" : "#3498db";
      }

      function isObstacle(x, y) {
        return obstacles.some((o) => o.x === x && o.y === y);
      }

      function resetEnvironment() {
        initializeEnvironment();
        episodeRewards = [];
        policyLosses = [];
        valueLosses = [];
        episode = 0;
        updateEpisodeCounter();
        updateReward(0);

        // Reset training state
        isTraining = false;
        document.getElementById("start-btn").textContent = "Start Training";
        document.getElementById("step-btn").disabled = true;

        // Clear charts
        // In a real implementation, you would update the charts here

        logMessage("Environment reset. Ready for training!");
      }

      function startTraining() {
        if (isTraining) {
          // Stop training
          isTraining = false;
          document.getElementById("start-btn").textContent = "Start Training";
          document.getElementById("step-btn").disabled = true;
        } else {
          // Start training
          isTraining = true;
          document.getElementById("start-btn").textContent = "Stop Training";
          document.getElementById("step-btn").disabled = false;

          // If we're at the end of training, reset first
          if (episode >= maxEpisodes) {
            resetEnvironment();
          }

          runTrainingLoop();
        }
      }

      function stepTraining() {
        if (episode < maxEpisodes) {
          runEpisode();
          updateTrainingProgress();
        } else {
          logMessage("Training complete! Reset to train again.");
        }
      }

      async function runTrainingLoop() {
        while (isTraining && episode < maxEpisodes) {
          await runEpisode();
          updateTrainingProgress();

          // Add a small delay to visualize the process
          await new Promise((resolve) => setTimeout(resolve, 200));
        }

        if (episode >= maxEpisodes) {
          logMessage("Training complete!");
          isTraining = false;
          document.getElementById("start-btn").textContent = "Start Training";
        }
      }

      async function runEpisode() {
        // Reset agent position and episodic variables
        agentPos = { x: 0, y: 0 };
        episodeSteps = 0;
        totalReward = 0;
        trajectories = [];

        // Decay exploration rate over time (important for improving policy)
        explorationRate = Math.max(0.05, 0.2 * Math.pow(0.99, episode));

        renderGrid();
        updateReward(totalReward);

        // Save old policy for PPO ratio calculation
        oldPolicy = JSON.parse(JSON.stringify(policyNetwork));

        // Run episode until termination
        let done = false;
        while (!done && episodeSteps < maxStepsPerEpisode) {
          done = await executeStep();
          episodeSteps++;

          // Small delay for visualization
          await new Promise((resolve) =>
            setTimeout(resolve, animationSpeeds[animationSpeed] / 2)
          );
        }

        // Add episode reward to history
        episodeRewards.push(totalReward);

        // Run PPO update if we have enough steps
        if (trajectories.length > 0) {
          const [policyLoss, valueLoss] = updatePPO();
          policyLosses.push(policyLoss);
          valueLosses.push(valueLoss);
        }

        // Update UI
        renderPolicy();
        episode++;
        updateEpisodeCounter();

        logMessage(
          `Episode ${episode}: Reward=${totalReward.toFixed(
            2
          )}, Steps=${episodeSteps}, Exploration=${explorationRate.toFixed(2)}`
        );

        return new Promise((resolve) => setTimeout(resolve, 10));
      }

      async function executeStep() {
        const stateKey = `${agentPos.x},${agentPos.y}`;
        const policy = policyNetwork[stateKey];

        // Choose action based on policy
        const action = sampleAction(policy);

        // Store old position
        const oldPos = { ...agentPos };

        // Move agent
        moveAgent(action);

        // Calculate reward
        const reward = calculateReward(oldPos);
        totalReward += reward;
        updateReward(totalReward);

        // Check if episode is done
        const done =
          (agentPos.x === goalPos.x && agentPos.y === goalPos.y) ||
          isObstacle(agentPos.x, agentPos.y);

        // If agent hit obstacle, move it back for visualization
        if (isObstacle(agentPos.x, agentPos.y)) {
          agentPos = { ...oldPos };
        }

        // Render the grid
        renderGrid();

        // Store trajectory
        const newStateKey = `${agentPos.x},${agentPos.y}`;
        trajectories.push({
          state: stateKey,
          action,
          reward,
          nextState: newStateKey,
          done,
        });

        return done;
      }

      function sampleAction(policy) {
        // Use exploration rate to decide whether to take random action or follow policy
        if (Math.random() < explorationRate) {
          // Take random action with exploration probability
          const actions = Object.keys(policy);
          const randomIndex = Math.floor(Math.random() * actions.length);
          return actions[randomIndex];
        }

        // Otherwise sample from policy distribution
        const actions = Object.keys(policy);
        const probs = actions.map((a) => policy[a]);

        const rand = Math.random();
        let cumProb = 0;

        for (let i = 0; i < actions.length; i++) {
          cumProb += probs[i];
          if (rand < cumProb) {
            return actions[i];
          }
        }

        return actions[actions.length - 1];
      }

      function moveAgent(action) {
        // Save previous position
        const prevPos = { ...agentPos };

        // Attempt to move agent
        switch (action) {
          case "up":
            agentPos.y = Math.max(0, agentPos.y - 1);
            break;
          case "right":
            agentPos.x = Math.min(GRID_SIZE - 1, agentPos.x + 1);
            break;
          case "down":
            agentPos.y = Math.min(GRID_SIZE - 1, agentPos.y + 1);
            break;
          case "left":
            agentPos.x = Math.max(0, agentPos.x - 1);
            break;
        }

        // Check if new position is an obstacle
        if (isObstacle(agentPos.x, agentPos.y)) {
          // Revert to previous position if it hit an obstacle
          agentPos.x = prevPos.x;
          agentPos.y = prevPos.y;
          return false; // Indicate movement was blocked
        }

        return true; // Movement successful
      }

      function calculateReward(oldPos, movementSuccessful) {
        // Reward for reaching goal
        if (agentPos.x === goalPos.x && agentPos.y === goalPos.y) {
          return 10;
        }

        // Penalty for attempting to move into an obstacle (but not actually moving into it)
        if (!movementSuccessful) {
          return -1; // Reduced penalty to avoid too much negative learning
        }

        // Small penalty for each step to encourage efficiency
        let stepPenalty = -0.1;

        // Small reward for getting closer to goal (using Manhattan distance)
        const oldDistance =
          Math.abs(oldPos.x - goalPos.x) + Math.abs(oldPos.y - goalPos.y);
        const newDistance =
          Math.abs(agentPos.x - goalPos.x) + Math.abs(agentPos.y - goalPos.y);
        const proximityReward = oldDistance > newDistance ? 0.3 : -0.1; // Stronger reward for progress

        return stepPenalty + proximityReward;
      }

      function updatePPO() {
        // Get parameters from sliders
        clipRatio = parseFloat(document.getElementById("clip-ratio").value);
        learningRate = parseFloat(
          document.getElementById("learning-rate").value
        );
        ppoEpochs = parseInt(document.getElementById("epochs").value);

        // Compute returns and advantages
        const returns = [];
        const advantages = [];

        // Compute returns (discounted sum of future rewards)
        let discountedReturn = 0;
        for (let i = trajectories.length - 1; i >= 0; i--) {
          const transition = trajectories[i];
          discountedReturn =
            transition.reward +
            gamma * (transition.done ? 0 : discountedReturn);
          returns.unshift(discountedReturn);
        }

        // Compute advantages using Generalized Advantage Estimation (GAE)
        let lastGaeAdvantage = 0;
        for (let i = trajectories.length - 1; i >= 0; i--) {
          const transition = trajectories[i];
          const stateKey = transition.state;
          const nextStateKey = transition.nextState;

          const currentValue = valueNetwork[stateKey];
          const nextValue = transition.done ? 0 : valueNetwork[nextStateKey];

          // TD error
          const delta = transition.reward + gamma * nextValue - currentValue;

          // GAE
          lastGaeAdvantage = delta + gamma * lambda * lastGaeAdvantage;
          advantages.unshift(lastGaeAdvantage);
        }

        // Normalize advantages for more stable learning
        const meanAdvantage =
          advantages.reduce((a, b) => a + b, 0) / advantages.length;
        const stdAdvantage =
          Math.sqrt(
            advantages.reduce((a, b) => a + Math.pow(b - meanAdvantage, 2), 0) /
              advantages.length
          ) || 1; // Avoid division by zero

        for (let i = 0; i < advantages.length; i++) {
          advantages[i] =
            (advantages[i] - meanAdvantage) / (stdAdvantage + 1e-8);
        }

        // Store losses for metrics
        let totalPolicyLoss = 0;
        let totalValueLoss = 0;

        // Backup old policy for PPO ratio calculation
        const oldPolicyBackup = JSON.parse(JSON.stringify(policyNetwork));

        // Multiple epochs of optimization on the same data (key PPO feature)
        for (let epoch = 0; epoch < ppoEpochs; epoch++) {
          // Update policy and value networks for each step in the trajectory
          for (let i = 0; i < trajectories.length; i++) {
            const transition = trajectories[i];
            const stateKey = transition.state;
            const action = transition.action;

            // Get old action probability
            const oldActionProb = oldPolicy[stateKey][action];

            // Get current action probability
            const currentActionProb = policyNetwork[stateKey][action];

            // Compute probability ratio (crucial for PPO)
            const ratio = currentActionProb / Math.max(oldActionProb, 1e-8);

            // Get advantage for this action
            const advantage = advantages[i];

            // Compute unclipped and clipped surrogate objectives
            const unclippedObjective = ratio * advantage;
            const clippedRatio = Math.max(
              Math.min(ratio, 1 + clipRatio),
              1 - clipRatio
            );
            const clippedObjective = clippedRatio * advantage;

            // PPO's clipped surrogate objective (core of PPO)
            const surrogateObjective = Math.min(
              unclippedObjective,
              clippedObjective
            );

            // Compute policy gradient
            // Note: In PPO, we maximize the objective, so negative for gradient ascent
            const policyLoss = -surrogateObjective;
            totalPolicyLoss += policyLoss;

            // Value loss (using returns as targets)
            const valueTarget = returns[i];
            const valuePrediction = valueNetwork[stateKey];
            const valueLoss = 0.5 * Math.pow(valueTarget - valuePrediction, 2);
            totalValueLoss += valueLoss;

            // Update value network with gradient descent
            valueNetwork[stateKey] +=
              learningRate * (valueTarget - valuePrediction);

            // Compute policy update based on whether we're using clipped or unclipped objective
            const useClippedObjective = unclippedObjective > clippedObjective;
            const policyGradient =
              learningRate * advantage * (useClippedObjective ? 0 : 1);

            // Apply policy gradient update
            // Increase probability of the taken action if it was good (positive advantage)
            // Decrease probability if it was bad (negative advantage)
            let newProb = policyNetwork[stateKey][action] + policyGradient;

            // Ensure probability stays positive (important for ratio calculation)
            newProb = Math.max(newProb, 0.01);
            policyNetwork[stateKey][action] = newProb;

            // Normalize probabilities to ensure they sum to 1
            const sumProb = Object.values(policyNetwork[stateKey]).reduce(
              (a, b) => a + b,
              0
            );
            for (const a in policyNetwork[stateKey]) {
              policyNetwork[stateKey][a] /= sumProb;
            }

            // Add some exploration (entropy bonus)
            // This is crucial for avoiding local optima
            if (i % 5 === 0) {
              // Apply periodically to maintain some exploration
              for (const a in policyNetwork[stateKey]) {
                // Slightly nudge probabilities toward uniform
                policyNetwork[stateKey][a] =
                  0.95 * policyNetwork[stateKey][a] + 0.05 * 0.25;
              }
              // Re-normalize
              const sumProb = Object.values(policyNetwork[stateKey]).reduce(
                (a, b) => a + b,
                0
              );
              for (const a in policyNetwork[stateKey]) {
                policyNetwork[stateKey][a] /= sumProb;
              }
            }
          }
        }

        // Calculate average losses
        const avgPolicyLoss =
          totalPolicyLoss / (trajectories.length * ppoEpochs);
        const avgValueLoss = totalValueLoss / (trajectories.length * ppoEpochs);

        // Log progress periodically
        if (episode % 5 === 0) {
          logMessage(
            `Episode ${episode}: Average Policy Loss = ${avgPolicyLoss.toFixed(
              4
            )}, Value Loss = ${avgValueLoss.toFixed(4)}`
          );
        }

        return [avgPolicyLoss, avgValueLoss];
      }

      function updateReward(reward) {
        document.getElementById("reward-value").textContent = reward.toFixed(2);
      }

      function updateEpisodeCounter() {
        document.getElementById(
          "episode-counter"
        ).textContent = `Episodes: ${episode} / ${maxEpisodes}`;
        document.getElementById("training-progress").style.width = `${
          (episode / maxEpisodes) * 100
        }%`;
      }

      function updateTrainingProgress() {
        // Update charts with the latest data
        // In a real implementation, you would update charts here

        // Show progress
        updateEpisodeCounter();
      }

      function updateSliderValue(id) {
        const slider = document.getElementById(id);
        const valueDisplay = document.getElementById(`${id}-value`);
        valueDisplay.textContent = slider.value;

        // Update corresponding variables
        if (id === "clip-ratio") clipRatio = parseFloat(slider.value);
        if (id === "learning-rate") learningRate = parseFloat(slider.value);
        if (id === "epochs") ppoEpochs = parseInt(slider.value);
      }

      function logMessage(message) {
        const logContainer = document.getElementById("log-container");
        const logEntry = document.createElement("div");
        logEntry.classList.add("log-entry");
        logEntry.textContent = message;
        logContainer.appendChild(logEntry);
        logContainer.scrollTop = logContainer.scrollHeight;
      }

      function openTab(tabId) {
        // Hide all tab contents
        const tabContents = document.getElementsByClassName("tab-content");
        for (let i = 0; i < tabContents.length; i++) {
          tabContents[i].classList.remove("active");
        }

        // Remove active class from tab buttons
        const tabButtons = document.getElementsByClassName("tab-button");
        for (let i = 0; i < tabButtons.length; i++) {
          tabButtons[i].classList.remove("active");
        }

        // Show selected tab content and mark button as active
        document.getElementById(tabId).classList.add("active");
        const activeButton = document.querySelector(
          `.tab-button[onclick="openTab('${tabId}')"]`
        );
        activeButton.classList.add("active");
      }

      function showPopup(title, content) {
        document.getElementById("popup-title").textContent = title;
        document.getElementById("popup-content").innerHTML = content;
        document.getElementById("popup-overlay").style.display = "block";
        document.getElementById("popup").style.display = "block";
      }

      function closePopup() {
        document.getElementById("popup-overlay").style.display = "none";
        document.getElementById("popup").style.display = "none";
      }

      // Initialize the environment when the page loads
      window.onload = function () {
        initializeEnvironment();
        logMessage('Environment initialized. Click "Start Training" to begin!');

        // Show concept popup with a delay
        setTimeout(() => {
          showPopup(
            "Welcome to PPO Simulation",
            `
                    <p>This simulation demonstrates Proximal Policy Optimization (PPO), a reinforcement learning algorithm.</p>
                    <p>In this grid world:</p>
                    <ul>
                        <li>The agent (blue circle) must learn to navigate to the goal (green square)</li>
                        <li>You can place obstacles (red squares) by clicking the "Place Obstacles" button</li>
                        <li>The agent receives rewards for approaching the goal and penalties for hitting obstacles</li>
                        <li>PPO helps the agent learn efficiently by preventing large policy updates</li>
                    </ul>
                    <p>Try experimenting with different parameters to see how they affect learning!</p>
                `
          );
        }, 1000);
      };
      // Animation speed control
      let animationSpeed = "normal";
      const animationSpeeds = {
        slow: 300,
        normal: 100,
        fast: 20,
      };

      function toggleAnimationSpeed() {
        const speedBtn = document.getElementById("animation-speed-btn");

        if (animationSpeed === "slow") {
          animationSpeed = "normal";
          speedBtn.textContent = "Animation Speed: Normal";
        } else if (animationSpeed === "normal") {
          animationSpeed = "fast";
          speedBtn.textContent = "Animation Speed: Fast";
        } else {
          animationSpeed = "slow";
          speedBtn.textContent = "Animation Speed: Slow";
        }
      }

      // Update animation speed in relevant functions
      async function runTrainingLoop() {
        while (isTraining && episode < maxEpisodes) {
          await runEpisode();
          updateTrainingProgress();

          // Use dynamic animation speed
          await new Promise((resolve) =>
            setTimeout(resolve, animationSpeeds[animationSpeed])
          );
        }

        if (episode >= maxEpisodes) {
          logMessage("Training complete!");
          isTraining = false;
          document.getElementById("start-btn").textContent = "Start Training";
        }
      }

      async function executeStep() {
        const stateKey = `${agentPos.x},${agentPos.y}`;
        const policy = policyNetwork[stateKey];

        // Choose action based on policy
        const action = sampleAction(policy);

        // Store old position
        const oldPos = { ...agentPos };

        // Move agent
        const movementSuccessful = moveAgent(action);

        // Calculate reward
        const reward = calculateReward(oldPos, movementSuccessful);
        totalReward += reward;
        updateReward(totalReward);

        // Check if episode is done
        const done = agentPos.x === goalPos.x && agentPos.y === goalPos.y;

        // Render the grid
        renderGrid();

        // Store trajectory
        const newStateKey = `${agentPos.x},${agentPos.y}`;
        trajectories.push({
          state: stateKey,
          action,
          reward,
          nextState: newStateKey,
          done,
        });

        // Use dynamic animation speed
        await new Promise((resolve) =>
          setTimeout(resolve, animationSpeeds[animationSpeed] / 2)
        );

        return done;
      }
    </script>

    <footer
      style="
        text-align: center;
        margin-top: 30px;
        padding: 15px;
        background-color: #f8f9fa;
        border-top: 1px solid #ddd;
      "
    >
      &copy; 2025 Pejman Ebrahimi - All Rights Reserved
    </footer>
  </body>
</html>