Spaces:

arad1367
/

PPO-Simulation

Running

App Files Files Community

PPO-Simulation / index.html

arad1367

Update index.html

9227b81 verified 4 months ago

raw

history blame

44 kB

	<!-- PPO Simulation By Pejman Ebrahimi -->
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<title>PPO Reinforcement Learning Simulation</title>
	<style>
	body {
	font-family: Arial, sans-serif;
	margin: 0;
	padding: 20px;
	line-height: 1.6;
	color: #333;
	background-color: #f8f9fa;
	}
	.container {
	max-width: 1000px;
	margin: 0 auto;
	background-color: white;
	padding: 20px;
	border-radius: 8px;
	box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
	}
	h1,
	h2,
	h3 {
	color: #2c3e50;
	}
	h1 {
	text-align: center;
	margin-bottom: 30px;
	border-bottom: 2px solid #3498db;
	padding-bottom: 10px;
	}
	.grid-container {
	display: grid;
	grid-template-columns: repeat(10, 1fr);
	gap: 2px;
	margin: 20px 0;
	}
	.cell {
	width: 100%;
	aspect-ratio: 1;
	background-color: #ecf0f1;
	display: flex;
	align-items: center;
	justify-content: center;
	cursor: pointer;
	position: relative;
	transition: all 0.3s;
	}
	.agent {
	background-color: #3498db;
	border-radius: 50%;
	width: 80%;
	height: 80%;
	position: absolute;
	}
	.goal {
	background-color: #2ecc71;
	width: 80%;
	height: 80%;
	position: absolute;
	}
	.obstacle {
	background-color: #e74c3c;
	width: 80%;
	height: 80%;
	position: absolute;
	}
	.panel {
	background-color: #f5f7f9;
	padding: 15px;
	border-radius: 5px;
	margin-bottom: 20px;
	border: 1px solid #ddd;
	}
	.controls {
	display: flex;
	gap: 10px;
	flex-wrap: wrap;
	margin: 20px 0;
	}
	button {
	padding: 8px 15px;
	background-color: #3498db;
	color: white;
	border: none;
	border-radius: 4px;
	cursor: pointer;
	transition: background-color 0.3s;
	}
	button:hover {
	background-color: #2980b9;
	}
	button:disabled {
	background-color: #95a5a6;
	cursor: not-allowed;
	}
	.sliders {
	display: flex;
	flex-direction: column;
	gap: 10px;
	margin: 15px 0;
	}
	.slider-container {
	display: flex;
	align-items: center;
	}
	.slider-container label {
	flex: 1;
	min-width: 180px;
	}
	.slider-container input {
	flex: 2;
	}
	.slider-value {
	flex: 0 0 50px;
	text-align: right;
	}
	#log-container {
	max-height: 200px;
	overflow-y: auto;
	background-color: #2c3e50;
	color: #ecf0f1;
	padding: 10px;
	border-radius: 4px;
	margin-top: 20px;
	font-family: monospace;
	}
	.log-entry {
	margin: 5px 0;
	}
	.tab-container {
	margin-top: 20px;
	}
	.tab-buttons {
	display: flex;
	border-bottom: 1px solid #ddd;
	}
	.tab-button {
	padding: 10px 20px;
	background-color: #f1f1f1;
	border: none;
	cursor: pointer;
	transition: background-color 0.3s;
	}
	.tab-button.active {
	background-color: #3498db;
	color: white;
	}
	.tab-content {
	display: none;
	padding: 15px;
	border: 1px solid #ddd;
	border-top: none;
	animation: fadeIn 0.5s;
	}
	.tab-content.active {
	display: block;
	}
	#policy-display {
	width: 100%;
	height: 300px;
	overflow: auto;
	margin-top: 10px;
	}
	.policy-grid {
	display: grid;
	grid-template-columns: repeat(10, 1fr);
	gap: 2px;
	}
	.policy-cell {
	aspect-ratio: 1;
	border: 1px solid #ddd;
	padding: 2px;
	font-size: 10px;
	display: flex;
	flex-direction: column;
	align-items: center;
	justify-content: center;
	}
	.arrow {
	width: 0;
	height: 0;
	border-style: solid;
	margin: 2px;
	}
	.arrow-up {
	border-width: 0 4px 8px 4px;
	border-color: transparent transparent #3498db transparent;
	}
	.arrow-right {
	border-width: 4px 0 4px 8px;
	border-color: transparent transparent transparent #3498db;
	}
	.arrow-down {
	border-width: 8px 4px 0 4px;
	border-color: #3498db transparent transparent transparent;
	}
	.arrow-left {
	border-width: 4px 8px 4px 0;
	border-color: transparent #3498db transparent transparent;
	}
	.progress-container {
	margin-top: 10px;
	background-color: #f1f1f1;
	border-radius: 5px;
	height: 20px;
	position: relative;
	}
	.progress-bar {
	height: 100%;
	background-color: #3498db;
	border-radius: 5px;
	width: 0%;
	transition: width 0.3s;
	}
	.chart-container {
	height: 300px;
	margin: 15px 0;
	}
	@keyframes fadeIn {
	from {
	opacity: 0;
	}
	to {
	opacity: 1;
	}
	}
	.popup {
	display: none;
	position: fixed;
	top: 50%;
	left: 50%;
	transform: translate(-50%, -50%);
	background-color: white;
	padding: 20px;
	border-radius: 8px;
	box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2);
	z-index: 1000;
	max-width: 80%;
	max-height: 80%;
	overflow-y: auto;
	}
	.popup-overlay {
	display: none;
	position: fixed;
	top: 0;
	left: 0;
	width: 100%;
	height: 100%;
	background-color: rgba(0, 0, 0, 0.5);
	z-index: 999;
	}
	.reward-display {
	font-weight: bold;
	font-size: 1.2em;
	text-align: center;
	margin: 10px 0;
	}
	.explanation {
	background-color: #e8f4fc;
	padding: 15px;
	border-radius: 5px;
	margin: 10px 0;
	border-left: 4px solid #3498db;
	}
	.highlight {
	background-color: #fffacd;
	padding: 2px 4px;
	border-radius: 3px;
	}
	.concept-box {
	border: 1px solid #ddd;
	margin: 15px 0;
	border-radius: 5px;
	overflow: hidden;
	}
	.concept-title {
	background-color: #3498db;
	color: white;
	padding: 10px;
	margin: 0;
	}
	.concept-content {
	padding: 15px;
	}
	</style>
	</head>
	<body>
	<div class="container">
	<h1>Proximal Policy Optimization (PPO) Simulation</h1>

	<div class="explanation">
	<p>
	This simulation demonstrates how an agent learns to navigate to a goal
	using <strong>Proximal Policy Optimization (PPO)</strong>. PPO is an
	on-policy reinforcement learning algorithm that uses a "clipping"
	mechanism to prevent large policy updates, making training more stable
	and efficient.
	</p>
	</div>

	<div class="tab-container">
	<div class="tab-buttons">
	<button class="tab-button active" onclick="openTab('simulation-tab')">
	Simulation
	</button>
	<button class="tab-button" onclick="openTab('concepts-tab')">
	PPO Concepts
	</button>
	<button class="tab-button" onclick="openTab('metrics-tab')">
	Training Metrics
	</button>
	</div>

	<div id="simulation-tab" class="tab-content active">
	<div class="panel">
	<h3>Environment</h3>
	<p>
	The agent (blue) must navigate to the goal (green) while avoiding
	obstacles (red).
	</p>
	<div class="grid-container" id="grid"></div>
	<div class="reward-display">
	Total Reward: <span id="reward-value">0</span>
	</div>
	</div>

	<div class="controls">
	<button id="start-btn" onclick="startTraining()">
	Start Training
	</button>
	<button id="reset-btn" onclick="resetEnvironment()">
	Reset Environment
	</button>
	<button id="step-btn" onclick="stepTraining()" disabled>
	Step Forward
	</button>
	<button id="place-obstacle-btn" onclick="toggleObstaclePlacement()">
	Place Obstacles
	</button>
	<button id="animation-speed-btn" onclick="toggleAnimationSpeed()">
	Animation Speed: Normal
	</button>
	</div>

	<div class="panel">
	<h3>PPO Parameters</h3>
	<div class="sliders">
	<div class="slider-container">
	<label for="clip-ratio">Clip Ratio (ε):</label>
	<input
	type="range"
	id="clip-ratio"
	min="0.05"
	max="0.5"
	step="0.05"
	value="0.2"
	oninput="updateSliderValue('clip-ratio')"
	/>
	<span class="slider-value" id="clip-ratio-value">0.2</span>
	</div>
	<div class="slider-container">
	<label for="learning-rate">Learning Rate:</label>
	<input
	type="range"
	id="learning-rate"
	min="0.01"
	max="1"
	step="0.01"
	value="0.1"
	oninput="updateSliderValue('learning-rate')"
	/>
	<span class="slider-value" id="learning-rate-value">0.1</span>
	</div>
	<div class="slider-container">
	<label for="epochs">PPO Epochs per Update:</label>
	<input
	type="range"
	id="epochs"
	min="1"
	max="10"
	step="1"
	value="4"
	oninput="updateSliderValue('epochs')"
	/>
	<span class="slider-value" id="epochs-value">4</span>
	</div>
	</div>
	</div>

	<div class="panel">
	<h3>Policy Visualization</h3>
	<p>
	This shows the current policy of the agent (arrows indicate
	preferred actions in each state).
	</p>
	<div id="policy-display">
	<div class="policy-grid" id="policy-grid"></div>
	</div>
	</div>

	<div id="log-container"></div>
	</div>

	<div id="concepts-tab" class="tab-content">
	<div class="concept-box">
	<h3 class="concept-title">What is PPO?</h3>
	<div class="concept-content">
	<p>
	Proximal Policy Optimization (PPO) is a policy gradient method
	for reinforcement learning developed by OpenAI in 2017. It has
	become one of the most popular RL algorithms due to its
	simplicity and effectiveness.
	</p>
	<p>PPO aims to balance two objectives:</p>
	<ul>
	<li>Improving the agent's policy to maximize rewards</li>
	<li>
	Preventing large policy updates that could destabilize
	training
	</li>
	</ul>
	</div>
	</div>

	<div class="concept-box">
	<h3 class="concept-title">Key Innovations in PPO</h3>
	<div class="concept-content">
	<p>
	The central innovation in PPO is the
	<strong>clipped surrogate objective function</strong>:
	</p>
	<p style="text-align: center">
	L<sup>CLIP</sup>(θ) = E[min(r<sub>t</sub>(θ)A<sub>t</sub>,
	clip(r<sub>t</sub>(θ), 1-ε, 1+ε)A<sub>t</sub>)]
	</p>
	<p>where:</p>
	<ul>
	<li>
	<strong>r<sub>t</sub>(θ)</strong> is the ratio of
	probabilities under new and old policies
	</li>
	<li>
	<strong>A<sub>t</sub></strong> is the advantage estimate
	</li>
	<li>
	<strong>ε</strong> is the clipping parameter (usually 0.1 or
	0.2)
	</li>
	</ul>
	<p>
	The clipping mechanism ensures that the policy update stays
	within a "trust region" by limiting how much the new policy can
	deviate from the old one.
	</p>
	</div>
	</div>

	<div class="concept-box">
	<h3 class="concept-title">How PPO Works in This Simulation</h3>
	<div class="concept-content">
	<ol>
	<li>
	The agent collects experience by interacting with the
	environment using its current policy
	</li>
	<li>Advantages are computed for each state-action pair</li>
	<li>
	The policy is updated using the clipped surrogate objective
	</li>
	<li>
	Multiple optimization epochs are performed on the same batch
	of data
	</li>
	<li>The process repeats with the new policy</li>
	</ol>
	<p>
	You can observe these steps in action in the simulation tab by
	watching the policy visualization and training metrics.
	</p>
	</div>
	</div>

	<div class="concept-box">
	<h3 class="concept-title">PPO vs. Other RL Algorithms</h3>
	<div class="concept-content">
	<p>PPO improves upon earlier algorithms in several ways:</p>
	<ul>
	<li>
	<strong>vs. REINFORCE:</strong> More stable training due to
	advantage estimation and clipping
	</li>
	<li>
	<strong>vs. TRPO:</strong> Simpler implementation while
	maintaining similar performance
	</li>
	<li>
	<strong>vs. A2C/A3C:</strong> Better sample efficiency and
	more stable policy updates
	</li>
	<li>
	<strong>vs. Off-policy algorithms (DQN, DDPG):</strong> Less
	sensitive to hyperparameters and often more stable
	</li>
	</ul>
	</div>
	</div>
	</div>

	<div id="metrics-tab" class="tab-content">
	<div class="panel">
	<h3>Training Progress</h3>
	<div class="progress-container">
	<div class="progress-bar" id="training-progress"></div>
	</div>
	<p id="episode-counter">Episodes: 0 / 100</p>
	</div>

	<div class="panel">
	<h3>Reward Over Time</h3>
	<div class="chart-container" id="reward-chart"></div>
	</div>

	<div class="panel">
	<h3>Policy Loss</h3>
	<div class="chart-container" id="policy-loss-chart"></div>
	</div>

	<div class="panel">
	<h3>Value Loss</h3>
	<div class="chart-container" id="value-loss-chart"></div>
	</div>
	</div>
	</div>
	</div>

	<div class="popup-overlay" id="popup-overlay"></div>
	<div class="popup" id="popup">
	<h2 id="popup-title">Title</h2>
	<div id="popup-content">Content</div>
	<button onclick="closePopup()">Close</button>
	</div>

	<script>
	// Environment configuration
	const GRID_SIZE = 10;
	let grid = [];
	let agentPos = { x: 0, y: 0 };
	let goalPos = { x: 9, y: 9 };
	let obstacles = [];
	let placingObstacles = false;

	// Agent and PPO parameters
	let policyNetwork = {};
	let valueNetwork = {};
	let clipRatio = 0.2;
	let learningRate = 0.1; // Default learning rate (0-1 range)
	let ppoEpochs = 4;
	let gamma = 0.99; // Discount factor
	let lambda = 0.95; // GAE parameter

	// Training state
	let isTraining = false;
	let episode = 0;
	let maxEpisodes = 100;
	let episodeSteps = 0;
	let maxStepsPerEpisode = 100; // Increased max steps to allow more exploration
	let totalReward = 0;
	let episodeRewards = [];
	let policyLosses = [];
	let valueLosses = [];

	// Tracking for visualization
	let trajectories = [];
	let oldPolicy = {};

	// Exploration parameters
	let explorationRate = 0.2; // Probability of taking a random action (exploration)

	// Initialize the environment
	function initializeEnvironment() {
	grid = [];
	obstacles = [];

	// Create the grid UI
	const gridContainer = document.getElementById("grid");
	gridContainer.innerHTML = "";

	for (let y = 0; y < GRID_SIZE; y++) {
	for (let x = 0; x < GRID_SIZE; x++) {
	const cell = document.createElement("div");
	cell.classList.add("cell");
	cell.dataset.x = x;
	cell.dataset.y = y;
	cell.addEventListener("click", handleCellClick);
	gridContainer.appendChild(cell);
	}
	}

	// Place agent and goal
	agentPos = { x: 0, y: 0 };
	goalPos = { x: 9, y: 9 };
	renderGrid();

	// Initialize policy and value networks
	initializeNetworks();
	renderPolicy();
	updateReward(0);
	}

	// Initialize policy and value networks
	function initializeNetworks() {
	policyNetwork = {};
	valueNetwork = {};

	// Initialize learning rate
	learningRate = parseFloat(
	document.getElementById("learning-rate").value
	);

	// Initialize policy and value for each state (cell)
	for (let y = 0; y < GRID_SIZE; y++) {
	for (let x = 0; x < GRID_SIZE; x++) {
	const stateKey = `${x},${y}`;

	// Initialize policy with random probabilities
	policyNetwork[stateKey] = {
	up: 0.25,
	right: 0.25,
	down: 0.25,
	left: 0.25,
	};

	// Initialize value to zero
	valueNetwork[stateKey] = 0;
	}
	}
	}

	function renderGrid() {
	// Clear all cells
	const cells = document.querySelectorAll(".cell");
	cells.forEach((cell) => {
	cell.innerHTML = "";
	});

	// Place agent
	const agentCell = document.querySelector(
	`.cell[data-x="${agentPos.x}"][data-y="${agentPos.y}"]`
	);
	const agentElement = document.createElement("div");
	agentElement.classList.add("agent");
	agentCell.appendChild(agentElement);

	// Place goal
	const goalCell = document.querySelector(
	`.cell[data-x="${goalPos.x}"][data-y="${goalPos.y}"]`
	);
	const goalElement = document.createElement("div");
	goalElement.classList.add("goal");
	goalCell.appendChild(goalElement);

	// Place obstacles
	obstacles.forEach((obstacle) => {
	const obstacleCell = document.querySelector(
	`.cell[data-x="${obstacle.x}"][data-y="${obstacle.y}"]`
	);
	const obstacleElement = document.createElement("div");
	obstacleElement.classList.add("obstacle");
	obstacleCell.appendChild(obstacleElement);
	});
	}

	function renderPolicy() {
	const policyGrid = document.getElementById("policy-grid");
	policyGrid.innerHTML = "";

	for (let y = 0; y < GRID_SIZE; y++) {
	for (let x = 0; x < GRID_SIZE; x++) {
	const cell = document.createElement("div");
	cell.classList.add("policy-cell");

	const stateKey = `${x},${y}`;
	const policy = policyNetwork[stateKey];

	// Skip rendering policy for obstacles
	if (isObstacle(x, y)) {
	cell.style.backgroundColor = "#e74c3c";
	policyGrid.appendChild(cell);
	continue;
	}

	// If it's the goal, mark it green
	if (x === goalPos.x && y === goalPos.y) {
	cell.style.backgroundColor = "#2ecc71";
	policyGrid.appendChild(cell);
	continue;
	}

	// Create arrows for each action probability
	for (const [action, prob] of Object.entries(policy)) {
	if (prob > 0.2) {
	// Only show significant probabilities
	const arrow = document.createElement("div");
	arrow.classList.add("arrow", `arrow-${action}`);
	arrow.style.opacity = Math.min(1, prob * 2); // Scale opacity with probability
	cell.appendChild(arrow);
	}
	}

	// Add state value indication using background color intensity
	const value = valueNetwork[stateKey];
	const normalizedValue = (value + 10) / 20; // Normalize to [0,1] range assuming values between -10 and 10
	const intensity = Math.max(
	0,
	Math.min(255, Math.floor(normalizedValue * 255))
	);
	cell.style.backgroundColor = `rgba(236, 240, 241, ${normalizedValue})`;

	policyGrid.appendChild(cell);
	}
	}
	}

	function handleCellClick(event) {
	const x = parseInt(event.currentTarget.dataset.x);
	const y = parseInt(event.currentTarget.dataset.y);

	if (placingObstacles) {
	// Don't allow obstacles on agent or goal
	if (
	(x === agentPos.x && y === agentPos.y) \|\|
	(x === goalPos.x && y === goalPos.y)
	) {
	return;
	}

	const obstacleIndex = obstacles.findIndex(
	(o) => o.x === x && o.y === y
	);
	if (obstacleIndex === -1) {
	obstacles.push({ x, y });
	} else {
	obstacles.splice(obstacleIndex, 1);
	}
	renderGrid();
	renderPolicy();
	}
	}

	function toggleObstaclePlacement() {
	placingObstacles = !placingObstacles;
	const btn = document.getElementById("place-obstacle-btn");
	btn.textContent = placingObstacles ? "Done Placing" : "Place Obstacles";
	btn.style.backgroundColor = placingObstacles ? "#e74c3c" : "#3498db";
	}

	function isObstacle(x, y) {
	return obstacles.some((o) => o.x === x && o.y === y);
	}

	function resetEnvironment() {
	initializeEnvironment();
	episodeRewards = [];
	policyLosses = [];
	valueLosses = [];
	episode = 0;
	updateEpisodeCounter();
	updateReward(0);

	// Reset training state
	isTraining = false;
	document.getElementById("start-btn").textContent = "Start Training";
	document.getElementById("step-btn").disabled = true;

	// Clear charts
	// In a real implementation, you would update the charts here

	logMessage("Environment reset. Ready for training!");
	}

	function startTraining() {
	if (isTraining) {
	// Stop training
	isTraining = false;
	document.getElementById("start-btn").textContent = "Start Training";
	document.getElementById("step-btn").disabled = true;
	} else {
	// Start training
	isTraining = true;
	document.getElementById("start-btn").textContent = "Stop Training";
	document.getElementById("step-btn").disabled = false;

	// If we're at the end of training, reset first
	if (episode >= maxEpisodes) {
	resetEnvironment();
	}

	runTrainingLoop();
	}
	}

	function stepTraining() {
	if (episode < maxEpisodes) {
	runEpisode();
	updateTrainingProgress();
	} else {
	logMessage("Training complete! Reset to train again.");
	}
	}

	async function runTrainingLoop() {
	while (isTraining && episode < maxEpisodes) {
	await runEpisode();
	updateTrainingProgress();

	// Add a small delay to visualize the process
	await new Promise((resolve) => setTimeout(resolve, 200));
	}

	if (episode >= maxEpisodes) {
	logMessage("Training complete!");
	isTraining = false;
	document.getElementById("start-btn").textContent = "Start Training";
	}
	}

	async function runEpisode() {
	// Reset agent position and episodic variables
	agentPos = { x: 0, y: 0 };
	episodeSteps = 0;
	totalReward = 0;
	trajectories = [];

	// Decay exploration rate over time (important for improving policy)
	explorationRate = Math.max(0.05, 0.2 * Math.pow(0.99, episode));

	renderGrid();
	updateReward(totalReward);

	// Save old policy for PPO ratio calculation
	oldPolicy = JSON.parse(JSON.stringify(policyNetwork));

	// Run episode until termination
	let done = false;
	while (!done && episodeSteps < maxStepsPerEpisode) {
	done = await executeStep();
	episodeSteps++;

	// Small delay for visualization
	await new Promise((resolve) =>
	setTimeout(resolve, animationSpeeds[animationSpeed] / 2)
	);
	}

	// Add episode reward to history
	episodeRewards.push(totalReward);

	// Run PPO update if we have enough steps
	if (trajectories.length > 0) {
	const [policyLoss, valueLoss] = updatePPO();
	policyLosses.push(policyLoss);
	valueLosses.push(valueLoss);
	}

	// Update UI
	renderPolicy();
	episode++;
	updateEpisodeCounter();

	logMessage(
	`Episode ${episode}: Reward=${totalReward.toFixed(
	2
	)}, Steps=${episodeSteps}, Exploration=${explorationRate.toFixed(2)}`
	);

	return new Promise((resolve) => setTimeout(resolve, 10));
	}

	async function executeStep() {
	const stateKey = `${agentPos.x},${agentPos.y}`;
	const policy = policyNetwork[stateKey];

	// Choose action based on policy
	const action = sampleAction(policy);

	// Store old position
	const oldPos = { ...agentPos };

	// Move agent
	moveAgent(action);

	// Calculate reward
	const reward = calculateReward(oldPos);
	totalReward += reward;
	updateReward(totalReward);

	// Check if episode is done
	const done =
	(agentPos.x === goalPos.x && agentPos.y === goalPos.y) \|\|
	isObstacle(agentPos.x, agentPos.y);

	// If agent hit obstacle, move it back for visualization
	if (isObstacle(agentPos.x, agentPos.y)) {
	agentPos = { ...oldPos };
	}

	// Render the grid
	renderGrid();

	// Store trajectory
	const newStateKey = `${agentPos.x},${agentPos.y}`;
	trajectories.push({
	state: stateKey,
	action,
	reward,
	nextState: newStateKey,
	done,
	});

	return done;
	}

	function sampleAction(policy) {
	// Use exploration rate to decide whether to take random action or follow policy
	if (Math.random() < explorationRate) {
	// Take random action with exploration probability
	const actions = Object.keys(policy);
	const randomIndex = Math.floor(Math.random() * actions.length);
	return actions[randomIndex];
	}

	// Otherwise sample from policy distribution
	const actions = Object.keys(policy);
	const probs = actions.map((a) => policy[a]);

	const rand = Math.random();
	let cumProb = 0;

	for (let i = 0; i < actions.length; i++) {
	cumProb += probs[i];
	if (rand < cumProb) {
	return actions[i];
	}
	}

	return actions[actions.length - 1];
	}

	function moveAgent(action) {
	// Save previous position
	const prevPos = { ...agentPos };

	// Attempt to move agent
	switch (action) {
	case "up":
	agentPos.y = Math.max(0, agentPos.y - 1);
	break;
	case "right":
	agentPos.x = Math.min(GRID_SIZE - 1, agentPos.x + 1);
	break;
	case "down":
	agentPos.y = Math.min(GRID_SIZE - 1, agentPos.y + 1);
	break;
	case "left":
	agentPos.x = Math.max(0, agentPos.x - 1);
	break;
	}

	// Check if new position is an obstacle
	if (isObstacle(agentPos.x, agentPos.y)) {
	// Revert to previous position if it hit an obstacle
	agentPos.x = prevPos.x;
	agentPos.y = prevPos.y;
	return false; // Indicate movement was blocked
	}

	return true; // Movement successful
	}

	function calculateReward(oldPos, movementSuccessful) {
	// Reward for reaching goal
	if (agentPos.x === goalPos.x && agentPos.y === goalPos.y) {
	return 10;
	}

	// Penalty for attempting to move into an obstacle (but not actually moving into it)
	if (!movementSuccessful) {
	return -1; // Reduced penalty to avoid too much negative learning
	}

	// Small penalty for each step to encourage efficiency
	let stepPenalty = -0.1;

	// Small reward for getting closer to goal (using Manhattan distance)
	const oldDistance =
	Math.abs(oldPos.x - goalPos.x) + Math.abs(oldPos.y - goalPos.y);
	const newDistance =
	Math.abs(agentPos.x - goalPos.x) + Math.abs(agentPos.y - goalPos.y);
	const proximityReward = oldDistance > newDistance ? 0.3 : -0.1; // Stronger reward for progress

	return stepPenalty + proximityReward;
	}

	function updatePPO() {
	// Get parameters from sliders
	clipRatio = parseFloat(document.getElementById("clip-ratio").value);
	learningRate = parseFloat(
	document.getElementById("learning-rate").value
	);
	ppoEpochs = parseInt(document.getElementById("epochs").value);

	// Compute returns and advantages
	const returns = [];
	const advantages = [];

	// Compute returns (discounted sum of future rewards)
	let discountedReturn = 0;
	for (let i = trajectories.length - 1; i >= 0; i--) {
	const transition = trajectories[i];
	discountedReturn =
	transition.reward +
	gamma * (transition.done ? 0 : discountedReturn);
	returns.unshift(discountedReturn);
	}

	// Compute advantages using Generalized Advantage Estimation (GAE)
	let lastGaeAdvantage = 0;
	for (let i = trajectories.length - 1; i >= 0; i--) {
	const transition = trajectories[i];
	const stateKey = transition.state;
	const nextStateKey = transition.nextState;

	const currentValue = valueNetwork[stateKey];
	const nextValue = transition.done ? 0 : valueNetwork[nextStateKey];

	// TD error
	const delta = transition.reward + gamma * nextValue - currentValue;

	// GAE
	lastGaeAdvantage = delta + gamma * lambda * lastGaeAdvantage;
	advantages.unshift(lastGaeAdvantage);
	}

	// Normalize advantages for more stable learning
	const meanAdvantage =
	advantages.reduce((a, b) => a + b, 0) / advantages.length;
	const stdAdvantage =
	Math.sqrt(
	advantages.reduce((a, b) => a + Math.pow(b - meanAdvantage, 2), 0) /
	advantages.length
	) \|\| 1; // Avoid division by zero

	for (let i = 0; i < advantages.length; i++) {
	advantages[i] =
	(advantages[i] - meanAdvantage) / (stdAdvantage + 1e-8);
	}

	// Store losses for metrics
	let totalPolicyLoss = 0;
	let totalValueLoss = 0;

	// Backup old policy for PPO ratio calculation
	const oldPolicyBackup = JSON.parse(JSON.stringify(policyNetwork));

	// Multiple epochs of optimization on the same data (key PPO feature)
	for (let epoch = 0; epoch < ppoEpochs; epoch++) {
	// Update policy and value networks for each step in the trajectory
	for (let i = 0; i < trajectories.length; i++) {
	const transition = trajectories[i];
	const stateKey = transition.state;
	const action = transition.action;

	// Get old action probability
	const oldActionProb = oldPolicy[stateKey][action];

	// Get current action probability
	const currentActionProb = policyNetwork[stateKey][action];

	// Compute probability ratio (crucial for PPO)
	const ratio = currentActionProb / Math.max(oldActionProb, 1e-8);

	// Get advantage for this action
	const advantage = advantages[i];

	// Compute unclipped and clipped surrogate objectives
	const unclippedObjective = ratio * advantage;
	const clippedRatio = Math.max(
	Math.min(ratio, 1 + clipRatio),
	1 - clipRatio
	);
	const clippedObjective = clippedRatio * advantage;

	// PPO's clipped surrogate objective (core of PPO)
	const surrogateObjective = Math.min(
	unclippedObjective,
	clippedObjective
	);

	// Compute policy gradient
	// Note: In PPO, we maximize the objective, so negative for gradient ascent
	const policyLoss = -surrogateObjective;
	totalPolicyLoss += policyLoss;

	// Value loss (using returns as targets)
	const valueTarget = returns[i];
	const valuePrediction = valueNetwork[stateKey];
	const valueLoss = 0.5 * Math.pow(valueTarget - valuePrediction, 2);
	totalValueLoss += valueLoss;

	// Update value network with gradient descent
	valueNetwork[stateKey] +=
	learningRate * (valueTarget - valuePrediction);

	// Compute policy update based on whether we're using clipped or unclipped objective
	const useClippedObjective = unclippedObjective > clippedObjective;
	const policyGradient =
	learningRate * advantage * (useClippedObjective ? 0 : 1);

	// Apply policy gradient update
	// Increase probability of the taken action if it was good (positive advantage)
	// Decrease probability if it was bad (negative advantage)
	let newProb = policyNetwork[stateKey][action] + policyGradient;

	// Ensure probability stays positive (important for ratio calculation)
	newProb = Math.max(newProb, 0.01);
	policyNetwork[stateKey][action] = newProb;

	// Normalize probabilities to ensure they sum to 1
	const sumProb = Object.values(policyNetwork[stateKey]).reduce(
	(a, b) => a + b,
	0
	);
	for (const a in policyNetwork[stateKey]) {
	policyNetwork[stateKey][a] /= sumProb;
	}

	// Add some exploration (entropy bonus)
	// This is crucial for avoiding local optima
	if (i % 5 === 0) {
	// Apply periodically to maintain some exploration
	for (const a in policyNetwork[stateKey]) {
	// Slightly nudge probabilities toward uniform
	policyNetwork[stateKey][a] =
	0.95 * policyNetwork[stateKey][a] + 0.05 * 0.25;
	}
	// Re-normalize
	const sumProb = Object.values(policyNetwork[stateKey]).reduce(
	(a, b) => a + b,
	0
	);
	for (const a in policyNetwork[stateKey]) {
	policyNetwork[stateKey][a] /= sumProb;
	}
	}
	}
	}

	// Calculate average losses
	const avgPolicyLoss =
	totalPolicyLoss / (trajectories.length * ppoEpochs);
	const avgValueLoss = totalValueLoss / (trajectories.length * ppoEpochs);

	// Log progress periodically
	if (episode % 5 === 0) {
	logMessage(
	`Episode ${episode}: Average Policy Loss = ${avgPolicyLoss.toFixed(
	4
	)}, Value Loss = ${avgValueLoss.toFixed(4)}`
	);
	}

	return [avgPolicyLoss, avgValueLoss];
	}

	function updateReward(reward) {
	document.getElementById("reward-value").textContent = reward.toFixed(2);
	}

	function updateEpisodeCounter() {
	document.getElementById(
	"episode-counter"
	).textContent = `Episodes: ${episode} / ${maxEpisodes}`;
	document.getElementById("training-progress").style.width = `${
	(episode / maxEpisodes) * 100
	}%`;
	}

	function updateTrainingProgress() {
	// Update charts with the latest data
	// In a real implementation, you would update charts here

	// Show progress
	updateEpisodeCounter();
	}

	function updateSliderValue(id) {
	const slider = document.getElementById(id);
	const valueDisplay = document.getElementById(`${id}-value`);
	valueDisplay.textContent = slider.value;

	// Update corresponding variables
	if (id === "clip-ratio") clipRatio = parseFloat(slider.value);
	if (id === "learning-rate") learningRate = parseFloat(slider.value);
	if (id === "epochs") ppoEpochs = parseInt(slider.value);
	}

	function logMessage(message) {
	const logContainer = document.getElementById("log-container");
	const logEntry = document.createElement("div");
	logEntry.classList.add("log-entry");
	logEntry.textContent = message;
	logContainer.appendChild(logEntry);
	logContainer.scrollTop = logContainer.scrollHeight;
	}

	function openTab(tabId) {
	// Hide all tab contents
	const tabContents = document.getElementsByClassName("tab-content");
	for (let i = 0; i < tabContents.length; i++) {
	tabContents[i].classList.remove("active");
	}

	// Remove active class from tab buttons
	const tabButtons = document.getElementsByClassName("tab-button");
	for (let i = 0; i < tabButtons.length; i++) {
	tabButtons[i].classList.remove("active");
	}

	// Show selected tab content and mark button as active
	document.getElementById(tabId).classList.add("active");
	const activeButton = document.querySelector(
	`.tab-button[onclick="openTab('${tabId}')"]`
	);
	activeButton.classList.add("active");
	}

	function showPopup(title, content) {
	document.getElementById("popup-title").textContent = title;
	document.getElementById("popup-content").innerHTML = content;
	document.getElementById("popup-overlay").style.display = "block";
	document.getElementById("popup").style.display = "block";
	}

	function closePopup() {
	document.getElementById("popup-overlay").style.display = "none";
	document.getElementById("popup").style.display = "none";
	}

	// Initialize the environment when the page loads
	window.onload = function () {
	initializeEnvironment();
	logMessage('Environment initialized. Click "Start Training" to begin!');

	// Show concept popup with a delay
	setTimeout(() => {
	showPopup(
	"Welcome to PPO Simulation",
	`
	<p>This simulation demonstrates Proximal Policy Optimization (PPO), a reinforcement learning algorithm.</p>
	<p>In this grid world:</p>
	<ul>
	<li>The agent (blue circle) must learn to navigate to the goal (green square)</li>
	<li>You can place obstacles (red squares) by clicking the "Place Obstacles" button</li>
	<li>The agent receives rewards for approaching the goal and penalties for hitting obstacles</li>
	<li>PPO helps the agent learn efficiently by preventing large policy updates</li>
	</ul>
	<p>Try experimenting with different parameters to see how they affect learning!</p>
	`
	);
	}, 1000);
	};
	// Animation speed control
	let animationSpeed = "normal";
	const animationSpeeds = {
	slow: 300,
	normal: 100,
	fast: 20,
	};

	function toggleAnimationSpeed() {
	const speedBtn = document.getElementById("animation-speed-btn");

	if (animationSpeed === "slow") {
	animationSpeed = "normal";
	speedBtn.textContent = "Animation Speed: Normal";
	} else if (animationSpeed === "normal") {
	animationSpeed = "fast";
	speedBtn.textContent = "Animation Speed: Fast";
	} else {
	animationSpeed = "slow";
	speedBtn.textContent = "Animation Speed: Slow";
	}
	}

	// Update animation speed in relevant functions
	async function runTrainingLoop() {
	while (isTraining && episode < maxEpisodes) {
	await runEpisode();
	updateTrainingProgress();

	// Use dynamic animation speed
	await new Promise((resolve) =>
	setTimeout(resolve, animationSpeeds[animationSpeed])
	);
	}

	if (episode >= maxEpisodes) {
	logMessage("Training complete!");
	isTraining = false;
	document.getElementById("start-btn").textContent = "Start Training";
	}
	}

	async function executeStep() {
	const stateKey = `${agentPos.x},${agentPos.y}`;
	const policy = policyNetwork[stateKey];

	// Choose action based on policy
	const action = sampleAction(policy);

	// Store old position
	const oldPos = { ...agentPos };

	// Move agent
	const movementSuccessful = moveAgent(action);

	// Calculate reward
	const reward = calculateReward(oldPos, movementSuccessful);
	totalReward += reward;
	updateReward(totalReward);

	// Check if episode is done
	const done = agentPos.x === goalPos.x && agentPos.y === goalPos.y;

	// Render the grid
	renderGrid();

	// Store trajectory
	const newStateKey = `${agentPos.x},${agentPos.y}`;
	trajectories.push({
	state: stateKey,
	action,
	reward,
	nextState: newStateKey,
	done,
	});

	// Use dynamic animation speed
	await new Promise((resolve) =>
	setTimeout(resolve, animationSpeeds[animationSpeed] / 2)
	);

	return done;
	}
	</script>

	<footer
	style="
	text-align: center;
	margin-top: 30px;
	padding: 15px;
	background-color: #f8f9fa;
	border-top: 1px solid #ddd;
	"
	>
	© 2025 Pejman Ebrahimi - All Rights Reserved
	</footer>
	</body>
	</html>