diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..3152023feaef0811cf55ea93ff7d32c71a004d03
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,45 @@
+FROM nvidia/cuda:12.0.1-runtime-ubuntu22.04
+
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    python3-dev \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libglib2.0-0 \
+    git \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+# Copy requirements first to leverage Docker caching
+COPY requirements.txt .
+
+# Install Python dependencies
+RUN pip3 install --no-cache-dir -r requirements.txt
+RUN pip3 install aiohttp
+
+# Install additional required packages
+RUN pip3 install --no-cache-dir torch torchvision torchaudio
+
+# Copy application code
+COPY . .
+
+# Create assets directory if it doesn't exist
+RUN mkdir -p /app/assets
+
+# Expose the port used by the server
+EXPOSE 8080
+
+# Set entry command
+CMD ["python3", "server.py", "--host", "0.0.0.0", "--port", "8080"]
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..08357e62af5bcd3f43c8028206d75238c4db1655
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2024 Eloi Alonso
+Copyright (c) 2025 Enigma Labs AI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 4a1d97e427779dab9d5035c290b67dfaccc18b6d..212b895a1cb25e646708ed05f0bab4bf57d49284 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,170 @@
 ---
-title: Tikslop Gaming Multiverse
-emoji: 🏃
-colorFrom: purple
-colorTo: purple
+title: Multiverse
+emoji: 🐟
+colorFrom: blue
+colorTo: blue
 sdk: docker
-pinned: false
+app_file: server.py
+pinned: true
+short_description: AI Multiplayer World Model
+app_port: 8080
+disable_embedding: false
 ---
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Multiverse: The First AI Multiplayer World Model
+
+🌐 [Enigma-AI website](https://enigma-labs.io/) - 📚 [Technical Blog](https://enigma-labs.io/) - [🤗 Model on Huggingface](https://huggingface.co/Enigma-AI/multiverse) -  [🤗 Datasets on Huggingface](https://huggingface.co/datasets/Enigma-AI/multiplayer-racing-low-res) - 𝕏 [Multiverse Tweet](https://x.com/j0nathanj/status/1920516649511244258) 
+
+<div align='center'>
+ <b>Two human players driving cars in Multiverse</b>
+<br>
+  <img alt="Cars in Multiverse" src="assets/demo.gif" width="400">
+</div>
+
+---
+
+## Installation
+```bash
+git clone https://github.com/EnigmaLabsAI/multiverse
+cd multiverse
+pip install -r requirements.txt
+```
+
+### Running the model
+
+```bash
+python src/play.py --compile
+```
+
+> Note on Apple Silicon you must enable CPU fallback for MPS backend with PYTORCH_ENABLE_MPS_FALLBACK=1 python src/play.py
+
+When running this command, you will be prompted with the controls. Press `enter` to start:
+![img.png](assets/img.png)
+
+Then the game will be start:
+* To control the silver car at the top screen use the arrow keys.
+* To control the blue car at the bottom use the WASD keys.
+
+![img_2.png](assets/img_2.png)
+
+---
+
+
+## Training
+
+Multiverse comprised two models:
+* Denoiser - a world model that simulates a game
+* Upsampler - a model which takes the frames from the denoiser and increases their resolution
+
+### Denoiser training
+
+#### 1. Download the dataset
+Download the Denoiser's training set from  [🤗 Huggingface](https://huggingface.co/datasets/Enigma-AI/multiplayer-racing-low-res).
+
+#### 2. Process data for training
+Run the command:
+```bash
+python src/process_denoiser_files.py <folder_with_dataset_files_from_step_one> <folder_to_store_processed_data>
+```
+
+#### 3. Edit training configuration
+
+Edit [config/env/racing.yaml](config/env/racing.yaml) and set:
+- `path_data_low_res` to `<folder_to_store_processed_data>/low_res`
+- `path_data_full_res` to `<folder_to_store_processed_data>/full_res`
+
+Edit [config/training.yaml](config/trainer.yaml) to train the `denoiser`:
+```yaml
+train_model: denoiser
+```
+
+#### 4. Launch training run
+
+You can then launch a training run with `python src/main.py`.
+
+
+### Upsampler training
+
+#### 1. Download the dataset
+Download the Upsampler's training set from  [🤗 Huggingface](https://huggingface.co/datasets/Enigma-AI/multiplayer-racing-full-res).
+
+#### 2. Process data for training
+Run the command:
+```bash
+python src/process_upsampler_files.py <folder_with_dataset_files_from_step_one> <folder_to_store_processed_data>
+```
+
+#### 3. Edit training configuration
+
+Edit [config/env/racing.yaml](config/env/racing.yaml) and set:
+- `path_data_low_res` to `<folder_to_store_processed_data>/low_res`
+- `path_data_full_res` to `<folder_to_store_processed_data>/full_res`
+
+Edit [config/training.yaml](config/trainer.yaml) to train the `denoiser`:
+```yaml
+train_model: upsampler
+```
+
+#### 4. Launch training run
+
+You can then launch a training run with `python src/main.py`.
+
+
+---
+
+## Datasets
+
+1. We've collected over 4 hours of multiplayer (1v1) footage from Gran Turismo 4 at a resolution of 48x64 (per players): [🤗 Huggingface link](https://huggingface.co/datasets/Enigma-AI/multiplayer-racing-low-res).
+
+2. A sparse sampling of full resolution, cropped frames, are availabe in order to train the upsampler at a resolution of 350x530: [🤗 Huggingface link](https://huggingface.co/datasets/Enigma-AI/multiplayer-racing-full-res).
+
+The datasets contain a variety of situations: acceleration, braking, overtakes, crashes, and expert driving for both players.
+You can read about the data collection mechanism [here](https://enigma-labs.io/blog)
+
+Note: The full resolution dataset is only for upsampler training and is not fit for world model training.
+
+---
+
+## Outside resources
+
+- DIAMOND - https://github.com/eloialonso/diamond
+- AI-MarioKart64 - https://github.com/Dere-Wah/AI-MarioKart64
+
+---
+
+## Cloud Gaming Server
+
+This project now includes a WebSocket-based cloud gaming server that allows you to play the game through a web browser.
+
+### Using Docker (Recommended for GPU Servers)
+
+The easiest way to deploy the cloud gaming server on a machine with an NVIDIA GPU is using Docker:
+
+```bash
+# Build the Docker image
+docker build -t ai-game-multiverse .
+
+# Run the container with GPU support
+docker run --gpus all -p 8080:8080 ai-game-multiverse
+```
+
+Then access the web interface at http://yourserver:8080
+
+### Features
+
+- Web-based interface accessible from any modern browser
+- Real-time streaming of AI-generated game frames
+- Keyboard and mouse controls
+- Multiple scene selection
+- WebSocket communication for low-latency interaction
+
+### Usage
+
+1. Access the web interface at http://yourserver:8080
+2. Click "Connect" to establish a WebSocket connection
+3. Select a scene from the dropdown
+4. Click "Start Stream" to begin streaming frames
+5. Use WASD keys for movement, Space for jump, Shift for attack
+6. Mouse controls camera view (click on game area to capture mouse)
+
+Note: The server requires an NVIDIA GPU for optimal performance with the AI models. Without a suitable GPU, it will fall back to using simple placeholder frames.
\ No newline at end of file
diff --git a/config/agent/racing.yaml b/config/agent/racing.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..470a4f2768357c5591005fb8e7b26ac2f8b0a14a
--- /dev/null
+++ b/config/agent/racing.yaml
@@ -0,0 +1,66 @@
+_target_: agent.AgentConfig
+
+denoiser:
+  _target_: models.diffusion.DenoiserConfig
+  sigma_data: 0.5
+  sigma_offset_noise: 0.1
+  noise_previous_obs: true
+  upsampling_factor: null
+  frame_sampling:
+    - count: 4
+      stride: 1
+    - count: 4
+      stride: 4
+  inner_model:
+    _target_: models.diffusion.InnerModelConfig
+    img_channels: 6
+    num_steps_conditioning: 8
+    cond_channels: 2048
+    depths:
+    - 2
+    - 2
+    - 2
+    - 2
+    channels:
+    - 128
+    - 256
+    - 512
+    - 1024
+    attn_depths:
+    - 0
+    - 0
+    - 1
+    - 1
+
+upsampler:
+  _target_: models.diffusion.DenoiserConfig
+  sigma_data: 0.5
+  sigma_offset_noise: 0.1
+  noise_previous_obs: false
+  upsampling_factor: 10
+  upsampling_frame_height: 350
+  upsampling_frame_width: 530
+  inner_model:
+    _target_: models.diffusion.InnerModelConfig
+    img_channels: 6
+    num_steps_conditioning: 0
+    cond_channels: 2048
+    depths:
+    - 2
+    - 2
+    - 2
+    - 2
+    channels:
+    - 64
+    - 64
+    - 128
+    - 256
+    attn_depths:
+    - 0
+    - 0
+    - 0
+    - 0
+  
+rew_end_model: null 
+
+actor_critic: null 
diff --git a/config/env/racing.yaml b/config/env/racing.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..79c035cfe6d02d3dec551bea230fcdcf5f75f842
--- /dev/null
+++ b/config/env/racing.yaml
@@ -0,0 +1,7 @@
+train:
+  id: racing
+  size: [700, 530]
+num_actions: 66
+path_data_low_res: null
+path_data_full_res: null
+keymap: racing
diff --git a/config/trainer.yaml b/config/trainer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4217362d937e42b6144e4a624bdd2c5e3a5a125f
--- /dev/null
+++ b/config/trainer.yaml
@@ -0,0 +1,113 @@
+defaults:
+  - _self_
+  - env: racing
+  - agent: racing
+  - world_model_env: fast
+
+hydra:
+  job:
+    chdir: True
+
+wandb:
+  mode: offline
+  project: null
+  entity: null
+  name: null
+  group: null
+  tags: null
+
+initialization:
+  path_to_ckpt: null
+  load_denoiser: True
+  load_rew_end_model: True
+  load_actor_critic: True
+
+common:
+  devices: all  # int, list of int, cpu, or all
+  seed: null
+  resume: False # do not modify, set by scripts/resume.sh only.
+
+checkpointing:
+  save_agent_every: 5
+  num_to_keep: 11  # number of checkpoints to keep, use null to disable
+
+collection:
+  train:
+    num_envs: 1
+    epsilon: 0.01
+    num_steps_total: 100000
+    first_epoch:
+      min: 5000
+      max: 10000  # null: no maximum
+      threshold_rew: 10
+    steps_per_epoch: 100
+  test:
+    num_envs: 1
+    num_episodes: 4
+    epsilon: 0.0
+    num_final_episodes: 100
+
+static_dataset:
+  path: ${env.path_data_low_res}
+  ignore_sample_weights: True
+
+training:
+  should: True
+  num_final_epochs: 600
+  cache_in_ram: False
+  num_workers_data_loaders: 1
+  model_free: False # if True, turn off world_model training and RL in imagination
+  compile_wm: False
+
+evaluation:
+  should: True
+  every: 20
+
+train_model: denoiser
+
+denoiser:
+  training:
+    num_autoregressive_steps: 8
+    initial_num_consecutive_page_count: 1
+    num_consecutive_pages:
+      - epoch: 400
+        count: 10
+      - epoch: 500
+        count: 50
+    start_after_epochs: 0
+    steps_first_epoch: 10
+    steps_per_epoch: 20
+    sample_weights: null
+    batch_size: 30
+    grad_acc_steps: 2
+    lr_warmup_steps: 100
+    max_grad_norm: 10.0
+
+  optimizer:
+    lr: 1e-4
+    weight_decay: 1e-2
+    eps: 1e-8
+  
+  sigma_distribution: # log normal distribution for sigma during training
+    _target_: models.diffusion.SigmaDistributionConfig
+    loc: -1.2
+    scale: 1.2
+    sigma_min: 2e-3
+    sigma_max: 20
+
+upsampler:
+  training:
+    num_autoregressive_steps: 1
+    initial_num_consecutive_page_count: 1
+    start_after_epochs: 0
+    steps_first_epoch: 20
+    steps_per_epoch: 20
+    sample_weights: null
+    batch_size: 4
+    grad_acc_steps: 2
+    lr_warmup_steps: 100
+    max_grad_norm: 10.0
+
+  optimizer: ${denoiser.optimizer}
+  sigma_distribution: ${denoiser.sigma_distribution}
+
diff --git a/config/world_model_env/fast.yaml b/config/world_model_env/fast.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..08eadc86c1cbe70f1ab822844f1194c9319d8120
--- /dev/null
+++ b/config/world_model_env/fast.yaml
@@ -0,0 +1,27 @@
+_target_: envs.WorldModelEnvConfig
+horizon: 1000
+num_batches_to_preload: 256
+diffusion_sampler_next_obs:
+  _target_: models.diffusion.DiffusionSamplerConfig
+  num_steps_denoising: 1
+  sigma_min: 2e-3
+  sigma_max: 5.0
+  rho: 7
+  order: 1  # 1: Euler, 2: Heun
+  s_churn: 0.0  # Amount of stochasticity
+  s_tmin: 0.0
+  s_tmax: ${eval:'float("inf")'}
+  s_noise: 1.0
+  s_cond: 0.005
+diffusion_sampler_upsampling:
+  _target_: models.diffusion.DiffusionSamplerConfig
+  num_steps_denoising: 1
+  sigma_min: 1
+  sigma_max: 5.0
+  rho: 7
+  order: 2  # 1: Euler, 2: Heun
+  s_churn: 10.0  # Amount of stochasticity
+  s_tmin: 1
+  s_tmax: 5
+  s_noise: 0.9
+  s_cond: 0
\ No newline at end of file
diff --git a/example/Dockerfile b/example/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..28542909b9f5203a3d24ebe0234abc82965b4486
--- /dev/null
+++ b/example/Dockerfile
@@ -0,0 +1,59 @@
+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+ENV PYTHONUNBUFFERED=1
+
+RUN apt-get update && apt-get install --no-install-recommends -y \
+  build-essential \
+  python3.11 \
+  python3-pip \
+  python3-dev \
+  git \
+  curl \
+  ffmpeg \
+  libglib2.0-0 \
+  libsm6 \
+  libxrender1 \
+  libxext6 \
+  ninja-build \
+  && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /code
+
+COPY ./requirements.txt /code/requirements.txt
+
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+
+# Set Python path and environment variables
+ENV PYTHONPATH=$HOME/app \
+  PYTHONUNBUFFERED=1 \
+  DATA_ROOT=/tmp/data
+
+RUN echo "Installing requirements.txt"
+RUN pip3 install --no-cache-dir --upgrade -r /code/requirements.txt
+
+# Install NVIDIA Apex with CUDA and C++ extensions
+RUN cd $HOME && \
+    git clone https://github.com/NVIDIA/apex && \
+    cd apex && \
+    NVCC_APPEND_FLAGS="--threads 4" pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--parallel" --global-option="8" ./
+
+WORKDIR $HOME/app
+
+# Copy all files and set proper ownership
+COPY --chown=user . $HOME/app
+
+# Expose the port that server.py uses (8080)
+EXPOSE 8080
+
+ENV PORT 8080
+
+# Run the HF space launcher script which sets up the correct paths
+CMD ["python3", "run_hf_space.py"]
\ No newline at end of file
diff --git a/example/client.js b/example/client.js
new file mode 100644
index 0000000000000000000000000000000000000000..36724958f7e64964cc73355a3d7908abb7a2abe0
--- /dev/null
+++ b/example/client.js
@@ -0,0 +1,603 @@
+// MatrixGame WebSocket Client
+
+// WebSocket connection
+let socket = null;
+let userId = null;
+let isStreaming = false;
+let lastFrameTime = 0;
+let frameCount = 0;
+let fpsUpdateInterval = null;
+
+// DOM Elements
+const connectBtn = document.getElementById('connect-btn');
+const startStreamBtn = document.getElementById('start-stream-btn');
+const stopStreamBtn = document.getElementById('stop-stream-btn');
+const sceneSelect = document.getElementById('scene-select');
+const gameCanvas = document.getElementById('game-canvas');
+const connectionLog = document.getElementById('connection-log');
+const mousePosition = document.getElementById('mouse-position');
+const fpsCounter = document.getElementById('fps-counter');
+const mouseTrackingArea = document.getElementById('mouse-tracking-area');
+
+// Pointer Lock API support check
+const pointerLockSupported = 'pointerLockElement' in document || 
+                            'mozPointerLockElement' in document || 
+                            'webkitPointerLockElement' in document;
+
+// Keyboard DOM elements
+const keyElements = {
+    'w': document.getElementById('key-w'),
+    'a': document.getElementById('key-a'),
+    's': document.getElementById('key-s'),
+    'd': document.getElementById('key-d'),
+    'space': document.getElementById('key-space'),
+    'shift': document.getElementById('key-shift')
+};
+
+// Key mapping to action names
+const keyToAction = {
+    'w': 'forward',
+    'arrowup': 'forward',
+    'a': 'left',
+    'arrowleft': 'left',
+    's': 'back',
+    'arrowdown': 'back',
+    'd': 'right',
+    'arrowright': 'right',
+    ' ': 'jump',
+    'shift': 'attack'
+};
+
+// Key state tracking
+const keyState = {
+    'forward': false,
+    'back': false,
+    'left': false,
+    'right': false,
+    'jump': false,
+    'attack': false
+};
+
+// Mouse state
+const mouseState = {
+    x: 0,
+    y: 0,
+    captured: false
+};
+
+// Test server connectivity before establishing WebSocket
+async function testServerConnectivity() {
+    try {
+        // Get base path by extracting path from the script tag's src attribute
+        let basePath = '';
+        const scriptTags = document.getElementsByTagName('script');
+        for (const script of scriptTags) {
+            if (script.src.includes('client.js')) {
+                const url = new URL(script.src);
+                basePath = url.pathname.replace('/assets/client.js', '');
+                break;
+            }
+        }
+        
+        // Try to fetch the debug endpoint to see if the server is accessible
+        const response = await fetch(`${window.location.protocol}//${window.location.host}${basePath}/api/debug`);
+        if (!response.ok) {
+            throw new Error(`Server returned ${response.status}`);
+        }
+        
+        const debugInfo = await response.json();
+        logMessage(`Server connection test successful! Server time: ${new Date(debugInfo.server_time * 1000).toLocaleTimeString()}`);
+        
+        // Log available routes from server
+        if (debugInfo.all_routes && debugInfo.all_routes.length > 0) {
+            logMessage(`Available routes: ${debugInfo.all_routes.join(', ')}`);
+        }
+        
+        // Return the debug info for connection setup
+        return debugInfo;
+    } catch (error) {
+        logMessage(`Server connection test failed: ${error.message}`);
+        return null;
+    }
+}
+
+// Connect to WebSocket server
+async function connectWebSocket() {
+    // First test connectivity to the server
+    logMessage('Testing server connectivity...');
+    const debugInfo = await testServerConnectivity();
+    
+    // Use secure WebSocket (wss://) if the page is loaded over HTTPS
+    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
+    
+    // Get base path by extracting path from the script tag's src attribute
+    let basePath = '';
+    if (debugInfo && debugInfo.base_path) {
+        // Use base path from server if available
+        basePath = debugInfo.base_path;
+        logMessage(`Using server-provided base path: ${basePath}`);
+    } else {
+        const scriptTags = document.getElementsByTagName('script');
+        for (const script of scriptTags) {
+            if (script.src.includes('client.js')) {
+                const url = new URL(script.src);
+                basePath = url.pathname.replace('/assets/client.js', '');
+                break;
+            }
+        }
+    }
+    
+    // Try both with and without base path for WebSocket connection
+    let serverUrl = `${protocol}//${window.location.hostname}${window.location.port ? ':' + window.location.port : ''}${basePath}/ws`;
+    logMessage(`Attempting to connect to WebSocket at ${serverUrl}...`);
+    
+    // For Hugging Face Spaces, try the direct /ws path if the base path doesn't work
+    const fallbackUrl = `${protocol}//${window.location.hostname}${window.location.port ? ':' + window.location.port : ''}/ws`;
+    
+    try {
+        socket = new WebSocket(serverUrl);
+        setupWebSocketHandlers();
+        
+        // Set a timeout to try the fallback URL if the first one doesn't connect
+        setTimeout(() => {
+            if (socket.readyState !== WebSocket.OPEN && socket.readyState !== WebSocket.CONNECTING) {
+                logMessage(`Connection to ${serverUrl} failed. Trying fallback URL: ${fallbackUrl}`);
+                socket = new WebSocket(fallbackUrl);
+                setupWebSocketHandlers();
+            }
+        }, 3000);
+    } catch (error) {
+        logMessage(`Error connecting to WebSocket: ${error.message}`);
+        resetUI();
+    }
+}
+
+// Set up WebSocket event handlers
+function setupWebSocketHandlers() {
+    socket.onopen = () => {
+        logMessage('WebSocket connection established');
+        connectBtn.textContent = 'Disconnect';
+        startStreamBtn.disabled = false;
+        sceneSelect.disabled = false;
+    };
+    
+    socket.onmessage = (event) => {
+        const message = JSON.parse(event.data);
+        
+        switch (message.action) {
+            case 'welcome':
+                userId = message.userId;
+                logMessage(`Connected with user ID: ${userId}`);
+                
+                // Update scene options if server provides them
+                if (message.scenes && Array.isArray(message.scenes)) {
+                    sceneSelect.innerHTML = '';
+                    message.scenes.forEach(scene => {
+                        const option = document.createElement('option');
+                        option.value = scene;
+                        option.textContent = scene.charAt(0).toUpperCase() + scene.slice(1);
+                        sceneSelect.appendChild(option);
+                    });
+                }
+                break;
+                
+            case 'frame':
+                // Process incoming frame
+                processFrame(message);
+                break;
+                
+            case 'start_stream':
+                if (message.success) {
+                    isStreaming = true;
+                    startStreamBtn.disabled = true;
+                    stopStreamBtn.disabled = false;
+                    logMessage(`Streaming started: ${message.message}`);
+                    
+                    // Start FPS counter
+                    startFpsCounter();
+                } else {
+                    logMessage(`Error starting stream: ${message.error}`);
+                }
+                break;
+                
+            case 'stop_stream':
+                if (message.success) {
+                    isStreaming = false;
+                    startStreamBtn.disabled = false;
+                    stopStreamBtn.disabled = true;
+                    logMessage('Streaming stopped');
+                    
+                    // Stop FPS counter
+                    stopFpsCounter();
+                } else {
+                    logMessage(`Error stopping stream: ${message.error}`);
+                }
+                break;
+                
+            case 'pong':
+                // Server responded to ping
+                break;
+                
+            case 'change_scene':
+                if (message.success) {
+                    logMessage(`Scene changed to ${message.scene}`);
+                } else {
+                    logMessage(`Error changing scene: ${message.error}`);
+                }
+                break;
+                
+            default:
+                logMessage(`Received message: ${JSON.stringify(message)}`);
+        }
+    };
+    
+    socket.onclose = (event) => {
+        logMessage(`WebSocket connection closed (code: ${event.code}, reason: ${event.reason || 'none given'})`);
+        resetUI();
+    };
+    
+    socket.onerror = (error) => {
+        logMessage(`WebSocket error. This is often caused by CORS issues or the server being inaccessible.`);
+        console.error('WebSocket error:', error);
+        resetUI();
+    };
+}
+
+// Disconnect from WebSocket server
+function disconnectWebSocket() {
+    if (socket && socket.readyState === WebSocket.OPEN) {
+        // Stop streaming if active
+        if (isStreaming) {
+            sendStopStream();
+        }
+        
+        // Close the socket
+        socket.close();
+        logMessage('Disconnected from server');
+    }
+}
+
+// Start streaming frames
+function sendStartStream() {
+    if (socket && socket.readyState === WebSocket.OPEN) {
+        socket.send(JSON.stringify({
+            action: 'start_stream',
+            requestId: generateRequestId(),
+            fps: 16  // Default FPS
+        }));
+    }
+}
+
+// Stop streaming frames
+function sendStopStream() {
+    if (socket && socket.readyState === WebSocket.OPEN) {
+        socket.send(JSON.stringify({
+            action: 'stop_stream',
+            requestId: generateRequestId()
+        }));
+    }
+}
+
+// Send keyboard input to server
+function sendKeyboardInput(key, pressed) {
+    if (socket && socket.readyState === WebSocket.OPEN) {
+        socket.send(JSON.stringify({
+            action: 'keyboard_input',
+            requestId: generateRequestId(),
+            key: key,
+            pressed: pressed
+        }));
+    }
+}
+
+// Send mouse input to server
+function sendMouseInput(x, y) {
+    if (socket && socket.readyState === WebSocket.OPEN && isStreaming) {
+        socket.send(JSON.stringify({
+            action: 'mouse_input',
+            requestId: generateRequestId(),
+            x: x,
+            y: y
+        }));
+    }
+}
+
+// Change scene
+function sendChangeScene(scene) {
+    if (socket && socket.readyState === WebSocket.OPEN) {
+        socket.send(JSON.stringify({
+            action: 'change_scene',
+            requestId: generateRequestId(),
+            scene: scene
+        }));
+    }
+}
+
+// Process incoming frame
+function processFrame(message) {
+    // Update FPS calculation
+    const now = performance.now();
+    if (lastFrameTime > 0) {
+        frameCount++;
+    }
+    lastFrameTime = now;
+    
+    // Update the canvas with the new frame
+    if (message.frameData) {
+        gameCanvas.src = `data:image/jpeg;base64,${message.frameData}`;
+    }
+}
+
+// Generate a random request ID
+function generateRequestId() {
+    return Math.random().toString(36).substring(2, 15);
+}
+
+// Log message to the connection info panel
+function logMessage(message) {
+    const logEntry = document.createElement('div');
+    logEntry.className = 'log-entry';
+    
+    const timestamp = new Date().toLocaleTimeString();
+    logEntry.textContent = `[${timestamp}] ${message}`;
+    
+    connectionLog.appendChild(logEntry);
+    connectionLog.scrollTop = connectionLog.scrollHeight;
+    
+    // Limit number of log entries
+    while (connectionLog.children.length > 100) {
+        connectionLog.removeChild(connectionLog.firstChild);
+    }
+}
+
+// Start FPS counter updates
+function startFpsCounter() {
+    frameCount = 0;
+    lastFrameTime = 0;
+    
+    // Update FPS display every second
+    fpsUpdateInterval = setInterval(() => {
+        fpsCounter.textContent = `FPS: ${frameCount}`;
+        frameCount = 0;
+    }, 1000);
+}
+
+// Stop FPS counter updates
+function stopFpsCounter() {
+    if (fpsUpdateInterval) {
+        clearInterval(fpsUpdateInterval);
+        fpsUpdateInterval = null;
+    }
+    fpsCounter.textContent = 'FPS: 0';
+}
+
+// Reset UI to initial state
+function resetUI() {
+    connectBtn.textContent = 'Connect';
+    startStreamBtn.disabled = true;
+    stopStreamBtn.disabled = true;
+    sceneSelect.disabled = true;
+    
+    // Reset key indicators
+    for (const key in keyElements) {
+        keyElements[key].classList.remove('active');
+    }
+    
+    // Stop FPS counter
+    stopFpsCounter();
+    
+    // Reset streaming state
+    isStreaming = false;
+}
+
+// Event Listeners
+connectBtn.addEventListener('click', () => {
+    if (socket && socket.readyState === WebSocket.OPEN) {
+        disconnectWebSocket();
+    } else {
+        connectWebSocket();
+    }
+});
+
+startStreamBtn.addEventListener('click', sendStartStream);
+stopStreamBtn.addEventListener('click', sendStopStream);
+
+sceneSelect.addEventListener('change', () => {
+    sendChangeScene(sceneSelect.value);
+});
+
+// Keyboard event listeners
+document.addEventListener('keydown', (event) => {
+    const key = event.key.toLowerCase();
+    
+    // Map key to action
+    let action = keyToAction[key];
+    if (!action && key === ' ') {
+        action = keyToAction[' '];  // Handle spacebar
+    }
+    
+    if (action && !keyState[action]) {
+        keyState[action] = true;
+        
+        // Update visual indicator
+        const keyElement = keyElements[key] || 
+                          (key === ' ' ? keyElements['space'] : null) ||
+                          (key === 'shift' ? keyElements['shift'] : null);
+        
+        if (keyElement) {
+            keyElement.classList.add('active');
+        }
+        
+        // Send to server
+        sendKeyboardInput(action, true);
+    }
+    
+    // Prevent default actions for game controls
+    if (Object.keys(keyToAction).includes(key) || key === ' ') {
+        event.preventDefault();
+    }
+});
+
+document.addEventListener('keyup', (event) => {
+    const key = event.key.toLowerCase();
+    
+    // Map key to action
+    let action = keyToAction[key];
+    if (!action && key === ' ') {
+        action = keyToAction[' '];  // Handle spacebar
+    }
+    
+    if (action && keyState[action]) {
+        keyState[action] = false;
+        
+        // Update visual indicator
+        const keyElement = keyElements[key] || 
+                          (key === ' ' ? keyElements['space'] : null) ||
+                          (key === 'shift' ? keyElements['shift'] : null);
+        
+        if (keyElement) {
+            keyElement.classList.remove('active');
+        }
+        
+        // Send to server
+        sendKeyboardInput(action, false);
+    }
+});
+
+// Mouse capture functions
+function requestPointerLock() {
+    if (!mouseState.captured && pointerLockSupported) {
+        mouseTrackingArea.requestPointerLock = mouseTrackingArea.requestPointerLock ||
+                                            mouseTrackingArea.mozRequestPointerLock ||
+                                            mouseTrackingArea.webkitRequestPointerLock;
+        mouseTrackingArea.requestPointerLock();
+        logMessage('Mouse captured. Press ESC to release.');
+    }
+}
+
+function exitPointerLock() {
+    if (mouseState.captured) {
+        document.exitPointerLock = document.exitPointerLock ||
+                                 document.mozExitPointerLock ||
+                                 document.webkitExitPointerLock;
+        document.exitPointerLock();
+        logMessage('Mouse released.');
+    }
+}
+
+// Handle pointer lock change events
+document.addEventListener('pointerlockchange', pointerLockChangeHandler);
+document.addEventListener('mozpointerlockchange', pointerLockChangeHandler);
+document.addEventListener('webkitpointerlockchange', pointerLockChangeHandler);
+
+function pointerLockChangeHandler() {
+    if (document.pointerLockElement === mouseTrackingArea ||
+        document.mozPointerLockElement === mouseTrackingArea ||
+        document.webkitPointerLockElement === mouseTrackingArea) {
+        // Pointer is locked, enable mouse movement tracking
+        mouseState.captured = true;
+        document.addEventListener('mousemove', handleMouseMovement);
+    } else {
+        // Pointer is unlocked, disable mouse movement tracking
+        mouseState.captured = false;
+        document.removeEventListener('mousemove', handleMouseMovement);
+        // Reset mouse state
+        mouseState.x = 0;
+        mouseState.y = 0;
+        mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
+        throttledSendMouseInput();
+    }
+}
+
+// Mouse tracking with pointer lock
+function handleMouseMovement(event) {
+    if (mouseState.captured) {
+        // Use movement for mouse look when captured
+        const sensitivity = 0.005; // Adjust sensitivity
+        mouseState.x += event.movementX * sensitivity;
+        mouseState.y -= event.movementY * sensitivity; // Invert Y for intuitive camera control
+        
+        // Clamp values
+        mouseState.x = Math.max(-1, Math.min(1, mouseState.x));
+        mouseState.y = Math.max(-1, Math.min(1, mouseState.y));
+        
+        // Update display
+        mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
+        
+        // Send to server (throttled)
+        throttledSendMouseInput();
+    }
+}
+
+// Mouse click to capture
+mouseTrackingArea.addEventListener('click', () => {
+    if (!mouseState.captured && isStreaming) {
+        requestPointerLock();
+    }
+});
+
+// Standard mouse tracking for when pointer is not locked
+mouseTrackingArea.addEventListener('mousemove', (event) => {
+    if (!mouseState.captured) {
+        // Calculate normalized coordinates relative to the center of the tracking area
+        const rect = mouseTrackingArea.getBoundingClientRect();
+        const centerX = rect.width / 2;
+        const centerY = rect.height / 2;
+        
+        // Calculate relative position from center (-1 to 1)
+        const relX = (event.clientX - rect.left - centerX) / centerX;
+        const relY = (event.clientY - rect.top - centerY) / centerY;
+        
+        // Scale down for smoother movement (similar to conditions.py)
+        const scaleFactor = 0.05;
+        mouseState.x = relX * scaleFactor;
+        mouseState.y = -relY * scaleFactor;  // Invert Y for intuitive camera control
+        
+        // Update display
+        mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
+        
+        // Send to server (throttled)
+        throttledSendMouseInput();
+    }
+});
+
+// Throttle mouse movement to avoid flooding the server
+const throttledSendMouseInput = (() => {
+    let lastSentTime = 0;
+    const interval = 50;  // milliseconds
+    
+    return () => {
+        const now = performance.now();
+        if (now - lastSentTime >= interval) {
+            sendMouseInput(mouseState.x, mouseState.y);
+            lastSentTime = now;
+        }
+    };
+})();
+
+// Toggle panel collapse/expand
+function togglePanel(panelId) {
+    const panel = document.getElementById(panelId);
+    const button = panel.querySelector('.toggle-button');
+    
+    if (panel.classList.contains('collapsed')) {
+        // Expand the panel
+        panel.classList.remove('collapsed');
+        button.textContent = '−'; // Minus sign
+    } else {
+        // Collapse the panel
+        panel.classList.add('collapsed');
+        button.textContent = '+'; // Plus sign
+    }
+}
+
+// Initialize the UI
+resetUI();
+
+// Make panel headers clickable
+document.querySelectorAll('.panel-header').forEach(header => {
+    header.addEventListener('click', () => {
+        const panelId = header.parentElement.id;
+        togglePanel(panelId);
+    });
+});
\ No newline at end of file
diff --git a/example/engine.py b/example/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d5f724a95a78a72848435427171ca45a48a6008
--- /dev/null
+++ b/example/engine.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+MatrixGame Engine
+
+This module handles the core rendering and model inference for the MatrixGame project.
+"""
+
+import os
+import logging
+import argparse
+import time
+import torch
+import numpy as np
+from PIL import Image
+import cv2
+from einops import rearrange
+from diffusers.utils import load_image
+from diffusers.video_processor import VideoProcessor
+from typing import Dict, List, Tuple, Any, Optional, Union
+
+# MatrixGame specific imports
+from matrixgame.sample.pipeline_matrixgame import MatrixGameVideoPipeline
+from matrixgame.model_variants import get_dit
+from matrixgame.vae_variants import get_vae
+from matrixgame.encoder_variants import get_text_enc
+from matrixgame.model_variants.matrixgame_dit_src import MGVideoDiffusionTransformerI2V
+from matrixgame.sample.flow_matching_scheduler_matrixgame import FlowMatchDiscreteScheduler
+from teacache_forward import teacache_forward
+
+# Import utility functions
+from utils import (
+    visualize_controls, 
+    frame_to_jpeg, 
+    load_scene_frames,
+    logger
+)
+
+class MatrixGameEngine:
+    """
+    Core engine for MatrixGame model inference and frame generation.
+    """
+    def __init__(self, args: Optional[argparse.Namespace] = None):
+        """
+        Initialize the MatrixGame engine with configuration parameters.
+        
+        Args:
+            args: Optional parsed command line arguments for model configuration
+        """
+        # Set default parameters if args not provided
+        self.frame_width = getattr(args, 'frame_width', 640)
+        self.frame_height = getattr(args, 'frame_height', 360)
+        self.fps = getattr(args, 'fps', 16)
+        self.inference_steps = getattr(args, 'inference_steps', 20)
+        self.guidance_scale = getattr(args, 'guidance_scale', 6.0)
+        self.num_pre_frames = getattr(args, 'num_pre_frames', 3)
+        
+        # Initialize state
+        self.frame_count = 0
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.weight_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+        
+        # Model paths from environment or args
+        self.vae_path = os.environ.get("VAE_PATH", "./models/matrixgame/vae/")
+        self.dit_path = os.environ.get("DIT_PATH", "./models/matrixgame/dit/")
+        self.textenc_path = os.environ.get("TEXTENC_PATH", "./models/matrixgame")
+        
+        # Cache scene initial frames
+        self.scenes = {
+            'forest': load_scene_frames('forest', self.frame_width, self.frame_height),
+            'desert': load_scene_frames('desert', self.frame_width, self.frame_height),
+            'beach': load_scene_frames('beach', self.frame_width, self.frame_height),
+            'hills': load_scene_frames('hills', self.frame_width, self.frame_height),
+            'river': load_scene_frames('river', self.frame_width, self.frame_height),
+            'icy': load_scene_frames('icy', self.frame_width, self.frame_height),
+            'mushroom': load_scene_frames('mushroom', self.frame_width, self.frame_height),
+            'plain': load_scene_frames('plain', self.frame_width, self.frame_height)
+        }
+        
+        # Cache initial images for model input
+        self.scene_initial_images = {}
+        
+        # Initialize MatrixGame pipeline
+        self.model_loaded = False
+        if torch.cuda.is_available():
+            try:
+                self._init_models()
+                self.model_loaded = True
+                logger.info("MatrixGame models loaded successfully")
+            except Exception as e:
+                logger.error(f"Failed to initialize MatrixGame models: {str(e)}")
+                logger.info("Falling back to frame cycling mode")
+        else:
+            logger.warning("CUDA not available. Using frame cycling mode only.")
+    
+    def _init_models(self):
+        """Initialize MatrixGame models (VAE, text encoder, transformer)"""
+        # Initialize flow matching scheduler
+        self.scheduler = FlowMatchDiscreteScheduler(
+            shift=15.0,
+            reverse=True,
+            solver="euler"
+        )
+        
+        # Initialize VAE
+        try:
+            self.vae = get_vae("matrixgame", self.vae_path, self.weight_dtype)
+            self.vae.requires_grad_(False)
+            self.vae.eval()
+            self.vae.enable_tiling()
+            logger.info("VAE model loaded successfully")
+        except Exception as e:
+            logger.error(f"Error loading VAE model: {str(e)}")
+            raise
+        
+        # Initialize DIT (Transformer)
+        try:
+            dit = MGVideoDiffusionTransformerI2V.from_pretrained(self.dit_path)
+            dit.requires_grad_(False)
+            dit.eval()
+            logger.info("DIT model loaded successfully")
+        except Exception as e:
+            logger.error(f"Error loading DIT model: {str(e)}")
+            raise
+        
+        # Initialize text encoder
+        try:
+            self.text_enc = get_text_enc('matrixgame', self.textenc_path, weight_dtype=self.weight_dtype, i2v_type='refiner')
+            logger.info("Text encoder loaded successfully")
+        except Exception as e:
+            logger.error(f"Error loading text encoder: {str(e)}")
+            raise
+        
+        # Initialize pipeline
+        try:
+            self.pipeline = MatrixGameVideoPipeline(
+                vae=self.vae.vae,
+                text_encoder=self.text_enc,
+                transformer=dit,
+                scheduler=self.scheduler,
+            ).to(self.weight_dtype).to(self.device)
+            logger.info("Pipeline initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing pipeline: {str(e)}")
+            raise
+        
+        # Configure teacache for the transformer
+        self.pipeline.transformer.__class__.enable_teacache = True
+        self.pipeline.transformer.__class__.cnt = 0
+        self.pipeline.transformer.__class__.num_steps = self.inference_steps
+        self.pipeline.transformer.__class__.accumulated_rel_l1_distance = 0
+        self.pipeline.transformer.__class__.rel_l1_thresh = 0.075
+        self.pipeline.transformer.__class__.previous_modulated_input = None
+        self.pipeline.transformer.__class__.previous_residual = None
+        self.pipeline.transformer.__class__.forward = teacache_forward
+        
+        # Preprocess initial images for all scenes
+        for scene_name, frames in self.scenes.items():
+            if frames:
+                # Use first frame as initial image
+                self.scene_initial_images[scene_name] = self._preprocess_image(frames[0])
+    
+    def _preprocess_image(self, image_array: np.ndarray) -> torch.Tensor:
+        """
+        Preprocess an image for the model.
+        
+        Args:
+            image_array: Input image as numpy array
+            
+        Returns:
+            torch.Tensor: Preprocessed image tensor
+        """
+        # Convert numpy array to PIL Image if needed
+        if isinstance(image_array, np.ndarray):
+            image = Image.fromarray(image_array)
+        else:
+            image = image_array
+            
+        # Preprocess for VAE
+        vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, 'vae') else 8
+        video_processor = VideoProcessor(vae_scale_factor=vae_scale_factor)
+        initial_image = video_processor.preprocess(image, height=self.frame_height, width=self.frame_width)
+        
+        # Add past frames for stability (use same frame repeated)
+        past_frames = initial_image.repeat(self.num_pre_frames, 1, 1, 1)
+        initial_image = torch.cat([initial_image, past_frames], dim=0)
+        
+        return initial_image
+    
+    def generate_frame(self, scene_name: str, keyboard_condition: Optional[List] = None, 
+                      mouse_condition: Optional[List] = None) -> bytes:
+        """
+        Generate the next frame based on current conditions using MatrixGame model.
+        
+        Args:
+            scene_name: Name of the current scene
+            keyboard_condition: Keyboard input state
+            mouse_condition: Mouse input state
+            
+        Returns:
+            bytes: JPEG bytes of the frame
+        """
+        # Check if model is loaded
+        if not self.model_loaded or not torch.cuda.is_available():
+            # Fall back to frame cycling for demo mode or if models failed to load
+            return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
+        else:
+            # Use MatrixGame model for frame generation
+            try:
+                # Get initial image for this scene
+                initial_image = self.scene_initial_images.get(scene_name)
+                if initial_image is None:
+                    # Use forest as default if we don't have an initial image for this scene
+                    initial_image = self.scene_initial_images.get('forest')
+                    if initial_image is None:
+                        # If we still don't have an initial image, fall back to frame cycling
+                        logger.error(f"No initial image available for scene {scene_name}")
+                        return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
+                
+                # Prepare input tensors (move to device and format correctly)
+                if keyboard_condition is None:
+                    keyboard_condition = [[0, 0, 0, 0, 0, 0]]
+                if mouse_condition is None:
+                    mouse_condition = [[0, 0]]
+                
+                # Convert conditions to tensors
+                keyboard_tensor = torch.tensor(keyboard_condition, dtype=torch.float32)
+                mouse_tensor = torch.tensor(mouse_condition, dtype=torch.float32)
+                
+                # Move to device and convert to correct dtype
+                keyboard_tensor = keyboard_tensor.to(self.weight_dtype).to(self.device)
+                mouse_tensor = mouse_tensor.to(self.weight_dtype).to(self.device)
+                
+                # Get the first frame from the scene for semantic conditioning
+                scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
+                if not scene_frames:
+                    return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
+                
+                semantic_image = Image.fromarray(scene_frames[0])
+                
+                # Get PIL image version of the frame for visualization
+                for scene_frame in scene_frames:
+                    if isinstance(scene_frame, np.ndarray):
+                        semantic_image = Image.fromarray(scene_frame)
+                        break
+                
+                # Generate a single frame with the model
+                # Use fewer inference steps for interactive frame generation
+                with torch.no_grad():
+                    # Generate a short video (we'll just use the first frame)
+                    # We're using a short length (3 frames) for real-time performance
+                    video = self.pipeline(
+                        height=self.frame_height,
+                        width=self.frame_width,
+                        video_length=3,  # Generate a very short video for speed
+                        mouse_condition=mouse_tensor,
+                        keyboard_condition=keyboard_tensor,
+                        initial_image=initial_image,
+                        num_inference_steps=self.inference_steps,
+                        guidance_scale=self.guidance_scale,
+                        embedded_guidance_scale=None,
+                        data_type="video",
+                        vae_ver='884-16c-hy',
+                        enable_tiling=True,
+                        generator=torch.Generator(device=self.device).manual_seed(42),
+                        i2v_type='refiner',
+                        semantic_images=semantic_image
+                    ).videos[0]
+                
+                # Convert video tensor to numpy array (use first frame)
+                video_frame = video[0].permute(1, 2, 0).cpu().numpy()
+                video_frame = (video_frame * 255).astype(np.uint8)
+                frame = video_frame
+                
+                # Increment frame counter
+                self.frame_count += 1
+                
+            except Exception as e:
+                logger.error(f"Error generating frame with MatrixGame model: {str(e)}")
+                # Fall back to cycling demo frames if model generation fails
+                return self._fallback_frame(scene_name, keyboard_condition, mouse_condition)
+        
+        # Add visualization of input controls
+        frame = visualize_controls(
+            frame, keyboard_condition, mouse_condition, 
+            self.frame_width, self.frame_height
+        )
+        
+        # Convert frame to JPEG
+        return frame_to_jpeg(frame, self.frame_height, self.frame_width)
+    
+    def _fallback_frame(self, scene_name: str, keyboard_condition: Optional[List] = None, 
+                       mouse_condition: Optional[List] = None) -> bytes:
+        """
+        Generate a fallback frame when model generation fails.
+        
+        Args:
+            scene_name: Name of the current scene
+            keyboard_condition: Keyboard input state
+            mouse_condition: Mouse input state
+            
+        Returns:
+            bytes: JPEG bytes of the frame
+        """
+        scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
+        frame_idx = self.frame_count % len(scene_frames)
+        frame = scene_frames[frame_idx].copy()
+        self.frame_count += 1
+        
+        # Add fallback mode indicator
+        cv2.putText(frame, "Fallback mode", 
+                  (10, self.frame_height - 20), 
+                  cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
+        
+        # Add visualization of input controls
+        frame = visualize_controls(
+            frame, keyboard_condition, mouse_condition, 
+            self.frame_width, self.frame_height
+        )
+        
+        # Convert frame to JPEG
+        return frame_to_jpeg(frame, self.frame_height, self.frame_width)
+    
+    def get_valid_scenes(self) -> List[str]:
+        """
+        Get a list of valid scene names.
+        
+        Returns:
+            List[str]: List of valid scene names
+        """
+        return list(self.scenes.keys())
\ No newline at end of file
diff --git a/example/index.html b/example/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..c215658cef7b396ba2435527ea1705a097180f5d
--- /dev/null
+++ b/example/index.html
@@ -0,0 +1,329 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>MatrixGame Client</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 0;
+            padding: 0;
+            background-color: #121212;
+            color: #e0e0e0;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            user-select: none; /* Disable text selection */
+            -webkit-user-select: none;
+            -moz-user-select: none;
+            -ms-user-select: none;
+            overflow-x: hidden;
+        }
+        
+        .container {
+            width: 100%;
+            max-width: 100%;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+        }
+        
+        .game-area {
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            width: 100%;
+            max-height: 85vh;
+            margin: 0;
+            position: relative;
+        }
+        
+        #mouse-tracking-area {
+            position: relative;
+            width: 100%;
+            height: auto;
+            cursor: pointer; /* Show cursor as pointer to encourage clicks */
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            max-height: 85vh;
+        }
+        
+        #game-canvas {
+            width: 100%;
+            height: auto;
+            max-height: 85vh;
+            object-fit: contain;
+            background-color: #000;
+            pointer-events: none; /* Prevent drag on the image */
+            -webkit-user-drag: none;
+            -khtml-user-drag: none;
+            -moz-user-drag: none;
+            -o-user-drag: none;
+            user-drag: none;
+        }
+        
+        .controls {
+            display: flex;
+            justify-content: space-between;
+            width: 100%;
+            max-width: 1200px;
+            padding: 10px;
+            background-color: rgba(0, 0, 0, 0.5);
+            position: absolute;
+            bottom: 0;
+            z-index: 10;
+            box-sizing: border-box;
+        }
+        
+        .panels-container {
+            display: flex;
+            width: 100%;
+            max-width: 1200px;
+            margin: 10px auto;
+            gap: 10px;
+        }
+        
+        .panel {
+            flex: 1;
+            background-color: #1E1E1E;
+            border-radius: 5px;
+            overflow: hidden;
+            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
+            transition: height 0.3s ease;
+        }
+        
+        .panel-header {
+            background-color: #272727;
+            padding: 10px 15px;
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            cursor: pointer;
+        }
+        
+        .panel-title {
+            font-weight: bold;
+            color: #4CAF50;
+        }
+        
+        .toggle-button {
+            background: none;
+            border: none;
+            color: #e0e0e0;
+            font-size: 18px;
+            cursor: pointer;
+        }
+        
+        .toggle-button:focus {
+            outline: none;
+        }
+        
+        .panel-content {
+            padding: 15px;
+            max-height: 300px;
+            overflow-y: auto;
+            transition: all 0.3s ease;
+        }
+        
+        .collapsed .panel-content {
+            max-height: 0;
+            padding-top: 0;
+            padding-bottom: 0;
+            overflow: hidden;
+        }
+        
+        button {
+            background-color: #4CAF50;
+            color: white;
+            border: none;
+            padding: 10px 15px;
+            text-align: center;
+            text-decoration: none;
+            display: inline-block;
+            font-size: 14px;
+            border-radius: 5px;
+            cursor: pointer;
+            margin: 5px;
+            transition: background-color 0.3s;
+        }
+        
+        button:hover {
+            background-color: #45a049;
+        }
+        
+        button:disabled {
+            background-color: #cccccc;
+            cursor: not-allowed;
+        }
+        
+        select {
+            padding: 10px;
+            border-radius: 5px;
+            background-color: #2A2A2A;
+            color: #e0e0e0;
+            border: 1px solid #4CAF50;
+        }
+        
+        .status {
+            margin-top: 10px;
+            color: #4CAF50;
+        }
+        
+        .key-indicators {
+            display: flex;
+            justify-content: center;
+            margin-top: 15px;
+        }
+        
+        .key {
+            width: 40px;
+            height: 40px;
+            margin: 0 5px;
+            background-color: #2A2A2A;
+            border: 1px solid #444;
+            border-radius: 5px;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            font-weight: bold;
+            transition: background-color 0.2s;
+        }
+        
+        .key.active {
+            background-color: #4CAF50;
+            color: white;
+        }
+        
+        .key-row {
+            display: flex;
+            justify-content: center;
+            margin: 5px 0;
+        }
+        
+        .spacebar {
+            width: 150px;
+        }
+        
+        .connection-info {
+            font-family: monospace;
+            height: 100%;
+            overflow-y: auto;
+        }
+        
+        .log-entry {
+            margin: 5px 0;
+            padding: 3px;
+            border-bottom: 1px solid #333;
+        }
+        
+        .fps-counter {
+            position: absolute;
+            top: 10px;
+            right: 10px;
+            background-color: rgba(0,0,0,0.5);
+            color: #4CAF50;
+            padding: 5px;
+            border-radius: 3px;
+            font-family: monospace;
+            z-index: 20;
+        }
+        
+        
+        #mouse-position {
+            position: absolute;
+            top: 10px;
+            left: 10px;
+            background-color: rgba(0,0,0,0.5);
+            color: #4CAF50;
+            padding: 5px;
+            border-radius: 3px;
+            font-family: monospace;
+            z-index: 20;
+        }
+        
+        @media (max-width: 768px) {
+            .panels-container {
+                flex-direction: column;
+            }
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="game-area">
+            <div id="mouse-tracking-area">
+                <img id="game-canvas" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" alt="Game Frame">
+                <div id="mouse-position">Mouse: 0.00, 0.00</div>
+                <div class="fps-counter" id="fps-counter">FPS: 0</div>
+            </div>
+            
+            <div class="controls">
+                <button id="connect-btn">Connect</button>
+                <button id="start-stream-btn" disabled>Start Stream</button>
+                <button id="stop-stream-btn" disabled>Stop Stream</button>
+                <select id="scene-select" disabled>
+                    <option value="forest">Forest</option>
+                    <option value="desert">Desert</option>
+                    <option value="beach">Beach</option>
+                    <option value="hills">Hills</option>
+                    <option value="river">River</option>
+                    <option value="icy">Icy</option>
+                    <option value="mushroom">Mushroom</option>
+                    <option value="plain">Plain</option>
+                </select>
+            </div>
+        </div>
+        
+        <div class="panels-container">
+            <!-- Controls Panel -->
+            <div class="panel" id="controls-panel">
+                <div class="panel-header" onclick="togglePanel('controls-panel')">
+                    <div class="panel-title">Keyboard Controls</div>
+                    <button class="toggle-button">−</button>
+                </div>
+                <div class="panel-content">
+                    <div class="key-indicators">
+                        <div class="key-row">
+                            <div id="key-w" class="key">W</div>
+                        </div>
+                        <div class="key-row">
+                            <div id="key-a" class="key">A</div>
+                            <div id="key-s" class="key">S</div>
+                            <div id="key-d" class="key">D</div>
+                        </div>
+                        <div class="key-row">
+                            <div id="key-space" class="key spacebar">SPACE</div>
+                        </div>
+                        <div class="key-row">
+                            <div id="key-shift" class="key">SHIFT</div>
+                        </div>
+                    </div>
+                    <p class="status">
+                        W or ↑ = Forward, S or ↓ = Back, A or ← = Left, D or → = Right<br>
+                        Space = Jump, Shift = Attack<br>
+                        Click on game view to capture mouse (ESC to release)<br>
+                        Mouse = Look around
+                    </p>
+                </div>
+            </div>
+            
+            <!-- Connection Log Panel -->
+            <div class="panel" id="log-panel">
+                <div class="panel-header" onclick="togglePanel('log-panel')">
+                    <div class="panel-title">Connection Log</div>
+                    <button class="toggle-button">−</button>
+                </div>
+                <div class="panel-content">
+                    <div class="connection-info" id="connection-log">
+                        <div class="log-entry">Waiting to connect...</div>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    
+    <script src="./assets/client.js"></script>
+</body>
+</html>
\ No newline at end of file
diff --git a/example/requirements.txt b/example/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f903e55a951ef2222ac45cd39ee12716e34c5142
--- /dev/null
+++ b/example/requirements.txt
@@ -0,0 +1,23 @@
+diffusers==0.32.2
+einops==0.8.1
+
+#flash_attn==2.7.4.post1
+flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+
+ftfy==6.3.1
+imageio==2.34.0
+numpy==1.24.4
+opencv_python==4.9.0.80
+opencv_python_headless==4.9.0.80
+packaging==25.0
+peft==0.14.0
+Pillow==11.2.1
+regex==2024.11.6
+safetensors==0.5.3
+torch==2.5.1
+torchvision==0.20.1 
+torchaudio==2.5.1
+transformers==4.47.1
+aiohttp==3.9.3
+jinja2==3.1.3
+python-multipart==0.0.6
\ No newline at end of file
diff --git a/example/server.py b/example/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..67b1703c3761987a855ad09c05f482d339d35495
--- /dev/null
+++ b/example/server.py
@@ -0,0 +1,649 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+MatrixGame Websocket Gaming Server
+
+This script implements a websocket server for the MatrixGame project,
+allowing real-time streaming of game frames based on player inputs.
+"""
+
+import asyncio
+import json
+import logging
+import os
+import pathlib
+import time
+import uuid
+import base64
+import argparse
+from typing import Dict, List, Any, Optional
+from aiohttp import web, WSMsgType
+
+# Import the game engine
+from engine import MatrixGameEngine
+from utils import logger, parse_model_args, setup_gpu_environment
+
+class GameSession:
+    """
+    Represents a user's gaming session.
+    Each WebSocket connection gets its own session with separate queues.
+    """
+    def __init__(self, user_id: str, ws: web.WebSocketResponse, game_manager):
+        self.user_id = user_id
+        self.ws = ws
+        self.game_manager = game_manager
+        
+        # Create action queue for this user session
+        self.action_queue = asyncio.Queue()
+        
+        # Session creation time
+        self.created_at = time.time()
+        self.last_activity = time.time()
+        
+        # Game state
+        self.current_scene = "forest"  # Default scene
+        self.is_streaming = False
+        self.stream_task = None
+        
+        # Current input state
+        self.keyboard_state = [0, 0, 0, 0, 0, 0]  # forward, back, left, right, jump, attack
+        self.mouse_state = [0, 0]  # x, y
+        
+        self.background_tasks = []
+        
+    async def start(self):
+        """Start all the queue processors for this session"""
+        self.background_tasks = [
+            asyncio.create_task(self._process_action_queue()),
+        ]
+        logger.info(f"Started game session for user {self.user_id}")
+        
+    async def stop(self):
+        """Stop all background tasks for this session"""
+        # Stop streaming if active
+        if self.is_streaming and self.stream_task:
+            self.is_streaming = False
+            self.stream_task.cancel()
+            try:
+                await self.stream_task
+            except asyncio.CancelledError:
+                pass
+        
+        # Cancel other background tasks
+        for task in self.background_tasks:
+            task.cancel()
+        
+        try:
+            # Wait for tasks to complete cancellation
+            await asyncio.gather(*self.background_tasks, return_exceptions=True)
+        except asyncio.CancelledError:
+            pass
+        
+        logger.info(f"Stopped game session for user {self.user_id}")
+        
+    async def _process_action_queue(self):
+        """Process game actions from the queue"""
+        while True:
+            data = await self.action_queue.get()
+            try:
+                action_type = data.get('action')
+                
+                if action_type == 'start_stream':
+                    result = await self._handle_start_stream(data)
+                elif action_type == 'stop_stream':
+                    result = await self._handle_stop_stream(data)
+                elif action_type == 'keyboard_input':
+                    result = await self._handle_keyboard_input(data)
+                elif action_type == 'mouse_input':
+                    result = await self._handle_mouse_input(data)
+                elif action_type == 'change_scene':
+                    result = await self._handle_scene_change(data)
+                else:
+                    result = {
+                        'action': action_type,
+                        'requestId': data.get('requestId'),
+                        'success': False,
+                        'error': f'Unknown action: {action_type}'
+                    }
+                
+                # Send response back to the client
+                await self.ws.send_json(result)
+                
+                # Update last activity time
+                self.last_activity = time.time()
+                
+            except Exception as e:
+                logger.error(f"Error processing action for user {self.user_id}: {str(e)}")
+                try:
+                    await self.ws.send_json({
+                        'action': data.get('action'),
+                        'requestId': data.get('requestId', 'unknown'),
+                        'success': False,
+                        'error': f'Error processing action: {str(e)}'
+                    })
+                except Exception as send_error:
+                    logger.error(f"Error sending error response: {send_error}")
+            finally:
+                self.action_queue.task_done()
+    
+    async def _handle_start_stream(self, data: Dict) -> Dict:
+        """Handle request to start streaming frames"""
+        if self.is_streaming:
+            return {
+                'action': 'start_stream',
+                'requestId': data.get('requestId'),
+                'success': False,
+                'error': 'Stream already active'
+            }
+        
+        fps = data.get('fps', 16)
+        self.is_streaming = True
+        self.stream_task = asyncio.create_task(self._stream_frames(fps))
+        
+        return {
+            'action': 'start_stream',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'message': f'Streaming started at {fps} FPS'
+        }
+    
+    async def _handle_stop_stream(self, data: Dict) -> Dict:
+        """Handle request to stop streaming frames"""
+        if not self.is_streaming:
+            return {
+                'action': 'stop_stream',
+                'requestId': data.get('requestId'),
+                'success': False,
+                'error': 'No active stream to stop'
+            }
+        
+        self.is_streaming = False
+        if self.stream_task:
+            self.stream_task.cancel()
+            try:
+                await self.stream_task
+            except asyncio.CancelledError:
+                pass
+            self.stream_task = None
+        
+        return {
+            'action': 'stop_stream',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'message': 'Streaming stopped'
+        }
+    
+    async def _handle_keyboard_input(self, data: Dict) -> Dict:
+        """Handle keyboard input from client"""
+        key = data.get('key', '')
+        pressed = data.get('pressed', False)
+        
+        # Map key to keyboard state index
+        key_map = {
+            'w': 0, 'forward': 0,
+            's': 1, 'back': 1, 'backward': 1,
+            'a': 2, 'left': 2,
+            'd': 3, 'right': 3,
+            'space': 4, 'jump': 4,
+            'shift': 5, 'attack': 5, 'ctrl': 5
+        }
+        
+        if key.lower() in key_map:
+            key_idx = key_map[key.lower()]
+            self.keyboard_state[key_idx] = 1 if pressed else 0
+        
+        return {
+            'action': 'keyboard_input',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'keyboardState': self.keyboard_state
+        }
+    
+    async def _handle_mouse_input(self, data: Dict) -> Dict:
+        """Handle mouse movement/input from client"""
+        mouse_x = data.get('x', 0)
+        mouse_y = data.get('y', 0)
+        
+        # Update mouse state, normalize values between -1 and 1
+        self.mouse_state = [float(mouse_x), float(mouse_y)]
+        
+        return {
+            'action': 'mouse_input',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'mouseState': self.mouse_state
+        }
+    
+    async def _handle_scene_change(self, data: Dict) -> Dict:
+        """Handle scene change requests"""
+        scene_name = data.get('scene', 'forest')
+        valid_scenes = self.game_manager.valid_scenes
+        
+        if scene_name not in valid_scenes:
+            return {
+                'action': 'change_scene',
+                'requestId': data.get('requestId'),
+                'success': False,
+                'error': f'Invalid scene: {scene_name}. Valid scenes are: {", ".join(valid_scenes)}'
+            }
+            
+        self.current_scene = scene_name
+        
+        return {
+            'action': 'change_scene',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'scene': scene_name
+        }
+    
+    async def _stream_frames(self, fps: int):
+        """Stream frames to the client at the specified FPS"""
+        frame_interval = 1.0 / fps  # Time between frames in seconds
+        
+        try:
+            while self.is_streaming:
+                start_time = time.time()
+                
+                # Generate frame based on current keyboard and mouse state
+                keyboard_condition = [self.keyboard_state]
+                mouse_condition = [self.mouse_state]
+                
+                # Use the engine to generate the next frame
+                frame_bytes = self.game_manager.engine.generate_frame(
+                    self.current_scene, keyboard_condition, mouse_condition
+                )
+                
+                # Encode as base64 for sending in JSON
+                frame_base64 = base64.b64encode(frame_bytes).decode('utf-8')
+                
+                # Send frame to client
+                await self.ws.send_json({
+                    'action': 'frame',
+                    'frameData': frame_base64,
+                    'timestamp': time.time()
+                })
+                
+                # Calculate sleep time to maintain FPS
+                elapsed = time.time() - start_time
+                sleep_time = max(0, frame_interval - elapsed)
+                await asyncio.sleep(sleep_time)
+                
+        except asyncio.CancelledError:
+            logger.info(f"Frame streaming cancelled for user {self.user_id}")
+        except Exception as e:
+            logger.error(f"Error in frame streaming for user {self.user_id}: {str(e)}")
+            if self.ws.closed:
+                logger.info(f"WebSocket closed for user {self.user_id}")
+                return
+            
+            # Notify client of error
+            try:
+                await self.ws.send_json({
+                    'action': 'frame_error',
+                    'error': f'Streaming error: {str(e)}'
+                })
+            except:
+                pass
+            
+            # Stop streaming
+            self.is_streaming = False
+
+class GameManager:
+    """
+    Manages all active gaming sessions and shared resources.
+    """
+    def __init__(self, args: argparse.Namespace):
+        self.sessions = {}
+        self.session_lock = asyncio.Lock()
+        
+        # Initialize game engine
+        self.engine = MatrixGameEngine(args)
+        
+        # Load valid scenes from engine
+        self.valid_scenes = self.engine.get_valid_scenes()
+        
+    async def create_session(self, user_id: str, ws: web.WebSocketResponse) -> GameSession:
+        """Create a new game session"""
+        async with self.session_lock:
+            # Create a new session for this user
+            session = GameSession(user_id, ws, self)
+            await session.start()
+            self.sessions[user_id] = session
+            return session
+    
+    async def delete_session(self, user_id: str) -> None:
+        """Delete a game session and clean up resources"""
+        async with self.session_lock:
+            if user_id in self.sessions:
+                session = self.sessions[user_id]
+                await session.stop()
+                del self.sessions[user_id]
+                logger.info(f"Deleted game session for user {user_id}")
+    
+    def get_session(self, user_id: str) -> Optional[GameSession]:
+        """Get a game session if it exists"""
+        return self.sessions.get(user_id)
+    
+    async def close_all_sessions(self) -> None:
+        """Close all active sessions (used during shutdown)"""
+        async with self.session_lock:
+            for user_id, session in list(self.sessions.items()):
+                await session.stop()
+            self.sessions.clear()
+            logger.info("Closed all active game sessions")
+    
+    @property
+    def session_count(self) -> int:
+        """Get the number of active sessions"""
+        return len(self.sessions)
+    
+    def get_session_stats(self) -> Dict:
+        """Get statistics about active sessions"""
+        stats = {
+            'total_sessions': len(self.sessions),
+            'active_scenes': {},
+            'streaming_sessions': 0
+        }
+        
+        # Count sessions by scene and streaming status
+        for session in self.sessions.values():
+            scene = session.current_scene
+            stats['active_scenes'][scene] = stats['active_scenes'].get(scene, 0) + 1
+            if session.is_streaming:
+                stats['streaming_sessions'] += 1
+            
+        return stats
+
+# Create global game manager
+game_manager = None
+
+async def status_handler(request: web.Request) -> web.Response:
+    """Handler for API status endpoint"""
+    # Get session statistics
+    session_stats = game_manager.get_session_stats()
+    
+    return web.json_response({
+        'product': 'MatrixGame WebSocket Server',
+        'version': '1.0.0',
+        'active_sessions': session_stats,
+        'available_scenes': game_manager.valid_scenes
+    })
+
+async def root_handler(request: web.Request) -> web.Response:
+    """Handler for serving the client at the root path"""
+    client_path = pathlib.Path(__file__).parent / 'client' / 'index.html'
+    
+    with open(client_path, 'r') as file:
+        html_content = file.read()
+        
+    return web.Response(text=html_content, content_type='text/html')
+
+async def websocket_handler(request: web.Request) -> web.WebSocketResponse:
+    """Handle WebSocket connections with robust error handling"""
+    logger.info(f"WebSocket connection attempt - PATH: {request.path}, QUERY: {request.query_string}")
+    
+    # Log request headers at debug level only (could contain sensitive information)
+    logger.debug(f"WebSocket request headers: {dict(request.headers)}")
+    
+    # Prepare a WebSocket response with appropriate settings
+    ws = web.WebSocketResponse(
+        max_msg_size=1024*1024*10,  # 10MB max message size
+        timeout=60.0,
+        heartbeat=30.0  # Add heartbeat to keep connection alive
+    )
+    
+    # Check if WebSocket protocol is supported
+    if not ws.can_prepare(request):
+        logger.error("Cannot prepare WebSocket: WebSocket protocol not supported")
+        return web.Response(status=400, text="WebSocket protocol not supported")
+    
+    try:
+        logger.info("Preparing WebSocket connection...")
+        await ws.prepare(request)
+        
+        # Generate a unique user ID for this connection
+        user_id = str(uuid.uuid4())
+        
+        # Get client IP address
+        peername = request.transport.get_extra_info('peername')
+        if peername is not None:
+            client_ip = peername[0]
+        else:
+            client_ip = request.headers.get('X-Forwarded-For', 'unknown').split(',')[0].strip()
+        
+        # Log connection success
+        logger.info(f"Client {user_id} connecting from IP: {client_ip} - WebSocket connection established")
+        
+        # Mark that the session is established
+        is_session_created = False
+        
+        try:
+            # Store the user ID in the websocket for easy access
+            ws.user_id = user_id
+            
+            # Create a new session for this user
+            logger.info(f"Creating game session for user {user_id}")
+            user_session = await game_manager.create_session(user_id, ws)
+            is_session_created = True
+            logger.info(f"Game session created for user {user_id}")
+        except Exception as session_error:
+            logger.error(f"Error creating game session: {str(session_error)}", exc_info=True)
+            if not ws.closed:
+                await ws.close(code=1011, message=f"Server error: {str(session_error)}".encode())
+            if is_session_created:
+                await game_manager.delete_session(user_id)
+            return ws
+    except Exception as e:
+        logger.error(f"Error establishing WebSocket connection: {str(e)}", exc_info=True)
+        if not ws.closed and ws.prepared:
+            await ws.close(code=1011, message=f"Server error: {str(e)}".encode())
+        return ws
+    
+    # Send initial welcome message
+    try:
+        await ws.send_json({
+            'action': 'welcome',
+            'userId': user_id,
+            'message': 'Welcome to the MatrixGame WebSocket server!',
+            'scenes': game_manager.valid_scenes
+        })
+        logger.info(f"Sent welcome message to user {user_id}")
+    except Exception as welcome_error:
+        logger.error(f"Error sending welcome message: {str(welcome_error)}")
+        if not ws.closed:
+            await ws.close(code=1011, message=b"Failed to send welcome message")
+        await game_manager.delete_session(user_id)
+        return ws
+
+    try:
+        async for msg in ws:
+            if msg.type == WSMsgType.TEXT:
+                try:
+                    data = json.loads(msg.data)
+                    action = data.get('action')
+                    
+                    logger.debug(f"Received {action} message from user {user_id}")
+                    
+                    if action == 'ping':
+                        # Respond to ping immediately
+                        await ws.send_json({
+                            'action': 'pong',
+                            'requestId': data.get('requestId'),
+                            'timestamp': time.time()
+                        })
+                    else:
+                        # Route game actions to the session's action queue
+                        await user_session.action_queue.put(data)
+                        
+                except json.JSONDecodeError:
+                    logger.error(f"Invalid JSON from user {user_id}: {msg.data}")
+                    if not ws.closed:
+                        await ws.send_json({
+                            'error': 'Invalid JSON message',
+                            'success': False
+                        })
+                except Exception as e:
+                    logger.error(f"Error processing WebSocket message for user {user_id}: {str(e)}")
+                    if not ws.closed:
+                        await ws.send_json({
+                            'action': data.get('action') if 'data' in locals() else 'unknown',
+                            'success': False,
+                            'error': f'Error processing message: {str(e)}'
+                        })
+                    
+            elif msg.type == WSMsgType.ERROR:
+                logger.error(f"WebSocket error for user {user_id}: {ws.exception()}")
+                break
+                
+            elif msg.type == WSMsgType.CLOSE:
+                logger.info(f"WebSocket close received for user {user_id} (code: {msg.data}, message: {msg.extra})")
+                break
+                
+            elif msg.type == WSMsgType.CLOSING:
+                logger.info(f"WebSocket closing for user {user_id}")
+                break
+                
+            elif msg.type == WSMsgType.CLOSED:
+                logger.info(f"WebSocket already closed for user {user_id}")
+                break
+                
+    except Exception as ws_error:
+        logger.error(f"Unexpected WebSocket error for user {user_id}: {str(ws_error)}", exc_info=True)
+    finally:
+        # Cleanup session
+        try:
+            logger.info(f"Cleaning up session for user {user_id}")
+            await game_manager.delete_session(user_id)
+            logger.info(f"Connection closed for user {user_id}")
+        except Exception as cleanup_error:
+            logger.error(f"Error during session cleanup for user {user_id}: {str(cleanup_error)}")
+    
+    return ws
+
+async def init_app(args, base_path="") -> web.Application:
+    """Initialize the web application"""
+    global game_manager
+    
+    # Initialize game manager with command line args
+    game_manager = GameManager(args)
+    
+    app = web.Application(
+        client_max_size=1024**2*10  # 10MB max size
+    )
+    
+    # Add cleanup logic
+    async def cleanup(app):
+        logger.info("Shutting down server, closing all sessions...")
+        await game_manager.close_all_sessions()
+    
+    app.on_shutdown.append(cleanup)
+    
+    # Add routes with CORS headers for WebSockets
+    # Configure CORS for all routes
+    @web.middleware
+    async def cors_middleware(request, handler):
+        if request.method == 'OPTIONS':
+            # Handle preflight requests
+            resp = web.Response()
+            resp.headers['Access-Control-Allow-Origin'] = '*'
+            resp.headers['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS'
+            resp.headers['Access-Control-Allow-Headers'] = 'Content-Type, X-Requested-With'
+            return resp
+            
+        # Normal request, call the handler
+        resp = await handler(request)
+        
+        # Add CORS headers to the response
+        resp.headers['Access-Control-Allow-Origin'] = '*'
+        resp.headers['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS'
+        resp.headers['Access-Control-Allow-Headers'] = 'Content-Type, X-Requested-With'
+        return resp
+    
+    app.middlewares.append(cors_middleware)
+    
+    # Add a debug endpoint to help diagnose WebSocket issues
+    async def debug_handler(request):
+        client_ip = request.remote
+        headers = dict(request.headers)
+        server_host = request.host
+        
+        debug_info = {
+            "client_ip": client_ip,
+            "server_host": server_host,
+            "headers": headers,
+            "request_path": request.path,
+            "server_time": time.time(),
+            "base_path": base_path,
+            "websocket_route": f"{base_path}/ws",
+            "all_routes": [route.name for route in app.router.routes() if route.name],
+            "server_info": {
+                "active_sessions": game_manager.session_count,
+                "available_scenes": game_manager.valid_scenes
+            }
+        }
+        
+        return web.json_response(debug_info)
+    
+    # Set up routes with the base_path
+    # Add multiple WebSocket routes to ensure compatibility
+    logger.info(f"Setting up WebSocket route at {base_path}/ws")
+    app.router.add_get(f'{base_path}/ws', websocket_handler, name='ws_handler')
+    
+    # Also add WebSocket route at the root for Hugging Face compatibility
+    if base_path:
+        logger.info(f"Adding additional WebSocket route at /ws")
+        app.router.add_get('/ws', websocket_handler, name='ws_root_handler')
+    
+    # Add routes for API and debug endpoints
+    app.router.add_get(f'{base_path}/api/status', status_handler, name='status_handler')
+    app.router.add_get(f'{base_path}/api/debug', debug_handler, name='debug_handler')
+    
+    # Serve the client at both the base path and root path for compatibility
+    app.router.add_get(f'{base_path}/', root_handler, name='root_handler')
+    
+    # Always serve at the root path for Hugging Face Spaces compatibility
+    if base_path:
+        app.router.add_get('/', root_handler, name='root_handler_no_base')
+    
+    # Set up static file serving for the client assets
+    app.router.add_static(f'{base_path}/assets', pathlib.Path(__file__).parent / 'client', name='static_handler')
+    
+    # Add static file serving at root for compatibility
+    if base_path:
+        app.router.add_static('/assets', pathlib.Path(__file__).parent / 'client', name='static_handler_no_base')
+    
+    return app
+
+def parse_args() -> argparse.Namespace:
+    """Parse server-specific command line arguments"""
+    parser = argparse.ArgumentParser(description="MatrixGame WebSocket Server")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host IP to bind to")
+    parser.add_argument("--port", type=int, default=8080, help="Port to listen on")
+    parser.add_argument("--path", type=str, default="", help="Base path for the server (for proxy setups)")
+    
+    # Parse server args first
+    server_args, remaining_args = parser.parse_known_args()
+    
+    # Parse model args and combine
+    model_args = parse_model_args()
+    
+    # Combine all args
+    combined_args = argparse.Namespace(**vars(server_args), **vars(model_args))
+    
+    return combined_args
+
+if __name__ == '__main__':
+    # Configure GPU environment
+    setup_gpu_environment()
+    
+    # Parse command line arguments
+    args = parse_args()
+    
+    # Initialize app
+    loop = asyncio.get_event_loop()
+    app = loop.run_until_complete(init_app(args, base_path=args.path))
+    
+    # Start server
+    logger.info(f"Starting MatrixGame WebSocket Server at {args.host}:{args.port}")
+    web.run_app(app, host=args.host, port=args.port)
\ No newline at end of file
diff --git a/example/utils.py b/example/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2c35aa64e6d4f4cc1452b3c4789143ce2be6ef7
--- /dev/null
+++ b/example/utils.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+MatrixGame Utility Functions
+
+This module contains helper functions and utilities for the MatrixGame project.
+"""
+
+import os
+import logging
+import argparse
+import torch
+import numpy as np
+import cv2
+from PIL import Image
+from typing import Dict, List, Tuple, Any, Optional, Union
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+def setup_gpu_environment():
+    """
+    Configure the GPU environment and log GPU information.
+    
+    Returns:
+        bool: True if CUDA is available, False otherwise
+    """
+    # Set CUDA memory allocation environment variable for better performance
+    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+    
+    # Check if CUDA is available and log information
+    if torch.cuda.is_available():
+        gpu_count = torch.cuda.device_count()
+        gpu_info = []
+        
+        for i in range(gpu_count):
+            gpu_name = torch.cuda.get_device_name(i)
+            gpu_memory = torch.cuda.get_device_properties(i).total_memory / (1024**3)  # Convert to GB
+            gpu_info.append(f"GPU {i}: {gpu_name} ({gpu_memory:.2f} GB)")
+        
+        logger.info(f"CUDA is available. Found {gpu_count} GPU(s):")
+        for info in gpu_info:
+            logger.info(f"  {info}")
+        return True
+    else:
+        logger.warning("CUDA is not available. Running in CPU-only mode.")
+        return False
+
+def parse_model_args() -> argparse.Namespace:
+    """
+    Parse command line arguments for model paths and configuration.
+    
+    Returns:
+        argparse.Namespace: Parsed arguments
+    """
+    parser = argparse.ArgumentParser(description="MatrixGame Model Configuration")
+    
+    # Model paths
+    parser.add_argument("--model_root", type=str, default="./models/matrixgame",
+                        help="Root directory for model files")
+    parser.add_argument("--dit_path", type=str, default=None,
+                        help="Path to DIT model. If not provided, will use MODEL_ROOT/dit/")
+    parser.add_argument("--vae_path", type=str, default=None,
+                        help="Path to VAE model. If not provided, will use MODEL_ROOT/vae/")
+    parser.add_argument("--textenc_path", type=str, default=None,
+                        help="Path to text encoder model. If not provided, will use MODEL_ROOT")
+    
+    # Model settings
+    parser.add_argument("--inference_steps", type=int, default=20,
+                        help="Number of inference steps for frame generation (lower is faster)")
+    parser.add_argument("--guidance_scale", type=float, default=6.0,
+                        help="Guidance scale for generation")
+    parser.add_argument("--frame_width", type=int, default=640,
+                        help="Width of the generated frames")
+    parser.add_argument("--frame_height", type=int, default=360,
+                        help="Height of the generated frames")
+    parser.add_argument("--num_pre_frames", type=int, default=3,
+                        help="Number of pre-frames for conditioning")
+    parser.add_argument("--fps", type=int, default=16,
+                        help="Frames per second for video")
+    
+    args = parser.parse_args()
+    
+    # Set environment variables for model paths if provided
+    if args.model_root:
+        os.environ.setdefault("MODEL_ROOT", args.model_root)
+    if args.dit_path:
+        os.environ.setdefault("DIT_PATH", args.dit_path)
+    else:
+        os.environ.setdefault("DIT_PATH", os.path.join(os.environ.get("MODEL_ROOT", "./models/matrixgame"), "dit/"))
+    if args.vae_path:
+        os.environ.setdefault("VAE_PATH", args.vae_path)
+    else:
+        os.environ.setdefault("VAE_PATH", os.path.join(os.environ.get("MODEL_ROOT", "./models/matrixgame"), "vae/"))
+    if args.textenc_path:
+        os.environ.setdefault("TEXTENC_PATH", args.textenc_path)
+    else:
+        os.environ.setdefault("TEXTENC_PATH", os.environ.get("MODEL_ROOT", "./models/matrixgame"))
+    
+    return args
+
+def visualize_controls(frame: np.ndarray, keyboard_condition: List, mouse_condition: List, 
+                       frame_width: int, frame_height: int) -> np.ndarray:
+    """
+    Visualize keyboard and mouse controls on the frame.
+    
+    Args:
+        frame: The video frame to visualize on
+        keyboard_condition: Keyboard state as a list
+        mouse_condition: Mouse state as a list
+        frame_width: Width of the frame
+        frame_height: Height of the frame
+        
+    Returns:
+        np.ndarray: Frame with visualized controls
+    """
+    # Clone the frame to avoid modifying the original
+    frame = frame.copy()
+    
+    # If we have keyboard/mouse conditions, visualize them on the frame
+    if keyboard_condition:
+        # Visualize keyboard inputs
+        keys = ["W", "S", "A", "D", "JUMP", "ATTACK"]
+        for i, key_pressed in enumerate(keyboard_condition[0]):
+            color = (0, 255, 0) if key_pressed else (100, 100, 100)
+            cv2.putText(frame, keys[i], (20 + i*100, 30), 
+                      cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
+            
+    if mouse_condition:
+        # Visualize mouse movement
+        mouse_x, mouse_y = mouse_condition[0]
+        # Scale mouse values for visualization
+        offset_x = int(mouse_x * 100)
+        offset_y = int(mouse_y * 100)
+        center_x, center_y = frame_width // 2, frame_height // 2
+        cv2.circle(frame, (center_x + offset_x, center_y - offset_y), 10, (255, 0, 0), -1)
+        cv2.putText(frame, f"Mouse: {mouse_x:.2f}, {mouse_y:.2f}", 
+                   (frame_width - 250, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)
+    
+    return frame
+
+def frame_to_jpeg(frame: np.ndarray, frame_height: int, frame_width: int) -> bytes:
+    """
+    Convert a frame to JPEG bytes.
+    
+    Args:
+        frame: The video frame to convert
+        frame_height: Height of the frame for fallback
+        frame_width: Width of the frame for fallback
+        
+    Returns:
+        bytes: JPEG bytes of the frame
+    """
+    success, buffer = cv2.imencode('.jpg', frame)
+    if not success:
+        logger.error("Failed to encode frame as JPEG")
+        # Return a blank frame
+        blank = np.ones((frame_height, frame_width, 3), dtype=np.uint8) * 100
+        success, buffer = cv2.imencode('.jpg', blank)
+        
+    return buffer.tobytes()
+
+def load_scene_frames(scene_name: str, frame_width: int, frame_height: int) -> List[np.ndarray]:
+    """
+    Load initial frames for a scene from asset directory.
+    
+    Args:
+        scene_name: Name of the scene
+        frame_width: Width to resize frames to
+        frame_height: Height to resize frames to
+        
+    Returns:
+        List[np.ndarray]: List of frames as numpy arrays
+    """
+    frames = []
+    scene_dir = f"./GameWorldScore/asset/init_image/{scene_name}"
+    
+    if os.path.exists(scene_dir):
+        image_files = sorted([f for f in os.listdir(scene_dir) if f.endswith('.png') or f.endswith('.jpg')])
+        for img_file in image_files:
+            try:
+                img_path = os.path.join(scene_dir, img_file)
+                img = Image.open(img_path).convert("RGB")
+                img = img.resize((frame_width, frame_height))
+                frames.append(np.array(img))
+            except Exception as e:
+                logger.error(f"Error loading image {img_file}: {str(e)}")
+    
+    # If no frames were loaded, create a default colored frame with text
+    if not frames:
+        frame = np.ones((frame_height, frame_height, 3), dtype=np.uint8) * 100
+        # Add scene name as text
+        cv2.putText(frame, f"Scene: {scene_name}", (50, 180), 
+                   cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
+        frames.append(frame)
+        
+    return frames
\ No newline at end of file
diff --git a/game/spawn/1/act.npy b/game/spawn/1/act.npy
new file mode 100644
index 0000000000000000000000000000000000000000..2924767c1a504eb1e15ce0ac28d39624e61fe7d0
--- /dev/null
+++ b/game/spawn/1/act.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7caabf75f45d4c8bae5c0b66dc2b5a3cbf3ab7dbf89521d6ba539c4f30048d75
+size 10688
diff --git a/game/spawn/1/full_res.npy b/game/spawn/1/full_res.npy
new file mode 100644
index 0000000000000000000000000000000000000000..4e6c6c2ff258a8d56c070de131ed13d63968d5d7
--- /dev/null
+++ b/game/spawn/1/full_res.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c186e607a8cc4922e17ee66c1f37dc4858adfef220b6bf48fbcda9bf75ffde34
+size 22260128
diff --git a/game/spawn/1/low_res.npy b/game/spawn/1/low_res.npy
new file mode 100644
index 0000000000000000000000000000000000000000..e7b92ab83886e42ab0fc6d3c4146d34a846093a0
--- /dev/null
+++ b/game/spawn/1/low_res.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08e31f7ef447a1dffda3b51b4102ae4301f5804145b63c71acc5c278a294b1ee
+size 368768
diff --git a/game/spawn/1/next_act.npy b/game/spawn/1/next_act.npy
new file mode 100644
index 0000000000000000000000000000000000000000..7c6afd9bd0075b320e33f95e49cad10748b53dbe
--- /dev/null
+++ b/game/spawn/1/next_act.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0f2d1c96337459ddd84b9c1a8dbad9eb7284cb813a2f0ebd9cb4d757dd294e1
+size 105728
diff --git a/game/spawn/2/act.npy b/game/spawn/2/act.npy
new file mode 100644
index 0000000000000000000000000000000000000000..4745e120d72bd68c4de9b05c7b31ae9b6ba17894
--- /dev/null
+++ b/game/spawn/2/act.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:619cdda3de7f55e48a753b64542354387294c2688a6e014af85b094996b8a486
+size 10688
diff --git a/game/spawn/2/full_res.npy b/game/spawn/2/full_res.npy
new file mode 100644
index 0000000000000000000000000000000000000000..7d5357363ae4968b166615ce8bde52240a94fcde
--- /dev/null
+++ b/game/spawn/2/full_res.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:911389a5957acd8d7b96fccb1917ee4a4b0c74f0b9420f61580b571965dd99ff
+size 22260128
diff --git a/game/spawn/2/low_res.npy b/game/spawn/2/low_res.npy
new file mode 100644
index 0000000000000000000000000000000000000000..dc811a357c4143af270640a78f2a1a0be290bf2c
--- /dev/null
+++ b/game/spawn/2/low_res.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8351be59118b4c2237800a9119937d472564e63c70a8d1911bc2d52ac3a95a2
+size 368768
diff --git a/game/spawn/2/next_act.npy b/game/spawn/2/next_act.npy
new file mode 100644
index 0000000000000000000000000000000000000000..d1a89ff69cc6d2aabd893a9c67cd58045b4fe61f
--- /dev/null
+++ b/game/spawn/2/next_act.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba1b85fd4548f805352306a04e24368718328bfd513cb95305d0f9284fe9719f
+size 105728
diff --git a/game/spawn/3/act.npy b/game/spawn/3/act.npy
new file mode 100644
index 0000000000000000000000000000000000000000..795e24fb86302d7e4f38ef66d3a3ae87bd513fc6
--- /dev/null
+++ b/game/spawn/3/act.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60ff3ff40a48105e33feae08979dd4d7d7570984e10c950aae9308c08841400a
+size 10688
diff --git a/game/spawn/3/full_res.npy b/game/spawn/3/full_res.npy
new file mode 100644
index 0000000000000000000000000000000000000000..71f10372b77761a15339fb5f1758a10e59dbf86f
--- /dev/null
+++ b/game/spawn/3/full_res.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1f8a9a266e05f8d8a741dadb697bf3ed89c1a166dab443c1d81091c3cf1824b
+size 22260128
diff --git a/game/spawn/3/low_res.npy b/game/spawn/3/low_res.npy
new file mode 100644
index 0000000000000000000000000000000000000000..de23d6a00891c2f79014c4981f6e2c1b528d1d19
--- /dev/null
+++ b/game/spawn/3/low_res.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dd822892eb5201e23d5298da69523a4d43d740e37d7647d30a288a5b440991e
+size 368768
diff --git a/game/spawn/3/next_act.npy b/game/spawn/3/next_act.npy
new file mode 100644
index 0000000000000000000000000000000000000000..17f461d04b5278bad039cee29e2c1966a20b5528
--- /dev/null
+++ b/game/spawn/3/next_act.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:40f2a5d1381a93fc46e58f7ac0bf5df9f507c5c3b6489e85a0896deba8da9dca
+size 105728
diff --git a/index.html b/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..90e4922b2881ce5d0f18ee2e4adad4ec10f623d3
--- /dev/null
+++ b/index.html
@@ -0,0 +1,928 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>AI Game Multiverse</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 0;
+            padding: 0;
+            background-color: #121212;
+            color: #e0e0e0;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            user-select: none;
+            -webkit-user-select: none;
+            -moz-user-select: none;
+            -ms-user-select: none;
+            overflow-x: hidden;
+        }
+        
+        .container {
+            width: 100%;
+            max-width: 100%;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+        }
+        
+        .game-area {
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            width: 100%;
+            max-height: 85vh;
+            margin: 0;
+            position: relative;
+        }
+        
+        #mouse-tracking-area {
+            position: relative;
+            width: 100%;
+            height: auto;
+            cursor: pointer;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            max-height: 85vh;
+        }
+        
+        #game-canvas {
+            width: 100%;
+            height: auto;
+            max-height: 85vh;
+            object-fit: contain;
+            background-color: #000;
+            pointer-events: none;
+            -webkit-user-drag: none;
+            -khtml-user-drag: none;
+            -moz-user-drag: none;
+            -o-user-drag: none;
+            user-drag: none;
+        }
+        
+        .controls {
+            display: flex;
+            justify-content: space-between;
+            width: 100%;
+            max-width: 1200px;
+            padding: 10px;
+            background-color: rgba(0, 0, 0, 0.5);
+            position: absolute;
+            bottom: 0;
+            z-index: 10;
+            box-sizing: border-box;
+        }
+        
+        .panels-container {
+            display: flex;
+            width: 100%;
+            max-width: 1200px;
+            margin: 10px auto;
+            gap: 10px;
+        }
+        
+        .panel {
+            flex: 1;
+            background-color: #1E1E1E;
+            border-radius: 5px;
+            overflow: hidden;
+            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
+            transition: height 0.3s ease;
+        }
+        
+        .panel-header {
+            background-color: #272727;
+            padding: 10px 15px;
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            cursor: pointer;
+        }
+        
+        .panel-title {
+            font-weight: bold;
+            color: #4CAF50;
+        }
+        
+        .toggle-button {
+            background: none;
+            border: none;
+            color: #e0e0e0;
+            font-size: 18px;
+            cursor: pointer;
+        }
+        
+        .toggle-button:focus {
+            outline: none;
+        }
+        
+        .panel-content {
+            padding: 15px;
+            max-height: 300px;
+            overflow-y: auto;
+            transition: all 0.3s ease;
+        }
+        
+        .collapsed .panel-content {
+            max-height: 0;
+            padding-top: 0;
+            padding-bottom: 0;
+            overflow: hidden;
+        }
+        
+        button {
+            background-color: #4CAF50;
+            color: white;
+            border: none;
+            padding: 10px 15px;
+            text-align: center;
+            text-decoration: none;
+            display: inline-block;
+            font-size: 14px;
+            border-radius: 5px;
+            cursor: pointer;
+            margin: 5px;
+            transition: background-color 0.3s;
+        }
+        
+        button:hover {
+            background-color: #45a049;
+        }
+        
+        button:disabled {
+            background-color: #cccccc;
+            cursor: not-allowed;
+        }
+        
+        select {
+            padding: 10px;
+            border-radius: 5px;
+            background-color: #2A2A2A;
+            color: #e0e0e0;
+            border: 1px solid #4CAF50;
+        }
+        
+        .status {
+            margin-top: 10px;
+            color: #4CAF50;
+        }
+        
+        .key-indicators {
+            display: flex;
+            justify-content: center;
+            margin-top: 15px;
+        }
+        
+        .key {
+            width: 40px;
+            height: 40px;
+            margin: 0 5px;
+            background-color: #2A2A2A;
+            border: 1px solid #444;
+            border-radius: 5px;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            font-weight: bold;
+            transition: background-color 0.2s;
+        }
+        
+        .key.active {
+            background-color: #4CAF50;
+            color: white;
+        }
+        
+        .key-row {
+            display: flex;
+            justify-content: center;
+            margin: 5px 0;
+        }
+        
+        .spacebar {
+            width: 150px;
+        }
+        
+        .connection-info {
+            font-family: monospace;
+            height: 100%;
+            overflow-y: auto;
+        }
+        
+        .log-entry {
+            margin: 5px 0;
+            padding: 3px;
+            border-bottom: 1px solid #333;
+        }
+        
+        .fps-counter {
+            position: absolute;
+            top: 10px;
+            right: 10px;
+            background-color: rgba(0,0,0,0.5);
+            color: #4CAF50;
+            padding: 5px;
+            border-radius: 3px;
+            font-family: monospace;
+            z-index: 20;
+        }
+        
+        #mouse-position {
+            position: absolute;
+            top: 10px;
+            left: 10px;
+            background-color: rgba(0,0,0,0.5);
+            color: #4CAF50;
+            padding: 5px;
+            border-radius: 3px;
+            font-family: monospace;
+            z-index: 20;
+        }
+        
+        @media (max-width: 768px) {
+            .panels-container {
+                flex-direction: column;
+            }
+        }
+        
+        .header {
+            text-align: center;
+            padding: 15px;
+            margin-bottom: 20px;
+        }
+        
+        .header h1 {
+            margin: 0;
+            color: #4CAF50;
+            font-size: 2rem;
+        }
+        
+        .header p {
+            margin-top: 5px;
+            color: #aaa;
+        }
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>AI Game Multiverse</h1>
+        <p>Play procedurally generated games using AI</p>
+    </div>
+    
+    <div class="container">
+        <div class="game-area">
+            <div id="mouse-tracking-area">
+                <img id="game-canvas" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" alt="Game Frame">
+                <div id="mouse-position">Mouse: 0.00, 0.00</div>
+                <div class="fps-counter" id="fps-counter">FPS: 0</div>
+            </div>
+            
+            <div class="controls">
+                <button id="connect-btn">Connect</button>
+                <button id="start-stream-btn" disabled>Start Stream</button>
+                <button id="stop-stream-btn" disabled>Stop Stream</button>
+                <select id="scene-select" disabled>
+                    <option value="forest">Forest</option>
+                    <option value="desert">Desert</option>
+                    <option value="beach">Beach</option>
+                    <option value="hills">Hills</option>
+                    <option value="river">River</option>
+                    <option value="plain">Plain</option>
+                </select>
+            </div>
+        </div>
+        
+        <div class="panels-container">
+            <!-- Controls Panel -->
+            <div class="panel" id="controls-panel">
+                <div class="panel-header" onclick="togglePanel('controls-panel')">
+                    <div class="panel-title">Keyboard Controls</div>
+                    <button class="toggle-button">−</button>
+                </div>
+                <div class="panel-content">
+                    <div class="key-indicators">
+                        <div class="key-row">
+                            <div id="key-w" class="key">W</div>
+                        </div>
+                        <div class="key-row">
+                            <div id="key-a" class="key">A</div>
+                            <div id="key-s" class="key">S</div>
+                            <div id="key-d" class="key">D</div>
+                        </div>
+                        <div class="key-row">
+                            <div id="key-space" class="key spacebar">SPACE</div>
+                        </div>
+                        <div class="key-row">
+                            <div id="key-shift" class="key">SHIFT</div>
+                        </div>
+                    </div>
+                    <p class="status">
+                        W or ↑ = Forward, S or ↓ = Back, A or ← = Left, D or → = Right<br>
+                        Space = Jump, Shift = Attack<br>
+                        Click on game view to capture mouse (ESC to release)<br>
+                        Mouse = Look around
+                    </p>
+                </div>
+            </div>
+            
+            <!-- Connection Log Panel -->
+            <div class="panel" id="log-panel">
+                <div class="panel-header" onclick="togglePanel('log-panel')">
+                    <div class="panel-title">Connection Log</div>
+                    <button class="toggle-button">−</button>
+                </div>
+                <div class="panel-content">
+                    <div class="connection-info" id="connection-log">
+                        <div class="log-entry">Welcome to AI Game Multiverse. Click Connect to begin.</div>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    
+    <script>
+        // WebSocket connection
+        let socket = null;
+        let userId = null;
+        let isStreaming = false;
+        let lastFrameTime = 0;
+        let frameCount = 0;
+        let fpsUpdateInterval = null;
+
+        // DOM Elements
+        const connectBtn = document.getElementById('connect-btn');
+        const startStreamBtn = document.getElementById('start-stream-btn');
+        const stopStreamBtn = document.getElementById('stop-stream-btn');
+        const sceneSelect = document.getElementById('scene-select');
+        const gameCanvas = document.getElementById('game-canvas');
+        const connectionLog = document.getElementById('connection-log');
+        const mousePosition = document.getElementById('mouse-position');
+        const fpsCounter = document.getElementById('fps-counter');
+        const mouseTrackingArea = document.getElementById('mouse-tracking-area');
+
+        // Pointer Lock API support check
+        const pointerLockSupported = 'pointerLockElement' in document || 
+                                    'mozPointerLockElement' in document || 
+                                    'webkitPointerLockElement' in document;
+
+        // Keyboard DOM elements
+        const keyElements = {
+            'w': document.getElementById('key-w'),
+            'a': document.getElementById('key-a'),
+            's': document.getElementById('key-s'),
+            'd': document.getElementById('key-d'),
+            'space': document.getElementById('key-space'),
+            'shift': document.getElementById('key-shift')
+        };
+
+        // Key mapping to action names
+        const keyToAction = {
+            'w': 'forward',
+            'arrowup': 'forward',
+            'a': 'left',
+            'arrowleft': 'left',
+            's': 'back',
+            'arrowdown': 'back',
+            'd': 'right',
+            'arrowright': 'right',
+            ' ': 'jump',
+            'shift': 'attack'
+        };
+
+        // Key state tracking
+        const keyState = {
+            'forward': false,
+            'back': false,
+            'left': false,
+            'right': false,
+            'jump': false,
+            'attack': false
+        };
+
+        // Mouse state
+        const mouseState = {
+            x: 0,
+            y: 0,
+            captured: false
+        };
+
+        // Test server connectivity before establishing WebSocket
+        async function testServerConnectivity() {
+            try {
+                // Get base path by extracting path from the URL
+                const basePath = window.location.pathname.replace(/\/+$/, '');
+                
+                // Try to fetch the debug endpoint to see if the server is accessible
+                const response = await fetch(`${window.location.protocol}//${window.location.host}${basePath}/api/debug`);
+                if (!response.ok) {
+                    throw new Error(`Server returned ${response.status}`);
+                }
+                
+                const debugInfo = await response.json();
+                logMessage(`Server connection test successful! Server time: ${new Date(debugInfo.server_time * 1000).toLocaleTimeString()}`);
+                
+                // Log available routes from server
+                if (debugInfo.all_routes && debugInfo.all_routes.length > 0) {
+                    logMessage(`Available routes: ${debugInfo.all_routes.join(', ')}`);
+                }
+                
+                // Return the debug info for connection setup
+                return debugInfo;
+            } catch (error) {
+                logMessage(`Server connection test failed: ${error.message}`);
+                return null;
+            }
+        }
+
+        // Connect to WebSocket server
+        async function connectWebSocket() {
+            // First test connectivity to the server
+            logMessage('Testing server connectivity...');
+            const debugInfo = await testServerConnectivity();
+            
+            // Use secure WebSocket (wss://) if the page is loaded over HTTPS
+            const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
+            
+            // Get base path from URL
+            const basePath = window.location.pathname.replace(/\/+$/, '');
+            
+            // Try both with and without base path for WebSocket connection
+            let serverUrl = `${protocol}//${window.location.hostname}${window.location.port ? ':' + window.location.port : ''}${basePath}/ws`;
+            logMessage(`Attempting to connect to WebSocket at ${serverUrl}...`);
+            
+            // For compatibility, try the direct /ws path if the base path doesn't work
+            const fallbackUrl = `${protocol}//${window.location.hostname}${window.location.port ? ':' + window.location.port : ''}/ws`;
+            
+            try {
+                socket = new WebSocket(serverUrl);
+                setupWebSocketHandlers();
+                
+                // Set a timeout to try the fallback URL if the first one doesn't connect
+                setTimeout(() => {
+                    if (socket.readyState !== WebSocket.OPEN && socket.readyState !== WebSocket.CONNECTING) {
+                        logMessage(`Connection to ${serverUrl} failed. Trying fallback URL: ${fallbackUrl}`);
+                        socket = new WebSocket(fallbackUrl);
+                        setupWebSocketHandlers();
+                    }
+                }, 3000);
+            } catch (error) {
+                logMessage(`Error connecting to WebSocket: ${error.message}`);
+                resetUI();
+            }
+        }
+
+        // Set up WebSocket event handlers
+        function setupWebSocketHandlers() {
+            socket.onopen = () => {
+                logMessage('WebSocket connection established');
+                connectBtn.textContent = 'Disconnect';
+                startStreamBtn.disabled = false;
+                sceneSelect.disabled = false;
+            };
+            
+            socket.onmessage = (event) => {
+                const message = JSON.parse(event.data);
+                
+                switch (message.action) {
+                    case 'welcome':
+                        userId = message.userId;
+                        logMessage(`Connected with user ID: ${userId}`);
+                        
+                        // Update scene options if server provides them
+                        if (message.scenes && Array.isArray(message.scenes)) {
+                            sceneSelect.innerHTML = '';
+                            message.scenes.forEach(scene => {
+                                const option = document.createElement('option');
+                                option.value = scene;
+                                option.textContent = scene.charAt(0).toUpperCase() + scene.slice(1);
+                                sceneSelect.appendChild(option);
+                            });
+                        }
+                        break;
+                        
+                    case 'frame':
+                        // Process incoming frame
+                        processFrame(message);
+                        break;
+                        
+                    case 'start_stream':
+                        if (message.success) {
+                            isStreaming = true;
+                            startStreamBtn.disabled = true;
+                            stopStreamBtn.disabled = false;
+                            logMessage(`Streaming started: ${message.message}`);
+                            
+                            // Start FPS counter
+                            startFpsCounter();
+                        } else {
+                            logMessage(`Error starting stream: ${message.error}`);
+                        }
+                        break;
+                        
+                    case 'stop_stream':
+                        if (message.success) {
+                            isStreaming = false;
+                            startStreamBtn.disabled = false;
+                            stopStreamBtn.disabled = true;
+                            logMessage('Streaming stopped');
+                            
+                            // Stop FPS counter
+                            stopFpsCounter();
+                        } else {
+                            logMessage(`Error stopping stream: ${message.error}`);
+                        }
+                        break;
+                        
+                    case 'pong':
+                        // Server responded to ping
+                        break;
+                        
+                    case 'change_scene':
+                        if (message.success) {
+                            logMessage(`Scene changed to ${message.scene}`);
+                        } else {
+                            logMessage(`Error changing scene: ${message.error}`);
+                        }
+                        break;
+                        
+                    default:
+                        logMessage(`Received message: ${JSON.stringify(message)}`);
+                }
+            };
+            
+            socket.onclose = (event) => {
+                logMessage(`WebSocket connection closed (code: ${event.code}, reason: ${event.reason || 'none given'})`);
+                resetUI();
+            };
+            
+            socket.onerror = (error) => {
+                logMessage(`WebSocket error. This is often caused by CORS issues or the server being inaccessible.`);
+                console.error('WebSocket error:', error);
+                resetUI();
+            };
+        }
+
+        // Disconnect from WebSocket server
+        function disconnectWebSocket() {
+            if (socket && socket.readyState === WebSocket.OPEN) {
+                // Stop streaming if active
+                if (isStreaming) {
+                    sendStopStream();
+                }
+                
+                // Close the socket
+                socket.close();
+                logMessage('Disconnected from server');
+            }
+        }
+
+        // Start streaming frames
+        function sendStartStream() {
+            if (socket && socket.readyState === WebSocket.OPEN) {
+                socket.send(JSON.stringify({
+                    action: 'start_stream',
+                    requestId: generateRequestId(),
+                    fps: 16  // Default FPS
+                }));
+            }
+        }
+
+        // Stop streaming frames
+        function sendStopStream() {
+            if (socket && socket.readyState === WebSocket.OPEN) {
+                socket.send(JSON.stringify({
+                    action: 'stop_stream',
+                    requestId: generateRequestId()
+                }));
+            }
+        }
+
+        // Send keyboard input to server
+        function sendKeyboardInput(key, pressed) {
+            if (socket && socket.readyState === WebSocket.OPEN) {
+                socket.send(JSON.stringify({
+                    action: 'keyboard_input',
+                    requestId: generateRequestId(),
+                    key: key,
+                    pressed: pressed
+                }));
+            }
+        }
+
+        // Send mouse input to server
+        function sendMouseInput(x, y) {
+            if (socket && socket.readyState === WebSocket.OPEN && isStreaming) {
+                socket.send(JSON.stringify({
+                    action: 'mouse_input',
+                    requestId: generateRequestId(),
+                    x: x,
+                    y: y
+                }));
+            }
+        }
+
+        // Change scene
+        function sendChangeScene(scene) {
+            if (socket && socket.readyState === WebSocket.OPEN) {
+                socket.send(JSON.stringify({
+                    action: 'change_scene',
+                    requestId: generateRequestId(),
+                    scene: scene
+                }));
+            }
+        }
+
+        // Process incoming frame
+        function processFrame(message) {
+            // Update FPS calculation
+            const now = performance.now();
+            if (lastFrameTime > 0) {
+                frameCount++;
+            }
+            lastFrameTime = now;
+            
+            // Update the canvas with the new frame
+            if (message.frameData) {
+                gameCanvas.src = `data:image/jpeg;base64,${message.frameData}`;
+            }
+        }
+
+        // Generate a random request ID
+        function generateRequestId() {
+            return Math.random().toString(36).substring(2, 15);
+        }
+
+        // Log message to the connection info panel
+        function logMessage(message) {
+            const logEntry = document.createElement('div');
+            logEntry.className = 'log-entry';
+            
+            const timestamp = new Date().toLocaleTimeString();
+            logEntry.textContent = `[${timestamp}] ${message}`;
+            
+            connectionLog.appendChild(logEntry);
+            connectionLog.scrollTop = connectionLog.scrollHeight;
+            
+            // Limit number of log entries
+            while (connectionLog.children.length > 100) {
+                connectionLog.removeChild(connectionLog.firstChild);
+            }
+        }
+
+        // Start FPS counter updates
+        function startFpsCounter() {
+            frameCount = 0;
+            lastFrameTime = 0;
+            
+            // Update FPS display every second
+            fpsUpdateInterval = setInterval(() => {
+                fpsCounter.textContent = `FPS: ${frameCount}`;
+                frameCount = 0;
+            }, 1000);
+        }
+
+        // Stop FPS counter updates
+        function stopFpsCounter() {
+            if (fpsUpdateInterval) {
+                clearInterval(fpsUpdateInterval);
+                fpsUpdateInterval = null;
+            }
+            fpsCounter.textContent = 'FPS: 0';
+        }
+
+        // Reset UI to initial state
+        function resetUI() {
+            connectBtn.textContent = 'Connect';
+            startStreamBtn.disabled = true;
+            stopStreamBtn.disabled = true;
+            sceneSelect.disabled = true;
+            
+            // Reset key indicators
+            for (const key in keyElements) {
+                keyElements[key].classList.remove('active');
+            }
+            
+            // Stop FPS counter
+            stopFpsCounter();
+            
+            // Reset streaming state
+            isStreaming = false;
+        }
+
+        // Event Listeners
+        connectBtn.addEventListener('click', () => {
+            if (socket && socket.readyState === WebSocket.OPEN) {
+                disconnectWebSocket();
+            } else {
+                connectWebSocket();
+            }
+        });
+
+        startStreamBtn.addEventListener('click', sendStartStream);
+        stopStreamBtn.addEventListener('click', sendStopStream);
+
+        sceneSelect.addEventListener('change', () => {
+            sendChangeScene(sceneSelect.value);
+        });
+
+        // Keyboard event listeners
+        document.addEventListener('keydown', (event) => {
+            const key = event.key.toLowerCase();
+            
+            // Map key to action
+            let action = keyToAction[key];
+            if (!action && key === ' ') {
+                action = keyToAction[' '];  // Handle spacebar
+            }
+            
+            if (action && !keyState[action]) {
+                keyState[action] = true;
+                
+                // Update visual indicator
+                const keyElement = keyElements[key] || 
+                                  (key === ' ' ? keyElements['space'] : null) ||
+                                  (key === 'shift' ? keyElements['shift'] : null);
+                
+                if (keyElement) {
+                    keyElement.classList.add('active');
+                }
+                
+                // Send to server
+                sendKeyboardInput(action, true);
+            }
+            
+            // Prevent default actions for game controls
+            if (Object.keys(keyToAction).includes(key) || key === ' ') {
+                event.preventDefault();
+            }
+        });
+
+        document.addEventListener('keyup', (event) => {
+            const key = event.key.toLowerCase();
+            
+            // Map key to action
+            let action = keyToAction[key];
+            if (!action && key === ' ') {
+                action = keyToAction[' '];  // Handle spacebar
+            }
+            
+            if (action && keyState[action]) {
+                keyState[action] = false;
+                
+                // Update visual indicator
+                const keyElement = keyElements[key] || 
+                                  (key === ' ' ? keyElements['space'] : null) ||
+                                  (key === 'shift' ? keyElements['shift'] : null);
+                
+                if (keyElement) {
+                    keyElement.classList.remove('active');
+                }
+                
+                // Send to server
+                sendKeyboardInput(action, false);
+            }
+        });
+
+        // Mouse capture functions
+        function requestPointerLock() {
+            if (!mouseState.captured && pointerLockSupported) {
+                mouseTrackingArea.requestPointerLock = mouseTrackingArea.requestPointerLock ||
+                                                    mouseTrackingArea.mozRequestPointerLock ||
+                                                    mouseTrackingArea.webkitRequestPointerLock;
+                mouseTrackingArea.requestPointerLock();
+                logMessage('Mouse captured. Press ESC to release.');
+            }
+        }
+
+        function exitPointerLock() {
+            if (mouseState.captured) {
+                document.exitPointerLock = document.exitPointerLock ||
+                                         document.mozExitPointerLock ||
+                                         document.webkitExitPointerLock;
+                document.exitPointerLock();
+                logMessage('Mouse released.');
+            }
+        }
+
+        // Handle pointer lock change events
+        document.addEventListener('pointerlockchange', pointerLockChangeHandler);
+        document.addEventListener('mozpointerlockchange', pointerLockChangeHandler);
+        document.addEventListener('webkitpointerlockchange', pointerLockChangeHandler);
+
+        function pointerLockChangeHandler() {
+            if (document.pointerLockElement === mouseTrackingArea ||
+                document.mozPointerLockElement === mouseTrackingArea ||
+                document.webkitPointerLockElement === mouseTrackingArea) {
+                // Pointer is locked, enable mouse movement tracking
+                mouseState.captured = true;
+                document.addEventListener('mousemove', handleMouseMovement);
+            } else {
+                // Pointer is unlocked, disable mouse movement tracking
+                mouseState.captured = false;
+                document.removeEventListener('mousemove', handleMouseMovement);
+                // Reset mouse state
+                mouseState.x = 0;
+                mouseState.y = 0;
+                mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
+                throttledSendMouseInput();
+            }
+        }
+
+        // Mouse tracking with pointer lock
+        function handleMouseMovement(event) {
+            if (mouseState.captured) {
+                // Use movement for mouse look when captured
+                const sensitivity = 0.005; // Adjust sensitivity
+                mouseState.x += event.movementX * sensitivity;
+                mouseState.y -= event.movementY * sensitivity; // Invert Y for intuitive camera control
+                
+                // Clamp values
+                mouseState.x = Math.max(-1, Math.min(1, mouseState.x));
+                mouseState.y = Math.max(-1, Math.min(1, mouseState.y));
+                
+                // Update display
+                mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
+                
+                // Send to server (throttled)
+                throttledSendMouseInput();
+            }
+        }
+
+        // Mouse click to capture
+        mouseTrackingArea.addEventListener('click', () => {
+            if (!mouseState.captured && isStreaming) {
+                requestPointerLock();
+            }
+        });
+
+        // Standard mouse tracking for when pointer is not locked
+        mouseTrackingArea.addEventListener('mousemove', (event) => {
+            if (!mouseState.captured) {
+                // Calculate normalized coordinates relative to the center of the tracking area
+                const rect = mouseTrackingArea.getBoundingClientRect();
+                const centerX = rect.width / 2;
+                const centerY = rect.height / 2;
+                
+                // Calculate relative position from center (-1 to 1)
+                const relX = (event.clientX - rect.left - centerX) / centerX;
+                const relY = (event.clientY - rect.top - centerY) / centerY;
+                
+                // Scale down for smoother movement
+                const scaleFactor = 0.05;
+                mouseState.x = relX * scaleFactor;
+                mouseState.y = -relY * scaleFactor;  // Invert Y for intuitive camera control
+                
+                // Update display
+                mousePosition.textContent = `Mouse: ${mouseState.x.toFixed(2)}, ${mouseState.y.toFixed(2)}`;
+                
+                // Send to server (throttled)
+                throttledSendMouseInput();
+            }
+        });
+
+        // Throttle mouse movement to avoid flooding the server
+        const throttledSendMouseInput = (() => {
+            let lastSentTime = 0;
+            const interval = 50;  // milliseconds
+            
+            return () => {
+                const now = performance.now();
+                if (now - lastSentTime >= interval) {
+                    sendMouseInput(mouseState.x, mouseState.y);
+                    lastSentTime = now;
+                }
+            };
+        })();
+
+        // Toggle panel collapse/expand
+        function togglePanel(panelId) {
+            const panel = document.getElementById(panelId);
+            const button = panel.querySelector('.toggle-button');
+            
+            if (panel.classList.contains('collapsed')) {
+                // Expand the panel
+                panel.classList.remove('collapsed');
+                button.textContent = '−'; // Minus sign
+            } else {
+                // Collapse the panel
+                panel.classList.add('collapsed');
+                button.textContent = '+'; // Plus sign
+            }
+        }
+
+        // Initialize the UI
+        resetUI();
+
+        // Make panel headers clickable
+        document.querySelectorAll('.panel-header').forEach(header => {
+            header.addEventListener('click', () => {
+                const panelId = header.parentElement.id;
+                togglePanel(panelId);
+            });
+        });
+    </script>
+</body>
+</html>
\ No newline at end of file
diff --git a/reference_example/Dockerfile b/reference_example/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..427f95c1602875b9f737a838afd29ae250e84831
--- /dev/null
+++ b/reference_example/Dockerfile
@@ -0,0 +1,52 @@
+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+ENV PYTHONUNBUFFERED=1
+
+RUN apt-get update && apt-get install --no-install-recommends -y \
+  build-essential \
+  python3.11 \
+  python3-pip \
+  python3-dev \
+  git \
+  curl \
+  ffmpeg \
+  libglib2.0-0 \
+  libsm6 \
+  libxrender1 \
+  libxext6 \
+  && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /code
+
+COPY ./requirements.txt /code/requirements.txt
+
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+
+# Set home to the user's home directory
+ENV PYTHONPATH=$HOME/app \
+  PYTHONUNBUFFERED=1 \
+  DATA_ROOT=/tmp/data
+
+RUN echo "Installing requirements.txt"
+RUN pip3 install --no-cache-dir --upgrade -r /code/requirements.txt
+
+# yeah.. this is manual for now
+#RUN flutter build web
+
+WORKDIR $HOME/app
+
+COPY --chown=user . $HOME/app
+
+EXPOSE 8080
+
+ENV PORT 8080
+
+CMD python3 api.py
diff --git a/reference_example/api.py b/reference_example/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..5be3e44d40ab1e88c37f36332d9349e9ddde884b
--- /dev/null
+++ b/reference_example/api.py
@@ -0,0 +1,297 @@
+import asyncio
+import json
+import logging
+import os
+import pathlib
+import time
+import uuid
+from aiohttp import web, WSMsgType
+from typing import Dict, Any
+
+from api_core import VideoGenerationAPI
+from api_session import SessionManager
+from api_metrics import MetricsTracker
+from api_config import *
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Create global session and metrics managers
+session_manager = SessionManager()
+metrics_tracker = MetricsTracker()
+
+# Dictionary to track connected anonymous clients by IP address
+anon_connections = {}
+anon_connection_lock = asyncio.Lock()
+
+async def status_handler(request: web.Request) -> web.Response:
+    """Handler for API status endpoint"""
+    api = session_manager.shared_api
+    
+    # Get current busy status of all endpoints
+    endpoint_statuses = []
+    for ep in api.endpoint_manager.endpoints:
+        endpoint_statuses.append({
+            'id': ep.id,
+            'url': ep.url,
+            'busy': ep.busy,
+            'last_used': ep.last_used,
+            'error_count': ep.error_count,
+            'error_until': ep.error_until
+        })
+    
+    # Get session statistics
+    session_stats = session_manager.get_session_stats()
+    
+    # Get metrics
+    api_metrics = metrics_tracker.get_metrics()
+    
+    return web.json_response({
+        'product': PRODUCT_NAME,
+        'version': PRODUCT_VERSION,
+        'maintenance_mode': MAINTENANCE_MODE,
+        'available_endpoints': len(VIDEO_ROUND_ROBIN_ENDPOINT_URLS),
+        'endpoint_status': endpoint_statuses,
+        'active_endpoints': sum(1 for ep in endpoint_statuses if not ep['busy'] and ('error_until' not in ep or ep['error_until'] < time.time())),
+        'active_sessions': session_stats,
+        'metrics': api_metrics
+    })
+
+async def metrics_handler(request: web.Request) -> web.Response:
+    """Handler for detailed metrics endpoint (protected)"""
+    # Check for API key in header or query param
+    auth_header = request.headers.get('Authorization', '')
+    api_key = None
+    
+    if auth_header.startswith('Bearer '):
+        api_key = auth_header[7:]
+    else:
+        api_key = request.query.get('key')
+    
+    # Validate API key (using SECRET_TOKEN as the API key)
+    if not api_key or api_key != SECRET_TOKEN:
+        return web.json_response({
+            'error': 'Unauthorized'
+        }, status=401)
+    
+    # Get detailed metrics
+    detailed_metrics = metrics_tracker.get_detailed_metrics()
+    
+    return web.json_response(detailed_metrics)
+
+async def websocket_handler(request: web.Request) -> web.WebSocketResponse:
+    # Check if maintenance mode is enabled
+    if MAINTENANCE_MODE:
+        # Return an error response indicating maintenance mode
+        return web.json_response({
+            'error': 'Server is in maintenance mode',
+            'maintenance': True
+        }, status=503)  # 503 Service Unavailable
+    
+    ws = web.WebSocketResponse(
+        max_msg_size=1024*1024*20,  # 20MB max message size
+        timeout=30.0  # we want to keep things tight and short
+    )
+    
+    await ws.prepare(request)
+    
+    # Get the Hugging Face token from query parameters
+    hf_token = request.query.get('hf_token', '')
+    
+    # Generate a unique user ID for this connection
+    user_id = str(uuid.uuid4())
+    
+    # Validate the token and determine the user role
+    user_role = await session_manager.shared_api.validate_user_token(hf_token)
+    logger.info(f"User {user_id} connected with role: {user_role}")
+    
+    # Get client IP address
+    peername = request.transport.get_extra_info('peername')
+    if peername is not None:
+        client_ip = peername[0]
+    else:
+        client_ip = request.headers.get('X-Forwarded-For', 'unknown').split(',')[0].strip()
+    
+    logger.info(f"Client {user_id} connecting from IP: {client_ip} with role: {user_role}")
+    
+    # Check for anonymous user connection limits
+    if user_role == 'anon':
+        async with anon_connection_lock:
+            # Track this connection
+            anon_connections[client_ip] = anon_connections.get(client_ip, 0) + 1
+            # Store the IP so we can clean up later
+            ws.client_ip = client_ip
+            
+            # Log multiple connections from same IP but don't restrict them
+            if anon_connections[client_ip] > 1:
+                logger.info(f"Multiple anonymous connections from IP {client_ip}: {anon_connections[client_ip]} connections")
+    
+    # Store the user role in the websocket for easy access
+    ws.user_role = user_role
+    ws.user_id = user_id
+    
+    # Register with metrics
+    metrics_tracker.register_session(user_id, client_ip)
+    
+    # Create a new session for this user
+    user_session = await session_manager.create_session(user_id, user_role, ws)
+
+    try:
+        async for msg in ws:
+            if msg.type == WSMsgType.TEXT:
+                try:
+                    data = json.loads(msg.data)
+                    action = data.get('action')
+                    
+                    # Check for rate limiting
+                    request_type = 'other'
+                    if action in ['join_chat', 'leave_chat', 'chat_message']:
+                        request_type = 'chat'
+                    elif action in ['generate_video']:
+                        request_type = 'video'
+                    elif action == 'search':
+                        request_type = 'search'
+                    elif action == 'simulate':
+                        request_type = 'simulation'
+                    
+                    # Record the request for metrics
+                    await metrics_tracker.record_request(user_id, client_ip, request_type, user_role)
+                    
+                    # Check rate limits (except for admins)
+                    if user_role != 'admin' and await metrics_tracker.is_rate_limited(user_id, request_type, user_role):
+                        await ws.send_json({
+                            'action': action,
+                            'requestId': data.get('requestId'),
+                            'success': False,
+                            'error': f'Rate limit exceeded for {request_type} requests. Please try again later.'
+                        })
+                        continue
+                    
+                    # Route requests to appropriate queues
+                    if action in ['join_chat', 'leave_chat', 'chat_message']:
+                        await user_session.chat_queue.put(data)
+                    elif action in ['generate_video']:
+                        await user_session.video_queue.put(data)
+                    elif action == 'search':
+                        await user_session.search_queue.put(data)
+                    elif action == 'simulate':
+                        await user_session.simulation_queue.put(data)
+                    else:
+                        await user_session.process_generic_request(data)
+                        
+                except Exception as e:
+                    logger.error(f"Error processing WebSocket message for user {user_id}: {str(e)}")
+                    await ws.send_json({
+                        'action': data.get('action') if 'data' in locals() else 'unknown',
+                        'success': False,
+                        'error': f'Error processing message: {str(e)}'
+                    })
+                    
+            elif msg.type in (WSMsgType.ERROR, WSMsgType.CLOSE):
+                break
+                
+    finally:
+        # Cleanup session
+        await session_manager.delete_session(user_id)
+        
+        # Cleanup anonymous connection tracking
+        if getattr(ws, 'user_role', None) == 'anon' and hasattr(ws, 'client_ip'):
+            client_ip = ws.client_ip
+            async with anon_connection_lock:
+                if client_ip in anon_connections:
+                    anon_connections[client_ip] = max(0, anon_connections[client_ip] - 1)
+                    if anon_connections[client_ip] == 0:
+                        del anon_connections[client_ip]
+                    logger.info(f"Anonymous connection from {client_ip} closed. Remaining: {anon_connections.get(client_ip, 0)}")
+        
+        # Unregister from metrics
+        metrics_tracker.unregister_session(user_id, client_ip)
+        logger.info(f"Connection closed for user {user_id}")
+    
+    return ws
+
+async def init_app() -> web.Application:
+    app = web.Application(
+        client_max_size=1024**2*20  # 20MB max size
+    )
+    
+    # Add cleanup logic
+    async def cleanup(app):
+        logger.info("Shutting down server, closing all sessions...")
+        await session_manager.close_all_sessions()
+    
+    app.on_shutdown.append(cleanup)
+    
+    # Add routes
+    app.router.add_get('/ws', websocket_handler)
+    app.router.add_get('/api/status', status_handler)
+    app.router.add_get('/api/metrics', metrics_handler)
+    
+    # Set up static file serving
+    # Define the path to the public directory
+    public_path = pathlib.Path(__file__).parent / 'build' / 'web'
+    if not public_path.exists():
+        public_path.mkdir(parents=True, exist_ok=True)
+    
+    # Set up static file serving with proper security considerations
+    async def static_file_handler(request):
+        # Get the path from the request (removing leading /)
+        path_parts = request.path.lstrip('/').split('/')
+        
+        # Convert to safe path to prevent path traversal attacks
+        safe_path = public_path.joinpath(*path_parts)
+        
+        # Make sure the path is within the public directory (prevent directory traversal)
+        try:
+            safe_path = safe_path.resolve()
+            if not str(safe_path).startswith(str(public_path.resolve())):
+                return web.HTTPForbidden(text="Access denied")
+        except (ValueError, FileNotFoundError):
+            return web.HTTPNotFound()
+        
+        # If path is a directory, look for index.html
+        if safe_path.is_dir():
+            safe_path = safe_path / 'index.html'
+        
+        # Check if the file exists
+        if not safe_path.exists() or not safe_path.is_file():
+            # If not found, serve index.html (for SPA routing)
+            safe_path = public_path / 'index.html'
+            if not safe_path.exists():
+                return web.HTTPNotFound()
+        
+        # Determine content type based on file extension
+        content_type = 'text/plain'
+        ext = safe_path.suffix.lower()
+        if ext == '.html':
+            content_type = 'text/html'
+        elif ext == '.js':
+            content_type = 'application/javascript'
+        elif ext == '.css':
+            content_type = 'text/css'
+        elif ext in ('.jpg', '.jpeg'):
+            content_type = 'image/jpeg'
+        elif ext == '.png':
+            content_type = 'image/png'
+        elif ext == '.gif':
+            content_type = 'image/gif'
+        elif ext == '.svg':
+            content_type = 'image/svg+xml'
+        elif ext == '.json':
+            content_type = 'application/json'
+        
+        # Return the file with appropriate headers
+        return web.FileResponse(safe_path, headers={'Content-Type': content_type})
+    
+    # Add catch-all route for static files (lower priority than API routes)
+    app.router.add_get('/{path:.*}', static_file_handler)
+    
+    return app
+
+if __name__ == '__main__':
+    app = asyncio.run(init_app())
+    web.run_app(app, host='0.0.0.0', port=8080)
\ No newline at end of file
diff --git a/reference_example/api_config.py b/reference_example/api_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..b366493d35c59e610b47f8db2b84238e99b1771e
--- /dev/null
+++ b/reference_example/api_config.py
@@ -0,0 +1,184 @@
+import os
+
+PRODUCT_NAME = os.environ.get('PRODUCT_NAME', 'TikSlop')
+PRODUCT_VERSION = "2.0.0"
+
+# you should use Mistral 7b instruct for good performance and accuracy balance
+TEXT_MODEL = os.environ.get('HF_TEXT_MODEL', '')
+
+# Environment variable to control maintenance mode
+MAINTENANCE_MODE = os.environ.get('MAINTENANCE_MODE', 'false').lower() in ('true', 'yes', '1', 't')
+
+# Environment variable to control how many nodes to use
+MAX_NODES = int(os.environ.get('MAX_NODES', '8'))
+
+ADMIN_ACCOUNTS = [
+    "jbilcke-hf"
+]
+
+RAW_VIDEO_ROUND_ROBIN_ENDPOINT_URLS = [
+    os.environ.get('VIDEO_ROUND_ROBIN_SERVER_1', ''),
+    os.environ.get('VIDEO_ROUND_ROBIN_SERVER_2', ''),
+    os.environ.get('VIDEO_ROUND_ROBIN_SERVER_3', ''),
+    os.environ.get('VIDEO_ROUND_ROBIN_SERVER_4', ''),
+    os.environ.get('VIDEO_ROUND_ROBIN_SERVER_5', ''),
+    os.environ.get('VIDEO_ROUND_ROBIN_SERVER_6', ''),
+    os.environ.get('VIDEO_ROUND_ROBIN_SERVER_7', ''),
+    os.environ.get('VIDEO_ROUND_ROBIN_SERVER_8', ''),
+]
+
+# Filter out empty strings from the endpoint list
+filtered_urls = [url for url in RAW_VIDEO_ROUND_ROBIN_ENDPOINT_URLS if url]
+
+# Limit the number of URLs based on MAX_NODES environment variable
+VIDEO_ROUND_ROBIN_ENDPOINT_URLS = filtered_urls[:MAX_NODES]
+
+HF_TOKEN = os.environ.get('HF_TOKEN')
+
+# use the same secret token as you used to secure your BASE_SPACE_NAME spaces
+SECRET_TOKEN = os.environ.get('SECRET_TOKEN')
+
+# altenative words we could use: "saturated, highlight, overexposed, highlighted, overlit, shaking, too bright, worst quality, inconsistent motion, blurry, jittery, distorted, cropped, watermarked, watermark, logo, subtitle, subtitles, lowres"
+NEGATIVE_PROMPT = "low quality, worst quality, deformed, distorted, disfigured, blurry, text, watermark"
+
+POSITIVE_PROMPT_SUFFIX = "high quality, cinematic, 4K, intricate details"
+
+GUIDANCE_SCALE = 1.0
+
+THUMBNAIL_FRAMES = 65
+
+# anonymous users are people browing TikSlop without being connected
+# this category suffers from regular abuse so we need to enforce strict limitations
+CONFIG_FOR_ANONYMOUS_USERS = {
+
+    # anons can only watch 2 minutes per video
+    "max_rendering_time_per_client_per_video_in_sec": 2 * 60,
+
+    "min_num_inference_steps": 2,
+    "default_num_inference_steps": 4,
+    "max_num_inference_steps": 4,
+
+    "min_num_frames": 9, # 8 + 1
+    "default_max_num_frames": 65, # 8*8 + 1
+    "max_num_frames": 65, # 8*8 + 1
+
+    "min_clip_duration_seconds": 1,
+    "default_clip_duration_seconds": 2,
+    "max_clip_duration_seconds": 2,
+
+    "min_clip_playback_speed": 0.7,
+    "default_clip_playback_speed": 0.7,
+    "max_clip_playback_speed": 0.7,
+
+    "min_clip_framerate": 8,
+    "default_clip_framerate": 16,
+    "max_clip_framerate": 16,
+
+    "min_clip_width": 544,
+    "default_clip_width": 640,
+    "max_clip_width": 640,
+
+    "min_clip_height": 320,
+    "default_clip_height": 352,
+    "max_clip_height": 352,
+}
+
+# Hugging Face users enjoy a more normal and calibrated experience
+CONFIG_FOR_STANDARD_HF_USERS = {
+    "max_rendering_time_per_client_per_video_in_sec": 15 * 60,
+
+    "min_num_inference_steps": 2,
+    "default_num_inference_steps": 4,
+    "max_num_inference_steps": 4,
+    
+    "min_num_frames": 9, # 8 + 1
+    "default_num_frames": 81, # 8*10 + 1
+    "max_num_frames": 81,
+
+    "min_clip_duration_seconds": 1,
+    "default_clip_duration_seconds": 3,
+    "max_clip_duration_seconds": 3,
+
+    "min_clip_playback_speed": 0.7,
+    "default_clip_playback_speed": 0.7,
+    "max_clip_playback_speed": 0.7,
+
+    "min_clip_framerate": 8,
+    "default_clip_framerate": 25,
+    "max_clip_framerate": 25,
+
+    "min_clip_width": 544,
+    "default_clip_width": 1152, # 928, # 1216, # 768, # 640,
+    "max_clip_width": 1152, # 928, # 1216, # 768, # 640,
+
+    "min_clip_height": 320,
+    "default_clip_height": 640, # 512, # 448, # 416,
+    "max_clip_height": 640, # 512, # 448, # 416,
+}
+
+# Hugging Face users with a Pro may enjoy an improved experience
+CONFIG_FOR_PRO_HF_USERS = {
+    "max_rendering_time_per_client_per_video_in_sec": 20 * 60,
+
+    "min_num_inference_steps": 2,
+    "default_num_inference_steps": 4,
+    "max_num_inference_steps": 4,
+    
+    "min_num_frames": 9, # 8 + 1
+    "default_num_frames": 81, # 8*10 + 1
+    "max_num_frames": 81,
+
+    "min_clip_duration_seconds": 1,
+    "default_clip_duration_seconds": 3,
+    "max_clip_duration_seconds": 3,
+
+    "min_clip_playback_speed": 0.7,
+    "default_clip_playback_speed": 0.7,
+    "max_clip_playback_speed": 0.7,
+
+    "min_clip_framerate": 8,
+    "default_clip_framerate": 25,
+    "max_clip_framerate": 25,
+
+    "min_clip_width": 544,
+    "default_clip_width": 1152, # 928, # 1216, # 768, # 640,
+    "max_clip_width": 1152, # 928, # 1216, # 768, # 640,
+
+    "min_clip_height": 320,
+    "default_clip_height": 640, # 512, # 448, # 416,
+    "max_clip_height": 640, # 512, # 448, # 416,
+}
+
+CONFIG_FOR_ADMIN_HF_USERS = {
+    "max_rendering_time_per_client_per_video_in_sec": 60 * 60,
+
+    "min_num_inference_steps": 2,
+    "default_num_inference_steps": 4,
+    "max_num_inference_steps": 4,
+
+    "min_num_frames": 9, # 8 + 1
+    "default_num_frames": 81, # (8 * 10) + 1
+    "max_num_frames": 129, # (8 * 16) + 1
+
+    "min_clip_duration_seconds": 1,
+    "default_clip_duration_seconds": 2,
+    "max_clip_duration_seconds": 4,
+
+    "min_clip_playback_speed": 0.7,
+    "default_clip_playback_speed": 0.7,
+    "max_clip_playback_speed": 1.0,
+
+    "min_clip_framerate": 8,
+    "default_clip_framerate": 30,
+    "max_clip_framerate": 60,
+
+    "min_clip_width": 544,
+    "default_clip_width": 1152, # 928, # 1216, # 768, # 640,
+    "max_clip_width": 1152, # 928, # 1216, # 768, # 640,
+
+    "min_clip_height": 320,
+    "default_clip_height": 640, # 512, # 448, # 416,
+    "max_clip_height": 640, # 512, # 448, # 416,
+}
+
+CONFIG_FOR_ADMIN_HF_USERS = CONFIG_FOR_PRO_HF_USERS
\ No newline at end of file
diff --git a/reference_example/api_core.py b/reference_example/api_core.py
new file mode 100644
index 0000000000000000000000000000000000000000..8066d154d3d041cc08b3f231ff9b0378b9b5424a
--- /dev/null
+++ b/reference_example/api_core.py
@@ -0,0 +1,1068 @@
+import logging
+import os
+import io
+import re
+import base64
+import uuid
+from typing import Dict, Any, Optional, List, Literal
+from dataclasses import dataclass
+from asyncio import Lock, Queue
+import asyncio
+import time
+import datetime
+from contextlib import asynccontextmanager
+from collections import defaultdict
+from aiohttp import web, ClientSession
+from huggingface_hub import InferenceClient, HfApi
+from gradio_client import Client
+import random
+import yaml
+import json
+
+from api_config import *
+
+# User role type
+UserRole = Literal['anon', 'normal', 'pro', 'admin']
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+def generate_seed():
+    """Generate a random positive 32-bit integer seed."""
+    return random.randint(0, 2**32 - 1)
+
+def sanitize_yaml_response(response_text: str) -> str:
+    """
+    Sanitize and format AI response into valid YAML.
+    Returns properly formatted YAML string.
+    """
+
+    response_text = response_text.split("```")[0]
+
+    # Remove any markdown code block indicators and YAML document markers
+    clean_text = re.sub(r'```yaml|```|---|\.\.\.$', '', response_text.strip())
+    
+    # Split into lines and process each line
+    lines = clean_text.split('\n')
+    sanitized_lines = []
+    current_field = None
+    
+    for line in lines:
+        stripped = line.strip()
+        if not stripped:
+            continue
+            
+        # Handle field starts
+        if stripped.startswith('title:') or stripped.startswith('description:'):
+            # Ensure proper YAML format with space after colon and proper quoting
+            field_name = stripped.split(':', 1)[0]
+            field_value = stripped.split(':', 1)[1].strip().strip('"\'')
+            
+            # Quote the value if it contains special characters
+            if any(c in field_value for c in ':[]{},&*#?|-<>=!%@`'):
+                field_value = f'"{field_value}"'
+                
+            sanitized_lines.append(f"{field_name}: {field_value}")
+            current_field = field_name
+            
+        elif stripped.startswith('tags:'):
+            sanitized_lines.append('tags:')
+            current_field = 'tags'
+            
+        elif stripped.startswith('-') and current_field == 'tags':
+            # Process tag values
+            tag = stripped[1:].strip().strip('"\'')
+            if tag:
+                # Clean and format tag
+                tag = re.sub(r'[^\x00-\x7F]+', '', tag)  # Remove non-ASCII
+                tag = re.sub(r'[^a-zA-Z0-9\s-]', '', tag)  # Keep only alphanumeric and hyphen
+                tag = tag.strip().lower().replace(' ', '-')
+                if tag:
+                    sanitized_lines.append(f"  - {tag}")
+                    
+        elif current_field in ['title', 'description']:
+            # Handle multi-line title/description continuation
+            value = stripped.strip('"\'')
+            if value:
+                # Append to previous line
+                prev = sanitized_lines[-1]
+                sanitized_lines[-1] = f"{prev} {value}"
+    
+    # Ensure the YAML has all required fields
+    required_fields = {'title', 'description', 'tags'}
+    found_fields = {line.split(':')[0].strip() for line in sanitized_lines if ':' in line}
+    
+    for field in required_fields - found_fields:
+        if field == 'tags':
+            sanitized_lines.extend(['tags:', '  - default'])
+        else:
+            sanitized_lines.append(f'{field}: "No {field} provided"')
+    
+    return '\n'.join(sanitized_lines)
+
+@dataclass
+class Endpoint:
+    id: int
+    url: str
+    busy: bool = False
+    last_used: float = 0
+    error_count: int = 0
+    error_until: float = 0  # Timestamp until which this endpoint is considered in error state
+
+class EndpointManager:
+    def __init__(self):
+        self.endpoints: List[Endpoint] = []
+        self.lock = Lock()
+        self.initialize_endpoints()
+        self.last_used_index = -1  # Track the last used endpoint for round-robin
+
+    def initialize_endpoints(self):
+        """Initialize the list of endpoints"""
+        for i, url in enumerate(VIDEO_ROUND_ROBIN_ENDPOINT_URLS):
+            endpoint = Endpoint(id=i + 1, url=url)
+            self.endpoints.append(endpoint)
+
+    def _get_next_free_endpoint(self):
+        """Get the next available non-busy endpoint, or oldest endpoint if all are busy"""
+        current_time = time.time()
+        
+        # First priority: Get any non-busy and non-error endpoint
+        free_endpoints = [
+            ep for ep in self.endpoints 
+            if not ep.busy and current_time > ep.error_until
+        ]
+        
+        if free_endpoints:
+            # Return the least recently used free endpoint
+            return min(free_endpoints, key=lambda ep: ep.last_used)
+        
+        # Second priority: If all busy/error, use round-robin but skip error endpoints
+        tried_count = 0
+        next_index = self.last_used_index
+        
+        while tried_count < len(self.endpoints):
+            next_index = (next_index + 1) % len(self.endpoints)
+            tried_count += 1
+            
+            # If endpoint is not in error state, use it
+            if current_time > self.endpoints[next_index].error_until:
+                self.last_used_index = next_index
+                return self.endpoints[next_index]
+        
+        # If all endpoints are in error state, use the one with earliest error expiry
+        self.last_used_index = next_index
+        return min(self.endpoints, key=lambda ep: ep.error_until)
+
+    @asynccontextmanager
+    async def get_endpoint(self, max_wait_time: int = 10):
+        """Get the next available endpoint using a context manager"""
+        start_time = time.time()
+        endpoint = None
+        
+        try:
+            while True:
+                if time.time() - start_time > max_wait_time:
+                    raise TimeoutError(f"Could not acquire an endpoint within {max_wait_time} seconds")
+
+                async with self.lock:
+                    # Get the next available endpoint using our selection strategy
+                    endpoint = self._get_next_free_endpoint()
+                    
+                    # Mark it as busy
+                    endpoint.busy = True
+                    endpoint.last_used = time.time()
+                    #logger.info(f"Using endpoint {endpoint.id} (busy: {endpoint.busy}, last used: {endpoint.last_used})")
+                    break
+
+            yield endpoint
+
+        finally:
+            if endpoint:
+                async with self.lock:
+                    endpoint.busy = False
+                    endpoint.last_used = time.time()
+                    # We don't need to put back into queue - our strategy now picks directly from the list
+
+class ChatRoom:
+    def __init__(self):
+        self.messages = []
+        self.connected_clients = set()
+        self.max_history = 100
+
+    def add_message(self, message):
+        self.messages.append(message)
+        if len(self.messages) > self.max_history:
+            self.messages.pop(0)
+
+    def get_recent_messages(self, limit=50):
+        return self.messages[-limit:]
+
+class VideoGenerationAPI:
+    def __init__(self):
+        self.inference_client = InferenceClient(token=HF_TOKEN)
+        self.hf_api = HfApi(token=HF_TOKEN)
+        self.endpoint_manager = EndpointManager()
+        self.active_requests: Dict[str, asyncio.Future] = {}
+        self.chat_rooms = defaultdict(ChatRoom)
+        self.video_events: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
+        self.event_history_limit = 50
+        # Cache for user roles to avoid repeated API calls
+        self.user_role_cache: Dict[str, Dict[str, Any]] = {}
+        # Cache expiration time (10 minutes)
+        self.cache_expiration = 600
+
+
+    def _add_event(self, video_id: str, event: Dict[str, Any]):
+        """Add an event to the video's history and maintain the size limit"""
+        events = self.video_events[video_id]
+        events.append(event)
+        if len(events) > self.event_history_limit:
+            events.pop(0)
+    
+    async def validate_user_token(self, token: str) -> UserRole:
+        """
+        Validates a Hugging Face token and determines the user's role.
+        
+        Returns one of:
+        - 'anon': Anonymous user (no token or invalid token)
+        - 'normal': Standard Hugging Face user
+        - 'pro': Hugging Face Pro user
+        - 'admin': Admin user (username in ADMIN_ACCOUNTS)
+        """
+        # If no token is provided, the user is anonymous
+        if not token:
+            return 'anon'
+        
+        # Check if we have a cached result for this token
+        current_time = time.time()
+        if token in self.user_role_cache:
+            cached_data = self.user_role_cache[token]
+            # If the cache is still valid
+            if current_time - cached_data['timestamp'] < self.cache_expiration:
+                logger.info(f"Using cached user role: {cached_data['role']}")
+                return cached_data['role']
+        
+        # No valid cache, need to check the token with the HF API
+        try:
+            # Use HF API to validate the token and get user info
+            logger.info("Validating Hugging Face token...")
+            
+            # Run in executor to avoid blocking the event loop
+            user_info = await asyncio.get_event_loop().run_in_executor(
+                None, 
+                lambda: self.hf_api.whoami(token=token)
+            )
+            
+            # Handle both object and dict response formats from whoami
+            username = user_info.get('name') if isinstance(user_info, dict) else getattr(user_info, 'name', None)
+            is_pro = user_info.get('is_pro') if isinstance(user_info, dict) else getattr(user_info, 'is_pro', False)
+            
+            if not username:
+                logger.error(f"Could not determine username from user_info: {user_info}")
+                return 'anon'
+                
+            logger.info(f"Token valid for user: {username}")
+            
+            # Determine the user role based on the information
+            user_role: UserRole
+            
+            # Check if the user is an admin
+            if username in ADMIN_ACCOUNTS:
+                user_role = 'admin'
+            # Check if the user has a pro account
+            elif is_pro:
+                user_role = 'pro'
+            else:
+                user_role = 'normal'
+            
+            # Cache the result
+            self.user_role_cache[token] = {
+                'role': user_role,
+                'timestamp': current_time,
+                'username': username
+            }
+            
+            return user_role
+            
+        except Exception as e:
+            logger.error(f"Failed to validate Hugging Face token: {str(e)}")
+            # If validation fails, the user is treated as anonymous
+            return 'anon'
+
+    async def download_video(self, url: str) -> bytes:
+        """Download video file from URL and return bytes"""
+        async with ClientSession() as session:
+            async with session.get(url) as response:
+                if response.status != 200:
+                    raise Exception(f"Failed to download video: HTTP {response.status}")
+                return await response.read()
+
+    async def search_video(self, query: str, attempt_count: int = 0) -> Optional[dict]:
+        """Generate a single search result using HF text generation"""
+        # Maximum number of attempts to generate a description without placeholder tags
+        max_attempts = 2
+        current_attempt = attempt_count
+        # Use a random temperature between 0.68 and 0.72 to generate more diverse results
+        # and prevent duplicate results from successive calls with the same prompt
+        temperature = random.uniform(0.68, 0.72)
+
+        while current_attempt <= max_attempts:
+            prompt = f"""# Instruction
+Your response MUST be a YAML object containing a title and description, consistent with what we can find on a video sharing platform.
+Format your YAML response with only those fields: "title" (a short string) and "description" (string caption of the scene). Do not add any other field.
+In the description field, describe in a very synthetic way the visuals of the first shot (first scene), eg "<STYLE>, medium close-up shot, high angle view. In the foreground a <OPTIONAL AGE> <OPTIONAL GENDER> <CHARACTERS> <ACTIONS>. In the background <DESCRIBE LOCATION, BACKGROUND CHARACTERS, OBJECTS ETC>. The scene is lit by <LIGHTING> <WEATHER>". This is just an example! you MUST replace the <TAGS>!!.
+Don't forget to replace <STYLE> etc, by the actual fields!!
+For the style, be creative, for instance you can use anything like a "documentary footage", "japanese animation", "movie scene", "tv series", "tv show", "security footage" etc.
+If the user ask for something specific eg "movie screencap", "movie scene", "documentary footage" "animation" as a style etc.
+Keep it minimalist but still descriptive, don't use bullets points, use simple words, go to the essential to describe style (cinematic, documentary footage, 3D rendering..), camera modes and angles, characters, age, gender, action, location, lighting, country, costume, time, weather, textures, color palette.. etc). Write about 80 words, and use between 2 and 3 sentences.
+The most import part is to describe the actions and movements in the scene, so don't forget that!
+Don't describe sound, so ever say things like "atmospheric music playing in the background".
+Instead describe the visual elements we can see in the background, be precise, (if there are anything, cars, objects, people, bricks, birds, clouds, trees, leaves or grass then say it so etc).
+Make the result unique and different from previous search results. ONLY RETURN YAML AND WITH ENGLISH CONTENT, NOT CHINESE - DO NOT ADD ANY OTHER COMMENT!
+
+# Context
+This is attempt {current_attempt}.
+
+# Input
+Describe the first scene/shot for: "{query}".
+
+# Output
+
+```yaml
+title: \""""
+
+            try:
+                response = await asyncio.get_event_loop().run_in_executor(
+                    None,
+                    lambda: self.inference_client.text_generation(
+                        prompt,
+                        model=TEXT_MODEL,
+                        max_new_tokens=200,
+                        temperature=temperature
+                    )
+                )
+
+                response_text = re.sub(r'^\s*\.\s*\n', '', f"title: \"{response.strip()}")
+                sanitized_yaml = sanitize_yaml_response(response_text)
+                
+                try:
+                    result = yaml.safe_load(sanitized_yaml)
+                except yaml.YAMLError as e:
+                    logger.error(f"YAML parsing failed: {str(e)}")
+                    result = None
+                
+                if not result or not isinstance(result, dict):
+                    logger.error(f"Invalid result format: {result}")
+                    current_attempt += 1
+                    temperature = random.uniform(0.68, 0.72)  # Try with different random temperature on next attempt
+                    continue
+
+                # Extract fields with defaults
+                title = str(result.get('title', '')).strip() or 'Untitled Video'
+                description = str(result.get('description', '')).strip() or 'No description available'
+                
+                # Check if the description still contains placeholder tags like <LOCATION>, <GENDER>, etc.
+                if re.search(r'<[A-Z_]+>', description):
+                    #logger.warning(f"Description still contains placeholder tags: {description}")
+                    if current_attempt < max_attempts:
+                        # Try again with a different random temperature
+                        current_attempt += 1
+                        temperature = random.uniform(0.68, 0.72)
+                        continue
+                    else:
+                        # If we've reached max attempts, use the title as description
+                        description = title
+
+                # Return valid result with all required fields
+                return {
+                    'id': str(uuid.uuid4()),
+                    'title': title,
+                    'description': description,
+                    'thumbnailUrl': '',
+                    'videoUrl': '',
+
+                    # not really used yet, maybe one day if we pre-generate or store content
+                    'isLatent': True,
+
+                    'useFixedSeed': "webcam" in description.lower(),
+
+                    'seed': generate_seed(),
+                    'views': 0,
+                    'tags': []
+                }
+
+            except Exception as e:
+                logger.error(f"Search video generation failed: {str(e)}")
+                current_attempt += 1
+                temperature = random.uniform(0.68, 0.72)  # Try with different random temperature on next attempt
+        
+        # If all attempts failed, return a simple result with title only
+        return {
+            'id': str(uuid.uuid4()),
+            'title': f"Video about {query}",
+            'description': f"Video about {query}",
+            'thumbnailUrl': '',
+            'videoUrl': '',
+            'isLatent': True,
+            'useFixedSeed': "query" in description.lower(),
+            'seed': generate_seed(),
+            'views': 0,
+            'tags': []
+        }
+
+    # The generate_thumbnail function has been removed because we now use
+    # generate_video_thumbnail for all thumbnails, which generates a video clip
+    # instead of a static image
+
+    async def generate_caption(self, title: str, description: str) -> str:
+        """Generate detailed caption using HF text generation"""
+        try:
+            prompt = f"""Generate a detailed story for a video named: "{title}"
+Visual description of the video: {description}.
+Instructions: Write the story summary, including the plot, action, what should happen.
+Make it around 200-300 words long.
+A video can be anything from a tutorial, webcam, trailer, movie, live stream etc."""
+
+            response = await asyncio.get_event_loop().run_in_executor(
+                None,
+                lambda: self.inference_client.text_generation(
+                    prompt,
+                    model=TEXT_MODEL,
+                    max_new_tokens=180,
+                    temperature=0.7
+                )
+            )
+     
+            if "Caption: " in response:
+                response = response.replace("Caption: ", "")
+            
+            chunks = f" {response} ".split(". ")
+            if len(chunks) > 1:
+                text = ". ".join(chunks[:-1])
+            else:
+                text = response
+
+            return text.strip()
+        except Exception as e:
+            logger.error(f"Error generating caption: {str(e)}")
+            return ""
+            
+    async def simulate(self, original_title: str, original_description: str, 
+                         current_description: str, condensed_history: str, 
+                         evolution_count: int = 0, chat_messages: str = '') -> dict:
+        """
+        Simulate a video by evolving its description to create a dynamic narrative.
+        
+        Args:
+            original_title: The original video title
+            original_description: The original video description
+            current_description: The current description (last evolved or original if first evolution)
+            condensed_history: A condensed summary of previous scene developments
+            evolution_count: How many times the simulation has already evolved
+            chat_messages: Chat messages from users to incorporate into the simulation
+            
+        Returns:
+            A dictionary containing the evolved description and updated condensed history
+        """
+        try:
+            # Determine if this is the first simulation
+            is_first_simulation = evolution_count == 0 or not condensed_history
+            
+            logger.info(f"simulate(): is_first_simulation={is_first_simulation}")
+                
+            # Create an appropriate prompt based on whether this is the first simulation
+            chat_section = ""
+            if chat_messages:
+                chat_section = f"""
+People are watching this content right now and have shared their thoughts. Like a game master, please take their feedback as input to adjust the story and/or the scene. Here are their messages:
+
+{chat_messages}
+"""
+
+            if is_first_simulation:
+                prompt = f"""You are tasked with evolving the narrative for a video titled: "{original_title}"
+
+Original description:
+{original_description}
+{chat_section}
+
+Instructions:
+1. Imagine the next logical scene or development that would follow this description.
+2. Create a compelling new description (200-300 words) that builds on the original but introduces new elements, developments, or perspectives.
+3. Maintain the original style, tone, and setting.
+4. If viewers have shared messages, consider their input and incorporate relevant suggestions or reactions into your narrative evolution.
+5. Also create a brief "scene history" (50-75 words) that summarizes what has happened so far.
+
+Return your response in this format:
+EVOLVED_DESCRIPTION: [your new evolved description here]
+CONDENSED_HISTORY: [your scene history summary]"""
+            else:
+                prompt = f"""You are tasked with continuing to evolve the narrative for a video titled: "{original_title}"
+
+Original description:
+{original_description}
+
+Condensed history of scenes so far:
+{condensed_history}
+
+Current description (most recent scene):
+{current_description}
+{chat_section}
+
+Instructions:
+1. Imagine the next logical scene or development that would follow the current description.
+2. Create a compelling new description (200-300 words) that builds on the narrative but introduces new elements, developments, or perspectives.
+3. Maintain consistency with the previous scenes while advancing the story.
+4. If viewers have shared messages, consider their input and incorporate relevant suggestions or reactions into your narrative evolution.
+5. Also update the condensed history (50-75 words) to include this new development.
+
+Return your response in this format:
+EVOLVED_DESCRIPTION: [your new evolved description here]
+CONDENSED_HISTORY: [your updated scene history summary]"""
+
+            # Generate the evolved description
+            response = await asyncio.get_event_loop().run_in_executor(
+                None,
+                lambda: self.inference_client.text_generation(
+                    prompt,
+                    model=TEXT_MODEL,
+                    max_new_tokens=200,
+                    temperature=0.7
+                )
+            )
+            
+            # Extract the evolved description and condensed history from the response
+            evolved_description = ""
+            new_condensed_history = ""
+            
+            # Parse the response
+            if "EVOLVED_DESCRIPTION:" in response and "CONDENSED_HISTORY:" in response:
+                parts = response.split("CONDENSED_HISTORY:")
+                if len(parts) >= 2:
+                    desc_part = parts[0].strip()
+                    if "EVOLVED_DESCRIPTION:" in desc_part:
+                        evolved_description = desc_part.split("EVOLVED_DESCRIPTION:", 1)[1].strip()
+                    new_condensed_history = parts[1].strip()
+            
+            # If parsing failed, use some fallbacks
+            if not evolved_description:
+                evolved_description = current_description
+                logger.warning(f"Failed to parse evolved description, using current description as fallback")
+            
+            if not new_condensed_history and condensed_history:
+                new_condensed_history = condensed_history
+                logger.warning(f"Failed to parse condensed history, using current history as fallback")
+            elif not new_condensed_history:
+                new_condensed_history = f"The video begins with {original_title}: {original_description[:100]}..."
+            
+            return {
+                "evolved_description": evolved_description,
+                "condensed_history": new_condensed_history
+            }
+            
+        except Exception as e:
+            logger.error(f"Error simulating video: {str(e)}")
+            return {
+                "evolved_description": current_description,
+                "condensed_history": condensed_history or f"The video shows {original_title}."
+            }
+
+
+    def get_config_value(self, role: UserRole, field: str, options: dict = None) -> Any:
+        """
+        Get the appropriate config value for a user role.
+        
+        Args:
+            role: The user role ('anon', 'normal', 'pro', 'admin')
+            field: The config field name to retrieve
+            options: Optional user-provided options that may override defaults
+            
+        Returns:
+            The config value appropriate for the user's role with respect to
+            min/max boundaries and user overrides.
+        """
+        # Select the appropriate config based on user role
+        if role == 'admin':
+            config = CONFIG_FOR_ADMIN_HF_USERS
+        elif role == 'pro':
+            config = CONFIG_FOR_PRO_HF_USERS
+        elif role == 'normal':
+            config = CONFIG_FOR_STANDARD_HF_USERS
+        else:  # Anonymous users
+            config = CONFIG_FOR_ANONYMOUS_USERS
+        
+        # Get the default value for this field from the config
+        default_value = config.get(f"default_{field}", None)
+        
+        # For fields that have min/max bounds
+        min_field = f"min_{field}"
+        max_field = f"max_{field}"
+        
+        # Check if min/max constraints exist for this field
+        has_constraints = min_field in config or max_field in config
+        
+        if not has_constraints:
+            # For fields without constraints, just return the value from config
+            return default_value
+        
+        # Get min and max values from config (if they exist)
+        min_value = config.get(min_field, None)
+        max_value = config.get(max_field, None)
+        
+        # If user provided options with this field
+        if options and field in options:
+            user_value = options[field]
+            
+            # Apply constraints if they exist
+            if min_value is not None and user_value < min_value:
+                return min_value
+            if max_value is not None and user_value > max_value:
+                return max_value
+                
+            # If within bounds, use the user's value
+            return user_value
+        
+        # If no user value, return the default
+        return default_value
+
+    async def _generate_clip_prompt(self, video_id: str, title: str, description: str) -> str:
+        """Generate a new prompt for the next clip based on event history"""
+        events = self.video_events.get(video_id, [])
+        events_json = "\n".join(json.dumps(event) for event in events)
+        
+        prompt = f"""# Context and task
+Please write the caption for a new clip.
+
+# Instructions
+1. Consider the video context and recent events
+2. Create a natural progression from previous clips
+3. Take into account user suggestions (chat messages) into the scene
+4. Don't generate hateful, political, violent or sexual content
+5. Keep visual consistency with previous clips (in most cases you should repeat the same exact description of the location, characters etc but only change a few elements. If this is a webcam scenario, don't touch the camera orientation or focus)
+6. Return ONLY the caption text, no additional formatting or explanation
+7. Write in English, about 200 words.
+8. Keep the visual style consistant, but content as well (repeat the style, character, locations, appearance etc.. across scenes, when it makes sense).
+8. Your caption must describe visual elements of the scene in details, including: camera angle and focus, people's appearance, age, look, costumes, clothes, the location visual characteristics and geometry, lighting, action, objects, weather, textures, lighting.
+
+# Examples
+Here is a demo scenario, with fake data:
+{{"time": "2024-11-29T13:36:15Z", "event": "new_stream_clip", "caption": "webcam view of a beautiful park, squirrels are playing in the lush grass, blablabla etc... (rest omitted for brevity)"}}
+{{"time": "2024-11-29T13:36:20Z", "event": "new_chat_message", "username": "MonkeyLover89", "data": "hi"}}
+{{"time": "2024-11-29T13:36:25Z", "event": "new_chat_message", "username": "MonkeyLover89", "data": "more squirrels plz"}}
+{{"time": "2024-11-29T13:36:26Z", "event": "new_stream_clip", "caption": "webcam view of a beautiful park, a lot of squirrels are playing in the lush grass, blablabla etc... (rest omitted for brevity)"}}
+
+# Real scenario and data
+
+We are inside a video titled "{title}"
+The video is described by: "{description}".
+Here is a summary of the {len(events)} most recent events:
+{events_json}
+
+# Your response
+Your caption:"""
+
+        try:
+            response = await asyncio.get_event_loop().run_in_executor(
+                None,
+                lambda: self.inference_client.text_generation(
+                    prompt,
+                    model=TEXT_MODEL,
+                    max_new_tokens=200,
+                    temperature=0.7
+                )
+            )
+            
+            # Clean up the response
+            caption = response.strip()
+            if caption.lower().startswith("caption:"):
+                caption = caption[8:].strip()
+                
+            return caption
+            
+        except Exception as e:
+            logger.error(f"Error generating clip prompt: {str(e)}")
+            # Fallback to original description if prompt generation fails
+            return description
+
+    async def generate_video_thumbnail(self, title: str, description: str, video_prompt_prefix: str, options: dict, user_role: UserRole = 'anon') -> str:
+        """
+        Generate a short, low-resolution video thumbnail for search results and previews.
+        Optimized for quick generation and low resource usage.
+        """
+        video_id = options.get('video_id', str(uuid.uuid4()))
+        seed = options.get('seed', generate_seed())
+        request_id = str(uuid.uuid4())[:8]  # Generate a short ID for logging
+        
+        logger.info(f"[{request_id}] Starting video thumbnail generation for video_id: {video_id}")
+        logger.info(f"[{request_id}] Title: '{title}', User role: {user_role}")
+        
+        # Create a more concise prompt for the thumbnail
+        clip_caption = f"{video_prompt_prefix} - {title.strip()}"
+        
+        # Add the thumbnail generation to event history
+        self._add_event(video_id, {
+            "time": datetime.datetime.utcnow().isoformat() + "Z",
+            "event": "thumbnail_generation",
+            "caption": clip_caption,
+            "seed": seed,
+            "request_id": request_id
+        })
+        
+        # Use a shorter prompt for thumbnails
+        prompt = f"{clip_caption}, {POSITIVE_PROMPT_SUFFIX}"
+        logger.info(f"[{request_id}] Using prompt: '{prompt}'")
+        
+        # Specialized configuration for thumbnails - smaller size, single frame
+        width = 512  # Reduced size for thumbnails
+        height = 288  # 16:9 aspect ratio
+        num_frames = THUMBNAIL_FRAMES  # Just one frame for static thumbnail
+        num_inference_steps = 4  # Fewer steps for faster generation
+        frame_rate = 25  # Standard frame rate
+        
+        # Optionally override with options if specified
+        width = options.get('width', width)
+        height = options.get('height', height)
+        num_frames = options.get('num_frames', num_frames)
+        num_inference_steps = options.get('num_inference_steps', num_inference_steps)
+        frame_rate = options.get('frame_rate', frame_rate)
+        
+        logger.info(f"[{request_id}] Configuration: width={width}, height={height}, frames={num_frames}, steps={num_inference_steps}, fps={frame_rate}")
+        
+        # Add thumbnail-specific tag to help debugging and metrics
+        options['thumbnail'] = True
+        
+        # Check for available endpoints before attempting generation
+        available_endpoints = sum(1 for ep in self.endpoint_manager.endpoints 
+                               if not ep.busy and time.time() > ep.error_until)
+        logger.info(f"[{request_id}] Available endpoints: {available_endpoints}/{len(self.endpoint_manager.endpoints)}")
+        
+        if available_endpoints == 0:
+            logger.error(f"[{request_id}] No available endpoints for thumbnail generation")
+            return ""
+        
+        # Use the same logic as regular video generation but with thumbnail settings
+        try:
+            # logger.info(f"[{request_id}] Generating thumbnail for video {video_id} with seed {seed}")
+            
+            start_time = time.time()
+            # Rest of thumbnail generation logic same as regular video but with optimized settings
+            result = await self._generate_video_content(
+                prompt=prompt,
+                negative_prompt=options.get('negative_prompt', NEGATIVE_PROMPT),
+                width=width,
+                height=height,
+                num_frames=num_frames,
+                num_inference_steps=num_inference_steps,
+                frame_rate=frame_rate,
+                seed=seed,
+                options=options,
+                user_role=user_role
+            )
+            duration = time.time() - start_time
+            
+            if result:
+                data_length = len(result)
+                logger.info(f"[{request_id}] Successfully generated thumbnail in {duration:.2f}s, data length: {data_length} chars")
+                return result
+            else:
+                logger.error(f"[{request_id}] Empty result returned from video generation")
+                return ""
+            
+        except Exception as e:
+            logger.error(f"[{request_id}] Error generating thumbnail: {e}")
+            if hasattr(e, "__traceback__"):
+                import traceback
+                logger.error(f"[{request_id}] Traceback: {traceback.format_exc()}")
+            return ""  # Return empty string instead of raising to avoid crashes
+    
+    async def generate_video(self, title: str, description: str, video_prompt_prefix: str, options: dict, user_role: UserRole = 'anon') -> str:
+        """Generate video using available space from pool"""
+        video_id = options.get('video_id', str(uuid.uuid4()))
+        
+        # Generate a new prompt based on event history
+        #clip_caption = await self._generate_clip_prompt(video_id, title, description)
+        clip_caption = f"{video_prompt_prefix} - {title.strip()} - {description.strip()}"
+
+        # Add the new clip to event history
+        self._add_event(video_id, {
+            "time": datetime.datetime.utcnow().isoformat() + "Z",
+            "event": "new_stream_clip",
+            "caption": clip_caption
+        })
+
+        # Use the generated caption as the prompt
+        prompt = f"{clip_caption}, {POSITIVE_PROMPT_SUFFIX}"
+        
+        # Get the config values based on user role
+        width = self.get_config_value(user_role, 'clip_width', options)
+        height = self.get_config_value(user_role, 'clip_height', options)
+        num_frames = self.get_config_value(user_role, 'num_frames', options)
+        num_inference_steps = self.get_config_value(user_role, 'num_inference_steps', options)
+        frame_rate = self.get_config_value(user_role, 'clip_framerate', options)
+        
+        # Get orientation from options
+        orientation = options.get('orientation', 'LANDSCAPE')
+        
+        # Adjust width and height based on orientation if needed
+        if orientation == 'PORTRAIT' and width > height:
+            # Swap width and height for portrait orientation
+            width, height = height, width
+            # logger.info(f"Orientation: {orientation}, swapped dimensions to width={width}, height={height}")
+        elif orientation == 'LANDSCAPE' and height > width:
+            # Swap height and width for landscape orientation
+            height, width = width, height
+            # logger.info(f"generate_video()  Orientation: {orientation}, swapped dimensions to width={width}, height={height}, steps={num_inference_steps}, fps={frame_rate} | role: {user_role}")
+        else:
+            # logger.info(f"generate_video()  Orientation: {orientation}, using original dimensions width={width}, height={height}, steps={num_inference_steps}, fps={frame_rate} | role: {user_role}")
+            pass
+        
+        # Generate the video with standard settings
+        return await self._generate_video_content(
+            prompt=prompt,
+            negative_prompt=options.get('negative_prompt', NEGATIVE_PROMPT),
+            width=width,
+            height=height,
+            num_frames=num_frames,
+            num_inference_steps=num_inference_steps,
+            frame_rate=frame_rate,
+            seed=options.get('seed', 42),
+            options=options,
+            user_role=user_role
+        )
+        
+    async def _generate_video_content(self, prompt: str, negative_prompt: str, width: int, 
+                                     height: int, num_frames: int, num_inference_steps: int, 
+                                     frame_rate: int, seed: int, options: dict, user_role: UserRole) -> str:
+        """
+        Internal method to generate video content with specific parameters.
+        Used by both regular video generation and thumbnail generation.
+        """
+        is_thumbnail = options.get('thumbnail', False)
+        request_id = options.get('request_id', str(uuid.uuid4())[:8])  # Get or generate request ID
+        video_id = options.get('video_id', 'unknown')
+        
+        # logger.info(f"[{request_id}] Generating {'thumbnail' if is_thumbnail else 'video'} for video {video_id} with seed {seed}")
+        
+        json_payload = {
+            "inputs": {
+                "prompt": prompt,
+            },
+            "parameters": {
+                # ------------------- settings for LTX-Video -----------------------
+                "negative_prompt": negative_prompt,
+                "width": width,
+                "height": height,
+                "num_frames": num_frames,
+                "num_inference_steps": num_inference_steps,
+                "guidance_scale": options.get('guidance_scale', GUIDANCE_SCALE),
+                "seed": seed,
+            
+                # ------------------- settings for Varnish -----------------------
+                "double_num_frames": False,  # <- False for real-time generation
+                "fps": frame_rate,
+                "super_resolution": False,  # <- False for real-time generation
+                "grain_amount": 0,  # No film grain (on low-res, low-quality generation the effects aren't worth it + it adds weight to the MP4 payload)
+            }
+        }
+        
+        # Add thumbnail flag to help with metrics and debugging
+        if is_thumbnail:
+            json_payload["metadata"] = {
+                "is_thumbnail": True,
+                "thumbnail_version": "1.0",
+                "request_id": request_id
+            }
+
+        # logger.info(f"[{request_id}] Waiting for an available endpoint...")
+        async with self.endpoint_manager.get_endpoint() as endpoint:
+            # logger.info(f"[{request_id}] Using endpoint {endpoint.id} for generation")
+            
+            try:
+                async with ClientSession() as session:
+                    #logger.info(f"[{request_id}] Sending request to endpoint {endpoint.id}: {endpoint.url}")
+                    start_time = time.time()
+                    
+                    # Proceed with actual request
+                    async with session.post(
+                        endpoint.url,
+                        headers={
+                            "Accept": "application/json",
+                            "Authorization": f"Bearer {HF_TOKEN}",
+                            "Content-Type": "application/json",
+                            "X-Request-ID": request_id  # Add request ID to headers
+                        },
+                        json=json_payload,
+                        timeout=12  # Extended timeout for thumbnails (was 8s)
+                    ) as response:
+                        request_duration = time.time() - start_time
+                        #logger.info(f"[{request_id}] Received response from endpoint {endpoint.id} in {request_duration:.2f}s: HTTP {response.status}")
+                        
+                        if response.status != 200:
+                            error_text = await response.text()
+                            logger.error(f"[{request_id}] Failed response: {error_text}")
+                            # Mark endpoint as in error state
+                            await self._mark_endpoint_error(endpoint)
+                            if "paused" in error_text:
+                                logger.error(f"[{request_id}] Endpoint is paused")
+                                return ""
+                            raise Exception(f"Video generation failed: HTTP {response.status} - {error_text}")
+                        
+                        result = await response.json()
+                        #logger.info(f"[{request_id}] Successfully parsed JSON response")
+                        
+                        if "error" in result:
+                            error_msg = result['error']
+                            logger.error(f"[{request_id}] Error in response: {error_msg}")
+                            # Mark endpoint as in error state
+                            await self._mark_endpoint_error(endpoint)
+                            if "paused" in str(error_msg).lower():
+                                logger.error(f"[{request_id}] Endpoint is paused")
+                                return ""
+                            raise Exception(f"Video generation failed: {error_msg}")
+                        
+                        video_data_uri = result.get("video")
+                        if not video_data_uri:
+                            logger.error(f"[{request_id}] No video data in response")
+                            # Mark endpoint as in error state
+                            await self._mark_endpoint_error(endpoint)
+                            raise Exception("No video data in response")
+                        
+                        # Get data size
+                        data_size = len(video_data_uri)
+                        #logger.info(f"[{request_id}] Received video data: {data_size} chars")
+                        
+                        # Reset error count on successful call
+                        endpoint.error_count = 0
+                        endpoint.error_until = 0
+                        
+                        return video_data_uri
+                        
+            except asyncio.TimeoutError:
+                # Handle timeout specifically
+                logger.error(f"[{request_id}] Timeout occurred after {time.time() - start_time:.2f}s")
+                await self._mark_endpoint_error(endpoint, is_timeout=True)
+                return ""
+            except Exception as e:
+                # Handle all other exceptions
+                logger.error(f"[{request_id}] Exception during video generation: {str(e)}")
+                if not isinstance(e, asyncio.TimeoutError):  # Already handled above
+                    await self._mark_endpoint_error(endpoint)
+                return ""
+                
+    async def _mark_endpoint_error(self, endpoint: Endpoint, is_timeout: bool = False):
+        """Mark an endpoint as being in error state with exponential backoff"""
+        async with self.endpoint_manager.lock:
+            endpoint.error_count += 1
+            
+            # Calculate backoff time exponentially based on error count
+            # Start with 15 seconds, then 30, 60, etc. up to a max of 5 minutes
+            # Using shorter backoffs since generation should be fast
+            backoff_seconds = min(15 * (2 ** (endpoint.error_count - 1)), 300)
+            
+            # Add extra backoff for timeouts which are more indicative of serious issues
+            if is_timeout:
+                backoff_seconds *= 2
+                
+            endpoint.error_until = time.time() + backoff_seconds
+            
+            logger.warning(
+                f"Endpoint {endpoint.id} marked as in error state (count: {endpoint.error_count}, "
+                f"unavailable until: {datetime.datetime.fromtimestamp(endpoint.error_until).strftime('%H:%M:%S')})"
+            )
+
+
+    async def handle_chat_message(self, data: dict, ws: web.WebSocketResponse) -> dict:
+        """Process and broadcast a chat message"""
+        video_id = data.get('videoId')
+        request_id = data.get('requestId')
+        
+        if not video_id:
+            return {
+                'action': 'chat_message',
+                'requestId': request_id,
+                'success': False,
+                'error': 'No video ID provided'
+            }
+
+        # Add chat message to event history
+        self._add_event(video_id, {
+            "time": datetime.datetime.utcnow().isoformat() + "Z",
+            "event": "new_chat_message",
+            "username": data.get('username', 'Anonymous'),
+            "data": data.get('content', '')
+        })
+
+        room = self.chat_rooms[video_id]
+        message_data = {k: v for k, v in data.items() if k != '_ws'}
+        room.add_message(message_data)
+        
+        for client in room.connected_clients:
+            if client != ws:
+                try:
+                    await client.send_json({
+                        'action': 'chat_message',
+                        'broadcast': True,
+                        **message_data
+                    })
+                except Exception as e:
+                    logger.error(f"Failed to broadcast to client: {e}")
+                    room.connected_clients.remove(client)
+        
+        return {
+            'action': 'chat_message',
+            'requestId': request_id,
+            'success': True,
+            'message': message_data
+        }
+
+    async def handle_join_chat(self, data: dict, ws: web.WebSocketResponse) -> dict:
+        """Handle a request to join a chat room"""
+        video_id = data.get('videoId')
+        request_id = data.get('requestId')
+        
+        if not video_id:
+            return {
+                'action': 'join_chat',
+                'requestId': request_id,
+                'success': False,
+                'error': 'No video ID provided'
+            }
+
+        room = self.chat_rooms[video_id]
+        room.connected_clients.add(ws)
+        recent_messages = room.get_recent_messages()
+        
+        return {
+            'action': 'join_chat',
+            'requestId': request_id,
+            'success': True,
+            'messages': recent_messages
+        }
+
+    async def handle_leave_chat(self, data: dict, ws: web.WebSocketResponse) -> dict:
+        """Handle a request to leave a chat room"""
+        video_id = data.get('videoId')
+        request_id = data.get('requestId')
+        
+        if not video_id:
+            return {
+                'action': 'leave_chat',
+                'requestId': request_id,
+                'success': False,
+                'error': 'No video ID provided'
+            }
+
+        room = self.chat_rooms[video_id]
+        if ws in room.connected_clients:
+            room.connected_clients.remove(ws)
+        
+        return {
+            'action': 'leave_chat',
+            'requestId': request_id,
+            'success': True
+        }
\ No newline at end of file
diff --git a/reference_example/api_metrics.py b/reference_example/api_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..b667434fcf83c7b0875faa5c5532e5941c5cde0d
--- /dev/null
+++ b/reference_example/api_metrics.py
@@ -0,0 +1,185 @@
+import time
+import logging
+import asyncio
+from collections import defaultdict
+from typing import Dict, List, Set, Optional
+import datetime
+
+logger = logging.getLogger(__name__)
+
+class MetricsTracker:
+    """
+    Tracks usage metrics across the API server.
+    """
+    def __init__(self):
+        # Total metrics since server start
+        self.total_requests = {
+            'chat': 0,
+            'video': 0, 
+            'search': 0,
+            'other': 0,
+        }
+        
+        # Per-user metrics
+        self.user_metrics = defaultdict(lambda: {
+            'requests': {
+                'chat': 0,
+                'video': 0,
+                'search': 0,
+                'other': 0,
+            },
+            'first_seen': time.time(),
+            'last_active': time.time(),
+            'role': 'anon'
+        })
+        
+        # Rate limiting buckets (per minute)
+        self.rate_limits = {
+            'anon': {
+                'video': 30,
+                'search': 45,
+                'chat': 90,
+                'other': 45
+            },
+            'normal': {
+                'video': 60,
+                'search': 90,
+                'chat': 180,
+                'other': 90 
+            },
+            'pro': {
+                'video': 120,
+                'search': 180,
+                'chat': 300,
+                'other': 180
+            },
+            'admin': {
+                'video': 240,
+                'search': 360,
+                'chat': 450,
+                'other': 360
+            }
+        }
+        
+        # Minute-based rate limiting buckets
+        self.time_buckets = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
+        
+        # Lock for thread safety
+        self.lock = asyncio.Lock()
+        
+        # Track concurrent sessions by IP
+        self.ip_sessions = defaultdict(set)
+        
+        # Server start time
+        self.start_time = time.time()
+        
+    async def record_request(self, user_id: str, ip: str, request_type: str, role: str):
+        """Record a request for metrics and rate limiting"""
+        async with self.lock:
+            # Update total metrics
+            if request_type in self.total_requests:
+                self.total_requests[request_type] += 1
+            else:
+                self.total_requests['other'] += 1
+                
+            # Update user metrics
+            user_data = self.user_metrics[user_id]
+            user_data['last_active'] = time.time()
+            user_data['role'] = role
+            
+            if request_type in user_data['requests']:
+                user_data['requests'][request_type] += 1
+            else:
+                user_data['requests']['other'] += 1
+                
+            # Update time bucket for rate limiting
+            current_minute = int(time.time() / 60)
+            self.time_buckets[user_id][current_minute][request_type] += 1
+            
+            # Clean up old time buckets (keep only last 10 minutes)
+            cutoff = current_minute - 10
+            for minute in list(self.time_buckets[user_id].keys()):
+                if minute < cutoff:
+                    del self.time_buckets[user_id][minute]
+    
+    def register_session(self, user_id: str, ip: str):
+        """Register a new session for an IP address"""
+        self.ip_sessions[ip].add(user_id)
+        
+    def unregister_session(self, user_id: str, ip: str):
+        """Unregister a session when it disconnects"""
+        if user_id in self.ip_sessions[ip]:
+            self.ip_sessions[ip].remove(user_id)
+            if not self.ip_sessions[ip]:
+                del self.ip_sessions[ip]
+    
+    def get_session_count_for_ip(self, ip: str) -> int:
+        """Get the number of active sessions for an IP address"""
+        return len(self.ip_sessions.get(ip, set()))
+    
+    async def is_rate_limited(self, user_id: str, request_type: str, role: str) -> bool:
+        """Check if a user is currently rate limited for a request type"""
+        async with self.lock:
+            current_minute = int(time.time() / 60)
+            prev_minute = current_minute - 1
+            
+            # Count requests in current and previous minute
+            current_count = self.time_buckets[user_id][current_minute][request_type]
+            prev_count = self.time_buckets[user_id][prev_minute][request_type]
+            
+            # Calculate requests per minute rate (weighted average)
+            # Weight current minute more as it's more recent
+            rate = (current_count * 0.7) + (prev_count * 0.3)
+            
+            # Get rate limit based on user role
+            limit = self.rate_limits.get(role, self.rate_limits['anon']).get(
+                request_type, self.rate_limits['anon']['other'])
+            
+            # Check if rate exceeds limit
+            return rate >= limit
+    
+    def get_metrics(self) -> Dict:
+        """Get a snapshot of current metrics"""
+        active_users = {
+            'total': len(self.user_metrics),
+            'anon': 0,
+            'normal': 0,
+            'pro': 0,
+            'admin': 0,
+        }
+        
+        # Count active users in the last 5 minutes
+        active_cutoff = time.time() - (5 * 60)
+        for user_data in self.user_metrics.values():
+            if user_data['last_active'] >= active_cutoff:
+                active_users[user_data['role']] += 1
+        
+        return {
+            'uptime_seconds': int(time.time() - self.start_time),
+            'total_requests': dict(self.total_requests),
+            'active_users': active_users,
+            'active_ips': len(self.ip_sessions),
+            'timestamp': datetime.datetime.now().isoformat()
+        }
+
+    def get_detailed_metrics(self) -> Dict:
+        """Get detailed metrics including per-user data"""
+        metrics = self.get_metrics()
+        
+        # Add anonymized user metrics
+        user_list = []
+        for user_id, data in self.user_metrics.items():
+            # Skip users inactive for more than 1 hour
+            if time.time() - data['last_active'] > 3600:
+                continue
+                
+            user_list.append({
+                'id': user_id[:8] + '...',  # Anonymize ID
+                'role': data['role'],
+                'requests': data['requests'],
+                'active_ago': int(time.time() - data['last_active']),
+                'session_duration': int(time.time() - data['first_seen'])
+            })
+        
+        metrics['users'] = user_list
+        return metrics
\ No newline at end of file
diff --git a/reference_example/api_session.py b/reference_example/api_session.py
new file mode 100644
index 0000000000000000000000000000000000000000..d64e02feca026bd0bc90f5267421153bfca01dbb
--- /dev/null
+++ b/reference_example/api_session.py
@@ -0,0 +1,569 @@
+import asyncio
+import logging
+from typing import Dict, Set
+from aiohttp import web, WSMsgType
+import json
+import time
+import datetime
+from api_core import VideoGenerationAPI
+
+logger = logging.getLogger(__name__)
+
+class UserSession:
+    """
+    Represents a user's session with the API.
+    Each WebSocket connection gets its own session with separate queues and rate limits.
+    """
+    def __init__(self, user_id: str, user_role: str, ws: web.WebSocketResponse, shared_api):
+        self.user_id = user_id
+        self.user_role = user_role
+        self.ws = ws
+        self.shared_api = shared_api  # For shared resources like endpoint manager
+        
+        # Create separate queues for this user session
+        self.chat_queue = asyncio.Queue()
+        self.video_queue = asyncio.Queue()
+        self.search_queue = asyncio.Queue()
+        self.simulation_queue = asyncio.Queue()  # New queue for description evolution
+        
+        # Track request counts and rate limits
+        self.request_counts = {
+            'chat': 0,
+            'video': 0,
+            'search': 0,
+            'simulation': 0  # New counter for simulation requests
+        }
+        
+        # Last request timestamps for rate limiting
+        self.last_request_times = {
+            'chat': time.time(),
+            'video': time.time(),
+            'search': time.time(),
+            'simulation': time.time()  # New timestamp for simulation requests
+        }
+        
+        # Session creation time
+        self.created_at = time.time()
+        
+        self.background_tasks = []
+        
+    async def start(self):
+        """Start all the queue processors for this session"""
+        # Start background tasks for handling different request types
+        self.background_tasks = [
+            asyncio.create_task(self._process_chat_queue()),
+            asyncio.create_task(self._process_video_queue()),
+            asyncio.create_task(self._process_search_queue()),
+            asyncio.create_task(self._process_simulation_queue())  # New worker for simulation requests
+        ]
+        logger.info(f"Started session for user {self.user_id} with role {self.user_role}")
+        
+    async def stop(self):
+        """Stop all background tasks for this session"""
+        for task in self.background_tasks:
+            task.cancel()
+        
+        try:
+            # Wait for tasks to complete cancellation
+            await asyncio.gather(*self.background_tasks, return_exceptions=True)
+        except asyncio.CancelledError:
+            pass
+        
+        logger.info(f"Stopped session for user {self.user_id}")
+        
+    async def _process_chat_queue(self):
+        """High priority queue for chat operations"""
+        while True:
+            data = await self.chat_queue.get()
+            try:
+                if data['action'] == 'join_chat':
+                    result = await self.shared_api.handle_join_chat(data, self.ws)
+                elif data['action'] == 'chat_message':
+                    result = await self.shared_api.handle_chat_message(data, self.ws)
+                elif data['action'] == 'leave_chat':
+                    result = await self.shared_api.handle_leave_chat(data, self.ws)
+                # Redirect thumbnail requests to process_generic_request for consistent handling
+                elif data['action'] == 'generate_video_thumbnail':
+                    # Pass to the generic request handler to maintain consistent logic
+                    await self.process_generic_request(data)
+                    # Skip normal response handling since process_generic_request already sends a response
+                    self.chat_queue.task_done()
+                    continue
+                else:
+                    raise ValueError(f"Unknown chat action: {data['action']}")
+                    
+                await self.ws.send_json(result)
+                
+                # Update metrics
+                self.request_counts['chat'] += 1
+                self.last_request_times['chat'] = time.time()
+                
+            except Exception as e:
+                logger.error(f"Error processing chat request for user {self.user_id}: {e}")
+                try:
+                    await self.ws.send_json({
+                        'action': data['action'],
+                        'requestId': data.get('requestId'),
+                        'success': False,
+                        'error': f'Chat error: {str(e)}'
+                    })
+                except Exception as send_error:
+                    logger.error(f"Error sending error response: {send_error}")
+            finally:
+                self.chat_queue.task_done()
+
+    async def _process_video_queue(self):
+        """Process multiple video generation requests in parallel for this user"""
+        from api_config import VIDEO_ROUND_ROBIN_ENDPOINT_URLS
+        
+        active_tasks = set()
+        # Set a per-user concurrent limit based on role
+        max_concurrent = len(VIDEO_ROUND_ROBIN_ENDPOINT_URLS)
+        if self.user_role == 'anon':
+            max_concurrent = min(2, max_concurrent)  # Limit anonymous users
+        elif self.user_role == 'normal':
+            max_concurrent = min(4, max_concurrent)  # Standard users
+        # Pro and admin can use all endpoints
+
+        async def process_single_request(data):
+            try:
+                title = data.get('title', '')
+                description = data.get('description', '')
+                video_prompt_prefix = data.get('video_prompt_prefix', '')
+                options = data.get('options', {})
+                
+                # Pass the user role to generate_video
+                video_data = await self.shared_api.generate_video(
+                    title, description, video_prompt_prefix, options, self.user_role
+                )
+                
+                result = {
+                    'action': 'generate_video',
+                    'requestId': data.get('requestId'),
+                    'success': True,
+                    'video': video_data,
+                }
+                
+                await self.ws.send_json(result)
+                
+                # Update metrics
+                self.request_counts['video'] += 1
+                self.last_request_times['video'] = time.time()
+                
+            except Exception as e:
+                logger.error(f"Error processing video request for user {self.user_id}: {e}")
+                try:
+                    await self.ws.send_json({
+                        'action': 'generate_video',
+                        'requestId': data.get('requestId'),
+                        'success': False,
+                        'error': f'Video generation error: {str(e)}'
+                    })
+                except Exception as send_error:
+                    logger.error(f"Error sending error response: {send_error}")
+            finally:
+                active_tasks.discard(asyncio.current_task())
+
+        while True:
+            # Clean up completed tasks
+            active_tasks = {task for task in active_tasks if not task.done()}
+            
+            # Start new tasks if we have capacity
+            while len(active_tasks) < max_concurrent:
+                try:
+                    # Use try_get to avoid blocking if queue is empty
+                    data = await asyncio.wait_for(self.video_queue.get(), timeout=0.1)
+                    
+                    # Create and start new task
+                    task = asyncio.create_task(process_single_request(data))
+                    active_tasks.add(task)
+                    
+                except asyncio.TimeoutError:
+                    # No items in queue, break inner loop
+                    break
+                except Exception as e:
+                    logger.error(f"Error creating video generation task for user {self.user_id}: {e}")
+                    break
+
+            # Wait a short time before checking queue again
+            await asyncio.sleep(0.1)
+
+            # Handle any completed tasks' errors
+            for task in list(active_tasks):
+                if task.done():
+                    try:
+                        await task
+                    except Exception as e:
+                        logger.error(f"Task failed with error for user {self.user_id}: {e}")
+                    active_tasks.discard(task)
+
+    async def _process_search_queue(self):
+        """Medium priority queue for search operations"""
+        while True:
+            try:
+                data = await self.search_queue.get()
+                request_id = data.get('requestId')
+                query = data.get('query', '').strip()
+                attempt_count = data.get('attemptCount', 0)
+
+                # logger.info(f"Processing search request for user {self.user_id}, attempt={attempt_count}")
+
+                if not query:
+                    logger.warning(f"Empty query received in request from user {self.user_id}: {data}")
+                    result = {
+                        'action': 'search',
+                        'requestId': request_id,
+                        'success': False,
+                        'error': 'No search query provided'
+                    }
+                else:
+                    try:
+                        search_result = await self.shared_api.search_video(
+                            query,
+                            attempt_count=attempt_count
+                        )
+                        
+                        if search_result:
+                            # logger.info(f"Search successful for user {self.user_id}, query '{query}'")
+                            result = {
+                                'action': 'search',
+                                'requestId': request_id,
+                                'success': True,
+                                'result': search_result
+                            }
+                        else:
+                            # logger.warning(f"No results found for user {self.user_id}, query '{query}'")
+                            result = {
+                                'action': 'search',
+                                'requestId': request_id,
+                                'success': False,
+                                'error': 'No results found'
+                            }
+                    except Exception as e:
+                        logger.error(f"Search error for user {self.user_id}, (attempt {attempt_count}): {str(e)}")
+                        result = {
+                            'action': 'search',
+                            'requestId': request_id,
+                            'success': False,
+                            'error': f'Search error: {str(e)}'
+                        }
+
+                await self.ws.send_json(result)
+                
+                # Update metrics
+                self.request_counts['search'] += 1
+                self.last_request_times['search'] = time.time()
+                
+            except Exception as e:
+                logger.error(f"Error in search queue processor for user {self.user_id}: {str(e)}")
+                try:
+                    error_response = {
+                        'action': 'search',
+                        'requestId': data.get('requestId') if 'data' in locals() else None,
+                        'success': False,
+                        'error': f'Internal server error: {str(e)}'
+                    }
+                    await self.ws.send_json(error_response)
+                except Exception as send_error:
+                    logger.error(f"Error sending error response: {send_error}")
+            finally:
+                if 'search_queue' in self.__dict__:
+                    self.search_queue.task_done()
+                    
+    async def _process_simulation_queue(self):
+        """Dedicated queue for video simulation requests"""
+        while True:
+            try:
+                data = await self.simulation_queue.get()
+                request_id = data.get('requestId')
+                
+                # Extract parameters from the request
+                video_id = data.get('video_id', '')
+                original_title = data.get('original_title', '')
+                original_description = data.get('original_description', '')
+                current_description = data.get('current_description', '')
+                condensed_history = data.get('condensed_history', '')
+                evolution_count = data.get('evolution_count', 0)
+                chat_messages = data.get('chat_messages', '')
+                
+                logger.info(f"Processing video simulation for user {self.user_id}, video_id={video_id}, evolution_count={evolution_count}")
+                
+                # Validate required parameters
+                if not original_title or not original_description or not current_description:
+                    result = {
+                        'action': 'simulate',
+                        'requestId': request_id,
+                        'success': False,
+                        'error': 'Missing required parameters'
+                    }
+                else:
+                    try:
+                        # Call the simulate method in the API
+                        simulation_result = await self.shared_api.simulate(
+                            original_title=original_title,
+                            original_description=original_description,
+                            current_description=current_description,
+                            condensed_history=condensed_history,
+                            evolution_count=evolution_count,
+                            chat_messages=chat_messages
+                        )
+                        
+                        result = {
+                            'action': 'simulate',
+                            'requestId': request_id,
+                            'success': True,
+                            'evolved_description': simulation_result['evolved_description'],
+                            'condensed_history': simulation_result['condensed_history']
+                        }
+                    except Exception as e:
+                        logger.error(f"Error simulating video for user {self.user_id}, video_id={video_id}: {str(e)}")
+                        result = {
+                            'action': 'simulate',
+                            'requestId': request_id,
+                            'success': False,
+                            'error': f'Simulation error: {str(e)}'
+                        }
+                
+                await self.ws.send_json(result)
+                
+                # Update metrics
+                self.request_counts['simulation'] += 1
+                self.last_request_times['simulation'] = time.time()
+                
+            except Exception as e:
+                logger.error(f"Error in simulation queue processor for user {self.user_id}: {str(e)}")
+                try:
+                    error_response = {
+                        'action': 'simulate',
+                        'requestId': data.get('requestId') if 'data' in locals() else None,
+                        'success': False,
+                        'error': f'Internal server error: {str(e)}'
+                    }
+                    await self.ws.send_json(error_response)
+                except Exception as send_error:
+                    logger.error(f"Error sending error response: {send_error}")
+            finally:
+                if 'simulation_queue' in self.__dict__:
+                    self.simulation_queue.task_done()
+                    
+    async def process_generic_request(self, data: dict) -> None:
+        """Handle general requests that don't fit into specialized queues"""
+        try:
+            request_id = data.get('requestId')
+            action = data.get('action')
+            
+            def error_response(message: str):
+                return {
+                    'action': action,
+                    'requestId': request_id,
+                    'success': False,
+                    'error': message
+                }
+
+            if action == 'heartbeat':
+                # Include user role info in heartbeat response
+                await self.ws.send_json({
+                    'action': 'heartbeat',
+                    'requestId': request_id,
+                    'success': True,
+                    'user_role': self.user_role
+                })
+            
+            elif action == 'get_user_role':
+                # Return the user role information
+                await self.ws.send_json({
+                    'action': 'get_user_role',
+                    'requestId': request_id,
+                    'success': True,
+                    'user_role': self.user_role
+                })
+            
+            elif action == 'generate_caption':
+                title = data.get('params', {}).get('title')
+                description = data.get('params', {}).get('description')
+                
+                if not title or not description:
+                    await self.ws.send_json(error_response('Missing title or description'))
+                    return
+                    
+                caption = await self.shared_api.generate_caption(title, description)
+                await self.ws.send_json({
+                    'action': action,
+                    'requestId': request_id,
+                    'success': True,
+                    'caption': caption
+                })
+                
+            # evolve_description is now handled by the dedicated simulation queue processor
+                
+            elif action == 'generate_video_thumbnail':
+                title = data.get('title', '') or data.get('params', {}).get('title', '')
+                description = data.get('description', '') or data.get('params', {}).get('description', '')
+                video_prompt_prefix = data.get('video_prompt_prefix', '') or data.get('params', {}).get('video_prompt_prefix', '')
+                options = data.get('options', {}) or data.get('params', {}).get('options', {})
+                
+                if not title:
+                    await self.ws.send_json(error_response('Missing title for thumbnail generation'))
+                    return
+                
+                # Ensure the options include the thumbnail flag
+                options['thumbnail'] = True
+                
+                # Prioritize thumbnail generation with higher priority
+                options['priority'] = 'high'
+                
+                # Add small size settings if not already specified
+                if 'width' not in options:
+                    options['width'] = 512  # Default thumbnail width
+                if 'height' not in options:
+                    options['height'] = 288  # Default 16:9 aspect ratio
+                if 'num_frames' not in options:
+                    options['num_frames'] = 25  # 1 second @ 25fps
+                
+                # Let the API know this is a thumbnail for a specific video
+                options['video_id'] = data.get('video_id', f"thumbnail-{request_id}")
+                
+                logger.info(f"Generating thumbnail for video {options['video_id']} for user {self.user_id}")
+                
+                try:
+                    # Generate the thumbnail
+                    thumbnail_data = await self.shared_api.generate_video_thumbnail(
+                        title, description, video_prompt_prefix, options, self.user_role
+                    )
+                    
+                    # Respond with appropriate format based on the parameter names used in the request
+                    if 'thumbnailUrl' in data or 'thumbnailUrl' in data.get('params', {}):
+                        # Legacy format using thumbnailUrl
+                        await self.ws.send_json({
+                            'action': action,
+                            'requestId': request_id,
+                            'success': True,
+                            'thumbnailUrl': thumbnail_data or "",
+                        })
+                    else:
+                        # New format using thumbnail
+                        await self.ws.send_json({
+                            'action': action,
+                            'requestId': request_id,
+                            'success': True,
+                            'thumbnail': thumbnail_data,
+                        })
+                except Exception as e:
+                    logger.error(f"Error generating thumbnail: {str(e)}")
+                    await self.ws.send_json(error_response(f"Thumbnail generation failed: {str(e)}"))
+                
+            # Handle deprecated thumbnail actions
+            elif action == 'generate_thumbnail' or action == 'old_generate_thumbnail':
+                # Redirect to video thumbnail generation
+                logger.warning(f"Deprecated thumbnail action '{action}' used, redirecting to generate_video_thumbnail")
+                
+                # Extract parameters
+                title = data.get('title', '') or data.get('params', {}).get('title', '')
+                description = data.get('description', '') or data.get('params', {}).get('description', '')
+                
+                if not title or not description:
+                    await self.ws.send_json(error_response('Missing title or description'))
+                    return
+                
+                # Create a new request with the correct action
+                new_request = {
+                    'action': 'generate_video_thumbnail',
+                    'requestId': request_id,
+                    'title': title,
+                    'description': description,
+                    'options': {
+                        'width': 512,
+                        'height': 288,
+                        'thumbnail': True,
+                        'video_id': f"thumbnail-{request_id}"
+                    }
+                }
+                
+                # Process with the new action
+                await self.process_generic_request(new_request)
+                
+            else:
+                await self.ws.send_json(error_response(f'Unknown action: {action}'))
+                
+        except Exception as e:
+            logger.error(f"Error processing generic request for user {self.user_id}: {str(e)}")
+            try:
+                await self.ws.send_json({
+                    'action': data.get('action'),
+                    'requestId': data.get('requestId'),
+                    'success': False,
+                    'error': f'Internal server error: {str(e)}'
+                })
+            except Exception as send_error:
+                logger.error(f"Error sending error response: {send_error}")
+
+class SessionManager:
+    """
+    Manages all active user sessions and shared resources.
+    """
+    def __init__(self):
+        self.sessions = {}
+        self.shared_api = VideoGenerationAPI()  # Single instance for shared resources
+        self.session_lock = asyncio.Lock()
+    
+    async def create_session(self, user_id: str, user_role: str, ws: web.WebSocketResponse) -> UserSession:
+        """Create a new user session"""
+        async with self.session_lock:
+            # Create a new session for this user
+            session = UserSession(user_id, user_role, ws, self.shared_api)
+            await session.start()
+            self.sessions[user_id] = session
+            return session
+    
+    async def delete_session(self, user_id: str) -> None:
+        """Delete a user session and clean up resources"""
+        async with self.session_lock:
+            if user_id in self.sessions:
+                session = self.sessions[user_id]
+                await session.stop()
+                del self.sessions[user_id]
+                logger.info(f"Deleted session for user {user_id}")
+    
+    def get_session(self, user_id: str) -> UserSession:
+        """Get a user session if it exists"""
+        return self.sessions.get(user_id)
+    
+    async def close_all_sessions(self) -> None:
+        """Close all active sessions (used during shutdown)"""
+        async with self.session_lock:
+            for user_id, session in list(self.sessions.items()):
+                await session.stop()
+            self.sessions.clear()
+            logger.info("Closed all active sessions")
+    
+    @property
+    def session_count(self) -> int:
+        """Get the number of active sessions"""
+        return len(self.sessions)
+    
+    def get_session_stats(self) -> Dict:
+        """Get statistics about active sessions"""
+        stats = {
+            'total_sessions': len(self.sessions),
+            'by_role': {
+                'anon': 0,
+                'normal': 0,
+                'pro': 0,
+                'admin': 0
+            },
+            'requests': {
+                'chat': 0,
+                'video': 0,
+                'search': 0,
+                'simulation': 0
+            }
+        }
+        
+        for session in self.sessions.values():
+            stats['by_role'][session.user_role] += 1
+            stats['requests']['chat'] += session.request_counts['chat']
+            stats['requests']['video'] += session.request_counts['video']
+            stats['requests']['search'] += session.request_counts['search']
+            stats['requests']['simulation'] += session.request_counts['simulation']
+            
+        return stats
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c4d44d8ce3e9abbf665a79fb30437374491c6592
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,14 @@
+gymnasium==0.29.1
+ale-py==0.9.0
+h5py==3.11.0
+huggingface-hub==0.17.2
+hydra-core==1.3
+numpy==1.26.0
+opencv-python==4.10.0.84
+pillow==10.3.0
+pygame==2.5.2
+torch==2.1.0
+torchvision==0.16.0
+torcheval==0.0.7
+tqdm==4.66.4
+wandb==0.17.0
diff --git a/scripts/import_run.py b/scripts/import_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..17662093c178ab3c2b999d7991cb0b50364600a2
--- /dev/null
+++ b/scripts/import_run.py
@@ -0,0 +1,123 @@
+#! /usr/bin/env python
+
+import argparse
+from functools import partial
+import json
+from pathlib import Path
+import subprocess
+from typing import Optional
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("host", type=str)
+    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument("--user", type=Optional[str])
+    parser.add_argument("--rootdir", type=Optional[str])
+    args = parser.parse_args()
+
+    run = partial(subprocess.run, shell=True, check=True, text=True)
+    host = args.host if args.user is None else f"{args.user}@{args.host}"
+
+    def run_remote_cmd(cmd):
+        return subprocess.check_output(f"ssh {host} {cmd}", shell=True, text=True)
+
+    def ls(p):
+        out = run_remote_cmd(f"ls {p}")
+        return out.strip().split("\n")[::-1]
+
+    def ask(l, info=None):
+        print(
+            "\n".join(
+                [
+                    f"{i:{len(str(len(l)))}d}: {d}"
+                    + (f" ({info[d]})" if info is not None else "")
+                    for i, d in enumerate(l, 1)
+                ]
+            )
+        )
+        while True:
+            i = input("\nEnter a number: ")
+            if i.isdigit() and 1 <= int(i) <= len(l):
+                break
+            print("\n/!\\ Invalid choice\n")
+        return l[int(i) - 1]
+
+    def ask_if_verbose(question, default):
+        if not args.verbose:
+            return default
+        suffix = "[Y|n]" if default else "[y|N]"
+        answer = input(f"{question} {suffix} ").lower()
+
+        return (answer != "n") if default else (answer == "y")
+
+    def get_info(rundir):
+        return json.loads(
+            run_remote_cmd(f"cat {rundir}/checkpoints/info_for_import_script.json")
+        )
+
+    if args.rootdir is None:
+        for p in Path(__file__).resolve().parents:
+            if (p / ".git").is_dir():
+                break
+        else:
+            raise RuntimeError("This file is not in a git repository")
+        out = run_remote_cmd(f"find -type d -name {p.name}").strip().split("\n")
+        assert len(out) == 1
+        rootdir = out[0]
+    else:
+        rootdir = f'{args.rootdir.strip().strip("/")}'
+
+    dates = ls(f"{rootdir}/outputs")
+    date = ask(dates)
+    times = ls(f"{rootdir}/outputs/{date}")
+
+    infos = {
+        time: get_info(rundir=f"{rootdir}/outputs/{date}/{time}") for time in times
+    }
+    time = ask(times, infos)
+
+    src = f"{rootdir}/outputs/{date}/{time}"
+
+    dst = Path(args.host) / date
+    dst.mkdir(exist_ok=True, parents=True)
+
+    exclude = [
+        "*.log",
+        "checkpoints/*",
+        "checkpoints_tmp",
+        ".hydra",
+        "media",
+        "__pycache__",
+        "wandb",
+    ]
+
+    include = ["checkpoints/agent_versions"]
+
+    if ask_if_verbose("Download only last checkpoint?", default=True):
+        last_ckpt = ls(f"{src}/checkpoints/agent_versions")[0]
+        exclude.append("checkpoints/agent_versions/*")
+        include.append(f"checkpoints/agent_versions/{last_ckpt}")
+
+    if not ask_if_verbose("Download train dataset?", default=False):
+        exclude.append("dataset/train")
+
+    if not ask_if_verbose("Download test dataset?", default=False):
+        exclude.append("dataset/test")
+
+    cmd = "rsync -av"
+    for i in include:
+        cmd += f' --include="{i}"'
+    for e in exclude:
+        cmd += f' --exclude="{e}"'
+
+    cmd += f" {host}:{src} {str(dst)}"
+    run(cmd)
+
+    path = (dst / time).absolute()
+    print(f"\n--> Run imported in:\n{path}")
+    run(f"echo {path} | xclip")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/resume.sh b/scripts/resume.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e1552e26280a65b4b2b8e575883240c1beac91b5
--- /dev/null
+++ b/scripts/resume.sh
@@ -0,0 +1 @@
+python src/main.py common.resume=True hydra.output_subdir=null hydra.run.dir=.
diff --git a/server.py b/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0c88181bd7a2b91f4ce1e8c99b6b3ccb3fe9e0
--- /dev/null
+++ b/server.py
@@ -0,0 +1,796 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+AI Game Multiverse Cloud Gaming Server
+
+This script implements a websocket server for the AI Game Multiverse project,
+allowing real-time streaming of game frames based on player inputs.
+"""
+
+import asyncio
+import json
+import logging
+import os
+import pathlib
+import time
+import uuid
+import base64
+import argparse
+from typing import Dict, List, Any, Optional
+from aiohttp import web, WSMsgType
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+class SimpleGameEngine:
+    """
+    A simple game engine that generates placeholder frames.
+    This is used when the main model engine is not available.
+    """
+    def __init__(self, args=None):
+        self.frame_width = getattr(args, 'frame_width', 640)
+        self.frame_height = getattr(args, 'frame_height', 360)
+        self.frame_count = 0
+        
+        # Create placeholder scenes
+        self.scenes = {}
+        self._create_placeholder_scenes()
+    
+    def _create_placeholder_scenes(self):
+        """Create placeholder scene frames for demo purposes"""
+        scene_names = ['forest', 'desert', 'beach', 'hills', 'river', 'plain']
+        
+        for scene_name in scene_names:
+            frames = []
+            for i in range(5):  # Create 5 frames per scene
+                import numpy as np
+                import cv2
+                
+                # Create a colored frame based on scene name
+                if scene_name == 'forest':
+                    color = (34, 139, 34)  # Forest green
+                elif scene_name == 'desert':
+                    color = (210, 180, 140)  # Desert sand
+                elif scene_name == 'beach':
+                    color = (238, 214, 175)  # Beach sand
+                elif scene_name == 'hills':
+                    color = (85, 107, 47)  # Olive green
+                elif scene_name == 'river':
+                    color = (65, 105, 225)  # Royal blue
+                else:
+                    color = (160, 160, 160)  # Gray
+                
+                # Create base frame
+                frame = np.ones((self.frame_height, self.frame_width, 3), dtype=np.uint8)
+                frame[:] = color
+                
+                # Add scene name and frame number
+                cv2.putText(frame, f"Scene: {scene_name}", (50, 180), 
+                           cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
+                cv2.putText(frame, f"Frame {i}", (50, 220), 
+                           cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
+                
+                frames.append(frame)
+            
+            self.scenes[scene_name] = frames
+    
+    def get_valid_scenes(self) -> List[str]:
+        """
+        Get a list of valid scene names.
+        
+        Returns:
+            List[str]: List of valid scene names
+        """
+        return list(self.scenes.keys())
+    
+    def generate_frame(self, scene_name: str, keyboard_condition: Optional[List] = None, 
+                      mouse_condition: Optional[List] = None) -> bytes:
+        """
+        Generate a simple frame based on the scene and input conditions.
+        
+        Args:
+            scene_name: Name of the current scene
+            keyboard_condition: Keyboard input state
+            mouse_condition: Mouse input state
+            
+        Returns:
+            bytes: JPEG bytes of the frame
+        """
+        import numpy as np
+        import cv2
+        
+        scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
+        frame_idx = self.frame_count % len(scene_frames)
+        frame = scene_frames[frame_idx].copy()
+        self.frame_count += 1
+        
+        # Add visualization of input controls
+        frame = self._visualize_controls(frame, keyboard_condition, mouse_condition)
+        
+        # Convert frame to JPEG
+        success, buffer = cv2.imencode('.jpg', frame)
+        if not success:
+            # Return a blank frame
+            blank = np.ones((self.frame_height, self.frame_width, 3), dtype=np.uint8) * 100
+            success, buffer = cv2.imencode('.jpg', blank)
+            
+        return buffer.tobytes()
+    
+    def _visualize_controls(self, frame: np.ndarray, keyboard_condition: List, mouse_condition: List) -> np.ndarray:
+        """Visualize keyboard and mouse controls on the frame."""
+        import cv2
+        
+        # Clone the frame to avoid modifying the original
+        frame = frame.copy()
+        
+        # If we have keyboard/mouse conditions, visualize them on the frame
+        if keyboard_condition:
+            # Visualize keyboard inputs
+            keys = ["W", "S", "A", "D", "JUMP", "ATTACK"]
+            for i, key_pressed in enumerate(keyboard_condition[0]):
+                color = (0, 255, 0) if key_pressed else (100, 100, 100)
+                cv2.putText(frame, keys[i], (20 + i*100, 30), 
+                          cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
+                
+        if mouse_condition:
+            # Visualize mouse movement
+            mouse_x, mouse_y = mouse_condition[0]
+            # Scale mouse values for visualization
+            offset_x = int(mouse_x * 100)
+            offset_y = int(mouse_y * 100)
+            center_x, center_y = self.frame_width // 2, self.frame_height // 2
+            cv2.circle(frame, (center_x + offset_x, center_y - offset_y), 10, (255, 0, 0), -1)
+            cv2.putText(frame, f"Mouse: {mouse_x:.2f}, {mouse_y:.2f}", 
+                       (self.frame_width - 250, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)
+        
+        return frame
+
+class GameSession:
+    """
+    Represents a user's gaming session.
+    Each WebSocket connection gets its own session with separate queues.
+    """
+    def __init__(self, user_id: str, ws: web.WebSocketResponse, game_manager):
+        self.user_id = user_id
+        self.ws = ws
+        self.game_manager = game_manager
+        
+        # Create action queue for this user session
+        self.action_queue = asyncio.Queue()
+        
+        # Session creation time
+        self.created_at = time.time()
+        self.last_activity = time.time()
+        
+        # Game state
+        self.current_scene = "forest"  # Default scene
+        self.is_streaming = False
+        self.stream_task = None
+        
+        # Current input state
+        self.keyboard_state = [0, 0, 0, 0, 0, 0]  # forward, back, left, right, jump, attack
+        self.mouse_state = [0, 0]  # x, y
+        
+        self.background_tasks = []
+        
+    async def start(self):
+        """Start all the queue processors for this session"""
+        self.background_tasks = [
+            asyncio.create_task(self._process_action_queue()),
+        ]
+        logger.info(f"Started game session for user {self.user_id}")
+        
+    async def stop(self):
+        """Stop all background tasks for this session"""
+        # Stop streaming if active
+        if self.is_streaming and self.stream_task:
+            self.is_streaming = False
+            self.stream_task.cancel()
+            try:
+                await self.stream_task
+            except asyncio.CancelledError:
+                pass
+        
+        # Cancel other background tasks
+        for task in self.background_tasks:
+            task.cancel()
+        
+        try:
+            # Wait for tasks to complete cancellation
+            await asyncio.gather(*self.background_tasks, return_exceptions=True)
+        except asyncio.CancelledError:
+            pass
+        
+        logger.info(f"Stopped game session for user {self.user_id}")
+        
+    async def _process_action_queue(self):
+        """Process game actions from the queue"""
+        while True:
+            data = await self.action_queue.get()
+            try:
+                action_type = data.get('action')
+                
+                if action_type == 'start_stream':
+                    result = await self._handle_start_stream(data)
+                elif action_type == 'stop_stream':
+                    result = await self._handle_stop_stream(data)
+                elif action_type == 'keyboard_input':
+                    result = await self._handle_keyboard_input(data)
+                elif action_type == 'mouse_input':
+                    result = await self._handle_mouse_input(data)
+                elif action_type == 'change_scene':
+                    result = await self._handle_scene_change(data)
+                else:
+                    result = {
+                        'action': action_type,
+                        'requestId': data.get('requestId'),
+                        'success': False,
+                        'error': f'Unknown action: {action_type}'
+                    }
+                
+                # Send response back to the client
+                await self.ws.send_json(result)
+                
+                # Update last activity time
+                self.last_activity = time.time()
+                
+            except Exception as e:
+                logger.error(f"Error processing action for user {self.user_id}: {str(e)}")
+                try:
+                    await self.ws.send_json({
+                        'action': data.get('action'),
+                        'requestId': data.get('requestId', 'unknown'),
+                        'success': False,
+                        'error': f'Error processing action: {str(e)}'
+                    })
+                except Exception as send_error:
+                    logger.error(f"Error sending error response: {send_error}")
+            finally:
+                self.action_queue.task_done()
+    
+    async def _handle_start_stream(self, data: Dict) -> Dict:
+        """Handle request to start streaming frames"""
+        if self.is_streaming:
+            return {
+                'action': 'start_stream',
+                'requestId': data.get('requestId'),
+                'success': False,
+                'error': 'Stream already active'
+            }
+        
+        fps = data.get('fps', 16)
+        self.is_streaming = True
+        self.stream_task = asyncio.create_task(self._stream_frames(fps))
+        
+        return {
+            'action': 'start_stream',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'message': f'Streaming started at {fps} FPS'
+        }
+    
+    async def _handle_stop_stream(self, data: Dict) -> Dict:
+        """Handle request to stop streaming frames"""
+        if not self.is_streaming:
+            return {
+                'action': 'stop_stream',
+                'requestId': data.get('requestId'),
+                'success': False,
+                'error': 'No active stream to stop'
+            }
+        
+        self.is_streaming = False
+        if self.stream_task:
+            self.stream_task.cancel()
+            try:
+                await self.stream_task
+            except asyncio.CancelledError:
+                pass
+            self.stream_task = None
+        
+        return {
+            'action': 'stop_stream',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'message': 'Streaming stopped'
+        }
+    
+    async def _handle_keyboard_input(self, data: Dict) -> Dict:
+        """Handle keyboard input from client"""
+        key = data.get('key', '')
+        pressed = data.get('pressed', False)
+        
+        # Map key to keyboard state index
+        key_map = {
+            'w': 0, 'forward': 0,
+            's': 1, 'back': 1, 'backward': 1,
+            'a': 2, 'left': 2,
+            'd': 3, 'right': 3,
+            'space': 4, 'jump': 4,
+            'shift': 5, 'attack': 5, 'ctrl': 5
+        }
+        
+        if key.lower() in key_map:
+            key_idx = key_map[key.lower()]
+            self.keyboard_state[key_idx] = 1 if pressed else 0
+        
+        return {
+            'action': 'keyboard_input',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'keyboardState': self.keyboard_state
+        }
+    
+    async def _handle_mouse_input(self, data: Dict) -> Dict:
+        """Handle mouse movement/input from client"""
+        mouse_x = data.get('x', 0)
+        mouse_y = data.get('y', 0)
+        
+        # Update mouse state, normalize values between -1 and 1
+        self.mouse_state = [float(mouse_x), float(mouse_y)]
+        
+        return {
+            'action': 'mouse_input',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'mouseState': self.mouse_state
+        }
+    
+    async def _handle_scene_change(self, data: Dict) -> Dict:
+        """Handle scene change requests"""
+        scene_name = data.get('scene', 'forest')
+        valid_scenes = self.game_manager.valid_scenes
+        
+        if scene_name not in valid_scenes:
+            return {
+                'action': 'change_scene',
+                'requestId': data.get('requestId'),
+                'success': False,
+                'error': f'Invalid scene: {scene_name}. Valid scenes are: {", ".join(valid_scenes)}'
+            }
+            
+        self.current_scene = scene_name
+        
+        return {
+            'action': 'change_scene',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'scene': scene_name
+        }
+    
+    async def _stream_frames(self, fps: int):
+        """Stream frames to the client at the specified FPS"""
+        frame_interval = 1.0 / fps  # Time between frames in seconds
+        
+        try:
+            while self.is_streaming:
+                start_time = time.time()
+                
+                # Generate frame based on current keyboard and mouse state
+                keyboard_condition = [self.keyboard_state]
+                mouse_condition = [self.mouse_state]
+                
+                # Use the engine to generate the next frame
+                frame_bytes = self.game_manager.engine.generate_frame(
+                    self.current_scene, keyboard_condition, mouse_condition
+                )
+                
+                # Encode as base64 for sending in JSON
+                frame_base64 = base64.b64encode(frame_bytes).decode('utf-8')
+                
+                # Send frame to client
+                await self.ws.send_json({
+                    'action': 'frame',
+                    'frameData': frame_base64,
+                    'timestamp': time.time()
+                })
+                
+                # Calculate sleep time to maintain FPS
+                elapsed = time.time() - start_time
+                sleep_time = max(0, frame_interval - elapsed)
+                await asyncio.sleep(sleep_time)
+                
+        except asyncio.CancelledError:
+            logger.info(f"Frame streaming cancelled for user {self.user_id}")
+        except Exception as e:
+            logger.error(f"Error in frame streaming for user {self.user_id}: {str(e)}")
+            if self.ws.closed:
+                logger.info(f"WebSocket closed for user {self.user_id}")
+                return
+            
+            # Notify client of error
+            try:
+                await self.ws.send_json({
+                    'action': 'frame_error',
+                    'error': f'Streaming error: {str(e)}'
+                })
+            except:
+                pass
+            
+            # Stop streaming
+            self.is_streaming = False
+
+class GameManager:
+    """
+    Manages all active gaming sessions and shared resources.
+    """
+    def __init__(self, args: argparse.Namespace):
+        self.sessions = {}
+        self.session_lock = asyncio.Lock()
+        
+        # Try to import and initialize the game engine
+        try:
+            # Dynamically import the real engine if available
+            from src.envs.world_model_env import WorldModelEnv
+            # Initialize with model from args
+            self.engine = WorldModelEnv(args)
+            logger.info("Initialized World Model Environment")
+        except ImportError:
+            logger.warning("Could not import World Model Environment, falling back to simple engine")
+            self.engine = SimpleGameEngine(args)
+        except Exception as e:
+            logger.error(f"Error initializing World Model Environment: {str(e)}")
+            logger.warning("Falling back to simple engine")
+            self.engine = SimpleGameEngine(args)
+        
+        # Load valid scenes from engine
+        self.valid_scenes = self.engine.get_valid_scenes()
+        
+    async def create_session(self, user_id: str, ws: web.WebSocketResponse) -> GameSession:
+        """Create a new game session"""
+        async with self.session_lock:
+            # Create a new session for this user
+            session = GameSession(user_id, ws, self)
+            await session.start()
+            self.sessions[user_id] = session
+            return session
+    
+    async def delete_session(self, user_id: str) -> None:
+        """Delete a game session and clean up resources"""
+        async with self.session_lock:
+            if user_id in self.sessions:
+                session = self.sessions[user_id]
+                await session.stop()
+                del self.sessions[user_id]
+                logger.info(f"Deleted game session for user {user_id}")
+    
+    def get_session(self, user_id: str) -> Optional[GameSession]:
+        """Get a game session if it exists"""
+        return self.sessions.get(user_id)
+    
+    async def close_all_sessions(self) -> None:
+        """Close all active sessions (used during shutdown)"""
+        async with self.session_lock:
+            for user_id, session in list(self.sessions.items()):
+                await session.stop()
+            self.sessions.clear()
+            logger.info("Closed all active game sessions")
+    
+    @property
+    def session_count(self) -> int:
+        """Get the number of active sessions"""
+        return len(self.sessions)
+    
+    def get_session_stats(self) -> Dict:
+        """Get statistics about active sessions"""
+        stats = {
+            'total_sessions': len(self.sessions),
+            'active_scenes': {},
+            'streaming_sessions': 0
+        }
+        
+        # Count sessions by scene and streaming status
+        for session in self.sessions.values():
+            scene = session.current_scene
+            stats['active_scenes'][scene] = stats['active_scenes'].get(scene, 0) + 1
+            if session.is_streaming:
+                stats['streaming_sessions'] += 1
+            
+        return stats
+
+# Create global game manager
+game_manager = None
+
+async def status_handler(request: web.Request) -> web.Response:
+    """Handler for API status endpoint"""
+    # Get session statistics
+    session_stats = game_manager.get_session_stats()
+    
+    return web.json_response({
+        'product': 'AI Game Multiverse Server',
+        'version': '1.0.0',
+        'active_sessions': session_stats,
+        'available_scenes': game_manager.valid_scenes
+    })
+
+async def root_handler(request: web.Request) -> web.Response:
+    """Handler for serving the client at the root path"""
+    index_path = pathlib.Path(__file__).parent / 'index.html'
+    
+    if not index_path.exists():
+        return web.Response(text="""
+        <html>
+            <body style="font-family: Arial, sans-serif; text-align: center; padding: 50px;">
+                <h1>AI Game Multiverse Server</h1>
+                <p>Server is running, but the index.html file is missing.</p>
+                <p>Please create the index.html file in the same directory as the server.py file.</p>
+            </body>
+        </html>
+        """, content_type='text/html')
+    
+    with open(index_path, 'r') as file:
+        html_content = file.read()
+        
+    return web.Response(text=html_content, content_type='text/html')
+
+async def websocket_handler(request: web.Request) -> web.WebSocketResponse:
+    """Handle WebSocket connections with robust error handling"""
+    logger.info(f"WebSocket connection attempt - PATH: {request.path}, QUERY: {request.query_string}")
+    
+    # Log request headers at debug level only (could contain sensitive information)
+    logger.debug(f"WebSocket request headers: {dict(request.headers)}")
+    
+    # Prepare a WebSocket response with appropriate settings
+    ws = web.WebSocketResponse(
+        max_msg_size=1024*1024*10,  # 10MB max message size
+        timeout=60.0,
+        heartbeat=30.0  # Add heartbeat to keep connection alive
+    )
+    
+    # Check if WebSocket protocol is supported
+    if not ws.can_prepare(request):
+        logger.error("Cannot prepare WebSocket: WebSocket protocol not supported")
+        return web.Response(status=400, text="WebSocket protocol not supported")
+    
+    try:
+        logger.info("Preparing WebSocket connection...")
+        await ws.prepare(request)
+        
+        # Generate a unique user ID for this connection
+        user_id = str(uuid.uuid4())
+        
+        # Get client IP address
+        peername = request.transport.get_extra_info('peername')
+        if peername is not None:
+            client_ip = peername[0]
+        else:
+            client_ip = request.headers.get('X-Forwarded-For', 'unknown').split(',')[0].strip()
+        
+        # Log connection success
+        logger.info(f"Client {user_id} connecting from IP: {client_ip} - WebSocket connection established")
+        
+        # Mark that the session is established
+        is_session_created = False
+        
+        try:
+            # Store the user ID in the websocket for easy access
+            ws.user_id = user_id
+            
+            # Create a new session for this user
+            logger.info(f"Creating game session for user {user_id}")
+            user_session = await game_manager.create_session(user_id, ws)
+            is_session_created = True
+            logger.info(f"Game session created for user {user_id}")
+        except Exception as session_error:
+            logger.error(f"Error creating game session: {str(session_error)}", exc_info=True)
+            if not ws.closed:
+                await ws.close(code=1011, message=f"Server error: {str(session_error)}".encode())
+            if is_session_created:
+                await game_manager.delete_session(user_id)
+            return ws
+    except Exception as e:
+        logger.error(f"Error establishing WebSocket connection: {str(e)}", exc_info=True)
+        if not ws.closed and ws.prepared:
+            await ws.close(code=1011, message=f"Server error: {str(e)}".encode())
+        return ws
+    
+    # Send initial welcome message
+    try:
+        await ws.send_json({
+            'action': 'welcome',
+            'userId': user_id,
+            'message': 'Welcome to the AI Game Multiverse WebSocket server!',
+            'scenes': game_manager.valid_scenes
+        })
+        logger.info(f"Sent welcome message to user {user_id}")
+    except Exception as welcome_error:
+        logger.error(f"Error sending welcome message: {str(welcome_error)}")
+        if not ws.closed:
+            await ws.close(code=1011, message=b"Failed to send welcome message")
+        await game_manager.delete_session(user_id)
+        return ws
+
+    try:
+        async for msg in ws:
+            if msg.type == WSMsgType.TEXT:
+                try:
+                    data = json.loads(msg.data)
+                    action = data.get('action')
+                    
+                    logger.debug(f"Received {action} message from user {user_id}")
+                    
+                    if action == 'ping':
+                        # Respond to ping immediately
+                        await ws.send_json({
+                            'action': 'pong',
+                            'requestId': data.get('requestId'),
+                            'timestamp': time.time()
+                        })
+                    else:
+                        # Route game actions to the session's action queue
+                        await user_session.action_queue.put(data)
+                        
+                except json.JSONDecodeError:
+                    logger.error(f"Invalid JSON from user {user_id}: {msg.data}")
+                    if not ws.closed:
+                        await ws.send_json({
+                            'error': 'Invalid JSON message',
+                            'success': False
+                        })
+                except Exception as e:
+                    logger.error(f"Error processing WebSocket message for user {user_id}: {str(e)}")
+                    if not ws.closed:
+                        await ws.send_json({
+                            'action': data.get('action') if 'data' in locals() else 'unknown',
+                            'success': False,
+                            'error': f'Error processing message: {str(e)}'
+                        })
+                    
+            elif msg.type == WSMsgType.ERROR:
+                logger.error(f"WebSocket error for user {user_id}: {ws.exception()}")
+                break
+                
+            elif msg.type == WSMsgType.CLOSE:
+                logger.info(f"WebSocket close received for user {user_id} (code: {msg.data}, message: {msg.extra})")
+                break
+                
+            elif msg.type == WSMsgType.CLOSING:
+                logger.info(f"WebSocket closing for user {user_id}")
+                break
+                
+            elif msg.type == WSMsgType.CLOSED:
+                logger.info(f"WebSocket already closed for user {user_id}")
+                break
+                
+    except Exception as ws_error:
+        logger.error(f"Unexpected WebSocket error for user {user_id}: {str(ws_error)}", exc_info=True)
+    finally:
+        # Cleanup session
+        try:
+            logger.info(f"Cleaning up session for user {user_id}")
+            await game_manager.delete_session(user_id)
+            logger.info(f"Connection closed for user {user_id}")
+        except Exception as cleanup_error:
+            logger.error(f"Error during session cleanup for user {user_id}: {str(cleanup_error)}")
+    
+    return ws
+
+async def init_app(args, base_path="") -> web.Application:
+    """Initialize the web application"""
+    global game_manager
+    
+    # Initialize game manager with command line args
+    game_manager = GameManager(args)
+    
+    app = web.Application(
+        client_max_size=1024**2*10  # 10MB max size
+    )
+    
+    # Add cleanup logic
+    async def cleanup(app):
+        logger.info("Shutting down server, closing all sessions...")
+        await game_manager.close_all_sessions()
+    
+    app.on_shutdown.append(cleanup)
+    
+    # Add routes with CORS headers for WebSockets
+    # Configure CORS for all routes
+    @web.middleware
+    async def cors_middleware(request, handler):
+        if request.method == 'OPTIONS':
+            # Handle preflight requests
+            resp = web.Response()
+            resp.headers['Access-Control-Allow-Origin'] = '*'
+            resp.headers['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS'
+            resp.headers['Access-Control-Allow-Headers'] = 'Content-Type, X-Requested-With'
+            return resp
+            
+        # Normal request, call the handler
+        resp = await handler(request)
+        
+        # Add CORS headers to the response
+        resp.headers['Access-Control-Allow-Origin'] = '*'
+        resp.headers['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS'
+        resp.headers['Access-Control-Allow-Headers'] = 'Content-Type, X-Requested-With'
+        return resp
+    
+    app.middlewares.append(cors_middleware)
+    
+    # Add a debug endpoint to help diagnose WebSocket issues
+    async def debug_handler(request):
+        client_ip = request.remote
+        headers = dict(request.headers)
+        server_host = request.host
+        
+        debug_info = {
+            "client_ip": client_ip,
+            "server_host": server_host,
+            "headers": headers,
+            "request_path": request.path,
+            "server_time": time.time(),
+            "base_path": base_path,
+            "websocket_route": f"{base_path}/ws",
+            "all_routes": [route.name for route in app.router.routes() if route.name],
+            "server_info": {
+                "active_sessions": game_manager.session_count,
+                "available_scenes": game_manager.valid_scenes
+            }
+        }
+        
+        return web.json_response(debug_info)
+    
+    # Set up routes with the base_path
+    # Add multiple WebSocket routes to ensure compatibility
+    logger.info(f"Setting up WebSocket route at {base_path}/ws")
+    app.router.add_get(f'{base_path}/ws', websocket_handler, name='ws_handler')
+    
+    # Also add WebSocket route at the root for compatibility
+    if base_path:
+        logger.info(f"Adding additional WebSocket route at /ws")
+        app.router.add_get('/ws', websocket_handler, name='ws_root_handler')
+    
+    # Add routes for API and debug endpoints
+    app.router.add_get(f'{base_path}/api/status', status_handler, name='status_handler')
+    app.router.add_get(f'{base_path}/api/debug', debug_handler, name='debug_handler')
+    
+    # Serve the client at both the base path and root path for compatibility
+    app.router.add_get(f'{base_path}/', root_handler, name='root_handler')
+    
+    # Always serve at the root path for compatibility
+    if base_path:
+        app.router.add_get('/', root_handler, name='root_handler_no_base')
+    
+    # Set up static file serving for assets
+    static_path = pathlib.Path(__file__).parent / 'assets'
+    if not static_path.exists():
+        static_path.mkdir(exist_ok=True)
+    
+    app.router.add_static(f'{base_path}/assets', static_path, name='static_handler')
+    
+    # Add static file serving at root for compatibility
+    if base_path:
+        app.router.add_static('/assets', static_path, name='static_handler_no_base')
+    
+    return app
+
+def parse_args() -> argparse.Namespace:
+    """Parse server-specific command line arguments"""
+    parser = argparse.ArgumentParser(description="AI Game Multiverse Server")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host IP to bind to")
+    parser.add_argument("--port", type=int, default=8080, help="Port to listen on")
+    parser.add_argument("--path", type=str, default="", help="Base path for the server (for proxy setups)")
+    
+    # Add model-specific arguments
+    parser.add_argument("--frame_width", type=int, default=640, help="Width of output frames")
+    parser.add_argument("--frame_height", type=int, default=360, help="Height of output frames")
+    parser.add_argument("--fps", type=int, default=16, help="Target frames per second")
+    
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    # Parse command line arguments
+    args = parse_args()
+    
+    # Initialize app
+    loop = asyncio.get_event_loop()
+    app = loop.run_until_complete(init_app(args, base_path=args.path))
+    
+    # Start server
+    logger.info(f"Starting AI Game Multiverse Server at {args.host}:{args.port}")
+    web.run_app(app, host=args.host, port=args.port)
\ No newline at end of file
diff --git a/src/.DS_Store b/src/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..7e3fdab5894c602943f13cd6ec323b3c26ff570c
Binary files /dev/null and b/src/.DS_Store differ
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/agent.py b/src/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..b919dc267460105533a7b6ccbb23ef9575a262b3
--- /dev/null
+++ b/src/agent.py
@@ -0,0 +1,74 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from envs import TorchEnv, WorldModelEnv
+from models.actor_critic import ActorCritic, ActorCriticConfig, ActorCriticLossConfig
+from models.diffusion import Denoiser, DenoiserConfig, SigmaDistributionConfig
+from models.rew_end_model import RewEndModel, RewEndModelConfig
+from utils import extract_state_dict
+
+
+@dataclass
+class AgentConfig:
+    denoiser: DenoiserConfig
+    upsampler: Optional[DenoiserConfig]
+    rew_end_model: Optional[RewEndModelConfig]
+    actor_critic: Optional[ActorCriticConfig]
+    num_actions: int
+
+    def __post_init__(self) -> None:
+        self.denoiser.inner_model.num_actions = self.num_actions
+        if self.upsampler is not None:
+            self.upsampler.inner_model.num_actions = self.num_actions
+        if self.rew_end_model is not None:
+            self.rew_end_model.num_actions = self.num_actions
+        if self.actor_critic is not None:
+            self.actor_critic.num_actions = self.num_actions
+
+
+class Agent(nn.Module):
+    def __init__(self, cfg: AgentConfig) -> None:
+        super().__init__()
+        self.denoiser = Denoiser(cfg.denoiser)
+        self.upsampler = Denoiser(cfg.upsampler) if cfg.upsampler is not None else None
+        self.rew_end_model = RewEndModel(cfg.rew_end_model) if cfg.rew_end_model is not None else None
+        self.actor_critic = ActorCritic(cfg.actor_critic) if cfg.actor_critic is not None else None
+
+    @property
+    def device(self):
+        return self.denoiser.device
+
+    def setup_training(
+        self,
+        sigma_distribution_cfg: SigmaDistributionConfig,
+        sigma_distribution_cfg_upsampler: Optional[SigmaDistributionConfig],
+        actor_critic_loss_cfg: Optional[ActorCriticLossConfig],
+        rl_env: Optional[Union[TorchEnv, WorldModelEnv]],
+    ) -> None:
+        self.denoiser.setup_training(sigma_distribution_cfg)
+        if self.upsampler is not None:
+            self.upsampler.setup_training(sigma_distribution_cfg_upsampler)
+        if self.actor_critic is not None:
+            self.actor_critic.setup_training(rl_env, actor_critic_loss_cfg)
+
+    def load(
+        self,
+        path_to_ckpt: Path,
+        load_denoiser: bool = True,
+        load_upsampler: bool = True,
+        load_rew_end_model: bool = True,
+        load_actor_critic: bool = True,
+    ) -> None:
+        sd = torch.load(Path(path_to_ckpt), map_location=self.device)
+        if load_denoiser:
+            self.denoiser.load_state_dict(extract_state_dict(sd, "denoiser"))
+        if load_upsampler:
+            self.upsampler.load_state_dict(extract_state_dict(sd, "upsampler"))
+        if load_rew_end_model and self.rew_end_model is not None:
+            self.rew_end_model.load_state_dict(extract_state_dict(sd, "rew_end_model"))
+        if load_actor_critic and self.actor_critic is not None:
+            self.actor_critic.load_state_dict(extract_state_dict(sd, "actor_critic"))
diff --git a/src/coroutines/__init__.py b/src/coroutines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e306a7eefc87b5c769d5dc553dd110a6eb53b380
--- /dev/null
+++ b/src/coroutines/__init__.py
@@ -0,0 +1,11 @@
+from functools import wraps
+
+
+def coroutine(func):
+    @wraps(func)
+    def primer(*args, **kwargs):
+        gen = func(*args, **kwargs)
+        next(gen)
+        return gen
+
+    return primer
diff --git a/src/coroutines/collector.py b/src/coroutines/collector.py
new file mode 100644
index 0000000000000000000000000000000000000000..132b5baae1e263ad7d8707cb21102b530ccf0255
--- /dev/null
+++ b/src/coroutines/collector.py
@@ -0,0 +1,126 @@
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Generator, Optional
+
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+from . import coroutine
+from data import Episode, Dataset
+from envs import TorchEnv
+from .env_loop import make_env_loop
+from utils import Logs
+
+
+@coroutine
+def make_collector(
+    env: TorchEnv,
+    model: nn.Module,
+    dataset: Dataset,
+    epsilon: float = 0.0,
+    reset_every_collect: bool = False,
+    verbose: bool = True,
+) -> Generator[Logs, int, None]:
+    num_envs = env.num_envs
+
+    env_loop, buffer, episode_ids, dead = (None,) * 4
+    num_steps, num_episodes, to_log, pbar = (None,) * 4
+
+    def setup_new_collect():
+        nonlocal num_steps, num_episodes, buffer, to_log, pbar
+        num_steps = 0
+        num_episodes = 0
+        buffer = defaultdict(list)
+        to_log = []
+        pbar = tqdm(
+            total=num_to_collect.total,
+            unit=num_to_collect.unit,
+            desc=f"Collect {dataset.name}",
+            disable=not verbose,
+        )
+
+    def reset():
+        nonlocal env_loop, episode_ids, dead
+        env_loop = make_env_loop(env, model, epsilon)
+        episode_ids = defaultdict(lambda: None)
+        dead = [None] * num_envs
+
+    num_to_collect = yield
+    setup_new_collect()
+    reset()
+
+    while True:
+        with torch.no_grad():
+            all_obs, act, rew, end, trunc, *_, [infos] = env_loop.send(1)
+
+        num_steps += num_envs
+        pbar.update(num_envs if num_to_collect.steps is not None else 0)
+
+        for i, (o, a, r, e, t) in enumerate(zip(all_obs, act, rew, end, trunc)):
+            buffer[i].append((o, a, r, e, t))
+            dead[i] = (e + t).clip(max=1).item()
+
+        num_episodes += sum(dead)
+
+        can_stop = num_to_collect.can_stop(num_steps, num_episodes)
+
+        count_dead = 0
+        for i in range(num_envs):
+            # Store incomplete episodes only when reset_every_collect is set to False (train)
+            add_to_dataset = dead[i] or (can_stop and not reset_every_collect)
+            if add_to_dataset:
+                info = {"final_observation": infos["final_observation"][count_dead]} if dead[i] else {}
+                ep = Episode(*(torch.cat(x, dim=0) for x in zip(*buffer[i])), info).to("cpu")
+                if episode_ids[i] is not None:
+                    ep = dataset.load_episode(episode_ids[i]) + ep
+                episode_ids[i] = dataset.add_episode(ep, episode_id=episode_ids[i])
+
+            if dead[i]:
+                to_log.append(
+                    {
+                        f"{dataset.name}/episode_id": episode_ids[i],
+                        **ep.compute_metrics(),
+                    }
+                )
+                buffer[i] = []
+                episode_ids[i] = None
+                pbar.update(1 if num_to_collect.episodes is not None else 0)
+
+            count_dead += dead[i]
+
+        if can_stop:
+            pbar.close()
+            metrics = {
+                "num_steps": dataset.num_steps,
+                "counts/rew_-1": dataset.counts_rew[0],
+                "counts/rew__0": dataset.counts_rew[1],
+                "counts/rew_+1": dataset.counts_rew[2],
+                "counts/end_0": dataset.counts_end[0],
+                "counts/end_1": dataset.counts_end[1],
+            }
+            to_log.append({f"{dataset.name}/{k}": v for k, v in metrics.items()})
+            num_to_collect = yield to_log
+            setup_new_collect()
+            if reset_every_collect:
+                reset()
+
+
+@dataclass
+class NumToCollect:
+    steps: Optional[int] = None
+    episodes: Optional[int] = None
+
+    def __post_init__(self) -> None:
+        assert (self.steps is None) != (self.episodes is None)
+
+    def can_stop(self, num_steps: int, num_episodes: int) -> bool:
+        return num_steps >= self.steps if self.steps is not None else num_episodes >= self.episodes
+
+    @property
+    def unit(self) -> str:
+        return "steps" if self.steps is not None else "eps"
+
+    @property
+    def total(self) -> int:
+        return self.steps if self.steps is not None else self.episodes
diff --git a/src/coroutines/env_loop.py b/src/coroutines/env_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3d6a12091076c06ae327ec76797eebcd4f6d1d4
--- /dev/null
+++ b/src/coroutines/env_loop.py
@@ -0,0 +1,74 @@
+import random
+from typing import Generator, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.distributions.categorical import Categorical
+
+from . import coroutine
+from envs import TorchEnv, WorldModelEnv
+
+
+@coroutine
+def make_env_loop(
+    env: Union[TorchEnv, WorldModelEnv], model: nn.Module, epsilon: float = 0.0
+) -> Generator[Tuple[torch.Tensor, ...], int, None]:
+    num_steps = yield
+
+    hx = torch.zeros(env.num_envs, model.lstm_dim, device=model.device)
+    cx = torch.zeros(env.num_envs, model.lstm_dim, device=model.device)
+
+    seed = random.randint(0, 2**31 - 1)
+    obs, _ = env.reset(seed=[seed + i for i in range(env.num_envs)])
+
+    while True:
+        hx, cx = hx.detach(), cx.detach()
+        all_ = []
+        infos = []
+        n = 0
+
+        while n < num_steps:
+            logits_act, val, (hx, cx) = model.predict_act_value(obs, (hx, cx))
+            act = Categorical(logits=logits_act).sample()
+
+            if random.random() < epsilon:
+                act = torch.randint(low=0, high=env.num_actions, size=(obs.size(0),), device=obs.device)
+
+            next_obs, rew, end, trunc, info = env.step(act)
+
+            if n > 0:
+                val_bootstrap = val.detach().clone()
+                if dead.any():
+                    val_bootstrap[dead] = val_final_obs
+                all_[-1][-1] = val_bootstrap
+
+            dead = torch.logical_or(end, trunc)
+
+            if dead.any():
+                with torch.no_grad():
+                    _, val_final_obs, _ = model.predict_act_value(info["final_observation"], (hx[dead], cx[dead]))
+                reset_gate = 1 - dead.float().unsqueeze(1)
+                hx = hx * reset_gate
+                cx = cx * reset_gate
+                if "burnin_obs" in info:
+                    burnin_obs = info["burnin_obs"]
+                    for i in range(burnin_obs.size(1)):
+                        _, _, (hx[dead], cx[dead]) = model.predict_act_value(burnin_obs[:, i], (hx[dead], cx[dead]))
+
+            all_.append([obs, act, rew, end, trunc, logits_act, val, None])
+            infos.append(info)
+
+            obs = next_obs
+            n += 1
+
+        with torch.no_grad():
+            _, val_bootstrap, _ = model.predict_act_value(next_obs, (hx, cx))  # do not update hx/cx
+
+        if dead.any():
+            val_bootstrap[dead] = val_final_obs
+
+        all_[-1][-1] = val_bootstrap
+
+        all_obs, act, rew, end, trunc, logits_act, val, val_bootstrap = (torch.stack(x, dim=1) for x in zip(*all_))
+
+        num_steps = yield all_obs, act, rew, end, trunc, logits_act, val, val_bootstrap, infos
diff --git a/src/data/__init__.py b/src/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2929771e56bc987969900977830e1a7d08a43062
--- /dev/null
+++ b/src/data/__init__.py
@@ -0,0 +1,6 @@
+from .batch import Batch
+from .batch_sampler import BatchSampler
+from .dataset import Dataset, GameHdf5Dataset
+from .episode import Episode
+from .segment import Segment, SegmentId
+from .utils import collate_segments_to_batch, DatasetTraverser, make_segment
diff --git a/src/data/batch.py b/src/data/batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a109d5bd8b974a341bc9f38c3a4c7d46e1a49a5
--- /dev/null
+++ b/src/data/batch.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
+import torch
+
+from .segment import SegmentId
+
+
+@dataclass
+class Batch:
+    obs: torch.ByteTensor
+    act: torch.LongTensor
+    rew: torch.FloatTensor
+    end: torch.LongTensor
+    trunc: torch.LongTensor
+    mask_padding: torch.BoolTensor
+    info: List[Dict[str, Any]]
+    segment_ids: List[SegmentId]
+
+    def pin_memory(self) -> Batch:
+        return Batch(**{k: v if k in ("segment_ids", "info") else v.pin_memory() for k, v in self.__dict__.items()})
+
+    def to(self, device: torch.device) -> Batch:
+        return Batch(**{k: v if k in ("segment_ids", "info") else v.to(device) for k, v in self.__dict__.items()})
diff --git a/src/data/batch_sampler.py b/src/data/batch_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..51f9261a17201cb32778d148cddd163969867c33
--- /dev/null
+++ b/src/data/batch_sampler.py
@@ -0,0 +1,100 @@
+from typing import Generator, List, Optional
+
+import numpy as np
+import torch
+
+from .dataset import GameHdf5Dataset, Dataset
+from .segment import SegmentId
+
+
+class BatchSampler(torch.utils.data.Sampler):
+    def __init__(
+            self,
+            dataset: Dataset,
+            rank: int,
+            world_size: int,
+            batch_size: int,
+            seq_length: int,
+            sample_weights: Optional[List[float]] = None,
+            can_sample_beyond_end: bool = False,
+            autoregressive_obs: int = None,
+            initial_num_consecutive_page_count: int = 1
+    ) -> None:
+        super().__init__(dataset)
+        assert isinstance(dataset, (Dataset, GameHdf5Dataset))
+        self.dataset = dataset
+        self.rank = rank
+        self.world_size = world_size
+        self.sample_weights = sample_weights
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.can_sample_beyond_end = can_sample_beyond_end
+        self.autoregressive_obs = autoregressive_obs
+        self.num_consecutive_batches = initial_num_consecutive_page_count
+
+    def __len__(self):
+        raise NotImplementedError
+
+    def __iter__(self) -> Generator[List[SegmentId], None, None]:
+        segments = None
+        current_iter = 0
+
+        while True:
+            if current_iter == 0:
+                segments = self.sample()
+            else:
+                segments = self.next(segments)
+
+            current_iter = (current_iter + 1) % self.num_consecutive_batches
+            yield segments
+
+    def next(self, segments: List[SegmentId]):
+        return [
+            SegmentId(segment.episode_id, segment.stop, segment.stop + self.autoregressive_obs, False)
+            for segment in segments
+        ]
+
+    def sample(self) -> List[SegmentId]:
+        total_length = self.seq_length + (self.num_consecutive_batches - 1) * self.autoregressive_obs
+
+        num_episodes = self.dataset.num_episodes
+
+        if (self.sample_weights is None) or num_episodes < len(self.sample_weights):
+            weights = self.dataset.lengths / self.dataset.num_steps
+        else:
+            weights = self.sample_weights
+            num_weights = len(self.sample_weights)
+            assert all([0 <= x <= 1 for x in weights]) and sum(weights) == 1
+            sizes = [
+                num_episodes // num_weights + (num_episodes % num_weights) * (i == num_weights - 1)
+                for i in range(num_weights)
+            ]
+            weights = [w / s for (w, s) in zip(weights, sizes) for _ in range(s)]
+
+        episodes_partition = np.arange(self.rank, num_episodes, self.world_size)
+        episode_lengths = self.dataset.lengths[episodes_partition]
+        valid_mask = episode_lengths > total_length # valid episodes must be long enough for autoregressvie generation
+        episodes_partition = episodes_partition[valid_mask]
+
+        weights = np.array(weights[self.rank::self.world_size])
+        weights = weights[valid_mask]
+
+        max_eps = self.batch_size
+        episode_ids = np.random.choice(episodes_partition, size=max_eps, replace=True, p=weights / weights.sum())
+        episode_ids = episode_ids.repeat(self.batch_size // max_eps)
+
+        # choose a random timestamp at the dataset
+        timesteps = np.random.randint(low=0, high=self.dataset.lengths[episode_ids])
+        # compute total context size + autoregressive generation length
+
+        # the stops of the first page can be at most the length of the training example minus the autoregressive generation frames in the next pages
+        stops = np.minimum(
+            self.dataset.lengths[episode_ids] - (self.num_consecutive_batches - 1) * self.seq_length,
+            timesteps + 1 + np.random.randint(0, total_length, len(timesteps))
+        )
+        # stops must be longer than the initial context + first page prediction size
+        stops = np.maximum(stops, self.seq_length)
+        # starts is stops minus the initial context and the first page prediction size
+        starts = stops - self.seq_length
+
+        return [SegmentId(*x, True) for x in zip(episode_ids, starts, stops)]
\ No newline at end of file
diff --git a/src/data/dataset.py b/src/data/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5db5bff1990201e02cc113acfe5fb960ba2412f
--- /dev/null
+++ b/src/data/dataset.py
@@ -0,0 +1,219 @@
+from collections import Counter
+import multiprocessing as mp
+from pathlib import Path
+import shutil
+from typing import Any, Dict, List, Optional
+
+import h5py
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset as TorchDataset
+
+from .episode import Episode
+from .segment import Segment, SegmentId
+from .utils import make_segment
+from utils import StateDictMixin
+
+
+class Dataset(StateDictMixin, TorchDataset):
+    def __init__(
+        self,
+        directory: Path,
+        dataset_full_res: Optional[TorchDataset],
+        name: Optional[str] = None,
+        cache_in_ram: bool = False,
+        use_manager: bool = False,
+        save_on_disk: bool = True,
+    ) -> None:
+        super().__init__()
+
+        # State
+        self.is_static = False
+        self.num_episodes = None
+        self.num_steps = None
+        self.start_idx = None
+        self.lengths = None
+        self.counter_rew = None
+        self.counter_end = None
+
+        self._directory = Path(directory).expanduser()
+        self._name = name if name is not None else self._directory.stem
+        self._cache_in_ram = cache_in_ram
+        self._save_on_disk = save_on_disk
+        self._default_path = self._directory / "info.pt"
+        self._cache = mp.Manager().dict() if use_manager else {}
+        self._reset()
+
+        self._dataset_full_res = dataset_full_res
+
+    def __len__(self) -> int:
+        return self.num_steps
+
+    def __getitem__(self, segment_id: SegmentId) -> Segment:
+        episode = self.load_episode(segment_id.episode_id)
+        segment = make_segment(episode, segment_id, should_pad=True)
+        if self._dataset_full_res is not None:
+            segment_id_full_res = SegmentId(episode.info["original_file_id"], segment_id.start, segment_id.stop, segment_id.is_first_batch)
+            segment.info["full_res"] = self._dataset_full_res[segment_id_full_res].obs
+        elif "full_res" in segment.info:
+            segment.info["full_res"] = segment.info["full_res"][segment_id.start:segment_id.stop]
+        return segment
+        
+    def __str__(self) -> str:
+        return f"{self.name}: {self.num_episodes} episodes, {self.num_steps} steps."
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def counts_rew(self) -> List[int]:
+        return [self.counter_rew[r] for r in [-1, 0, 1]]
+
+    @property
+    def counts_end(self) -> List[int]:
+        return [self.counter_end[e] for e in [0, 1]]
+
+    def _reset(self) -> None:
+        self.num_episodes = 0
+        self.num_steps = 0
+        self.start_idx = np.array([], dtype=np.int64)
+        self.lengths = np.array([], dtype=np.int64)
+        self.counter_rew = Counter()
+        self.counter_end = Counter()
+        self._cache.clear()
+
+    def clear(self) -> None:
+        self.assert_not_static()
+        if self._directory.is_dir():
+            shutil.rmtree(self._directory)
+        self._reset()
+
+    def load_episode(self, episode_id: int) -> Episode:
+        if self._cache_in_ram and episode_id in self._cache:
+            episode = self._cache[episode_id]
+        else:
+            episode = Episode.load(self._get_episode_path(episode_id))
+            if self._cache_in_ram:
+                self._cache[episode_id] = episode
+        return episode
+
+    def add_episode(self, episode: Episode, *, episode_id: Optional[int] = None) -> int:
+        self.assert_not_static()
+        episode = episode.to("cpu")
+
+        if episode_id is None:
+            episode_id = self.num_episodes
+            self.start_idx = np.concatenate((self.start_idx, np.array([self.num_steps])))
+            self.lengths = np.concatenate((self.lengths, np.array([len(episode)])))
+            self.num_steps += len(episode)
+            self.num_episodes += 1
+
+        else:
+            assert episode_id < self.num_episodes
+            old_episode = self.load_episode(episode_id)
+            incr_num_steps = len(episode) - len(old_episode)
+            self.lengths[episode_id] = len(episode)
+            self.start_idx[episode_id + 1 :] += incr_num_steps
+            self.num_steps += incr_num_steps
+            self.counter_rew.subtract(old_episode.rew.sign().tolist())
+            self.counter_end.subtract(old_episode.end.tolist())
+
+        self.counter_rew.update(episode.rew.sign().tolist())
+        self.counter_end.update(episode.end.tolist())
+
+        if self._save_on_disk:
+            episode.save(self._get_episode_path(episode_id))
+
+        if self._cache_in_ram:
+            self._cache[episode_id] = episode
+
+        return episode_id
+
+    def _get_episode_path(self, episode_id: int) -> Path:
+        n = 3  # number of hierarchies
+        powers = np.arange(n)
+        subfolders = np.floor((episode_id % 10 ** (1 + powers)) / 10**powers) * 10**powers
+        subfolders = [int(x) for x in subfolders[::-1]]
+        subfolders = "/".join([f"{x:0{n - i}d}" for i, x in enumerate(subfolders)])
+        return self._directory / subfolders / f"{episode_id}.pt"
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        super().load_state_dict(state_dict)
+        self._cache.clear()
+
+    def assert_not_static(self) -> None:
+        assert not self.is_static, "Trying to modify a static dataset."
+
+    def save_to_default_path(self) -> None:
+        self._default_path.parent.mkdir(exist_ok=True, parents=True)
+        torch.save(self.state_dict(), self._default_path)
+
+    def load_from_default_path(self) -> None:
+        print(self._default_path)
+        if self._default_path.is_file():
+            self.load_state_dict(torch.load(self._default_path, weights_only=False))
+
+
+class GameHdf5Dataset(StateDictMixin, TorchDataset):
+    def __init__(self, directory: Path) -> None:
+        super().__init__()
+        filenames = sorted(Path(directory).rglob("*.hdf5"), key=lambda x: int(x.stem.split("_")[-1]))
+        self._filenames = {f"{x.parent.name}/{x.name}": x for x in filenames}
+
+        self._length_one_episode = self._episode_lengths(self._filenames)
+
+        self.num_episodes = len(self._filenames)
+
+        self.num_steps = sum(list(self._length_one_episode.values()))
+        self.lengths = np.array(list(self._length_one_episode.values()), dtype=np.int64)
+
+    def _episode_lengths(self, filenames):
+        length_one_episode = {}
+
+        for filename in filenames:
+            with h5py.File(filenames[filename], "r") as f:
+                keys = f.keys()
+                max_frame_index = max(int(key[len('frame_'):-len('_x')]) for key in keys if key.endswith('_x') and key.startswith('frame_'))
+                length_one_episode[filename] = max_frame_index + 1
+
+        return length_one_episode
+
+    def __len__(self) -> int:
+        return self.num_steps
+  
+    def save_to_default_path(self) -> None:
+        pass
+
+    def __getitem__(self, segment_id: SegmentId) -> Segment:
+        episode_length = self._length_one_episode[segment_id.episode_id]
+        assert segment_id.start < episode_length and segment_id.stop > 0 and segment_id.start < segment_id.stop
+
+        pad_len_right = max(0, segment_id.stop - episode_length)
+        pad_len_left = max(0, -segment_id.start)
+
+        start = max(0, segment_id.start)
+        stop = min(episode_length, segment_id.stop)
+        mask_padding = torch.cat((torch.zeros(pad_len_left), torch.ones(stop - start), torch.zeros(pad_len_right))).bool()
+
+        #print(self._filenames[segment_id.episode_id])
+        with h5py.File(self._filenames[segment_id.episode_id], "r") as f:
+            obs = torch.stack([torch.tensor(f[f"frame_{i}_x"][:]).flip(2).permute(2, 0, 1).div(255).mul(2).sub(1) for i in range(start, stop)])
+            act = torch.tensor(np.array([f[f"frame_{i}_y"][:] for i in range(start, stop)]))
+
+        def pad(x):
+            right = F.pad(x, [0 for _ in range(2 * x.ndim - 1)] + [pad_len_right]) if pad_len_right > 0 else x
+            return F.pad(right, [0 for _ in range(2 * x.ndim - 2)] + [pad_len_left, 0]) if pad_len_left > 0 else right
+
+        obs = pad(obs)
+        act = pad(act)
+        rew = torch.zeros(obs.size(0))
+        end = torch.zeros(obs.size(0), dtype=torch.uint8)
+        trunc = torch.zeros(obs.size(0), dtype=torch.uint8)
+        return Segment(obs, act, rew, end, trunc, mask_padding, info={}, id=SegmentId(segment_id.episode_id, start, stop, segment_id.is_first_batch))
+    
+    def load_episode(self, episode_id: int) -> Episode:  # used by DatasetTraverser
+        episode_length = self._length_one_episode[episode_id]
+        s = self[SegmentId(episode_id, 0, episode_length, None)]
+        return Episode(s.obs, s.act, s.rew, s.end, s.trunc, s.info)
\ No newline at end of file
diff --git a/src/data/episode.py b/src/data/episode.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e2f0af54ff6070a149cea27971498ece6f8c338
--- /dev/null
+++ b/src/data/episode.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import torch
+
+
+@dataclass
+class Episode:
+    obs: torch.FloatTensor
+    act: torch.LongTensor
+    rew: torch.FloatTensor
+    end: torch.ByteTensor
+    trunc: torch.ByteTensor
+    info: Dict[str, Any]
+
+    def __len__(self) -> int:
+        return self.obs.size(0)
+
+    def __add__(self, other: Episode) -> Episode:
+        assert self.dead.sum() == 0
+        d = {k: torch.cat((v, other.__dict__[k]), dim=0) for k, v in self.__dict__.items() if k != "info"}
+        return Episode(**d, info=merge_info(self.info, other.info))
+
+    def to(self, device) -> Episode:
+        return Episode(**{k: v.to(device) if k != "info" else v for k, v in self.__dict__.items()})
+
+    @property
+    def dead(self) -> torch.ByteTensor:
+        return (self.end + self.trunc).clip(max=1)
+
+    def compute_metrics(self) -> Dict[str, Any]:
+        return {"length": len(self), "return": self.rew.sum().item()}
+
+    @classmethod
+    def load(cls, path: Path, map_location: Optional[torch.device] = None) -> Episode:
+        return cls(
+            **{
+                k: v.div(255).mul(2).sub(1) if k == "obs" else v
+                for k, v in torch.load(Path(path), map_location=map_location).items()
+            }
+        )
+
+    def save(self, path: Path) -> None:
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        d = {k: v.add(1).div(2).mul(255).byte() if k == "obs" else v for k, v in self.__dict__.items()}
+        torch.save(d, path.with_suffix(".tmp"))
+        path.with_suffix(".tmp").rename(path)
+
+
+def merge_info(info_a, info_b):
+    keys_a = set(info_a)
+    keys_b = set(info_b)
+    intersection = keys_a & keys_b
+    info = {
+        **{k: info_a[k] for k in keys_a if k not in intersection},
+        **{k: info_b[k] for k in keys_b if k not in intersection},
+        **{k: torch.cat((info_a[k], info_b[k]), dim=0) for k in intersection},
+    }
+    return info
diff --git a/src/data/segment.py b/src/data/segment.py
new file mode 100644
index 0000000000000000000000000000000000000000..06a8dd0b47412a05911562af6b88270dc5d611a0
--- /dev/null
+++ b/src/data/segment.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, Union
+
+import torch
+
+
+@dataclass
+class SegmentId:
+    episode_id: Union[int, str]
+    start: int
+    stop: int
+    is_first_batch: bool
+
+
+@dataclass
+class Segment:
+    obs: torch.FloatTensor
+    act: torch.LongTensor
+    rew: torch.FloatTensor
+    end: torch.ByteTensor
+    trunc: torch.ByteTensor
+    mask_padding: torch.BoolTensor
+    info: Dict[str, Any]
+    id: SegmentId
+
+    @property
+    def effective_size(self):
+        return self.mask_padding.sum().item()
diff --git a/src/data/utils.py b/src/data/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..495051948c11f6607d214bde1b3072241285206d
--- /dev/null
+++ b/src/data/utils.py
@@ -0,0 +1,90 @@
+import math
+from typing import Generator, List
+
+import torch
+import torch.nn.functional as F
+
+from .batch import Batch
+from .episode import Episode
+from .segment import Segment, SegmentId
+
+
+def collate_segments_to_batch(segments: List[Segment]) -> Batch:
+    attrs = ("obs", "act", "rew", "end", "trunc", "mask_padding")
+    stack = (torch.stack([getattr(s, x) for s in segments]) for x in attrs)
+    return Batch(*stack, [s.info for s in segments], [s.id for s in segments])
+
+
+def make_segment(episode: Episode, segment_id: SegmentId, should_pad: bool = True) -> Segment:
+    if not (segment_id.start < len(episode) and segment_id.stop > 0 and segment_id.start < segment_id.stop):
+        print(f'Failed assertion because: start={segment_id.start}, stop={segment_id.stop}, len(episode)={len(episode)}')
+
+    assert segment_id.start < len(episode) and segment_id.stop > 0 and segment_id.start < segment_id.stop
+    pad_len_right = max(0, segment_id.stop - len(episode))
+    pad_len_left = max(0, -segment_id.start)
+    assert pad_len_right == pad_len_left == 0 or should_pad
+
+    def pad(x):
+        right = F.pad(x, [0 for _ in range(2 * x.ndim - 1)] + [pad_len_right]) if pad_len_right > 0 else x
+        return F.pad(right, [0 for _ in range(2 * x.ndim - 2)] + [pad_len_left, 0]) if pad_len_left > 0 else right
+
+    start = max(0, segment_id.start)
+    stop = min(len(episode), segment_id.stop)
+    mask_padding = torch.cat((torch.zeros(pad_len_left), torch.ones(stop - start), torch.zeros(pad_len_right))).bool()
+
+    return Segment(
+        pad(episode.obs[start:stop]),
+        pad(episode.act[start:stop]),
+        pad(episode.rew[start:stop]),
+        pad(episode.end[start:stop]),
+        pad(episode.trunc[start:stop]),
+        mask_padding,
+        info=episode.info,
+        id=SegmentId(segment_id.episode_id, start, stop, segment_id.is_first_batch),
+    )
+
+
+class DatasetTraverser:
+    def __init__(self, dataset, batch_num_samples: int, chunk_size: int) -> None:
+        self.dataset = dataset
+        self.batch_num_samples = batch_num_samples
+        self.chunk_size = chunk_size
+
+    def __len__(self):
+        return math.ceil(
+            sum(
+                [
+                    math.ceil(self.dataset.lengths[episode_id] / self.chunk_size)
+                    - int(self.dataset.lengths[episode_id] % self.chunk_size == 1)
+                    for episode_id in range(self.dataset.num_episodes)
+                ]
+            )
+            / self.batch_num_samples
+        )
+
+    def __iter__(self) -> Generator[Batch, None, None]:
+        chunks = []
+        for episode_id in range(self.dataset.num_episodes):
+            episode = self.dataset.load_episode(episode_id)
+            segments = []
+            for i in range(math.ceil(len(episode) / self.chunk_size)):
+                start = i * self.chunk_size
+                stop = (i + 1) * self.chunk_size
+                segment = make_segment(
+                    episode,
+                    SegmentId(episode_id, start, stop, None),
+                    should_pad=True,
+                )
+                segment_id_full_res = SegmentId(episode.info["original_file_id"], start, stop)
+                segment.info["full_res"] = self.dataset._dataset_full_res[segment_id_full_res].obs
+                chunks.append(segment)
+            if chunks[-1].effective_size < 2:
+                chunks.pop()
+
+            while len(chunks) >= self.batch_num_samples:
+                yield collate_segments_to_batch(chunks[: self.batch_num_samples])
+                chunks = chunks[self.batch_num_samples :]
+
+        if len(chunks) > 0:
+            yield collate_segments_to_batch(chunks)
+
diff --git a/src/envs/__init__.py b/src/envs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe503f48942c91a1c3320d355f0dd0a984be52a7
--- /dev/null
+++ b/src/envs/__init__.py
@@ -0,0 +1,2 @@
+from .env import make_atari_env, TorchEnv
+from .world_model_env import WorldModelEnv, WorldModelEnvConfig
diff --git a/src/envs/atari_preprocessing.py b/src/envs/atari_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2e6042daa755aa626f079003132f501f81f5592
--- /dev/null
+++ b/src/envs/atari_preprocessing.py
@@ -0,0 +1,133 @@
+"""
+Derived from https://github.com/openai/gym/blob/master/gym/wrappers/atari_preprocessing.py
+Implementation of Atari 2600 Preprocessing following the guidelines of Machado et al., 2018.
+"""
+
+from __future__ import annotations
+
+from typing import Any, SupportsFloat
+
+import cv2
+import numpy as np
+
+import gymnasium as gym
+from gymnasium.core import WrapperActType, WrapperObsType
+from gymnasium.spaces import Box
+
+
+class AtariPreprocessing(gym.Wrapper, gym.utils.RecordConstructorArgs):
+    def __init__(
+        self,
+        env: gym.Env,
+        noop_max: int,
+        frame_skip: int,
+        screen_size: int,
+    ):
+        gym.utils.RecordConstructorArgs.__init__(
+            self,
+            noop_max=noop_max,
+            frame_skip=frame_skip,
+            screen_size=screen_size,
+        )
+        gym.Wrapper.__init__(self, env)
+
+        assert frame_skip > 0
+        assert screen_size > 0
+        assert noop_max >= 0
+        if frame_skip > 1 and getattr(env.unwrapped, "_frameskip", None) != 1:
+            raise ValueError(
+                "Disable frame-skipping in the original env. Otherwise, more than one frame-skip will happen as through this wrapper"
+            )
+        self.noop_max = noop_max
+        assert env.unwrapped.get_action_meanings()[0] == "NOOP"
+
+        self.frame_skip = frame_skip
+        self.screen_size = screen_size
+
+        # buffer of most recent two observations for max pooling
+        assert isinstance(env.observation_space, Box)
+        self.obs_buffer = [
+            np.empty(env.observation_space.shape, dtype=np.uint8),
+            np.empty(env.observation_space.shape, dtype=np.uint8),
+        ]
+
+        self.lives = 0
+        self.game_over = False
+
+        _low, _high, _obs_dtype = (0, 255, np.uint8)
+        _shape = (screen_size, screen_size, 3)
+        self.observation_space = Box(low=_low, high=_high, shape=_shape, dtype=_obs_dtype)
+
+    @property
+    def ale(self):
+        """Make ale as a class property to avoid serialization error."""
+        return self.env.unwrapped.ale
+
+    def step(self, action: WrapperActType) -> tuple[WrapperObsType, SupportsFloat, bool, bool, dict[str, Any]]:
+        total_reward, terminated, truncated, info = 0.0, False, False, {}
+
+        life_loss = False
+
+        for t in range(self.frame_skip):
+            _, reward, terminated, truncated, info = self.env.step(action)
+            total_reward += reward
+            self.game_over = terminated
+
+            if self.ale.lives() < self.lives:
+                life_loss = True
+                self.lives = self.ale.lives()
+
+            if terminated or truncated:
+                break
+
+            if t == self.frame_skip - 2:
+                self.ale.getScreenRGB(self.obs_buffer[1])
+            elif t == self.frame_skip - 1:
+                self.ale.getScreenRGB(self.obs_buffer[0])
+
+        info["life_loss"] = life_loss
+
+        obs, original_obs = self._get_obs()
+        info["original_obs"] = original_obs
+
+        return obs, total_reward, terminated, truncated, info
+
+    def reset(
+        self, *, seed: int | None = None, options: dict[str, Any] | None = None
+    ) -> tuple[WrapperObsType, dict[str, Any]]:
+        """Resets the environment using preprocessing."""
+        # NoopReset
+        _, reset_info = self.env.reset(seed=seed, options=options)
+
+        reset_info["life_loss"] = False
+
+        noops = self.env.unwrapped.np_random.integers(1, self.noop_max + 1) if self.noop_max > 0 else 0
+        for _ in range(noops):
+            _, _, terminated, truncated, step_info = self.env.step(0)
+            reset_info.update(step_info)
+            if terminated or truncated:
+                _, reset_info = self.env.reset(seed=seed, options=options)
+
+        self.lives = self.ale.lives()
+        self.ale.getScreenRGB(self.obs_buffer[0])
+        self.obs_buffer[1].fill(0)
+
+        obs, original_obs = self._get_obs()
+        reset_info["original_obs"] = original_obs
+
+        return obs, reset_info
+
+    def _get_obs(self):
+        if self.frame_skip > 1:  # more efficient in-place pooling
+            np.maximum(self.obs_buffer[0], self.obs_buffer[1], out=self.obs_buffer[0])
+
+        original_obs = self.obs_buffer[0]
+        obs = cv2.resize(
+            original_obs,
+            (self.screen_size, self.screen_size),
+            interpolation=cv2.INTER_AREA,
+        )
+
+        obs = np.asarray(obs, dtype=np.uint8)
+
+        return obs, original_obs
diff --git a/src/envs/env.py b/src/envs/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e215be7cff97fc88a80d363ea03adb361359910
--- /dev/null
+++ b/src/envs/env.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+from typing import Any, Dict, Optional, Tuple
+
+import ale_py
+import gymnasium
+from gymnasium.vector import AsyncVectorEnv
+import numpy as np
+import torch
+from torch import Tensor
+
+from .atari_preprocessing import AtariPreprocessing
+
+
+def make_atari_env(
+    id: str,
+    num_envs: int,
+    device: torch.device,
+    done_on_life_loss: bool,
+    size: int,
+    max_episode_steps: Optional[int],
+) -> TorchEnv:
+    def env_fn():
+        env = gymnasium.make(
+            id,
+            full_action_space=False,
+            frameskip=1,
+            render_mode="rgb_array",
+            max_episode_steps=max_episode_steps,
+        )
+        env = AtariPreprocessing(
+            env=env,
+            noop_max=30,
+            frame_skip=4,
+            screen_size=size,
+        )
+        return env
+
+    env = AsyncVectorEnv([env_fn for _ in range(num_envs)])
+
+    # The AsyncVectorEnv resets the env on termination, which means that it will
+    # reset the environment if we use the default AtariPreprocessing of gymnasium with
+    # terminate_on_life_loss=True (which means that we will only see the first life).
+    # Hence a separate wrapper for life_loss, coming after the AsyncVectorEnv.
+
+    if done_on_life_loss:
+        env = DoneOnLifeLoss(env)
+
+    env = TorchEnv(env, device)
+
+    return env
+
+
+class DoneOnLifeLoss(gymnasium.Wrapper):
+    def __init__(self, env: AsyncVectorEnv) -> None:
+        super().__init__(env)
+
+    def step(self, actions: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, Dict[str, Any]]:
+        obs, rew, end, trunc, info = self.env.step(actions)
+        life_loss = info["life_loss"]
+        if life_loss.any():
+            end[life_loss] = True
+            info["final_observation"] = obs
+        return obs, rew, end, trunc, info
+
+
+class TorchEnv(gymnasium.Wrapper):
+    def __init__(self, env: gymnasium.Env, device: torch.device) -> None:
+        super().__init__(env)
+        self.device = device
+        self.num_envs = env.observation_space.shape[0]
+        self.num_actions = env.unwrapped.single_action_space.n
+        b, h, w, c = env.observation_space.shape
+        self.observation_space = gymnasium.spaces.Box(low=-1, high=1, shape=(b, c, h, w))
+
+    def reset(self, *args, **kwargs) -> Tuple[Tensor, Dict[str, Any]]:
+        obs, info = self.env.reset(*args, **kwargs)
+        return self._to_tensor(obs), info
+
+    def step(self, actions: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor, Dict[str, Any]]:
+        obs, rew, end, trunc, info = self.env.step(actions.cpu().numpy())
+        dead = np.logical_or(end, trunc)
+        if dead.any():
+            info["final_observation"] = self._to_tensor(np.stack(info["final_observation"][dead]))
+        obs, rew, end, trunc = (self._to_tensor(x) for x in (obs, rew, end, trunc))
+        return obs, rew, end, trunc, info
+
+    def _to_tensor(self, x: Tensor) -> Tensor:
+        if x.ndim == 4:
+            return torch.tensor(x, device=self.device).div(255).mul(2).sub(1).permute(0, 3, 1, 2).contiguous()
+        elif x.dtype is np.dtype("bool"):
+            return torch.tensor(x, dtype=torch.uint8, device=self.device)
+        else:
+            return torch.tensor(x, dtype=torch.float32, device=self.device)
diff --git a/src/envs/world_model_env.py b/src/envs/world_model_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..a71a8123ef11ffe57a31a1fd057d6fea8872c421
--- /dev/null
+++ b/src/envs/world_model_env.py
@@ -0,0 +1,201 @@
+from dataclasses import dataclass
+from itertools import cycle
+from pathlib import Path
+from typing import Any, Dict, Generator, List, Optional, Tuple
+
+import numpy as np
+import torch
+from torch import Tensor
+from torch.distributions.categorical import Categorical
+import torch.nn.functional as F
+
+from coroutines import coroutine
+from models.diffusion import Denoiser, DiffusionSampler, DiffusionSamplerConfig
+from models.rew_end_model import RewEndModel
+
+from utils import get_frame_indices
+
+ResetOutput = Tuple[torch.FloatTensor, Dict[str, Any]]
+StepOutput = Tuple[Tensor, Tensor, Tensor, Tensor, Dict[str, Any]]
+InitialCondition = Tuple[Tensor, Tensor, Tuple[Tensor, Tensor]]
+
+
+crop_frame = {
+    'left_top': (0.04, 0.18),
+    'right_bottom': (0.92, 0.95)
+}
+
+def extract_roi(image, rect):
+    _, _, img_width, img_height, = image.shape
+    min_x = int(rect['left_top'][1] * img_width)
+    max_x = int(rect['right_bottom'][1] * img_width)
+    min_y = int(rect['left_top'][0] * img_height)
+    max_y = int(rect['right_bottom'][0] * img_height)
+    roi = image[:,:,min_x:max_x, min_y:max_y]
+    return roi
+
+@dataclass
+class WorldModelEnvConfig:
+    horizon: int
+    num_batches_to_preload: int
+    diffusion_sampler_next_obs: DiffusionSamplerConfig
+    diffusion_sampler_upsampling: Optional[DiffusionSamplerConfig] = None
+
+
+class WorldModelEnv:
+    def __init__(
+        self,
+        denoiser: Denoiser,
+        upsampler: Optional[Denoiser],
+        rew_end_model: Optional[RewEndModel],
+        spawn_dir: Path,
+        num_envs: int,
+        seq_length: int,
+        cfg: WorldModelEnvConfig,
+        return_denoising_trajectory: bool = False,
+    ) -> None:
+        assert num_envs == 1  # for human play only
+        self.sampler_next_obs = DiffusionSampler(denoiser, cfg.diffusion_sampler_next_obs)
+        self.sampler_upsampling = None if upsampler is None else DiffusionSampler(upsampler, cfg.diffusion_sampler_upsampling)
+        self.rew_end_model = rew_end_model
+        self.horizon = cfg.horizon
+        self.return_denoising_trajectory = return_denoising_trajectory
+        self.num_envs = num_envs
+        self.generator_init = self.make_generator_init(spawn_dir, cfg.num_batches_to_preload)
+
+        self.n_skip_next_obs = seq_length - self.sampler_next_obs.denoiser.cfg.inner_model.num_steps_conditioning
+        self.n_skip_upsampling = None if upsampler is None else seq_length - self.sampler_upsampling.denoiser.cfg.inner_model.num_steps_conditioning
+
+        self.context_indicies = get_frame_indices(denoiser.cfg.frame_sampling)
+
+    @property
+    def device(self) -> torch.device:
+        return self.sampler_next_obs.denoiser.device
+
+    @torch.no_grad()
+    def reset(self, **kwargs) -> ResetOutput:
+        obs, obs_full_res, act, next_act, (hx, cx) = self.generator_init.send(self.num_envs)
+        self.obs_buffer = obs
+        self.act_buffer = act
+        self.next_act = next_act[0]
+        self.obs_full_res_buffer = obs_full_res
+        self.ep_len = torch.zeros(self.num_envs, dtype=torch.long, device=obs.device)
+        self.hx_rew_end = hx
+        self.cx_rew_end = cx
+        obs_to_return = self.obs_buffer[:, -1] if self.sampler_upsampling is None else self.obs_full_res_buffer[:, -1]
+        return obs_to_return, {}
+
+    @torch.no_grad()
+    def step(self, act: torch.LongTensor) -> StepOutput:
+        self.act_buffer[:, -1] = act
+
+        next_obs, denoising_trajectory = self.predict_next_obs()
+
+        if self.sampler_upsampling is not None:
+            next_obs_full, denoising_trajectory_upsampling = self.upsample_next_obs(next_obs)
+
+        if self.rew_end_model is not None:
+            rew, end = self.predict_rew_end(next_obs.unsqueeze(1))
+        else:
+            rew = torch.zeros(next_obs.size(0), dtype=torch.float32, device=self.device)
+            end = torch.zeros(next_obs.size(0), dtype=torch.int64, device=self.device)
+        
+        self.ep_len += 1
+        trunc = (self.ep_len >= self.horizon).long()
+
+        self.obs_buffer = self.obs_buffer.roll(-1, dims=1)
+        self.act_buffer = self.act_buffer.roll(-1, dims=1)
+        self.obs_buffer[:, -1] = next_obs
+
+        if self.sampler_upsampling is not None:
+            self.obs_full_res_buffer = self.obs_full_res_buffer.roll(-1, dims=1)
+            self.obs_full_res_buffer[:, -1] = next_obs_full
+
+        info = {}
+        if self.return_denoising_trajectory:
+            info["denoising_trajectory"] = torch.stack(denoising_trajectory, dim=1)
+            
+        if self.sampler_upsampling is not None:
+            info["obs_low_res"] = next_obs
+            if self.return_denoising_trajectory:
+                info["denoising_trajectory_upsampling"] = torch.stack(denoising_trajectory_upsampling, dim=1)
+
+        obs_to_return = self.obs_buffer[:, -1] if self.sampler_upsampling is None else self.obs_full_res_buffer[:, -1]
+        return obs_to_return, rew, end, trunc, info
+
+    @torch.no_grad()
+    def predict_next_obs(self) -> Tuple[Tensor, List[Tensor]]:
+        obs = self.obs_buffer[:, self.n_skip_next_obs:][:,self.context_indicies]
+        act = self.act_buffer[:, self.n_skip_next_obs:][:,self.context_indicies]
+
+        return self.sampler_next_obs.sample(obs, act)
+
+    @torch.no_grad()
+    def upsample_next_obs(self, next_obs: Tensor) -> Tuple[Tensor, List[Tensor]]:
+        # Upsampling the low resolution frame from the world model
+        low_res = F.interpolate(next_obs, scale_factor=self.sampler_upsampling.denoiser.cfg.upsampling_factor, mode="bicubic")
+
+        # Cropping the frame to remove the sides of the screen
+        low_res = extract_roi(low_res, crop_frame)
+
+        # Reshape the frame to the upsampler's expected size
+        size = (self.sampler_upsampling.denoiser.cfg.upsampling_frame_height, self.sampler_upsampling.denoiser.cfg.upsampling_frame_width)
+        low_res = F.interpolate(low_res, size=size, mode='bicubic')
+
+        return self.sampler_upsampling.sample(low_res.unsqueeze(1), None)
+
+    @torch.no_grad()
+    def predict_rew_end(self, next_obs: Tensor) -> Tuple[Tensor, Tensor]:
+        logits_rew, logits_end, (self.hx_rew_end, self.cx_rew_end) = self.rew_end_model.predict_rew_end(
+            self.obs_buffer[:, -1:],
+            self.act_buffer[:, -1:],
+            next_obs,
+            (self.hx_rew_end, self.cx_rew_end),
+        )
+        rew = Categorical(logits=logits_rew).sample().squeeze(1) - 1.0  # in {-1, 0, 1}
+        end = Categorical(logits=logits_end).sample().squeeze(1)
+        return rew, end
+
+    @coroutine
+    def make_generator_init(
+        self,
+        spawn_dir: Path,
+        num_batches_to_preload: int,
+    ) -> Generator[InitialCondition, None, None]:
+        num_dead = yield
+
+        spawn_dirs = cycle(sorted(list(spawn_dir.iterdir())))
+
+        while True:
+            # Preload on device and burnin rew/end model
+            obs_, obs_full_res_, act_, next_act_, hx_, cx_ = [], [], [], [], [], []
+            for _ in range(num_batches_to_preload):
+                d = next(spawn_dirs)
+                obs = torch.tensor(np.load(d / "low_res.npy"), device=self.device).div(255).mul(2).sub(1).unsqueeze(0)
+                obs_full_res = torch.tensor(np.load(d / "full_res.npy"), device=self.device).div(255).mul(2).sub(1).unsqueeze(0)
+                act = torch.tensor(np.load(d / "act.npy"), dtype=torch.long, device=self.device).unsqueeze(0)
+                next_act = torch.tensor(np.load(d / "next_act.npy"), dtype=torch.long, device=self.device).unsqueeze(0)
+
+                obs_.extend(list(obs))
+                obs_full_res_.extend(list(obs_full_res))
+                act_.extend(list(act))
+                next_act_.extend(list(next_act))
+
+                if self.rew_end_model is not None:
+                    with torch.no_grad():
+                        *_, (hx, cx) = self.rew_end_model.predict_rew_end(obs_[:, :-1], act[:, :-1], obs[:, 1:])  # Burn-in of rew/end model
+                    assert hx.size(0) == cx.size(0) == 1
+                    hx_.extend(list(hx[0]))
+                    cx_.extend(list(cx[0]))
+
+            # Yield new initial conditions for dead envs
+            c = 0
+            while c + num_dead <= len(obs_):
+                obs = torch.stack(obs_[c : c + num_dead])
+                act = torch.stack(act_[c : c + num_dead])
+                next_act = next_act_[c : c + num_dead]
+                obs_full_res = torch.stack(obs_full_res_[c : c + num_dead]) if self.sampler_upsampling is not None else None
+                hx = torch.stack(hx_[c : c + num_dead]).unsqueeze(0) if self.rew_end_model is not None else None
+                cx = torch.stack(cx_[c : c + num_dead]).unsqueeze(0) if self.rew_end_model is not None else None
+                c += num_dead
+                num_dead = yield obs, obs_full_res, act, next_act, (hx, cx)
diff --git a/src/game/__init__.py b/src/game/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..471203eff73541ed7896f4ffc58dc8f845015297
--- /dev/null
+++ b/src/game/__init__.py
@@ -0,0 +1,2 @@
+from .game import Game
+from .play_env import PlayEnv
diff --git a/src/game/dataset_env.py b/src/game/dataset_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..53fd1f8333f3fe9541017cb06b4dd6296e5af1b7
--- /dev/null
+++ b/src/game/dataset_env.py
@@ -0,0 +1,116 @@
+from typing import Any, Dict, List, Tuple
+
+import torch
+from torch import Tensor
+
+from data import Dataset
+
+
+class DatasetEnv:
+    def __init__(self, datasets: List[Dataset], action_names: List[str]) -> None:
+        self.datasets = [d for d in datasets if len(d) > 0]
+        assert len(self.datasets) > 0
+        self.action_names = action_names
+        self.dataset_id = 0
+        self.dataset = self.datasets[0]
+        self.episode_id = None
+        self.episode = None
+        self.t = None
+        self.ep_return = None
+        self.ep_length = None
+        self.pos_return = None
+        self.neg_return = None
+        self.load_episode(0)
+
+    def print_controls(self) -> None:
+        print("\nControls (dataset mode):\n")
+        print(f"m : datasets ({'/'.join([d.name for d in self.datasets])})")
+        print("↑ : next episode")
+        print("↓ : prev episode")
+        print("→ : next timestep")
+        print("← : prev timestep")
+
+    def next_mode(self) -> bool:
+        self.switch_dataset()
+        return True
+
+    def next_axis_1(self) -> bool:
+        self.load_episode(self.episode_id + 1)
+        return True
+
+    def prev_axis_1(self) -> bool:
+        self.load_episode(self.episode_id - 1)
+        return True
+
+    def next_axis_2(self) -> bool:
+        return False
+
+    def prev_axis_2(self) -> bool:
+        return False
+
+    def load_episode(self, episode_id: int) -> None:
+        self.episode_id = episode_id % self.dataset.num_episodes
+        self.episode = self.dataset.load_episode(self.episode_id)
+        self.set_timestep(0)
+        metrics = self.episode.compute_metrics()
+        self.ep_return = metrics["return"]
+        self.ep_length = metrics["length"]
+        self.pos_return = self.episode.rew[self.episode.rew > 0].sum().item()
+        self.neg_return = self.episode.rew[self.episode.rew < 0].sum().abs().item()
+
+    def set_timestep(self, timestep: int) -> None:
+        self.t = timestep % len(self.episode)
+        self.obs = self.episode.obs[self.t].unsqueeze(0)
+        self.act = self.episode.act[self.t]
+        self.rew = self.episode.rew[self.t]
+        self.end = self.episode.end[self.t]
+        self.trunc = self.episode.trunc[self.t]
+
+    def switch_dataset(self) -> None:
+        self.dataset_id = (self.dataset_id + 1) % len(self.datasets)
+        self.dataset = self.datasets[self.dataset_id]
+        self.load_episode(0)
+
+    def reset(self) -> None:
+        self.set_timestep(0)
+        return self.obs, None
+
+    @torch.no_grad()
+    def step(self, act: int) -> Tuple[Tensor, Tensor, bool, bool, Dict[str, Any]]:
+        match act:
+            case 1:
+                self.set_timestep(self.t - 1)
+            case 2:
+                self.set_timestep(self.t + 1)
+            case 3:
+                self.set_timestep(self.t - 10)
+            case 4:
+                self.set_timestep(self.t + 10)
+
+        n_digits = len(str(self.ep_length))
+
+        header = [
+            [
+                f"Dataset: {self.dataset.name}",
+                f"Episode: {self.episode_id}",
+                "--------",
+                f"Return (+): +{self.pos_return:4.1f}",
+                f"Return (-): -{self.neg_return:4.1f}",
+                f"Total     :  {self.ep_return:4.1f}",
+            ],
+            [
+                f"Action: {self.action_names[self.act]}",
+                f"Trunc : {bool(self.trunc)}",
+                f"Done  : {bool(self.end)}",
+                f"Reward: {self.rew.item():.2f}",
+                "-------",
+                f"To here: {self.episode.rew[:self.t + 1].sum().item():.2f}",
+                f"To go  : {self.episode.rew[self.t + 1:].sum().item():.2f}",
+            ],
+            [
+                f"Timestep: {self.t:{n_digits}d}",
+                f"Length  : {self.ep_length}",
+            ],
+        ]
+        info = {"header": header}
+        return self.obs, torch.tensor(0), False, False, info
diff --git a/src/game/game.py b/src/game/game.py
new file mode 100644
index 0000000000000000000000000000000000000000..88dfe1fb11ebcb4b389179bdbe3e419dca446a68
--- /dev/null
+++ b/src/game/game.py
@@ -0,0 +1,179 @@
+from typing import Tuple, Union
+
+import numpy as np
+import pygame
+from PIL import Image
+
+from player.action_processing import GameAction
+from .dataset_env import DatasetEnv
+from .play_env import PlayEnv
+
+class Game:
+    def __init__(
+        self,
+        play_env: Union[PlayEnv, DatasetEnv],
+        size: Tuple[int, int],
+        fps: int,
+        verbose: bool,
+    ) -> None:
+        self.env = play_env
+        self.height, self.width = size
+        self.fps = fps
+        self.verbose = verbose
+        self.env.print_controls()
+        print("\nControls:\n")
+        print(" m  : switch control (human/replay)") # Not for main as Game can use either PlayEnv or DatasetEnv
+        print(" .  : pause/unpause")
+        print(" e  : step-by-step (when paused)")
+        print(" ⏎  : reset env")
+        print("Esc : quit")
+        print("\n")
+        input("Press enter to start")
+
+    def run(self) -> None:
+        pygame.init()
+
+        header_height = 150 if self.verbose else 0
+        header_width = 540
+        font_size = 16
+        screen = pygame.display.set_mode((0, 0), pygame.FULLSCREEN)
+        pygame.mouse.set_visible(False)
+        pygame.event.set_grab(True)
+        clock = pygame.time.Clock()
+        font = pygame.font.SysFont("mono", font_size)
+        x_center, y_center = screen.get_rect().center
+        x_header = x_center - header_width // 2
+        y_header = y_center - self.height // 2 - header_height - 10
+        header_rect = pygame.Rect(x_header, y_header, header_width, header_height)
+
+        def clear_header():
+            pygame.draw.rect(screen, pygame.Color("black"), header_rect)
+            pygame.draw.rect(screen, pygame.Color("white"), header_rect, 1)
+
+        def draw_text(text, idx_line, idx_column, num_cols):
+            x_pos = 5 + idx_column * int(header_width // num_cols)
+            y_pos = 5 + idx_line * font_size
+            assert (0 <= x_pos <= header_width) and (0 <= y_pos <= header_height)
+            screen.blit(font.render(text, True, pygame.Color("white")), (x_header + x_pos, y_header + y_pos))
+
+        def draw_obs(obs, obs_low_res=None):
+            assert obs.ndim == 4 and obs.size(0) == 1
+            two_players = obs[0].add(1).div(2).mul(255).byte().permute(1, 2, 0).cpu().numpy()
+            player_1_frame, player_2_frame = two_players[:,:,:3], two_players[:, :, 3:]
+
+            # Stack images vertically
+            stacked_arr = np.vstack((player_1_frame, player_2_frame))
+
+            # Convert back to image
+            img = Image.fromarray(stacked_arr)
+
+            # resize the images, and prepare it for display
+            pygame_image = np.array(img.resize((self.width, self.height), resample=Image.BOX)).transpose((1, 0, 2))
+
+            surface = pygame.surfarray.make_surface(pygame_image)
+            screen.blit(surface, (x_center - self.width // 2, y_center - self.height // 2))
+
+            if obs_low_res is not None:
+                assert obs_low_res.ndim == 4 and obs_low_res.size(0) == 1
+                img = Image.fromarray(obs_low_res[0].add(1).div(2).mul(255).byte().permute(1, 2, 0).cpu().numpy())
+                h = self.height * obs_low_res.size(2) // obs.size(2)
+                w = self.width * obs_low_res.size(3) // obs.size(3)
+                pygame_image = np.array(img)
+                surface = pygame.surfarray.make_surface(pygame_image)
+                screen.blit(surface, (x_center - w // 2, y_center + self.height // 2))
+
+        def reset():
+            nonlocal obs, info, do_reset, ep_return, ep_length, keys_pressed
+            obs, info = self.env.reset()
+            pygame.event.clear()
+            do_reset = False
+            ep_return = 0
+            ep_length = 0
+            keys_pressed = []
+
+        obs, info, do_reset, ep_return, ep_length, keys_pressed= (None,) * 6
+
+        reset()
+        do_wait = False
+        should_stop = False
+
+        while not should_stop:
+            do_one_step = False
+            pygame.event.pump()
+
+            for event in pygame.event.get():
+                if event.type == pygame.QUIT or (event.type == pygame.KEYDOWN and event.key == pygame.K_ESCAPE):
+                    should_stop = True
+
+                if event.type == pygame.KEYDOWN:
+                    keys_pressed.append(event.key)
+
+                elif event.type == pygame.KEYUP and event.key in keys_pressed:
+                    keys_pressed.remove(event.key)
+
+                if event.type != pygame.KEYDOWN:
+                    continue
+
+                if event.key == pygame.K_RETURN:
+                    do_reset = True
+
+                if event.key == pygame.K_PERIOD:
+                    do_wait = not do_wait
+                    print("Game paused." if do_wait else "Game resumed.")
+
+                if event.key == pygame.K_e:
+                    do_one_step = True
+
+                if event.key == pygame.K_m:
+                    do_reset = self.env.next_mode()
+
+                if event.key == pygame.K_UP:
+                    do_reset = self.env.next_axis_1()
+
+                if event.key == pygame.K_DOWN:
+                    do_reset = self.env.prev_axis_1()
+
+                if event.key == pygame.K_RIGHT:
+                    do_reset = self.env.next_axis_2()
+
+                if event.key == pygame.K_LEFT:
+                    do_reset = self.env.prev_axis_2()
+
+            if do_reset:
+                reset()
+
+            if do_wait and not do_one_step:
+                continue
+
+            game_action = GameAction(keys_pressed)
+            next_obs, rew, end, trunc, info = self.env.step(game_action)
+
+            ep_return += rew.item()
+            ep_length += 1
+
+            if self.verbose and info is not None:
+                clear_header()
+                assert isinstance(info, dict) and "header" in info
+                header = info["header"]
+                num_cols = len(header)
+                for j, col in enumerate(header):
+                    for i, row in enumerate(col):
+                        draw_text(row, idx_line=i, idx_column=j, num_cols=num_cols)
+
+            draw_low_res = self.verbose and "obs_low_res" in info and self.width == 280
+            if draw_low_res:
+                draw_obs(obs, info["obs_low_res"])
+                draw_text("  Pre-upsampling:", 0, 2, 3)
+            else:
+                draw_obs(obs, None)
+
+            pygame.display.flip()  # update screen
+            clock.tick(self.fps)  # ensures game maintains the given frame rate
+
+            if end or trunc:
+                reset()
+
+            else:
+                obs = next_obs
+
+        pygame.quit()
diff --git a/src/game/play_env.py b/src/game/play_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..59e4580b8f663f6e916994e9883ccc43da0fdcc8
--- /dev/null
+++ b/src/game/play_env.py
@@ -0,0 +1,144 @@
+from collections import defaultdict, namedtuple
+import math
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+import pygame
+import torch
+from torch import Tensor
+
+from agent import Agent
+from player.action_processing import GameAction, decode_game_action, encode_game_action, print_game_action
+from player.keymap import GAME_KEYMAP
+from data import Dataset, Episode
+from envs import WorldModelEnv
+
+
+NamedEnv = namedtuple("NamedEnv", "name env")
+OneStepData = namedtuple("OneStepData", "obs act rew end trunc")
+
+
+class PlayEnv:
+    def __init__(
+        self,
+        agent: Agent,
+        wm_env: WorldModelEnv,
+        recording_mode: bool,
+        store_denoising_trajectory: bool,
+        store_original_obs: bool,
+    ) -> None:
+        self.agent = agent
+        self.keymap = GAME_KEYMAP
+        self.recording_mode = recording_mode
+        self.store_denoising_trajectory = store_denoising_trajectory
+        self.store_original_obs = store_original_obs
+        self.is_human_player = True
+        self.env_id = 0
+        self.env_name = "world model"
+        self.env = wm_env
+        self.obs, self.t, self.buffer, self.rec_dataset = (None,) * 4
+
+    def print_controls(self) -> None:
+        print("\nEnvironment actions:\n")
+        for key, action_name in self.keymap.items():
+            if key is not None:
+                key_name = pygame.key.name(key)
+                key_name = "⎵" if key_name == "space" else key_name
+                print(f"{key_name} : {action_name}")
+
+    def next_mode(self) -> bool:
+        self.switch_controller()
+        return True
+
+    def next_axis_1(self) -> bool:
+        return False
+
+    def prev_axis_1(self) -> bool:
+        return False
+
+    def next_axis_2(self) -> bool:
+        return False
+
+    def prev_axis_2(self) -> bool:
+        return False
+
+    def print_env(self) -> None:
+        print(f"> Environment: {self.env_name}")
+
+    def str_control(self) -> str:
+        return "human" if self.is_human_player else "replay actions (test dataset)"
+
+    def print_control(self) -> None:
+        print(f"> Control: {self.str_control()}")
+
+    def switch_controller(self) -> None:
+        self.is_human_player = not self.is_human_player
+        self.print_control()
+
+    def update_wm_horizon(self, incr: int) -> None:
+        self.env.horizon = max(1, self.env.horizon + incr)
+
+    def reset_recording(self) -> None:
+        self.buffer = defaultdict(list)
+        self.buffer["info"] = defaultdict(list)
+        dir = Path("dataset") / f"rec_{self.env_name}_{'H' if self.is_human_player else 'R'}"
+        self.rec_dataset = Dataset(dir, None)
+        self.rec_dataset.load_from_default_path()
+
+    def reset(self) -> Tuple[Tensor, None]:
+        self.obs, _ = self.env.reset()
+        self.t = 0
+        if self.recording_mode:
+            self.reset_recording()
+        return self.obs, None
+
+    @torch.no_grad()
+    def step(self, game_action: GameAction) -> Tuple[Tensor, Tensor, Tensor, Tensor, Dict[str, Any]]:
+        if self.is_human_player:
+            action = encode_game_action(game_action, device=self.agent.device)
+        else:
+            action = self.env.next_act[self.t - 1] if self.t > 0 else self.env.act_buffer[0, -1].clone()
+            game_action = decode_game_action(action.cpu())
+
+        next_obs, rew, end, trunc, env_info = self.env.step(action)
+
+        if not self.is_human_player and self.t == self.env.next_act.size(0):
+            trunc[0] = 1
+
+        data = OneStepData(self.obs, action, rew, end, trunc)
+        keys = print_game_action(game_action)
+        horizon = self.env.horizon if self.is_human_player else min(self.env.horizon, self.env.next_act.size(0))
+        header = [
+            [
+                f"Env     : {self.env_name}",
+                f"Control : {self.str_control()}",
+                f"Timestep: {self.t + 1}",
+                f"Horizon : {horizon}",
+                f"Keys  : {keys}",
+            ],
+        ]
+        info = {"header": header}
+        if "obs_low_res" in env_info:
+            info["obs_low_res"] = env_info["obs_low_res"]
+
+        if self.recording_mode:
+            for k, v in data._asdict().items():
+                self.buffer[k].append(v)
+            if "obs_low_res" in env_info:
+                self.buffer["info"]["obs_low_res"].append(env_info["obs_low_res"])
+            if self.store_denoising_trajectory and "denoising_trajectory" in env_info:
+                self.buffer["info"]["denoising_trajectory"].append(env_info["denoising_trajectory"])
+            if self.store_original_obs and "original_obs" in env_info:
+                original_obs = (torch.tensor(env_info["original_obs"][0]).permute(2, 0, 1).unsqueeze(0).contiguous())
+                self.buffer["info"]["original_obs"].append(original_obs)
+            if end or trunc:
+                ep_dict = {k: torch.cat(v, dim=0) for k, v in self.buffer.items() if k != "info"}
+                ep_info = {k: torch.cat(v, dim=0) for k, v in self.buffer["info"].items()}
+                ep = Episode(**ep_dict, info=ep_info).to("cpu")
+                self.rec_dataset.add_episode(ep)
+                self.rec_dataset.save_to_default_path()
+
+        self.obs = next_obs
+        self.t += 1
+
+        return next_obs, rew, end, trunc, info
diff --git a/src/main.py b/src/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f66e66796d9fa14d7aea428574332c071b5d15a
--- /dev/null
+++ b/src/main.py
@@ -0,0 +1,60 @@
+import os
+from pathlib import Path
+from typing import List, Union
+
+import hydra
+from omegaconf import DictConfig, OmegaConf
+import torch
+from torch.distributed import init_process_group, destroy_process_group
+import torch.multiprocessing as mp
+
+from trainer import Trainer
+from utils import skip_if_run_is_over
+
+
+OmegaConf.register_new_resolver("eval", eval)
+
+
+@hydra.main(config_path="../config", config_name="trainer", version_base="1.3")
+def main(cfg: DictConfig) -> None:
+    setup_visible_cuda_devices(cfg.common.devices)
+    world_size = torch.cuda.device_count()
+    root_dir = Path(hydra.utils.get_original_cwd())
+    if world_size < 2:
+        run(cfg, root_dir)
+    else:
+        mp.spawn(main_ddp, args=(world_size, cfg, root_dir), nprocs=world_size)
+
+
+def main_ddp(rank: int, world_size: int, cfg: DictConfig, root_dir: Path) -> None:
+    setup_ddp(rank, world_size)
+    run(cfg, root_dir)
+    destroy_process_group()
+
+
+@skip_if_run_is_over
+def run(cfg: DictConfig, root_dir: Path) -> None:
+    trainer = Trainer(cfg, root_dir)
+    trainer.run()
+
+
+def setup_ddp(rank: int, world_size: int) -> None:
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "6006"
+    init_process_group(backend="nccl", rank=rank, world_size=world_size)
+
+
+def setup_visible_cuda_devices(devices: Union[str, int, List[int]]) -> None:
+    if isinstance(devices, str):
+        if devices == "cpu":
+            devices = []
+        else:
+            assert devices == "all"
+            return
+    elif isinstance(devices, int):
+        devices = [devices]
+    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, devices))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/models/__init__.py b/src/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/models/actor_critic.py b/src/models/actor_critic.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea6bdbb956ffc8154def6ff0df3632de22c27f46
--- /dev/null
+++ b/src/models/actor_critic.py
@@ -0,0 +1,143 @@
+from collections import namedtuple
+from dataclasses import dataclass
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+import torch.nn as nn
+from torch.distributions.categorical import Categorical
+import torch.nn.functional as F
+
+from .blocks import Conv3x3, SmallResBlock
+from coroutines.env_loop import make_env_loop
+from envs import TorchEnv, WorldModelEnv
+from utils import init_lstm, LossAndLogs
+
+
+ActorCriticOutput = namedtuple("ActorCriticOutput", "logits_act val hx_cx")
+
+
+@dataclass
+class ActorCriticLossConfig:
+    backup_every: int
+    gamma: float
+    lambda_: float
+    weight_value_loss: float
+    weight_entropy_loss: float
+
+
+@dataclass
+class ActorCriticConfig:
+    lstm_dim: int
+    img_channels: int
+    img_size: int
+    channels: List[int]
+    down: List[int]
+    num_actions: Optional[int] = None
+
+
+class ActorCritic(nn.Module):
+    def __init__(self, cfg: ActorCriticConfig) -> None:
+        super().__init__()
+        self.encoder = ActorCriticEncoder(cfg)
+        self.lstm_dim = cfg.lstm_dim
+        input_dim_lstm = cfg.channels[-1] * (cfg.img_size // 2 ** (sum(cfg.down))) ** 2
+        self.lstm = nn.LSTMCell(input_dim_lstm, cfg.lstm_dim)
+        self.critic_linear = nn.Linear(cfg.lstm_dim, 1)
+        self.actor_linear = nn.Linear(cfg.lstm_dim, cfg.num_actions)
+
+        self.actor_linear.weight.data.fill_(0)
+        self.actor_linear.bias.data.fill_(0)
+        self.critic_linear.weight.data.fill_(0)
+        self.critic_linear.bias.data.fill_(0)
+        init_lstm(self.lstm)
+
+        self.env_loop = None
+        self.loss_cfg = None
+
+    @property
+    def device(self) -> torch.device:
+        return self.lstm.weight_hh.device
+
+    def setup_training(self, rl_env: Union[TorchEnv, WorldModelEnv], loss_cfg: ActorCriticLossConfig) -> None:
+        assert self.env_loop is None and self.loss_cfg is None
+        self.env_loop = make_env_loop(rl_env, self)
+        self.loss_cfg = loss_cfg
+
+    def predict_act_value(self, obs: Tensor, hx_cx: Tuple[Tensor, Tensor]) -> ActorCriticOutput:
+        assert obs.ndim == 4
+        x = self.encoder(obs)
+        x = x.flatten(start_dim=1)
+        hx, cx = self.lstm(x, hx_cx)
+        return ActorCriticOutput(self.actor_linear(hx), self.critic_linear(hx).squeeze(dim=1), (hx, cx))
+
+    def forward(self) -> LossAndLogs:
+        c = self.loss_cfg
+        _, act, rew, end, trunc, logits_act, val, val_bootstrap, _ = self.env_loop.send(c.backup_every)
+
+        d = Categorical(logits=logits_act)
+        entropy = d.entropy().mean()
+
+        lambda_returns = compute_lambda_returns(rew, end, trunc, val_bootstrap, c.gamma, c.lambda_)
+
+        loss_actions = (-d.log_prob(act) * (lambda_returns - val).detach()).mean()
+        loss_values = c.weight_value_loss * F.mse_loss(val, lambda_returns)
+        loss_entropy = -c.weight_entropy_loss * entropy
+
+        loss = loss_actions + loss_entropy + loss_values
+
+        metrics = {
+            "policy_entropy": entropy.detach() / math.log(2),
+            "loss_actions": loss_actions.detach(),
+            "loss_entropy": loss_entropy.detach(),
+            "loss_values": loss_values.detach(),
+            "loss_total": loss.detach(),
+        }
+
+        return loss, metrics
+
+
+class ActorCriticEncoder(nn.Module):
+    def __init__(self, cfg: ActorCriticConfig) -> None:
+        super().__init__()
+        assert len(cfg.channels) == len(cfg.down)
+        encoder_layers = [Conv3x3(cfg.img_channels, cfg.channels[0])]
+        for i in range(len(cfg.channels)):
+            encoder_layers.append(SmallResBlock(cfg.channels[max(0, i - 1)], cfg.channels[i]))
+            if cfg.down[i]:
+                encoder_layers.append(nn.MaxPool2d(2))
+        self.encoder = nn.Sequential(*encoder_layers)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.encoder(x)
+
+
+@torch.no_grad()
+def compute_lambda_returns(
+    rew: Tensor,
+    end: Tensor,
+    trunc: Tensor,
+    val_bootstrap: Tensor,
+    gamma: float,
+    lambda_: float,
+) -> Tensor:
+    assert rew.ndim == 2 and rew.size() == end.size() == trunc.size() == val_bootstrap.size()
+
+    rew = rew.sign()  # clip reward
+
+    end_or_trunc = (end + trunc).clip(max=1)
+    not_end = 1 - end
+    not_trunc = 1 - trunc
+
+    lambda_returns = rew + not_end * gamma * (not_trunc * (1 - lambda_) + trunc) * val_bootstrap
+
+    if lambda_ == 0:
+        return lambda_returns
+
+    last = val_bootstrap[:, -1]
+    for t in reversed(range(rew.size(1))):
+        lambda_returns[:, t] += end_or_trunc[:, t].logical_not() * gamma * lambda_ * last
+        last = lambda_returns[:, t]
+
+    return lambda_returns
diff --git a/src/models/blocks.py b/src/models/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..410ad69363fb349c1f5e56cfd111f9e49cdc605b
--- /dev/null
+++ b/src/models/blocks.py
@@ -0,0 +1,246 @@
+from functools import partial
+import math
+from typing import List, Optional
+
+import torch
+from torch import Tensor
+from torch import nn
+from torch.nn import functional as F
+
+# Settings for GroupNorm and Attention
+
+GN_GROUP_SIZE = 32
+GN_EPS = 1e-5
+ATTN_HEAD_DIM = 8
+
+# Convs
+
+Conv1x1 = partial(nn.Conv2d, kernel_size=1, stride=1, padding=0)
+Conv3x3 = partial(nn.Conv2d, kernel_size=3, stride=1, padding=1)
+
+# GroupNorm and conditional GroupNorm
+
+
+class GroupNorm(nn.Module):
+    def __init__(self, in_channels: int) -> None:
+        super().__init__()
+        num_groups = max(1, in_channels // GN_GROUP_SIZE)
+        self.norm = nn.GroupNorm(num_groups, in_channels, eps=GN_EPS)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.norm(x)
+
+
+class AdaGroupNorm(nn.Module):
+    def __init__(self, in_channels: int, cond_channels: int) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.num_groups = max(1, in_channels // GN_GROUP_SIZE)
+        self.linear = nn.Linear(cond_channels, in_channels * 2)
+
+    def forward(self, x: Tensor, cond: Tensor) -> Tensor:
+        assert x.size(1) == self.in_channels
+        x = F.group_norm(x, self.num_groups, eps=GN_EPS)
+        scale, shift = self.linear(cond)[:, :, None, None].chunk(2, dim=1)
+        return x * (1 + scale) + shift
+
+
+# Self Attention
+
+
+class SelfAttention2d(nn.Module):
+    def __init__(self, in_channels: int, head_dim: int = ATTN_HEAD_DIM) -> None:
+        super().__init__()
+        self.n_head = max(1, in_channels // head_dim)
+        assert in_channels % self.n_head == 0
+        self.norm = GroupNorm(in_channels)
+        self.qkv_proj = Conv1x1(in_channels, in_channels * 3)
+        self.out_proj = Conv1x1(in_channels, in_channels)
+        nn.init.zeros_(self.out_proj.weight)
+        nn.init.zeros_(self.out_proj.bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        n, c, h, w = x.shape
+        x = self.norm(x)
+        qkv = self.qkv_proj(x)
+        qkv = qkv.view(n, self.n_head * 3, c // self.n_head, h * w).transpose(2, 3).contiguous()
+        q, k, v = [x for x in qkv.chunk(3, dim=1)]
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(k.size(-1))
+        att = F.softmax(att, dim=-1)
+        y = att @ v
+        y = y.transpose(2, 3).reshape(n, c, h, w)
+        return x + self.out_proj(y)
+
+
+# Embedding of the noise level
+
+
+class FourierFeatures(nn.Module):
+    def __init__(self, cond_channels: int) -> None:
+        super().__init__()
+        assert cond_channels % 2 == 0
+        self.register_buffer("weight", torch.randn(1, cond_channels // 2))
+
+    def forward(self, input: Tensor) -> Tensor:
+        assert input.ndim == 1
+        f = 2 * math.pi * input.unsqueeze(1) @ self.weight
+        return torch.cat([f.cos(), f.sin()], dim=-1)
+
+
+# [Down|Up]sampling
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int) -> None:
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=1)
+        nn.init.orthogonal_(self.conv.weight)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.conv(x)
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int) -> None:
+        super().__init__()
+        self.conv = Conv3x3(in_channels, in_channels)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        return self.conv(x)
+
+
+# Small Residual block
+
+
+class SmallResBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int) -> None:
+        super().__init__()
+        self.f = nn.Sequential(GroupNorm(in_channels), nn.SiLU(inplace=True), Conv3x3(in_channels, out_channels))
+        self.skip_projection = nn.Identity() if in_channels == out_channels else Conv1x1(in_channels, out_channels)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.skip_projection(x) + self.f(x)
+
+
+# Residual block (conditioning with AdaGroupNorm, no [down|up]sampling, optional self-attention)
+
+
+class ResBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, cond_channels: int, attn: bool) -> None:
+        super().__init__()
+        should_proj = in_channels != out_channels
+        self.proj = Conv1x1(in_channels, out_channels) if should_proj else nn.Identity()
+        self.norm1 = AdaGroupNorm(in_channels, cond_channels)
+        self.conv1 = Conv3x3(in_channels, out_channels)
+        self.norm2 = AdaGroupNorm(out_channels, cond_channels)
+        self.conv2 = Conv3x3(out_channels, out_channels)
+        self.attn = SelfAttention2d(out_channels) if attn else nn.Identity()
+        nn.init.zeros_(self.conv2.weight)
+
+    def forward(self, x: Tensor, cond: Tensor) -> Tensor:
+        r = self.proj(x)
+        x = self.conv1(F.silu(self.norm1(x, cond)))
+        x = self.conv2(F.silu(self.norm2(x, cond)))
+        x = x + r
+        x = self.attn(x)
+        return x
+
+
+# Sequence of residual blocks (in_channels -> mid_channels -> ... -> mid_channels -> out_channels)
+
+
+class ResBlocks(nn.Module):
+    def __init__(
+        self,
+        list_in_channels: List[int],
+        list_out_channels: List[int],
+        cond_channels: int,
+        attn: bool,
+    ) -> None:
+        super().__init__()
+        assert len(list_in_channels) == len(list_out_channels)
+        self.in_channels = list_in_channels[0]
+        self.resblocks = nn.ModuleList(
+            [
+                ResBlock(in_ch, out_ch, cond_channels, attn)
+                for (in_ch, out_ch) in zip(list_in_channels, list_out_channels)
+            ]
+        )
+
+    def forward(self, x: Tensor, cond: Tensor, to_cat: Optional[List[Tensor]] = None) -> Tensor:
+        outputs = []
+        for i, resblock in enumerate(self.resblocks):
+            x = x if to_cat is None else torch.cat((x, to_cat[i]), dim=1)
+            x = resblock(x, cond)
+            outputs.append(x)
+        return x, outputs
+
+
+# UNet
+
+
+class UNet(nn.Module):
+    def __init__(self, cond_channels: int, depths: List[int], channels: List[int], attn_depths: List[int]) -> None:
+        super().__init__()
+        assert len(depths) == len(channels) == len(attn_depths)
+        self._num_down = len(channels) - 1
+
+        d_blocks, u_blocks = [], []
+        for i, n in enumerate(depths):
+            c1 = channels[max(0, i - 1)]
+            c2 = channels[i]
+            d_blocks.append(
+                ResBlocks(
+                    list_in_channels=[c1] + [c2] * (n - 1),
+                    list_out_channels=[c2] * n,
+                    cond_channels=cond_channels,
+                    attn=attn_depths[i],
+                )
+            )
+            u_blocks.append(
+                ResBlocks(
+                    list_in_channels=[2 * c2] * n + [c1 + c2],
+                    list_out_channels=[c2] * n + [c1],
+                    cond_channels=cond_channels,
+                    attn=attn_depths[i],
+                )
+            )
+        self.d_blocks = nn.ModuleList(d_blocks)
+        self.u_blocks = nn.ModuleList(reversed(u_blocks))
+
+        self.mid_blocks = ResBlocks(
+            list_in_channels=[channels[-1]] * 2,
+            list_out_channels=[channels[-1]] * 2,
+            cond_channels=cond_channels,
+            attn=True,
+        )
+
+        downsamples = [nn.Identity()] + [Downsample(c) for c in channels[:-1]]
+        upsamples = [nn.Identity()] + [Upsample(c) for c in reversed(channels[:-1])]
+        self.downsamples = nn.ModuleList(downsamples)
+        self.upsamples = nn.ModuleList(upsamples)
+
+    def forward(self, x: Tensor, cond: Tensor) -> Tensor:
+        *_, h, w = x.size()
+        n = self._num_down
+        padding_h = math.ceil(h / 2 ** n) * 2 ** n - h
+        padding_w = math.ceil(w / 2 ** n) * 2 ** n - w
+        x = F.pad(x, (0, padding_w, 0, padding_h))
+
+        d_outputs = []
+        for block, down in zip(self.d_blocks, self.downsamples):
+            x_down = down(x)
+            x, block_outputs = block(x_down, cond)
+            d_outputs.append((x_down, *block_outputs))
+
+        x, _ = self.mid_blocks(x, cond)
+        
+        u_outputs = []
+        for block, up, skip in zip(self.u_blocks, self.upsamples, reversed(d_outputs)):
+            x_up = up(x)
+            x, block_outputs = block(x_up, cond, skip[::-1])
+            u_outputs.append((x_up, *block_outputs))
+
+        x = x[..., :h, :w]
+        return x, d_outputs, u_outputs
diff --git a/src/models/diffusion/__init__.py b/src/models/diffusion/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fc4f9a9f419af53c63a872861dd097c7ee54ab6
--- /dev/null
+++ b/src/models/diffusion/__init__.py
@@ -0,0 +1,3 @@
+from .denoiser import Denoiser, DenoiserConfig, SigmaDistributionConfig
+from .inner_model import InnerModelConfig
+from .diffusion_sampler import DiffusionSampler, DiffusionSamplerConfig
diff --git a/src/models/diffusion/denoiser.py b/src/models/diffusion/denoiser.py
new file mode 100644
index 0000000000000000000000000000000000000000..45152af5d0d46c81bb73655f6c9cc28c7df6a1c1
--- /dev/null
+++ b/src/models/diffusion/denoiser.py
@@ -0,0 +1,161 @@
+from dataclasses import dataclass
+from typing import Optional, List
+
+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+
+from data import Batch
+from .inner_model import InnerModel, InnerModelConfig
+from utils import LossAndLogs, get_frame_indices, LossLogsData
+
+
+def add_dims(input: Tensor, n: int) -> Tensor:
+    return input.reshape(input.shape + (1,) * (n - input.ndim))
+
+
+@dataclass
+class Conditioners:
+    c_in: Tensor
+    c_out: Tensor
+    c_skip: Tensor
+    c_noise: Tensor
+    c_noise_cond: Tensor
+
+
+@dataclass
+class SigmaDistributionConfig:
+    loc: float
+    scale: float
+    sigma_min: float
+    sigma_max: float
+
+@dataclass
+class FrameStrides:
+    count: int
+    stride: int
+
+
+@dataclass
+class DenoiserConfig:
+    inner_model: InnerModelConfig
+    sigma_data: float
+    sigma_offset_noise: float
+    noise_previous_obs: bool
+    upsampling_factor: Optional[int] = None
+    upsampling_frame_height: Optional[int] = None
+    upsampling_frame_width: Optional[int] = None
+    frame_sampling: Optional[List[FrameStrides]] = None
+
+
+class Denoiser(nn.Module):
+    def __init__(self, cfg: DenoiserConfig) -> None:
+        super().__init__()
+        self.cfg = cfg
+        self.is_upsampler = cfg.upsampling_factor is not None
+        cfg.inner_model.is_upsampler = self.is_upsampler
+        self.inner_model = InnerModel(cfg.inner_model)
+        self.sample_sigma_training = None
+        self.context_indicies = None if self.is_upsampler else get_frame_indices(cfg.frame_sampling)
+
+    @property
+    def device(self) -> torch.device:
+        return self.inner_model.noise_emb.weight.device
+
+    def setup_training(self, cfg: SigmaDistributionConfig) -> None:
+        assert self.sample_sigma_training is None
+
+        def sample_sigma(n: int, device: torch.device):
+            s = torch.randn(n, device=device) * cfg.scale + cfg.loc
+            return s.exp().clip(cfg.sigma_min, cfg.sigma_max)
+
+        self.sample_sigma_training = sample_sigma
+
+    def apply_noise(self, x: Tensor, sigma: Tensor, sigma_offset_noise: float) -> Tensor:
+        b, c, _, _ = x.shape
+        offset_noise = sigma_offset_noise * torch.randn(b, c, 1, 1, device=self.device)
+        return x + offset_noise + torch.randn_like(x) * add_dims(sigma, x.ndim)
+
+    def compute_conditioners(self, sigma: Tensor, sigma_cond: Optional[Tensor]) -> Conditioners:
+        sigma = (sigma ** 2 + self.cfg.sigma_offset_noise ** 2).sqrt()
+        c_in = 1 / (sigma ** 2 + self.cfg.sigma_data ** 2).sqrt()
+        c_skip = self.cfg.sigma_data ** 2 / (sigma ** 2 + self.cfg.sigma_data ** 2)
+        c_out = sigma * c_skip.sqrt()
+        c_noise = sigma.log() / 4
+        c_noise_cond = sigma_cond.log() / 4 if sigma_cond is not None else torch.zeros_like(c_noise)
+        return Conditioners(
+            *(add_dims(c, n) for c, n in zip((c_in, c_out, c_skip, c_noise, c_noise_cond), (4, 4, 4, 1, 1))))
+
+    def compute_model_output(self, noisy_next_obs: Tensor, obs: Tensor, act: Optional[Tensor],
+                             cs: Conditioners) -> Tensor:
+        rescaled_obs = obs / self.cfg.sigma_data
+        rescaled_noise = noisy_next_obs * cs.c_in
+        return self.inner_model(rescaled_noise, cs.c_noise, cs.c_noise_cond, rescaled_obs, act)
+
+    @torch.no_grad()
+    def wrap_model_output(self, noisy_next_obs: Tensor, model_output: Tensor, cs: Conditioners) -> Tensor:
+        d = cs.c_skip * noisy_next_obs + cs.c_out * model_output
+        # Quantize to {0, ..., 255}, then back to [-1, 1]
+        d = d.clamp(-1, 1).add(1).div(2).mul(255).byte().div(255).mul(2).sub(1)
+        return d
+
+    @torch.no_grad()
+    def denoise(self, noisy_next_obs: Tensor, sigma: Tensor, sigma_cond: Optional[Tensor], obs: Tensor,
+                act: Optional[Tensor]) -> Tensor:
+        cs = self.compute_conditioners(sigma, sigma_cond)
+        model_output = self.compute_model_output(noisy_next_obs, obs, act, cs)
+        denoised = self.wrap_model_output(noisy_next_obs, model_output, cs)
+        return denoised
+
+    def get_prev_obs(self, all_obs, act, i, b, n, c, H, W):
+        prev_obs = all_obs[:, i:i + n].reshape(b, n * c, H, W) if self.is_upsampler else all_obs[:, i + self.context_indicies].reshape(b, n * c, H, W)
+        prev_act = None if self.is_upsampler else act[:,i+self.context_indicies]
+        return prev_obs, prev_act
+
+    def forward(self, batch: Batch) -> LossLogsData:
+        b, t, c, h, w = batch.obs.size()
+        H, W = (self.cfg.upsampling_factor * h, self.cfg.upsampling_factor * w) if self.is_upsampler else (h, w)
+        n = 0 if self.is_upsampler else self.context_indicies[-1] + 1
+        seq_length = t - n  # t = n + 1 + num_autoregressive_steps
+
+        if self.is_upsampler:
+            all_obs = torch.stack([x["full_res"] for x in batch.info]).to(self.device)
+            low_res = F.interpolate(batch.obs.reshape(b * t, c, h, w), scale_factor=self.cfg.upsampling_factor,
+                                    mode="bicubic").reshape(b, t, c, H, W)
+            all_acts = None
+            assert all_obs.shape == low_res.shape
+        else:
+            all_obs = batch.obs.clone()
+            all_acts = batch.act.clone()
+
+        loss = 0
+        for i in range(seq_length):
+            prev_obs, prev_act = self.get_prev_obs(all_obs, all_acts, i, b, self.cfg.inner_model.num_steps_conditioning, c, H, W)
+            obs = all_obs[:, n + i]
+            mask = batch.mask_padding[:, n + i]
+
+            if self.cfg.noise_previous_obs:
+                sigma_cond = self.sample_sigma_training(b, self.device)
+                prev_obs = self.apply_noise(prev_obs, sigma_cond, self.cfg.sigma_offset_noise)
+            else:
+                sigma_cond = None
+
+            if self.is_upsampler:
+                prev_obs = torch.cat((prev_obs, low_res[:, n + i]), dim=1)
+
+            sigma = self.sample_sigma_training(b, self.device)
+            noisy_obs = self.apply_noise(obs, sigma, self.cfg.sigma_offset_noise)
+
+            cs = self.compute_conditioners(sigma, sigma_cond)
+            model_output = self.compute_model_output(noisy_obs, prev_obs, prev_act, cs)
+
+            target = (obs - cs.c_skip * noisy_obs) / cs.c_out
+            loss += F.mse_loss(model_output[mask], target[mask])
+
+            denoised = self.wrap_model_output(noisy_obs, model_output, cs)
+            all_obs[:, n + i] = denoised
+
+        metrics = {"loss_denoising": loss.item()/seq_length}
+        batch_data = {"obs": all_obs[:, -seq_length:], 'act': batch.act[:, -seq_length:], 'mask_padding': batch.mask_padding[:, -seq_length:]}
+        return loss, metrics, batch_data
diff --git a/src/models/diffusion/diffusion_sampler.py b/src/models/diffusion/diffusion_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..18f96c5b4908f1cc7e1a5486a81dd4e2d94e798b
--- /dev/null
+++ b/src/models/diffusion/diffusion_sampler.py
@@ -0,0 +1,74 @@
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from .denoiser import Denoiser
+
+
+@dataclass
+class DiffusionSamplerConfig:
+    num_steps_denoising: int
+    sigma_min: float = 2e-3
+    sigma_max: float = 5
+    rho: int = 7
+    order: int = 1
+    s_churn: float = 0
+    s_tmin: float = 0
+    s_tmax: float = float("inf")
+    s_noise: float = 1
+    s_cond: float = 0
+
+
+class DiffusionSampler:
+    def __init__(self, denoiser: Denoiser, cfg: DiffusionSamplerConfig) -> None:
+        self.denoiser = denoiser
+        self.cfg = cfg
+        self.sigmas = build_sigmas(cfg.num_steps_denoising, cfg.sigma_min, cfg.sigma_max, cfg.rho, denoiser.device)
+
+    @torch.no_grad()
+    def sample(self, prev_obs: Tensor, prev_act: Optional[Tensor]) -> Tuple[Tensor, List[Tensor]]:
+        device = prev_obs.device
+
+        b, t, c, h, w = prev_obs.size()
+
+        prev_obs = prev_obs.reshape(b, t * c, h, w)
+        s_in = torch.ones(b, device=device)
+        gamma_ = min(self.cfg.s_churn / (len(self.sigmas) - 1), 2**0.5 - 1)
+        x = torch.randn(b, c, h, w, device=device)
+        trajectory = [x]
+        for sigma, next_sigma in zip(self.sigmas[:-1], self.sigmas[1:]):
+            gamma = gamma_ if self.cfg.s_tmin <= sigma <= self.cfg.s_tmax else 0
+            sigma_hat = sigma * (gamma + 1)
+            if gamma > 0:
+                eps = torch.randn_like(x) * self.cfg.s_noise
+                x = x + eps * (sigma_hat**2 - sigma**2) ** 0.5
+            if self.cfg.s_cond > 0:
+                sigma_cond = torch.full((b,), fill_value=self.cfg.s_cond, device=device)
+                prev_obs = self.denoiser.apply_noise(prev_obs, sigma_cond, sigma_offset_noise=0)
+            else:
+                sigma_cond = None
+            denoised = self.denoiser.denoise(x, sigma, sigma_cond, prev_obs, prev_act)
+            d = (x - denoised) / sigma_hat
+            dt = next_sigma - sigma_hat
+            if self.cfg.order == 1 or next_sigma == 0:
+                # Euler method
+                x = x + d * dt
+            else:
+                # Heun's method
+                x_2 = x + d * dt
+                denoised_2 = self.denoiser.denoise(x_2, next_sigma * s_in, sigma_cond, prev_obs, prev_act)
+                d_2 = (x_2 - denoised_2) / next_sigma
+                d_prime = (d + d_2) / 2
+                x = x + d_prime * dt
+            trajectory.append(x)
+        return x, trajectory
+
+
+def build_sigmas(num_steps: int, sigma_min: float, sigma_max: float, rho: int, device: torch.device) -> Tensor:
+    min_inv_rho = sigma_min ** (1 / rho)
+    max_inv_rho = sigma_max ** (1 / rho)
+    l = torch.linspace(0, 1, num_steps, device=device)
+    sigmas = (max_inv_rho + l * (min_inv_rho - max_inv_rho)) ** rho
+    return torch.cat((sigmas, sigmas.new_zeros(1)))
diff --git a/src/models/diffusion/inner_model.py b/src/models/diffusion/inner_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..42929ccf6a7cd408b80c9e818758eee750daa09c
--- /dev/null
+++ b/src/models/diffusion/inner_model.py
@@ -0,0 +1,58 @@
+from dataclasses import dataclass
+from typing import List, Optional
+
+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..blocks import Conv3x3, FourierFeatures, GroupNorm, UNet
+
+
+@dataclass
+class InnerModelConfig:
+    img_channels: int
+    num_steps_conditioning: int
+    cond_channels: int
+    depths: List[int]
+    channels: List[int]
+    attn_depths: List[bool]
+    num_actions: Optional[int] = None  # set by trainer after env creation
+    is_upsampler: Optional[bool] = None  # set by Denoiser
+
+
+class InnerModel(nn.Module):
+    def __init__(self, cfg: InnerModelConfig) -> None:
+        super().__init__()
+        self.noise_emb = FourierFeatures(cfg.cond_channels)
+        self.noise_cond_emb = FourierFeatures(cfg.cond_channels)
+        self.act_emb = None if cfg.is_upsampler else nn.Sequential(
+            nn.Embedding(cfg.num_actions, cfg.cond_channels // cfg.num_steps_conditioning),
+            nn.Flatten(),  # b t e -> b (t e)
+        )
+        self.cond_proj = nn.Sequential(
+            nn.Linear(cfg.cond_channels, cfg.cond_channels),
+            nn.SiLU(),
+            nn.Linear(cfg.cond_channels, cfg.cond_channels),
+        )
+        self.conv_in = Conv3x3((cfg.num_steps_conditioning + int(cfg.is_upsampler) + 1) * cfg.img_channels, cfg.channels[0])
+
+        self.unet = UNet(cfg.cond_channels, cfg.depths, cfg.channels, cfg.attn_depths)
+
+        self.norm_out = GroupNorm(cfg.channels[0])
+        self.conv_out = Conv3x3(cfg.channels[0], cfg.img_channels)
+        nn.init.zeros_(self.conv_out.weight)
+
+    def forward(self, noisy_next_obs: Tensor, c_noise: Tensor, c_noise_cond: Tensor, obs: Tensor, act: Optional[Tensor]) -> Tensor:
+        if self.act_emb is not None:
+            assert act.ndim == 2 or (act.ndim == 3 and act.size(2) == self.act_emb[0].num_embeddings and set(act.unique().tolist()).issubset(set([0, 1])))
+            act_emb = self.act_emb(act) if act.ndim == 2 else self.act_emb[1]((act.float() @ self.act_emb[0].weight))
+        else:
+            assert act is None
+            act_emb = 0
+
+        cond = self.cond_proj(self.noise_emb(c_noise) + self.noise_cond_emb(c_noise_cond) + act_emb)
+        x = self.conv_in(torch.cat((obs, noisy_next_obs), dim=1))
+        x, _, _ = self.unet(x, cond)
+        x = self.conv_out(F.silu(self.norm_out(x)))
+        return x
diff --git a/src/models/rew_end_model.py b/src/models/rew_end_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f661a1692fe0cc26551bff626c35fcb42622a38
--- /dev/null
+++ b/src/models/rew_end_model.py
@@ -0,0 +1,133 @@
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+from torcheval.metrics.functional import multiclass_confusion_matrix
+
+from .blocks import Conv3x3, Downsample, ResBlocks
+from data import Batch
+from utils import init_lstm, LossAndLogs
+
+
+@dataclass
+class RewEndModelConfig:
+    lstm_dim: int
+    img_channels: int
+    img_size: int
+    cond_channels: int
+    depths: List[int]
+    channels: List[int]
+    attn_depths: List[int]
+    num_actions: Optional[int] = None
+
+
+class RewEndModel(nn.Module):
+    def __init__(self, cfg: RewEndModelConfig) -> None:
+        super().__init__()
+        self.cfg = cfg
+        self.encoder = RewEndEncoder(2 * cfg.img_channels, cfg.cond_channels, cfg.depths, cfg.channels, cfg.attn_depths)
+        self.act_emb = nn.Embedding(cfg.num_actions, cfg.cond_channels)
+        input_dim_lstm = cfg.channels[-1] * (cfg.img_size // 2 ** (len(cfg.depths) - 1)) ** 2
+        self.lstm = nn.LSTM(input_dim_lstm, cfg.lstm_dim, batch_first=True)
+        self.head = nn.Sequential(
+            nn.Linear(cfg.lstm_dim, cfg.lstm_dim),
+            nn.SiLU(),
+            nn.Linear(cfg.lstm_dim, 3 + 2, bias=False),
+        )
+        init_lstm(self.lstm)
+
+    def predict_rew_end(
+        self,
+        obs: Tensor,
+        act: Tensor,
+        next_obs: Tensor,
+        hx_cx: Optional[Tuple[Tensor, Tensor]] = None,
+    ) -> Tuple[Tensor, Tensor, Tuple[Tensor, Tensor]]:
+        b, t, c, h, w = obs.shape
+        obs, act, next_obs = obs.reshape(b * t, c, h, w), act.reshape(b * t), next_obs.reshape(b * t, c, h, w)
+        x = self.encoder(torch.cat((obs, next_obs), dim=1), self.act_emb(act))
+        x = x.reshape(b, t, -1)  # (b t) e h w -> b t (e h w)
+        x, hx_cx = self.lstm(x, hx_cx)
+        logits = self.head(x)
+        return logits[:, :, :-2], logits[:, :, -2:], hx_cx
+
+    def forward(self, batch: Batch) -> LossAndLogs:
+        obs = batch.obs[:, :-1]
+        act = batch.act[:, :-1]
+        next_obs = batch.obs[:, 1:]
+        rew = batch.rew[:, :-1]
+        end = batch.end[:, :-1]
+        mask = batch.mask_padding[:, :-1]
+
+        # When dead, replace frame (gray padding) by true final obs
+        dead = end.bool().any(dim=1)
+        if dead.any():
+            final_obs = torch.stack([i["final_observation"] for i, d in zip(batch.info, dead) if d]).to(obs.device)
+            next_obs[dead, end[dead].argmax(dim=1)] = final_obs
+
+        logits_rew, logits_end, _ = self.predict_rew_end(obs, act, next_obs)
+        logits_rew = logits_rew[mask]
+        logits_end = logits_end[mask]
+        target_rew = rew[mask].sign().long().add(1)  # clipped to {-1, 0, 1}
+        target_end = end[mask]
+
+        loss_rew = F.cross_entropy(logits_rew, target_rew)
+        loss_end = F.cross_entropy(logits_end, target_end)
+        loss = loss_rew + loss_end
+
+        metrics = {
+            "loss_rew": loss_rew.detach(),
+            "loss_end": loss_end.detach(),
+            "loss_total": loss.detach(),
+            "confusion_matrix": {
+                "rew": multiclass_confusion_matrix(logits_rew, target_rew, num_classes=3),
+                "end": multiclass_confusion_matrix(logits_end, target_end, num_classes=2),
+            },
+        }
+        return loss, metrics
+
+
+class RewEndEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        cond_channels: int,
+        depths: List[int],
+        channels: List[int],
+        attn_depths: List[int],
+    ) -> None:
+        super().__init__()
+        assert len(depths) == len(channels) == len(attn_depths)
+        self.conv_in = Conv3x3(in_channels, channels[0])
+        blocks = []
+        for i, n in enumerate(depths):
+            c1 = channels[max(0, i - 1)]
+            c2 = channels[i]
+            blocks.append(
+                ResBlocks(
+                    list_in_channels=[c1] + [c2] * (n - 1),
+                    list_out_channels=[c2] * n,
+                    cond_channels=cond_channels,
+                    attn=attn_depths[i],
+                )
+            )
+        blocks.append(
+            ResBlocks(
+                list_in_channels=[channels[-1]] * 2,
+                list_out_channels=[channels[-1]] * 2,
+                cond_channels=cond_channels,
+                attn=True,
+            )
+        )
+        self.blocks = nn.ModuleList(blocks)
+        self.downsamples = nn.ModuleList([nn.Identity()] + [Downsample(c) for c in channels[:-1]] + [nn.Identity()])
+
+    def forward(self, x: Tensor, cond: Tensor) -> Tensor:
+        x = self.conv_in(x)
+        for block, down in zip(self.blocks, self.downsamples):
+            x = down(x)
+            x, _ = block(x, cond)
+        return x
diff --git a/src/play.py b/src/play.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9f85a9c4d50d4b0e69cbc6221bf926327b81960
--- /dev/null
+++ b/src/play.py
@@ -0,0 +1,107 @@
+import argparse
+from pathlib import Path
+
+from huggingface_hub import snapshot_download
+from hydra import compose, initialize
+from hydra.utils import instantiate
+from omegaconf import DictConfig, OmegaConf
+import torch
+
+from agent import Agent
+from envs import WorldModelEnv
+from game import Game, PlayEnv
+
+
+OmegaConf.register_new_resolver("eval", eval)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-r", "--record", action="store_true", help="Record episodes in PlayEnv.")
+    parser.add_argument("--store-denoising-trajectory", action="store_true", help="Save denoising steps in info.")
+    parser.add_argument("--store-original-obs", action="store_true", help="Save original obs (pre resizing) in info.")
+    parser.add_argument("--mouse-multiplier", type=int, default=10, help="Multiplication factor for the mouse movement.")
+    parser.add_argument("--compile", action="store_true", help="Turn on model compilation.")
+    parser.add_argument("--fps", type=int, default=30, help="Frame rate.")
+    parser.add_argument("--no-header", action="store_true")
+    return parser.parse_args()
+
+
+def check_args(args: argparse.Namespace) -> None:
+    if not args.record and (args.store_denoising_trajectory or args.store_original_obs):
+        print("Warning: not in recording mode, ignoring --store* options")
+    return True
+
+
+def prepare_play_mode(cfg: DictConfig, args: argparse.Namespace) -> PlayEnv:
+    path_hf = Path(snapshot_download(repo_id="Enigma-AI/multiverse"))
+
+    path_ckpt = path_hf / 'agent.pt'
+    spawn_dir = Path('.') / 'game/spawn'
+    # Override config
+    cfg.agent = OmegaConf.load("config/agent/racing.yaml")
+    cfg.env = OmegaConf.load("config/env/racing.yaml")
+    
+    if torch.cuda.is_available():
+        device = torch.device("cuda:0")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+
+    print("----------------------------------------------------------------------")
+    print(f"Using {device} for rendering.")
+    if not torch.cuda.is_available() and not torch.backends.mps.is_available(): # warn in case CUDA isn't being used (not on MPS devices)
+        print("If you have a CUDA GPU available and it is not being used, please follow the instructions at https://pytorch.org/get-started/locally/ to reinstall torch with CUDA support and try again.")
+    print("----------------------------------------------------------------------")
+
+    assert cfg.env.train.id == "racing"
+    num_actions = cfg.env.num_actions
+
+    # Models
+    agent = Agent(instantiate(cfg.agent, num_actions=num_actions)).to(device).eval()
+    agent.load(path_ckpt)
+    
+    # World model environment
+    sl = cfg.agent.denoiser.inner_model.num_steps_conditioning
+    if agent.upsampler is not None:
+        sl = max(sl, cfg.agent.upsampler.inner_model.num_steps_conditioning)
+    wm_env_cfg = instantiate(cfg.world_model_env, num_batches_to_preload=1)
+    wm_env = WorldModelEnv(agent.denoiser, agent.upsampler, agent.rew_end_model, spawn_dir, 1, sl, wm_env_cfg, return_denoising_trajectory=True)
+    
+    if device.type == "cuda" and args.compile:
+        print("Compiling models...")
+        wm_env.predict_next_obs = torch.compile(wm_env.predict_next_obs, mode="reduce-overhead")
+        wm_env.upsample_next_obs = torch.compile(wm_env.upsample_next_obs, mode="reduce-overhead")
+
+    play_env = PlayEnv(
+        agent,
+        wm_env,
+        args.record,
+        args.store_denoising_trajectory,
+        args.store_original_obs,
+    )
+
+    return play_env
+
+
+@torch.no_grad()
+def main():
+    args = parse_args()
+    ok = check_args(args)
+    if not ok:
+        return
+
+    with initialize(version_base="1.3", config_path="../config"):
+        cfg = compose(config_name="trainer")
+
+    # window size
+    h, w = (cfg.env.train.size,) * 2 if isinstance(cfg.env.train.size, int) else cfg.env.train.size
+    size_h, size_w = h, w
+    env = prepare_play_mode(cfg, args)
+    game = Game(env, (size_h, size_w), fps=args.fps, verbose=not args.no_header)
+    game.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/player/__init__.py b/src/player/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/player/action_processing.py b/src/player/action_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..48e6788d25cb20c7db44fef172621b28129ea2fc
--- /dev/null
+++ b/src/player/action_processing.py
@@ -0,0 +1,142 @@
+"""
+Credits: some parts are taken and modified from the file `config.py` from https://github.com/TeaPearce/Counter-Strike_Behavioural_Cloning/
+"""
+
+from dataclasses import dataclass
+from typing import Dict, List, Set
+
+import numpy as np
+import pygame
+import torch
+
+from .keymap import GAME_FORBIDDEN_COMBINATIONS, GAME_KEYMAP
+
+
+@dataclass
+class GameAction:
+    keys: List[int]
+
+    def __post_init__(self) -> None:
+        self.keys = filter_keys_pressed_forbidden(self.keys)
+        # self.process_mouse()
+
+    @property
+    def key_names(self) -> List[str]:
+        return [pygame.key.name(key) for key in self.keys]
+
+
+def print_game_action(action: GameAction) -> str:
+    action_names = [GAME_KEYMAP[k] for k in action.keys] if len(action.keys) > 0 else []
+    action_names = [x for x in action_names if not x.startswith("camera_")]
+    keys = " + ".join(action_names)
+    return keys
+
+
+N_PLAYERS = 2
+N_GAS_KAVIM = 11
+N_BREX_KAVIM = 11
+N_STEERING = 11
+N_KEYS = 8  # number of keyboard outputs, w,s,a,d,up,down,left,right
+
+
+def encode_game_action(game_action: GameAction, device: torch.device) -> torch.Tensor:
+    p1_gas = torch.zeros(N_GAS_KAVIM)
+    p2_gas = torch.zeros(N_GAS_KAVIM)
+
+    p1_brex = torch.zeros(N_BREX_KAVIM)
+    p2_brex = torch.zeros(N_BREX_KAVIM)
+
+    p1_steer = torch.zeros(N_STEERING)
+    p2_steer = torch.zeros(N_STEERING)
+
+    p1_is_steer = False
+    p2_is_steer = False
+
+    for key in game_action.key_names:
+        if key == "w":
+            p1_gas[N_GAS_KAVIM - 1] = 1
+        if key == "a":
+            p1_steer[3] = 1
+            p1_is_steer = True
+        if key == "s":
+            p1_brex[N_BREX_KAVIM - 1] = 1
+        if key == "d":
+            p1_steer[N_STEERING - 3] = 1
+            p1_is_steer = True
+
+        if key == "up":
+            p2_gas[N_GAS_KAVIM - 1] = 1
+        if key == "left":
+            p2_steer[3] = 1
+            p2_is_steer = True
+        if key == "down":
+            p2_brex[N_BREX_KAVIM - 1] = 1
+        if key == "right":
+            p2_steer[N_STEERING - 3] = 1
+            p2_is_steer = True
+
+    if not p1_is_steer:
+        p1_steer[len(p1_steer) // 2] = 1
+
+    if not any(p1_gas):
+        p1_gas[0] = 1
+
+    if not any(p1_brex):
+        p1_brex[0] = 1
+
+    if not p2_is_steer:
+        p2_steer[len(p2_steer) // 2] = 1
+
+    if not any(p2_gas):
+        p2_gas[0] = 1
+
+    if not any(p2_brex):
+        p2_brex[0] = 1
+
+    return torch.cat([p1_gas, p1_brex, p1_steer, p2_gas, p2_brex, p2_steer]).float().to(device)
+
+
+def decode_game_action(y_preds: torch.Tensor) -> GameAction:
+    y_preds = y_preds.squeeze()
+    keys_pred = y_preds[0:N_KEYS]
+
+    keys_pressed = []
+    keys_pressed_onehot = np.round(keys_pred)
+    if keys_pressed_onehot[0] == 1:
+        keys_pressed.append("w")
+    if keys_pressed_onehot[1] == 1:
+        keys_pressed.append("a")
+    if keys_pressed_onehot[2] == 1:
+        keys_pressed.append("s")
+    if keys_pressed_onehot[3] == 1:
+        keys_pressed.append("d")
+    if keys_pressed_onehot[4] == 1:
+        keys_pressed.append("up")
+    if keys_pressed_onehot[5] == 1:
+        keys_pressed.append("left")
+    if keys_pressed_onehot[6] == 1:
+        keys_pressed.append("down")
+    if keys_pressed_onehot[7] == 1:
+        keys_pressed.append("right")
+
+    keys_pressed = [pygame.key.key_code(x) for x in keys_pressed]
+
+    return GameAction(keys_pressed)
+
+
+def filter_keys_pressed_forbidden(keys_pressed: List[int], keymap: Dict[int, str] = GAME_KEYMAP,
+                                  forbidden_combinations: List[Set[str]] = GAME_FORBIDDEN_COMBINATIONS) -> List[int]:
+    keys = set()
+    names = set()
+    for key in keys_pressed:
+        if key not in keymap:
+            continue
+        name = keymap[key]
+        keys.add(key)
+        names.add(name)
+        for forbidden in forbidden_combinations:
+            if forbidden.issubset(names):
+                keys.remove(key)
+                names.remove(name)
+                break
+    return list(filter(lambda key: key in keys, keys_pressed))
diff --git a/src/player/keymap.py b/src/player/keymap.py
new file mode 100644
index 0000000000000000000000000000000000000000..1405b840931a61594bb4f6c41045c881e1548d53
--- /dev/null
+++ b/src/player/keymap.py
@@ -0,0 +1,16 @@
+import pygame
+
+
+GAME_KEYMAP = {
+    pygame.K_w: "p1_up",
+    pygame.K_d: "p1_right",
+    pygame.K_a: "p1_left",
+    pygame.K_s: "p1_down",
+
+    pygame.K_UP: "p2_up",
+    pygame.K_RIGHT: "p2_right",
+    pygame.K_LEFT: "p2_left",
+    pygame.K_DOWN: "p2_down",
+}
+
+GAME_FORBIDDEN_COMBINATIONS = {}
\ No newline at end of file
diff --git a/src/process_denoiser_files.py b/src/process_denoiser_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..62068e40c70f9088e79a13543c96b1d344d78eb2
--- /dev/null
+++ b/src/process_denoiser_files.py
@@ -0,0 +1,126 @@
+import argparse
+from functools import partial
+from pathlib import Path
+from multiprocessing import Pool
+import shutil
+
+import torchvision.transforms.functional as T
+from tqdm import tqdm
+
+from data.dataset import Dataset, GameHdf5Dataset
+from data.episode import Episode
+from data.segment import SegmentId
+
+import os
+
+PREFIX = ""
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "tar_dir",
+        type=Path,
+        help="folder containing the .tar files from `dataset_tars` folder from Huggingface",
+    )
+    parser.add_argument(
+        "out_dir",
+        type=Path,
+        help="a new directory (should not exist already), the script will untar and process data there",
+    )
+    return parser.parse_args()
+
+
+def process_tar(path_tar: Path, out_dir: Path, remove_tar: bool) -> None:
+    d = path_tar.stem
+    assert path_tar.stem.startswith(PREFIX)
+    d = out_dir / "-".join(path_tar.stem[len(PREFIX) :].split("_to_"))
+    d.mkdir(exist_ok=False, parents=True)
+    shutil.copy(path_tar, d / os.path.basename(path_tar))
+    new_path_tar = d / path_tar.name
+    if remove_tar:
+        new_path_tar.unlink()
+    else:
+        shutil.copy(new_path_tar, path_tar.parent)
+
+
+def main():
+    args = parse_args()
+
+    tar_dir = args.tar_dir.absolute()
+    out_dir = args.out_dir.absolute()
+
+    if not tar_dir.exists():
+        print(
+            "Wrong usage: the tar directory should exist (and contain the downloaded .tar files)"
+        )
+        return
+
+    if out_dir.exists():
+        print(f"Wrong usage: the output directory should not exist ({args.out_dir})")
+        return
+
+    with Path("test_split.txt").open("r") as f:
+        test_files = f.read().split("\n")
+
+    full_res_dir = out_dir / "full_res"
+    low_res_dir = out_dir / "low_res"
+
+    hdf5_files = [
+        x for x in tar_dir.iterdir() if x.suffix == ".hdf5" and x.stem.startswith(PREFIX)
+    ]
+    n = len(hdf5_files)
+
+    str_files = "\n".join(map(str, hdf5_files))
+    print(f"Ready to untar {n} tar files:\n{str_files}")
+
+    remove_tar = False
+
+    # Untar game files
+    f = partial(process_tar, out_dir=full_res_dir, remove_tar=remove_tar)
+    with Pool(n) as p:
+        p.map(f, hdf5_files)
+
+    print(f"{n} .tar files unpacked in {full_res_dir}")
+
+    #
+    # Create low-res data
+    #
+
+    game_dataset = GameHdf5Dataset(full_res_dir)
+
+    train_dataset = Dataset(low_res_dir / "train", None)
+    test_dataset = Dataset(low_res_dir / "test", None)
+
+    for i in tqdm(game_dataset._filenames, desc="Creating low_res"):
+        episode_length = game_dataset._length_one_episode[i]
+        episode = Episode(
+            **{
+                k: v
+                for k, v in game_dataset[SegmentId(i, 0, episode_length, None)].__dict__.items()
+                if k not in ("mask_padding", "id")
+            }
+        )
+        episode.obs = T.resize(
+            episode.obs, (48, 64), interpolation=T.InterpolationMode.BICUBIC
+        )
+        filename = game_dataset._filenames[i]
+        file_id = f"{filename.parent.stem}/{filename.name}"
+        episode.info = {"original_file_id": file_id}
+        dataset = test_dataset if filename.name in test_files else train_dataset
+        dataset.add_episode(episode)
+
+    train_dataset.save_to_default_path()
+    test_dataset.save_to_default_path()
+
+    print(
+        f"Split train/test data ({train_dataset.num_episodes}/{test_dataset.num_episodes} episodes)\n"
+    )
+
+    print("You can now edit `config/env/racing.yaml` and set:")
+    print(f"path_data_low_res: {low_res_dir}")
+    print(f"path_data_full_res: {full_res_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/process_upsampler_files.py b/src/process_upsampler_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9181e7f48c28fb73bc56895446515219e59df17
--- /dev/null
+++ b/src/process_upsampler_files.py
@@ -0,0 +1,126 @@
+import argparse
+from functools import partial
+from pathlib import Path
+from multiprocessing import Pool
+import shutil
+
+import torchvision.transforms.functional as T
+from tqdm import tqdm
+
+from data.dataset import Dataset, GameHdf5Dataset
+from data.episode import Episode
+from data.segment import SegmentId
+
+import os
+
+PREFIX = ""
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "tar_dir",
+        type=Path,
+        help="folder containing the .tar files from `dataset_tars` folder from Huggingface",
+    )
+    parser.add_argument(
+        "out_dir",
+        type=Path,
+        help="a new directory (should not exist already), the script will untar and process data there",
+    )
+    return parser.parse_args()
+
+
+def process_tar(path_tar: Path, out_dir: Path, remove_tar: bool) -> None:
+    d = path_tar.stem
+    assert path_tar.stem.startswith(PREFIX)
+    d = out_dir / "-".join(path_tar.stem[len(PREFIX) :].split("_to_"))
+    d.mkdir(exist_ok=False, parents=True)
+    shutil.copy(path_tar, d / os.path.basename(path_tar))
+    new_path_tar = d / path_tar.name
+    if remove_tar:
+        new_path_tar.unlink()
+    else:
+        shutil.copy(new_path_tar, path_tar.parent)
+
+
+def main():
+    args = parse_args()
+
+    tar_dir = args.tar_dir.absolute()
+    out_dir = args.out_dir.absolute()
+
+    if not tar_dir.exists():
+        print(
+            "Wrong usage: the tar directory should exist (and contain the downloaded .tar files)"
+        )
+        return
+
+    if out_dir.exists():
+        print(f"Wrong usage: the output directory should not exist ({args.out_dir})")
+        return
+
+    with Path("test_split.txt").open("r") as f:
+        test_files = f.read().split("\n")
+
+    full_res_dir = out_dir / "full_res"
+    low_res_dir = out_dir / "low_res"
+
+    hdf5_files = [
+        x for x in tar_dir.iterdir() if x.suffix == ".hdf5" and x.stem.startswith(PREFIX)
+    ]
+    n = len(hdf5_files)
+
+    str_files = "\n".join(map(str, hdf5_files))
+    print(f"Ready to untar {n} tar files:\n{str_files}")
+
+    remove_tar = False
+
+    # Untar game files
+    f = partial(process_tar, out_dir=full_res_dir, remove_tar=remove_tar)
+    with Pool(n) as p:
+        p.map(f, hdf5_files)
+
+    print(f"{n} .tar files unpacked in {full_res_dir}")
+
+    #
+    # Create low-res data
+    #
+
+    game_dataset = GameHdf5Dataset(full_res_dir)
+
+    train_dataset = Dataset(low_res_dir / "train", None)
+    test_dataset = Dataset(low_res_dir / "test", None)
+
+    for i in tqdm(game_dataset._filenames, desc="Creating low_res"):
+        episode_length = game_dataset._length_one_episode[i]
+        episode = Episode(
+            **{
+                k: v
+                for k, v in game_dataset[SegmentId(i, 0, episode_length, None)].__dict__.items()
+                if k not in ("mask_padding", "id")
+            }
+        )
+        episode.obs = T.resize(
+            episode.obs, (35, 53), interpolation=T.InterpolationMode.BICUBIC
+        )
+        filename = game_dataset._filenames[i]
+        file_id = f"{filename.parent.stem}/{filename.name}"
+        episode.info = {"original_file_id": file_id}
+        dataset = test_dataset if filename.name in test_files else train_dataset
+        dataset.add_episode(episode)
+
+    train_dataset.save_to_default_path()
+    test_dataset.save_to_default_path()
+
+    print(
+        f"Split train/test data ({train_dataset.num_episodes}/{test_dataset.num_episodes} episodes)\n"
+    )
+
+    print("You can now edit `config/env/racing.yaml` and set:")
+    print(f"path_data_low_res: {low_res_dir}")
+    print(f"path_data_full_res: {full_res_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/spawn.py b/src/spawn.py
new file mode 100644
index 0000000000000000000000000000000000000000..07d2d4ef7c95383206a5d6555ba1e0877b6de1a1
--- /dev/null
+++ b/src/spawn.py
@@ -0,0 +1,124 @@
+import argparse
+from pathlib import Path
+import os
+import random
+import h5py
+import numpy as np
+import cv2
+
+low_res_w = 64
+low_res_h = 48
+
+crop_frame = {
+    'left_top': (0.04, 0.18),
+    'right_bottom': (0.92, 0.95)
+}
+
+def extract_roi(image, rect):
+    img_width, img_height, _ = image.shape
+    min_x = int(rect['left_top'][1] * img_width)
+    max_x = int(rect['right_bottom'][1] * img_width)
+    min_y = int(rect['left_top'][0] * img_height)
+    max_y = int(rect['right_bottom'][0] * img_height)
+    roi = image[min_x:max_x, min_y:max_y]
+    return roi
+
+def rescale_image(image, scale_factor):
+    # Get new dimensions
+    new_width = int(image.shape[1] * scale_factor)
+    new_height = int(image.shape[0] * scale_factor)
+
+    # Resize the image
+    return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "full_res_directory",
+        type=Path,
+        help="Specify your full_res directory.",
+    )
+
+    parser.add_argument(
+        "model_directory",
+        type=Path,
+        help="Specify ",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    full_res_directory = args.full_res_directory.absolute()
+    model_directory = args.model_directory.absolute()
+
+    spawn_dir = model_directory / "game/spawn"
+    existing_spawns = len(os.listdir(spawn_dir))
+    os.makedirs(spawn_dir / str(existing_spawns), exist_ok=True)
+    full_res = os.path.join(full_res_directory, random.choice(os.listdir(full_res_directory)))
+    h5file = h5py.File(full_res, 'r')
+    i = random.randint(0, 1000)
+
+    data_x_frames = []
+    data_y_frames = []
+    next_act_frames = []
+    low_res_frames = []
+
+    for j in range(20):
+        frame_x = f'frame_{i}_x'
+        frame_y = f'frame_{i}_y'
+
+        # Check if the datasets exist in the file
+        if frame_x in h5file and frame_y in h5file:
+            # Append each frame to the lists
+            data_x = h5file[frame_x][:]
+            data_y = h5file[frame_y][:]
+
+            img1 = cv2.cvtColor(data_x[:,:,3:], cv2.COLOR_BGR2RGB)
+            img2 = cv2.cvtColor(data_x[:,:,:3], cv2.COLOR_BGR2RGB)
+
+            img1_cropped = extract_roi(img1, crop_frame)
+            img2_cropped = extract_roi(img2, crop_frame)
+            img1_cropped = cv2.resize(img1_cropped, (530, 350), interpolation=cv2.INTER_AREA)
+            img2_cropped = cv2.resize(img2_cropped, (530, 350), interpolation=cv2.INTER_AREA)
+
+            data_x_frames.append(np.concatenate([img1_cropped, img2_cropped], axis=2))
+            data_y_frames.append(data_y)
+
+            img1 = cv2.resize(img1, (low_res_w, low_res_h), interpolation=cv2.INTER_AREA)
+            img2 = cv2.resize(img2, (low_res_w, low_res_h), interpolation=cv2.INTER_AREA)
+
+            low_res_frames.append(np.concatenate([img1, img2], axis=2).astype(np.uint8))
+        else:
+            print(f"One or both of {frame_x} or {frame_y} do not exist in the file.")
+        i += 1
+    for _ in range(200):
+        next_act = f'frame_{i}_y'
+        if next_act in h5file:
+            next_act_data = h5file[next_act][:]
+            next_act_frames.append(next_act_data)
+
+    data_x_stacked = np.stack(data_x_frames)
+    data_y_stacked = np.stack(data_y_frames)
+    next_act_stacked = np.stack(next_act_frames)
+    low_res_stacked = np.stack(low_res_frames)
+
+    low_res_stacked = np.transpose(low_res_stacked, (0, 3, 1, 2))
+    data_x_stacked = np.transpose(data_x_stacked, (0, 3, 1, 2))
+
+    print(f"Saving act.npy of size {data_y_stacked.shape}")
+    np.save(spawn_dir / f"{existing_spawns}/act.npy", data_y_stacked)
+    print(f"Saving full_res.npy of size {data_x_stacked.shape}")
+    np.save(spawn_dir / f"{existing_spawns}/full_res.npy", data_x_stacked)
+    print(f"Saving next_act.npy of size {next_act_stacked.shape}")
+    np.save(spawn_dir / f"{existing_spawns}/next_act.npy", next_act_stacked)
+    print(f"Saving low_res.npy of size {low_res_stacked.shape}")
+    np.save(spawn_dir / f"{existing_spawns}/low_res.npy", low_res_stacked)
+
+    h5file.close()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/trainer.py b/src/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c68f54401eb3e8f42929cc9130acc512c84b670
--- /dev/null
+++ b/src/trainer.py
@@ -0,0 +1,552 @@
+from functools import partial
+from pathlib import Path
+import shutil
+import time
+from typing import List, Optional, Tuple
+
+from hydra.utils import instantiate
+import numpy as np
+from omegaconf import DictConfig, OmegaConf
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from tqdm import tqdm, trange
+import wandb
+
+from agent import Agent
+from coroutines.collector import make_collector, NumToCollect
+from data import BatchSampler, collate_segments_to_batch, Dataset, DatasetTraverser, GameHdf5Dataset
+from envs import make_atari_env, WorldModelEnv
+from utils import (
+    broadcast_if_needed,
+    build_ddp_wrapper,
+    CommonTools,
+    configure_opt,
+    count_parameters,
+    get_lr_sched,
+    keep_agent_copies_every,
+    Logs,
+    move_opt_to,
+    process_confusion_matrices_if_any_and_compute_classification_metrics,
+    save_info_for_import_script,
+    save_with_backup,
+    set_seed,
+    StateDictMixin,
+    try_until_no_except,
+    wandb_log,
+    get_frame_indices,
+    build_pages_per_epoch,
+    find_maximum_key_below_threshold
+)
+
+
+class Trainer(StateDictMixin):
+    def __init__(self, cfg: DictConfig, root_dir: Path) -> None:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        OmegaConf.resolve(cfg)
+        self._cfg = cfg
+        self._rank = dist.get_rank() if dist.is_initialized() else 0
+        self._world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+        # Pick a random seed
+        set_seed(torch.seed() % 10 ** 9)
+
+        # Device
+        self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu", self._rank)
+        print(f"Starting on {self._device}")
+        self._use_cuda = self._device.type == "cuda"
+        if self._use_cuda:
+            torch.cuda.set_device(self._rank)  # fix compilation error on multi-gpu nodes
+
+        # Init wandb
+        if self._rank == 0:
+            try_until_no_except(
+                partial(wandb.init, config=OmegaConf.to_container(cfg, resolve=True), reinit=True, resume=True,
+                        **cfg.wandb)
+            )
+
+        # Flags
+        self._is_static_dataset = cfg.static_dataset.path is not None
+        self._is_model_free = cfg.training.model_free
+
+        # Checkpointing
+        self._path_ckpt_dir = Path("checkpoints")
+        self._path_state_ckpt = self._path_ckpt_dir / "state.pt"
+        self._keep_agent_copies = partial(
+            keep_agent_copies_every,
+            every=cfg.checkpointing.save_agent_every,
+            path_ckpt_dir=self._path_ckpt_dir,
+            num_to_keep=cfg.checkpointing.num_to_keep,
+        )
+        self._save_info_for_import_script = partial(
+            save_info_for_import_script, run_name=cfg.wandb.name, path_ckpt_dir=self._path_ckpt_dir
+        )
+
+        # First time, init files hierarchy
+        if not cfg.common.resume and self._rank == 0:
+            self._path_ckpt_dir.mkdir(exist_ok=False, parents=False)
+            path_config = Path("config") / "trainer.yaml"
+            path_config.parent.mkdir(exist_ok=False, parents=False)
+            shutil.move(".hydra/config.yaml", path_config)
+            wandb.save(str(path_config))
+            shutil.copytree(src=root_dir / "src", dst="./src")
+            shutil.copytree(src=root_dir / "scripts", dst="./scripts")
+
+        if cfg.env.train.id == "racing":
+            assert cfg.env.path_data_low_res is not None and cfg.env.path_data_full_res is not None, "Make sure to download GT4 data and set the relevant paths in cfg.env"
+            assert self._is_static_dataset
+            num_actions = cfg.env.num_actions
+            dataset_full_res = GameHdf5Dataset(Path(cfg.env.path_data_full_res))
+
+        # Envs (atari only)
+        else:
+            if self._rank == 0:
+                train_env = make_atari_env(num_envs=cfg.collection.train.num_envs, device=self._device, **cfg.env.train)
+                test_env = make_atari_env(num_envs=cfg.collection.test.num_envs, device=self._device, **cfg.env.test)
+                num_actions = int(test_env.num_actions)
+            else:
+                num_actions = None
+            num_actions, = broadcast_if_needed(num_actions)
+            dataset_full_res = None
+
+        num_workers = cfg.training.num_workers_data_loaders
+        use_manager = cfg.training.cache_in_ram and (num_workers > 0)
+        p = Path(cfg.static_dataset.path) if self._is_static_dataset else Path("dataset")
+        self.train_dataset = Dataset(p / "train", dataset_full_res, "train_dataset", cfg.training.cache_in_ram,
+                                     use_manager)
+        self.test_dataset = Dataset(p / "test", dataset_full_res, "test_dataset", cache_in_ram=True)
+        self.train_dataset.load_from_default_path()
+        self.test_dataset.load_from_default_path()
+
+        # Create models
+        self.agent = Agent(instantiate(cfg.agent, num_actions=num_actions)).to(self._device)
+        self._agent = build_ddp_wrapper(**self.agent._modules) if dist.is_initialized() else self.agent
+
+        if cfg.initialization.path_to_ckpt is not None:
+            self.agent.load(**cfg.initialization)
+
+        # Collectors
+        if not self._is_static_dataset and self._rank == 0:
+            self._train_collector = make_collector(
+                train_env, self.agent.actor_critic, self.train_dataset, cfg.collection.train.epsilon
+            )
+            self._test_collector = make_collector(
+                test_env, self.agent.actor_critic, self.test_dataset, cfg.collection.test.epsilon,
+                reset_every_collect=True
+            )
+
+        ######################################################
+
+        # Optimizers and LR schedulers
+
+        def build_opt(name: str) -> torch.optim.AdamW:
+            return configure_opt(getattr(self.agent, name), **getattr(cfg, name).optimizer)
+
+        def build_lr_sched(name: str) -> torch.optim.lr_scheduler.LambdaLR:
+            return get_lr_sched(self.opt.get(name), getattr(cfg, name).training.lr_warmup_steps)
+
+        model_names = [self._cfg.train_model]
+        self._model_names = ["actor_critic"] if self._is_model_free else [name for name in model_names if
+                                                                          getattr(self.agent, name) is not None]
+
+        self.opt = CommonTools(**{name: build_opt(name) for name in self._model_names})
+        self.lr_sched = CommonTools(**{name: build_lr_sched(name) for name in self._model_names})
+
+        # Data loaders
+
+        make_data_loader = partial(
+            DataLoader,
+            dataset=self.train_dataset,
+            collate_fn=collate_segments_to_batch,
+            num_workers=num_workers,
+            persistent_workers=(num_workers > 0),
+            pin_memory=self._use_cuda,
+            pin_memory_device=str(self._device) if self._use_cuda else "",
+        )
+
+        make_batch_sampler = partial(BatchSampler, self.train_dataset, self._rank, self._world_size)
+
+        def get_sample_weights(sample_weights: List[float]) -> Optional[List[float]]:
+            return None if (self._is_static_dataset and cfg.static_dataset.ignore_sample_weights) else sample_weights
+
+        c = cfg.denoiser.training
+        # in case of distributed training, the cfg needs to be taken from the module attribute
+        agent_cfg = self._agent.denoiser.cfg if not hasattr(self._agent.denoiser, "module") else self._agent.denoiser.module.cfg
+        effective_context_length = int(get_frame_indices(agent_cfg.frame_sampling)[-1]) + 1
+        seq_length = effective_context_length + 1 + c.num_autoregressive_steps
+        bs = make_batch_sampler(c.batch_size, seq_length, get_sample_weights(c.sample_weights), False, c.num_autoregressive_steps, c.initial_num_consecutive_page_count)
+        dl_denoiser_train = make_data_loader(batch_sampler=bs)
+        dl_denoiser_test = DatasetTraverser(self.test_dataset, c.batch_size, seq_length)
+
+        self.pages_per_epoch = build_pages_per_epoch(c.num_consecutive_pages)
+
+        if self.agent.upsampler is not None:
+            c = cfg.upsampler.training
+            seq_length = cfg.agent.upsampler.inner_model.num_steps_conditioning + 1 + c.num_autoregressive_steps
+            bs = make_batch_sampler(c.batch_size, seq_length, get_sample_weights(c.sample_weights), False, c.num_autoregressive_steps, c.initial_num_consecutive_page_count)
+            dl_upsampler_train = make_data_loader(batch_sampler=bs)
+            dl_upsampler_test = DatasetTraverser(self.test_dataset, c.batch_size, seq_length)
+        else:
+            dl_upsampler_train = dl_upsampler_test = None
+
+        if self.agent.rew_end_model is not None:
+            c = cfg.rew_end_model.training
+            bs = make_batch_sampler(c.batch_size, c.seq_length, get_sample_weights(c.sample_weights),
+                                    can_sample_beyond_end=True)
+            dl_rew_end_model_train = make_data_loader(batch_sampler=bs)
+            dl_rew_end_model_test = DatasetTraverser(self.test_dataset, c.batch_size, c.seq_length)
+        else:
+            dl_rew_end_model_train = dl_rew_end_model_test = None
+
+        self._data_loader_train = CommonTools(dl_denoiser_train, dl_upsampler_train, dl_rew_end_model_train, None)
+        self._data_loader_test = CommonTools(dl_denoiser_test, dl_upsampler_test, dl_rew_end_model_test, None)
+
+        # RL env
+
+        if self.agent.actor_critic is not None:
+            actor_critic_loss_cfg = instantiate(cfg.actor_critic.actor_critic_loss)
+
+            if self._is_model_free:
+                assert self.agent.actor_critic is not None
+                rl_env = make_atari_env(num_envs=cfg.actor_critic.training.batch_size, device=self._device,
+                                        **cfg.env.train)
+
+            else:
+                c = cfg.actor_critic.training
+                sl = cfg.agent.denoiser.inner_model.num_steps_conditioning
+                if self.agent.upsampler is not None:
+                    sl = max(sl, cfg.agent.upsampler.inner_model.num_steps_conditioning)
+                bs = make_batch_sampler(c.batch_size, sl, get_sample_weights(c.sample_weights))
+                dl_actor_critic = make_data_loader(batch_sampler=bs)
+                wm_env_cfg = instantiate(cfg.world_model_env)
+                rl_env = WorldModelEnv(self.agent.denoiser, self.agent.upsampler, self.agent.rew_end_model,
+                                       dl_actor_critic, wm_env_cfg)
+
+                if cfg.training.compile_wm:
+                    rl_env.predict_next_obs = torch.compile(rl_env.predict_next_obs, mode="reduce-overhead")
+                    rl_env.predict_rew_end = torch.compile(rl_env.predict_rew_end, mode="reduce-overhead")
+        else:
+            actor_critic_loss_cfg = None
+            rl_env = None
+
+        # Setup training
+        sigma_distribution_cfg = instantiate(cfg.denoiser.sigma_distribution)
+        sigma_distribution_cfg_upsampler = instantiate(
+            cfg.upsampler.sigma_distribution) if self.agent.upsampler is not None else None
+        self.agent.setup_training(sigma_distribution_cfg, sigma_distribution_cfg_upsampler, actor_critic_loss_cfg,
+                                  rl_env)
+
+        # Training state (things to be saved/restored)
+        self.epoch = 0
+        self.num_epochs_collect = None
+        self.num_episodes_test = 0
+        self.num_batch_train = CommonTools(0, 0, 0)
+        self.num_batch_test = CommonTools(0, 0, 0)
+
+        if cfg.common.resume:
+            # self.load_state_checkpoint()
+            self.load_agent_state_checkpoint()
+        else:
+            self.save_checkpoint()
+
+        if self._rank == 0:
+            for name in self._model_names:
+                print(f"{count_parameters(getattr(self.agent, name))} parameters in {name}")
+            print(self.train_dataset)
+            print(self.test_dataset)
+
+    def run(self) -> None:
+        to_log = []
+
+        if self.epoch == 0:
+            if self._is_model_free or self._is_static_dataset:
+                self.num_epochs_collect = 0
+            else:
+                if self._rank == 0:
+                    self.num_epochs_collect, to_log_ = self.collect_initial_dataset()
+                    to_log += to_log_
+                self.num_epochs_collect, sd_train_dataset = broadcast_if_needed(self.num_epochs_collect,
+                                                                                self.train_dataset.state_dict())
+                self.train_dataset.load_state_dict(sd_train_dataset)
+
+        num_epochs = self.num_epochs_collect + self._cfg.training.num_final_epochs
+
+        while self.epoch < num_epochs:
+            self.epoch += 1
+            start_time = time.time()
+
+            if self._rank == 0:
+                print(f"\nEpoch {self.epoch} / {num_epochs}\n")
+
+            # Training
+            should_collect_train = (
+                        self._rank == 0 and not self._is_model_free and not self._is_static_dataset and self.epoch <= self.num_epochs_collect)
+
+            if should_collect_train:
+                c = self._cfg.collection.train
+                to_log += self._train_collector.send(NumToCollect(steps=c.steps_per_epoch))
+            sd_train_dataset, = broadcast_if_needed(self.train_dataset.state_dict())  # update dataset for ranks > 0
+            self.train_dataset.load_state_dict(sd_train_dataset)
+
+            if self._cfg.training.should:
+                to_log += self.train_agent()
+
+            # Evaluation
+            should_test = self._rank == 0 and self._cfg.evaluation.should and (
+                        self.epoch % self._cfg.evaluation.every == 0)
+            should_collect_test = should_test and not self._is_static_dataset
+
+            if should_collect_test:
+                to_log += self.collect_test()
+
+            if should_test and not self._is_model_free:
+                to_log += self.test_agent()
+
+            # Logging
+            to_log.append({"duration": (time.time() - start_time) / 3600})
+            if self._rank == 0:
+                wandb_log(to_log, self.epoch)
+            to_log = []
+
+            # Checkpointing
+            self.save_checkpoint()
+
+            if dist.is_initialized():
+                dist.barrier()
+
+            smallest_page_epoch = find_maximum_key_below_threshold(self.pages_per_epoch, self.epoch)
+            if smallest_page_epoch is not None:
+                self._data_loader_train.denoiser.batch_sampler.num_consecutive_batches = self.pages_per_epoch[smallest_page_epoch]
+
+        # Last collect
+        if self._rank == 0 and not self._is_static_dataset:
+            wandb_log(self.collect_test(final=True), self.epoch)
+
+    def collect_initial_dataset(self) -> Tuple[int, Logs]:
+        print("\nInitial collect\n")
+        to_log = []
+        c = self._cfg.collection.train
+        min_steps = c.first_epoch.min
+        steps_per_epoch = c.steps_per_epoch
+        max_steps = c.first_epoch.max
+        threshold_rew = c.first_epoch.threshold_rew
+        assert min_steps % steps_per_epoch == 0
+
+        steps = min_steps
+        while True:
+            to_log += self._train_collector.send(NumToCollect(steps=steps))
+            num_steps = self.train_dataset.num_steps
+            total_minority_rew = sum(sorted(self.train_dataset.counts_rew)[:-1])
+            if total_minority_rew >= threshold_rew:
+                break
+            if (max_steps is not None) and num_steps >= max_steps:
+                print("Reached the specified maximum for initial collect")
+                break
+            print(f"Minority reward: {total_minority_rew}/{threshold_rew} -> Keep collecting\n")
+            steps = steps_per_epoch
+
+        print("\nSummary of initial collect:")
+        print(f"Num steps: {num_steps} / {c.num_steps_total}")
+        print(f"Reward counts: {dict(self.train_dataset.counter_rew)}")
+
+        remaining_steps = c.num_steps_total - num_steps
+        assert remaining_steps % c.steps_per_epoch == 0
+        num_epochs_collect = remaining_steps // c.steps_per_epoch
+
+        return num_epochs_collect, to_log
+
+    def collect_test(self, final: bool = False) -> Logs:
+        c = self._cfg.collection.test
+        episodes = c.num_final_episodes if final else c.num_episodes
+        td = self.test_dataset
+        td.clear()
+        to_log = self._test_collector.send(NumToCollect(episodes=episodes))
+        key_ep_id = f"{td.name}/episode_id"
+        to_log = [{k: v + self.num_episodes_test if k == key_ep_id else v for k, v in x.items()} for x in to_log]
+
+        print(f"\nSummary of {'final' if final else 'test'} collect: {td.num_episodes} episodes ({td.num_steps} steps)")
+        keys = [key_ep_id, "return", "length"]
+        to_log_episodes = [x for x in to_log if set(x.keys()) == set(keys)]
+        episode_ids, returns, lengths = [[d[k] for d in to_log_episodes] for k in keys]
+        for i, (ep_id, ret, length) in enumerate(zip(episode_ids, returns, lengths)):
+            print(f"  Episode {ep_id}: return = {ret} length = {length}\n", end="\n" if i == episodes - 1 else "")
+
+        self.num_episodes_test += episodes
+
+        if final:
+            to_log.append({"final_return_mean": np.mean(returns), "final_return_std": np.std(returns)})
+            print(to_log[-1])
+
+        return to_log
+
+    def train_agent(self) -> Logs:
+        self.agent.train()
+        self.agent.zero_grad()
+        to_log = []
+        for name in self._model_names:
+            cfg = getattr(self._cfg, name).training
+            if self.epoch > cfg.start_after_epochs:
+                steps = cfg.steps_first_epoch if self.epoch == 1 else cfg.steps_per_epoch
+                to_log += self.train_component(name, steps)
+        return to_log
+
+    @torch.no_grad()
+    def test_agent(self) -> Logs:
+        self.agent.eval()
+        to_log = []
+        for name in self._model_names:
+            if name == "actor_critic":
+                continue
+            cfg = getattr(self._cfg, name).training
+            if self.epoch > cfg.start_after_epochs:
+                to_log += self.test_component(name)
+        return to_log
+
+    def train_component(self, name: str, steps: int) -> Logs:
+        cfg = getattr(self._cfg, name).training
+        model = getattr(self._agent, name)
+        opt = self.opt.get(name)
+        lr_sched = self.lr_sched.get(name)
+        data_loader = self._data_loader_train.get(name)
+
+        torch.cuda.empty_cache()
+        model.to(self._device)
+        move_opt_to(opt, self._device)
+
+        model.train()
+        opt.zero_grad()
+        data_iterator = iter(data_loader) if data_loader is not None else None
+        to_log = []
+
+        num_steps = cfg.grad_acc_steps * steps
+
+        # in case of distributed training, the cfg needs to be taken from the module attribute
+        agent_cfg = self._agent.denoiser.cfg if not hasattr(self._agent.denoiser, "module") else self._agent.denoiser.module.cfg
+        effective_context_length = 0 if name == 'upsampler' else get_frame_indices(agent_cfg.frame_sampling)[-1] + 1
+        context_obs = None
+        context_act = None
+        context_mask_padding = None
+
+        for i in trange(num_steps, desc=f"Training {name}", disable=self._rank > 0):
+            curr_iter = 0
+            total_length = 0
+            while curr_iter < data_loader.batch_sampler.num_consecutive_batches:
+                batch = next(data_iterator).to(self._device) if data_iterator is not None else None
+                curr_iter += 1
+                
+                # initialize variables in the first segment
+                if batch.segment_ids[0].is_first_batch:
+                    context_obs = batch.obs
+                    context_act = batch.act
+                    context_mask_padding = batch.mask_padding
+    
+                    # build the observations until there's enough context
+                    while context_obs.shape[1] < effective_context_length + 1:
+                        batch = next(data_iterator).to(self._device) if data_iterator is not None else None
+                        curr_iter += 1
+    
+                        context_obs = torch.concat([context_obs, batch.obs], dim=1)
+                        context_act = torch.concat([context_act, batch.act], dim=1)
+                        context_mask_padding = torch.concat([context_mask_padding, batch.mask_padding], dim=1)
+    
+                    # split the context into the context obs and the obs to predict
+                    
+                    # future obs (after the effective context length)
+                    predict_obs = context_obs[:, effective_context_length:]
+                    predict_act = context_act[:, effective_context_length:]
+                    predict_mask_padding = context_mask_padding[:, effective_context_length:]
+    
+                    # previous obs (before the effective context length)
+                    context_obs = context_obs[:, :effective_context_length]
+                    context_act = context_act[:, :effective_context_length]
+                    context_mask_padding = context_mask_padding[:, :effective_context_length]
+
+                    total_length = (data_loader.batch_sampler.num_consecutive_batches-1)*data_loader.batch_sampler.autoregressive_obs + data_loader.batch_sampler.seq_length - effective_context_length
+                else:  # set batch to be the frames to be predicted next
+                    predict_obs = batch.obs
+                    predict_act = batch.act
+                    predict_mask_padding = batch.mask_padding
+    
+                # build batch for prediction
+                batch.obs = torch.cat([context_obs[:, -effective_context_length:], predict_obs], dim=1)
+                batch.act = torch.cat([context_act[:, -effective_context_length:], predict_act], dim=1)
+                batch.mask_padding = torch.cat([context_mask_padding[:, -effective_context_length:], predict_mask_padding], dim=1)
+                
+                # train on the batch
+                loss, metrics, batch_data = model(batch) if batch is not None else model()
+                loss /= total_length
+                loss.backward()
+    
+                # collect the predicted frames and prepare the next context
+                if 'obs' in batch_data and 'act' in batch_data and 'mask_padding' in batch_data and name != 'upsampler':
+                    obs = batch_data['obs']
+                    act = batch_data['act']
+                    mask_padding = batch_data['mask_padding']
+    
+                    # the next context begins at the end of the current predictions
+                    context_obs = torch.concat([context_obs, obs], dim=1)
+                    context_act = torch.concat([context_act, act], dim=1)
+                    context_mask_padding = torch.cat([context_mask_padding, mask_padding], dim=1)
+    
+                    # saving up to effective context length frames back
+                    context_obs = context_obs[:, -effective_context_length:]
+                    context_act = context_act[:, -effective_context_length:]
+                    context_mask_padding = context_mask_padding[:, -effective_context_length:]
+
+                num_batch = self.num_batch_train.get(name)
+                metrics[f"num_batch_train_{name}"] = num_batch
+                self.num_batch_train.set(name, num_batch + 1)
+                to_log.append(metrics)
+            
+            if (i + 1) % cfg.grad_acc_steps == 0:
+                if cfg.max_grad_norm is not None:
+                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm).item()
+                    metrics["grad_norm_before_clip"] = grad_norm
+                opt.step()
+                opt.zero_grad()
+                if lr_sched is not None:
+                    metrics["lr"] = lr_sched.get_last_lr()[0]
+                    lr_sched.step()
+            
+        process_confusion_matrices_if_any_and_compute_classification_metrics(to_log)
+        to_log = [{f"{name}/train/{k}": v for k, v in d.items()} for d in to_log]
+
+        model.to("cpu")
+        move_opt_to(opt, "cpu")
+
+        return to_log
+
+    @torch.no_grad()
+    def test_component(self, name: str) -> Logs:
+        model = getattr(self.agent, name)
+        data_loader = self._data_loader_test.get(name)
+        model.eval()
+        model.to(self._device)
+        to_log = []
+        for batch in tqdm(data_loader, desc=f"Evaluating {name}"):
+            batch = batch.to(self._device)
+            _, metrics, _ = model(batch)
+            num_batch = self.num_batch_test.get(name)
+            metrics[f"num_batch_test_{name}"] = num_batch
+            self.num_batch_test.set(name, num_batch + 1)
+            to_log.append(metrics)
+
+        process_confusion_matrices_if_any_and_compute_classification_metrics(to_log)
+        to_log = [{f"{name}/test/{k}": v for k, v in d.items()} for d in to_log]
+        model.to("cpu")
+        return to_log
+
+    def load_state_checkpoint(self) -> None:
+        self.load_state_dict(torch.load(self._path_state_ckpt, map_location=self._device))
+
+    def load_agent_state_checkpoint(self) -> None:
+        agent_state_dict = torch.load(self._path_state_ckpt, map_location=self._device)
+        self.agent.load_state_dict(agent_state_dict)
+
+    def save_checkpoint(self) -> None:
+        if self._rank == 0:
+            save_with_backup(self.state_dict(), self._path_state_ckpt)
+            self.train_dataset.save_to_default_path()
+            self.test_dataset.save_to_default_path()
+            self._keep_agent_copies(self.agent.state_dict(), self.epoch)
+            self._save_info_for_import_script(self.epoch)
\ No newline at end of file
diff --git a/src/utils.py b/src/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d588f73f69299134606e7b69bb8f602b40db3293
--- /dev/null
+++ b/src/utils.py
@@ -0,0 +1,361 @@
+
+from argparse import Namespace
+from collections import OrderedDict
+from dataclasses import dataclass
+from functools import partial
+import json
+from pathlib import Path
+import random
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+from omegaconf import OmegaConf
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch import Tensor
+from torch.optim.lr_scheduler import LambdaLR
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import AdamW
+import wandb
+
+
+ATARI_100K_GAMES = [
+    "Alien",
+    "Amidar",
+    "Assault",
+    "Asterix",
+    "BankHeist",
+    "BattleZone",
+    "Boxing",
+    "Breakout",
+    "ChopperCommand",
+    "CrazyClimber",
+    "DemonAttack",
+    "Freeway",
+    "Frostbite",
+    "Gopher",
+    "Hero",
+    "Jamesbond",
+    "Kangaroo",
+    "Krull",
+    "KungFuMaster",
+    "MsPacman",
+    "Pong",
+    "PrivateEye",
+    "Qbert",
+    "RoadRunner",
+    "Seaquest",
+    "UpNDown",
+]
+
+
+Logs = List[Dict[str, float]]
+LossAndLogs = Tuple[Tensor, Dict[str, Any]]
+LossLogsData = Tuple[Tensor, Dict[str, Any], Dict[str, Any]]
+
+
+class StateDictMixin:
+    def _init_fields(self) -> None:
+        def has_sd(x: str) -> bool:
+            return callable(getattr(x, "state_dict", None)) and callable(getattr(x, "load_state_dict", None))
+
+        self._all_fields = {k for k in vars(self) if not k.startswith("_")}
+        self._fields_sd = {k for k in self._all_fields if has_sd(getattr(self, k))}
+
+    def _get_field(self, k: str) -> Any:
+        return getattr(self, k).state_dict() if k in self._fields_sd else getattr(self, k)
+
+    def _set_field(self, k: str, v: Any) -> None:
+        getattr(self, k).load_state_dict(v) if k in self._fields_sd else setattr(self, k, v)
+
+    def state_dict(self) -> Dict[str, Any]:
+        if not hasattr(self, "_all_fields"):
+            self._init_fields()
+        return {k: self._get_field(k) for k in self._all_fields}
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        if not hasattr(self, "_all_fields"):
+            self._init_fields()
+        assert set(list(state_dict.keys())) == self._all_fields
+        for k, v in state_dict.items():
+            self._set_field(k, v)
+
+
+@dataclass
+class CommonTools(StateDictMixin):
+    denoiser: Optional[Any] = None
+    upsampler: Optional[Any] = None
+    rew_end_model: Optional[Any] = None
+    actor_critic: Optional[Any] = None
+
+    def get(self, name: str) -> Any:
+        return getattr(self, name)
+
+    def set(self, name: str, value: Any):
+        return setattr(self, name, value)
+
+
+def broadcast_if_needed(*args):
+    objects = list(args)
+    if dist.is_initialized():
+        dist.broadcast_object_list(objects, src=0) 
+        # the list `objects` now contains the version of rank 0
+    return objects
+
+
+def build_ddp_wrapper(**modules_dict: Dict[str, nn.Module]) -> Namespace:
+    return Namespace(**{name: DDP(module) for name, module in modules_dict.items()})
+
+
+def compute_classification_metrics(confusion_matrix: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+    num_classes = confusion_matrix.size(0)
+    precision = torch.zeros(num_classes)
+    recall = torch.zeros(num_classes)
+    f1_score = torch.zeros(num_classes)
+
+    for i in range(num_classes):
+        true_positive = confusion_matrix[i, i].item()
+        false_positive = confusion_matrix[:, i].sum().item() - true_positive
+        false_negative = confusion_matrix[i, :].sum().item() - true_positive
+
+        precision[i] = true_positive / (true_positive + false_positive) if (true_positive + false_positive) != 0 else 0
+        recall[i] = true_positive / (true_positive + false_negative) if (true_positive + false_negative) != 0 else 0
+        f1_score[i] = (
+            2 * (precision[i] * recall[i]) / (precision[i] + recall[i]) if (precision[i] + recall[i]) != 0 else 0
+        )
+
+    return precision, recall, f1_score
+
+
+def configure_opt(model: nn.Module, lr: float, weight_decay: float, eps: float, *blacklist_module_names: str) -> AdamW:
+    """Credits to https://github.com/karpathy/minGPT"""
+    # separate out all parameters to those that will and won't experience regularizing weight decay
+    decay = set()
+    no_decay = set()
+    whitelist_weight_modules = (nn.Linear, nn.Conv1d, nn.Conv2d, nn.LSTMCell, nn.LSTM)
+    blacklist_weight_modules = (nn.LayerNorm, nn.Embedding, nn.GroupNorm)
+    for mn, m in model.named_modules():
+        for pn, p in m.named_parameters():
+            fpn = "%s.%s" % (mn, pn) if mn else pn  # full param name
+            if any([fpn.startswith(module_name) for module_name in blacklist_module_names]):
+                no_decay.add(fpn)
+            elif "bias" in pn:
+                # all biases will not be decayed
+                no_decay.add(fpn)
+            elif (pn.endswith("weight") or pn.startswith("weight_")) and isinstance(m, whitelist_weight_modules):
+                # weights of whitelist modules will be weight decayed
+                decay.add(fpn)
+            elif (pn.endswith("weight") or pn.startswith("weight_")) and isinstance(m, blacklist_weight_modules):
+                # weights of blacklist modules will NOT be weight decayed
+                no_decay.add(fpn)
+
+    # validate that we considered every parameter
+    param_dict = {pn: p for pn, p in model.named_parameters()}
+    inter_params = decay & no_decay
+    union_params = decay | no_decay
+    assert len(inter_params) == 0, f"parameters {str(inter_params)} made it into both decay/no_decay sets!"
+    assert (
+        len(param_dict.keys() - union_params) == 0
+    ), f"parameters {str(param_dict.keys() - union_params)} were not separated into either decay/no_decay set!"
+
+    # create the pytorch optimizer object
+    optim_groups = [
+        {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
+        {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+    ]
+    optimizer = AdamW(optim_groups, lr=lr, eps=eps)
+    return optimizer
+
+
+def count_parameters(model: nn.Module) -> int:
+    return sum(p.numel() for p in model.parameters())
+
+
+def extract_state_dict(state_dict: OrderedDict, module_name: str) -> OrderedDict:
+    return OrderedDict({k.split(".", 1)[1]: v for k, v in state_dict.items() if k.startswith(module_name)})
+
+
+def get_lr_sched(opt: torch.optim.Optimizer, num_warmup_steps: int) -> LambdaLR:
+    def lr_lambda(current_step: int):
+        return 1 if current_step >= num_warmup_steps else current_step / max(1, num_warmup_steps)
+
+    return LambdaLR(opt, lr_lambda, last_epoch=-1)
+
+
+def init_lstm(model: nn.Module) -> None:
+    for name, p in model.named_parameters():
+        if "weight_ih" in name:
+            nn.init.xavier_uniform_(p.data)
+        elif "weight_hh" in name:
+            nn.init.orthogonal_(p.data)
+        elif "bias_ih" in name:
+            p.data.fill_(0)
+            # Set forget-gate bias to 1
+            n = p.size(0)
+            p.data[(n // 4) : (n // 2)].fill_(1)
+        elif "bias_hh" in name:
+            p.data.fill_(0)
+
+
+def get_path_agent_ckpt(path_ckpt_dir: Union[str, Path], epoch: int, num_zeros: int = 5) -> Path:
+    d = Path(path_ckpt_dir) / "agent_versions"
+    if epoch >= 0:
+        return d / f"agent_epoch_{epoch:0{num_zeros}d}.pt"
+    else:
+        all_ = sorted(list(d.iterdir()))
+        assert len(all_) >= -epoch
+        return all_[epoch]
+
+
+def keep_agent_copies_every(
+    agent_sd: Dict[str, Any],
+    epoch: int,
+    path_ckpt_dir: Path,
+    every: int,
+    num_to_keep: Optional[int],
+) -> None:
+    assert every > 0
+    assert num_to_keep is None or num_to_keep > 0
+    get_path = partial(get_path_agent_ckpt, path_ckpt_dir)
+    get_path(0).parent.mkdir(parents=False, exist_ok=True)
+
+    # Save agent
+    save_with_backup(agent_sd, get_path(epoch))
+
+    # Clean oldest
+    if (num_to_keep is not None) and (epoch % every == 0):
+        get_path(max(0, epoch - num_to_keep * every)).unlink(missing_ok=True)
+
+    # Clean previous
+    if (epoch - 1) % every != 0:
+        get_path(max(0, epoch - 1)).unlink(missing_ok=True)
+
+
+def move_opt_to(opt: AdamW, device: torch.device):
+    for optimizer_metrics in opt.state.values():
+        for metric_name, metric in optimizer_metrics.items():
+            if torch.is_tensor(metric) and metric_name != "step":
+                optimizer_metrics[metric_name] = metric.to(device)
+
+
+def process_confusion_matrices_if_any_and_compute_classification_metrics(logs: Logs) -> None:
+    cm = [x.pop("confusion_matrix") for x in logs if "confusion_matrix" in x]
+    if len(cm) > 0:
+        confusion_matrices = {k: sum([d[k] for d in cm]) for k in cm[0]}  # accumulate confusion matrices
+        metrics = {}
+        for key, confusion_matrix in confusion_matrices.items():
+            precision, recall, f1_score = compute_classification_metrics(confusion_matrix)
+            metrics.update(
+                {
+                    **{f"classification_metrics/{key}_precision_class_{i}": v for i, v in enumerate(precision)},
+                    **{f"classification_metrics/{key}_recall_class_{i}": v for i, v in enumerate(recall)},
+                    **{f"classification_metrics/{key}_f1_score_class_{i}": v for i, v in enumerate(f1_score)},
+                }
+            )
+
+        logs.append(metrics)  # Append the obtained metrics to logs (in place)
+
+
+def prompt_atari_game():
+    for i, game in enumerate(ATARI_100K_GAMES):
+        print(f"{i:2d}: {game}")
+    while True:
+        x = input("\nEnter a number: ")
+        if not x.isdigit():
+            print("Invalid.")
+            continue
+        x = int(x)
+        if x < 0 or x > 25:
+            print("Invalid.")
+            continue
+        break
+    game = ATARI_100K_GAMES[x]
+    return game
+
+
+def prompt_run_name(game):
+    cfg_file = Path("config/trainer.yaml")
+    cfg_name = OmegaConf.load(cfg_file).wandb.name
+    suffix = f"-{cfg_name}" if cfg_name is not None else ""
+    name = game + suffix
+    name_ = input(f"Confirm run name by pressing Enter (or enter a new name): {name}\n")
+    if name_ != "":
+        name = name_
+    return name
+
+
+def save_info_for_import_script(epoch: int, run_name: str, path_ckpt_dir: Path) -> None:
+    with (path_ckpt_dir / "info_for_import_script.json").open("w") as f:
+        json.dump({"epoch": epoch, "name": run_name}, f)
+
+
+def save_with_backup(obj: Any, path: Path):
+    bk = path.with_suffix(".bk")
+    if path.is_file():
+        path.rename(bk)
+    torch.save(obj, path)
+    bk.unlink(missing_ok=True)
+
+
+def set_seed(seed: int) -> None:
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    random.seed(seed)
+
+
+def skip_if_run_is_over(func: Callable) -> Callable:
+    def inner(*args, **kwargs):
+        path_run_is_over = Path(".run_is_over")
+        if not path_run_is_over.is_file():
+            func(*args, **kwargs)
+            path_run_is_over.touch()
+        else:
+            print(f"Run is marked as finished. To unmark, remove '{str(path_run_is_over)}'.")
+
+    return inner
+
+
+def try_until_no_except(func: Callable) -> None:
+    while True:
+        try:
+            func()
+        except KeyboardInterrupt:
+            break
+        except Exception:
+            continue
+        else:
+            break
+
+
+def wandb_log(logs: Logs, epoch: int):
+    for d in logs:
+        wandb.log({"epoch": epoch, **d})
+
+
+def get_frame_indices(frame_sampling):
+    indexes = []
+    current_index = 0
+    for group in frame_sampling[::-1]:
+        for _ in range(group['count']):
+            indexes.append(current_index)
+            current_index += group['stride']
+
+    return torch.tensor(indexes)
+
+def build_pages_per_epoch(pages_per_epoch):
+    mapping = {}
+    for group in pages_per_epoch[::-1]:
+        mapping[group['epoch']] = group['count']
+
+    return mapping
+
+def find_maximum_key_below_threshold(d, threshold):
+    if d is None:
+        return None
+
+    eligible_keys = [k for k in d.keys() if k <= threshold]
+    if not eligible_keys:
+        return None
+    return max(eligible_keys)
\ No newline at end of file