NON_WORKING_matrix_game_2

Paused

App Files Files Community

jbilcke-hf commited on May 15

Commit

4315c88

verified ·

1 Parent(s): d9ee793

Upload 5 files

Browse files

Files changed (5) hide show

requirements.txt +20 -0
run_inference.sh +22 -0
server.py +586 -0
teacache_forward.py +353 -0
tools/visualize.py +190 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+diffusers==0.32.2
+einops==0.8.1
+flash_attn==2.7.4.post1
+ftfy==6.3.1
+imageio==2.34.0
+numpy==1.24.4
+opencv_python==4.9.0.80
+opencv_python_headless==4.9.0.80
+packaging==25.0
+peft==0.14.0
+Pillow==11.2.1
+regex==2024.11.6
+safetensors==0.5.3
+torch==2.5.1
+torchvision==0.20.1
+torchaudio==2.5.1
+transformers==4.47.1
+aiohttp==3.9.3
+jinja2==3.1.3
+python-multipart==0.0.6

run_inference.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+# Set environment variable for CUDA memory allocation
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export MODEL_ROOT="models/matrixgame" # Replace with the actual path to your model directory
+export DIT_PATH="$MODEL_ROOT/dit/"
+export TEXTENC_PATH="$MODEL_ROOT"
+export VAE_PATH="$MODEL_ROOT/vae/"
+export MOUSE_ICON_PATH="$MODEL_ROOT/assets/mouse.png"
+export IMAGE_PATH="initial_image/" # Replace with the actual path to your initial image
+export OUTPUT_PATH="./test"
+export INFERENCE_STEPS=50
+# Execute inference script with parameters
+python inference_bench.py \
+    --dit_path $DIT_PATH \
+    --textenc_path $TEXTENC_PATH \
+    --vae_path $VAE_PATH \
+    --mouse_icon_path $MOUSE_ICON_PATH \
+    --image_path $IMAGE_PATH \
+    --output_path $OUTPUT_PATH \
+    --inference_steps $INFERENCE_STEPS \
+    --bfloat16

server.py ADDED Viewed

	@@ -0,0 +1,586 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+MatrixGame Websocket Gaming Server
+This script implements a websocket server for the MatrixGame project,
+allowing real-time streaming of game frames based on player inputs.
+"""
+import asyncio
+import json
+import logging
+import os
+import pathlib
+import time
+import uuid
+import io
+import base64
+from typing import Dict, List, Any, Optional
+import argparse
+import torch
+import numpy as np
+from PIL import Image
+import cv2
+from aiohttp import web, WSMsgType
+from condtions import Bench_actions_76
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class FrameGenerator:
+    """
+    Simplified frame generator for the game.
+    In production, this would use the MatrixGame model.
+    """
+    def __init__(self):
+        self.frame_width = 640
+        self.frame_height = 360
+        self.fps = 16
+        self.frame_count = 0
+        self.scenes = {
+            'forest': self._load_scene_frames('forest'),
+            'desert': self._load_scene_frames('desert'),
+            'beach': self._load_scene_frames('beach'),
+            'hills': self._load_scene_frames('hills'),
+            'river': self._load_scene_frames('river'),
+            'icy': self._load_scene_frames('icy'),
+            'mushroom': self._load_scene_frames('mushroom'),
+            'plain': self._load_scene_frames('plain')
+        }
+    def _load_scene_frames(self, scene_name):
+        """Load initial frames for a scene from asset directory"""
+        frames = []
+        scene_dir = f"./GameWorldScore/asset/init_image/{scene_name}"
+        if os.path.exists(scene_dir):
+            image_files = sorted([f for f in os.listdir(scene_dir) if f.endswith('.png') or f.endswith('.jpg')])
+            for img_file in image_files:
+                try:
+                    img_path = os.path.join(scene_dir, img_file)
+                    img = Image.open(img_path).convert("RGB")
+                    img = img.resize((self.frame_width, self.frame_height))
+                    frames.append(np.array(img))
+                except Exception as e:
+                    logger.error(f"Error loading image {img_file}: {str(e)}")
+        # If no frames were loaded, create a default colored frame with text
+        if not frames:
+            frame = np.ones((self.frame_height, self.frame_width, 3), dtype=np.uint8) * 100
+            # Add scene name as text
+            cv2.putText(frame, f"Scene: {scene_name}", (50, 180),
+                       cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
+            frames.append(frame)
+        return frames
+    def get_next_frame(self, scene_name, keyboard_condition=None, mouse_condition=None):
+        """
+        Generate the next frame based on current conditions.
+        Args:
+            scene_name: Name of the current scene
+            keyboard_condition: Keyboard input state
+            mouse_condition: Mouse input state
+        Returns:
+            JPEG bytes of the frame
+        """
+        scene_frames = self.scenes.get(scene_name, self.scenes['forest'])
+        # In a real implementation, this would use the MatrixGame model to generate frames
+        # based on the keyboard_condition and mouse_condition
+        # For the demo, just cycle through the pre-loaded frames
+        frame_idx = self.frame_count % len(scene_frames)
+        frame = scene_frames[frame_idx].copy()
+        self.frame_count += 1
+        # If we have keyboard/mouse conditions, visualize them on the frame
+        if keyboard_condition:
+            # Visualize keyboard inputs (simple example)
+            keys = ["W", "S", "A", "D", "JUMP", "ATTACK"]
+            for i, key_pressed in enumerate(keyboard_condition[0]):
+                color = (0, 255, 0) if key_pressed else (100, 100, 100)
+                cv2.putText(frame, keys[i], (20 + i*100, 30),
+                           cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
+        if mouse_condition:
+            # Visualize mouse movement (simple example)
+            mouse_x, mouse_y = mouse_condition[0]
+            # Scale mouse values for visualization
+            offset_x = int(mouse_x * 100)
+            offset_y = int(mouse_y * 100)
+            center_x, center_y = self.frame_width // 2, self.frame_height // 2
+            cv2.circle(frame, (center_x + offset_x, center_y - offset_y), 10, (255, 0, 0), -1)
+            cv2.putText(frame, f"Mouse: {mouse_x:.2f}, {mouse_y:.2f}",
+                       (self.frame_width - 250, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)
+        # Convert frame to JPEG
+        success, buffer = cv2.imencode('.jpg', frame)
+        if not success:
+            logger.error("Failed to encode frame as JPEG")
+            # Return a blank frame
+            blank = np.ones((self.frame_height, self.frame_width, 3), dtype=np.uint8) * 100
+            success, buffer = cv2.imencode('.jpg', blank)
+        return buffer.tobytes()
+class GameSession:
+    """
+    Represents a user's gaming session.
+    Each WebSocket connection gets its own session with separate queues.
+    """
+    def __init__(self, user_id: str, ws: web.WebSocketResponse, game_manager):
+        self.user_id = user_id
+        self.ws = ws
+        self.game_manager = game_manager
+        # Create action queue for this user session
+        self.action_queue = asyncio.Queue()
+        # Session creation time
+        self.created_at = time.time()
+        self.last_activity = time.time()
+        # Game state
+        self.current_scene = "forest"  # Default scene
+        self.is_streaming = False
+        self.stream_task = None
+        # Current input state
+        self.keyboard_state = [0, 0, 0, 0, 0, 0]  # forward, back, left, right, jump, attack
+        self.mouse_state = [0, 0]  # x, y
+        self.background_tasks = []
+    async def start(self):
+        """Start all the queue processors for this session"""
+        self.background_tasks = [
+            asyncio.create_task(self._process_action_queue()),
+        ]
+        logger.info(f"Started game session for user {self.user_id}")
+    async def stop(self):
+        """Stop all background tasks for this session"""
+        # Stop streaming if active
+        if self.is_streaming and self.stream_task:
+            self.is_streaming = False
+            self.stream_task.cancel()
+            try:
+                await self.stream_task
+            except asyncio.CancelledError:
+                pass
+        # Cancel other background tasks
+        for task in self.background_tasks:
+            task.cancel()
+        try:
+            # Wait for tasks to complete cancellation
+            await asyncio.gather(*self.background_tasks, return_exceptions=True)
+        except asyncio.CancelledError:
+            pass
+        logger.info(f"Stopped game session for user {self.user_id}")
+    async def _process_action_queue(self):
+        """Process game actions from the queue"""
+        while True:
+            data = await self.action_queue.get()
+            try:
+                action_type = data.get('action')
+                if action_type == 'start_stream':
+                    result = await self._handle_start_stream(data)
+                elif action_type == 'stop_stream':
+                    result = await self._handle_stop_stream(data)
+                elif action_type == 'keyboard_input':
+                    result = await self._handle_keyboard_input(data)
+                elif action_type == 'mouse_input':
+                    result = await self._handle_mouse_input(data)
+                elif action_type == 'change_scene':
+                    result = await self._handle_scene_change(data)
+                else:
+                    result = {
+                        'action': action_type,
+                        'requestId': data.get('requestId'),
+                        'success': False,
+                        'error': f'Unknown action: {action_type}'
+                    }
+                # Send response back to the client
+                await self.ws.send_json(result)
+                # Update last activity time
+                self.last_activity = time.time()
+            except Exception as e:
+                logger.error(f"Error processing action for user {self.user_id}: {str(e)}")
+                try:
+                    await self.ws.send_json({
+                        'action': data.get('action'),
+                        'requestId': data.get('requestId', 'unknown'),
+                        'success': False,
+                        'error': f'Error processing action: {str(e)}'
+                    })
+                except Exception as send_error:
+                    logger.error(f"Error sending error response: {send_error}")
+            finally:
+                self.action_queue.task_done()
+    async def _handle_start_stream(self, data: Dict) -> Dict:
+        """Handle request to start streaming frames"""
+        if self.is_streaming:
+            return {
+                'action': 'start_stream',
+                'requestId': data.get('requestId'),
+                'success': False,
+                'error': 'Stream already active'
+            }
+        fps = data.get('fps', 16)
+        self.is_streaming = True
+        self.stream_task = asyncio.create_task(self._stream_frames(fps))
+        return {
+            'action': 'start_stream',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'message': f'Streaming started at {fps} FPS'
+        }
+    async def _handle_stop_stream(self, data: Dict) -> Dict:
+        """Handle request to stop streaming frames"""
+        if not self.is_streaming:
+            return {
+                'action': 'stop_stream',
+                'requestId': data.get('requestId'),
+                'success': False,
+                'error': 'No active stream to stop'
+            }
+        self.is_streaming = False
+        if self.stream_task:
+            self.stream_task.cancel()
+            try:
+                await self.stream_task
+            except asyncio.CancelledError:
+                pass
+            self.stream_task = None
+        return {
+            'action': 'stop_stream',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'message': 'Streaming stopped'
+        }
+    async def _handle_keyboard_input(self, data: Dict) -> Dict:
+        """Handle keyboard input from client"""
+        key = data.get('key', '')
+        pressed = data.get('pressed', False)
+        # Map key to keyboard state index
+        key_map = {
+            'w': 0, 'forward': 0,
+            's': 1, 'back': 1, 'backward': 1,
+            'a': 2, 'left': 2,
+            'd': 3, 'right': 3,
+            'space': 4, 'jump': 4,
+            'shift': 5, 'attack': 5, 'ctrl': 5
+        }
+        if key.lower() in key_map:
+            key_idx = key_map[key.lower()]
+            self.keyboard_state[key_idx] = 1 if pressed else 0
+        return {
+            'action': 'keyboard_input',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'keyboardState': self.keyboard_state
+        }
+    async def _handle_mouse_input(self, data: Dict) -> Dict:
+        """Handle mouse movement/input from client"""
+        mouse_x = data.get('x', 0)
+        mouse_y = data.get('y', 0)
+        # Update mouse state, normalize values between -1 and 1
+        self.mouse_state = [float(mouse_x), float(mouse_y)]
+        return {
+            'action': 'mouse_input',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'mouseState': self.mouse_state
+        }
+    async def _handle_scene_change(self, data: Dict) -> Dict:
+        """Handle scene change requests"""
+        scene_name = data.get('scene', 'forest')
+        valid_scenes = ['forest', 'desert', 'beach', 'hills', 'river', 'icy', 'mushroom', 'plain']
+        if scene_name not in valid_scenes:
+            return {
+                'action': 'change_scene',
+                'requestId': data.get('requestId'),
+                'success': False,
+                'error': f'Invalid scene: {scene_name}. Valid scenes are: {", ".join(valid_scenes)}'
+            }
+        self.current_scene = scene_name
+        return {
+            'action': 'change_scene',
+            'requestId': data.get('requestId'),
+            'success': True,
+            'scene': scene_name
+        }
+    async def _stream_frames(self, fps: int):
+        """Stream frames to the client at the specified FPS"""
+        frame_interval = 1.0 / fps  # Time between frames in seconds
+        try:
+            while self.is_streaming:
+                start_time = time.time()
+                # Generate frame based on current keyboard and mouse state
+                keyboard_condition = [self.keyboard_state]
+                mouse_condition = [self.mouse_state]
+                frame_bytes = self.game_manager.frame_generator.get_next_frame(
+                    self.current_scene, keyboard_condition, mouse_condition
+                )
+                # Encode as base64 for sending in JSON
+                frame_base64 = base64.b64encode(frame_bytes).decode('utf-8')
+                # Send frame to client
+                await self.ws.send_json({
+                    'action': 'frame',
+                    'frameData': frame_base64,
+                    'timestamp': time.time()
+                })
+                # Calculate sleep time to maintain FPS
+                elapsed = time.time() - start_time
+                sleep_time = max(0, frame_interval - elapsed)
+                await asyncio.sleep(sleep_time)
+        except asyncio.CancelledError:
+            logger.info(f"Frame streaming cancelled for user {self.user_id}")
+        except Exception as e:
+            logger.error(f"Error in frame streaming for user {self.user_id}: {str(e)}")
+            if self.ws.closed:
+                logger.info(f"WebSocket closed for user {self.user_id}")
+                return
+            # Notify client of error
+            try:
+                await self.ws.send_json({
+                    'action': 'frame_error',
+                    'error': f'Streaming error: {str(e)}'
+                })
+            except:
+                pass
+            # Stop streaming
+            self.is_streaming = False
+class GameManager:
+    """
+    Manages all active gaming sessions and shared resources.
+    """
+    def __init__(self):
+        self.sessions = {}
+        self.session_lock = asyncio.Lock()
+        # Initialize frame generator
+        self.frame_generator = FrameGenerator()
+        # Load valid scenes from FrameGenerator
+        self.valid_scenes = list(self.frame_generator.scenes.keys())
+    async def create_session(self, user_id: str, ws: web.WebSocketResponse) -> GameSession:
+        """Create a new game session"""
+        async with self.session_lock:
+            # Create a new session for this user
+            session = GameSession(user_id, ws, self)
+            await session.start()
+            self.sessions[user_id] = session
+            return session
+    async def delete_session(self, user_id: str) -> None:
+        """Delete a game session and clean up resources"""
+        async with self.session_lock:
+            if user_id in self.sessions:
+                session = self.sessions[user_id]
+                await session.stop()
+                del self.sessions[user_id]
+                logger.info(f"Deleted game session for user {user_id}")
+    def get_session(self, user_id: str) -> Optional[GameSession]:
+        """Get a game session if it exists"""
+        return self.sessions.get(user_id)
+    async def close_all_sessions(self) -> None:
+        """Close all active sessions (used during shutdown)"""
+        async with self.session_lock:
+            for user_id, session in list(self.sessions.items()):
+                await session.stop()
+            self.sessions.clear()
+            logger.info("Closed all active game sessions")
+    @property
+    def session_count(self) -> int:
+        """Get the number of active sessions"""
+        return len(self.sessions)
+    def get_session_stats(self) -> Dict:
+        """Get statistics about active sessions"""
+        stats = {
+            'total_sessions': len(self.sessions),
+            'active_scenes': {},
+            'streaming_sessions': 0
+        }
+        # Count sessions by scene and streaming status
+        for session in self.sessions.values():
+            scene = session.current_scene
+            stats['active_scenes'][scene] = stats['active_scenes'].get(scene, 0) + 1
+            if session.is_streaming:
+                stats['streaming_sessions'] += 1
+        return stats
+# Create global game manager
+game_manager = GameManager()
+async def status_handler(request: web.Request) -> web.Response:
+    """Handler for API status endpoint"""
+    # Get session statistics
+    session_stats = game_manager.get_session_stats()
+    return web.json_response({
+        'product': 'MatrixGame WebSocket Server',
+        'version': '1.0.0',
+        'active_sessions': session_stats,
+        'available_scenes': game_manager.valid_scenes
+    })
+async def websocket_handler(request: web.Request) -> web.WebSocketResponse:
+    ws = web.WebSocketResponse(
+        max_msg_size=1024*1024*10,  # 10MB max message size
+        timeout=60.0
+    )
+    await ws.prepare(request)
+    # Generate a unique user ID for this connection
+    user_id = str(uuid.uuid4())
+    # Get client IP address
+    peername = request.transport.get_extra_info('peername')
+    if peername is not None:
+        client_ip = peername[0]
+    else:
+        client_ip = request.headers.get('X-Forwarded-For', 'unknown').split(',')[0].strip()
+    logger.info(f"Client {user_id} connecting from IP: {client_ip}")
+    # Store the user ID in the websocket for easy access
+    ws.user_id = user_id
+    # Create a new session for this user
+    user_session = await game_manager.create_session(user_id, ws)
+    # Send initial welcome message
+    await ws.send_json({
+        'action': 'welcome',
+        'userId': user_id,
+        'message': 'Welcome to the MatrixGame WebSocket server!',
+        'scenes': game_manager.valid_scenes
+    })
+    try:
+        async for msg in ws:
+            if msg.type == WSMsgType.TEXT:
+                try:
+                    data = json.loads(msg.data)
+                    action = data.get('action')
+                    if action == 'ping':
+                        # Respond to ping immediately
+                        await ws.send_json({
+                            'action': 'pong',
+                            'requestId': data.get('requestId'),
+                            'timestamp': time.time()
+                        })
+                    else:
+                        # Route game actions to the session's action queue
+                        await user_session.action_queue.put(data)
+                except json.JSONDecodeError:
+                    logger.error(f"Invalid JSON from user {user_id}: {msg.data}")
+                    await ws.send_json({
+                        'error': 'Invalid JSON message',
+                        'success': False
+                    })
+                except Exception as e:
+                    logger.error(f"Error processing WebSocket message for user {user_id}: {str(e)}")
+                    await ws.send_json({
+                        'action': data.get('action') if 'data' in locals() else 'unknown',
+                        'success': False,
+                        'error': f'Error processing message: {str(e)}'
+                    })
+            elif msg.type in (WSMsgType.ERROR, WSMsgType.CLOSE):
+                break
+    finally:
+        # Cleanup session
+        await game_manager.delete_session(user_id)
+        logger.info(f"Connection closed for user {user_id}")
+    return ws
+async def init_app() -> web.Application:
+    app = web.Application(
+        client_max_size=1024**2*10  # 10MB max size
+    )
+    # Add cleanup logic
+    async def cleanup(app):
+        logger.info("Shutting down server, closing all sessions...")
+        await game_manager.close_all_sessions()
+    app.on_shutdown.append(cleanup)
+    # Add routes
+    app.router.add_get('/ws', websocket_handler)
+    app.router.add_get('/api/status', status_handler)
+    # Set up static file serving for the client demo
+    app.router.add_static('/client', path=pathlib.Path(__file__).parent / 'client')
+    return app
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="MatrixGame WebSocket Server")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host IP to bind to")
+    parser.add_argument("--port", type=int, default=8080, help="Port to listen on")
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = parse_args()
+    app = asyncio.run(init_app())
+    web.run_app(app, host=args.host, port=args.port)

teacache_forward.py ADDED Viewed

	@@ -0,0 +1,353 @@

+# teacache
+import torch
+import numpy as np
+from typing import Optional, Union, Dict, Any
+from matrixgame.model_variants.matrixgame_dit_src.modulate_layers import modulate
+from matrixgame.model_variants.matrixgame_dit_src.attenion import attention, get_cu_seqlens
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+def teacache_forward(
+    self,
+    hidden_states: torch.Tensor,
+    timestep: torch.Tensor,  # Should be in range(0, 1000).
+    encoder_hidden_states: torch.Tensor = None,
+    encoder_attention_mask: torch.Tensor = None,  # Now we don't use it.
+    guidance: torch.Tensor = None,  # Guidance for modulation, should be cfg_scale x 1000.
+    mouse_condition = None,
+    keyboard_condition = None,
+    return_dict: bool = True,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        x = hidden_states
+        t = timestep
+        text_states, text_states_2 = encoder_hidden_states
+        text_mask, test_mask_2 = encoder_attention_mask
+        out = {}
+        img = x
+        txt = text_states
+        _, _, ot, oh, ow = x.shape
+        freqs_cos, freqs_sin = self.get_rotary_pos_embed(ot, oh, ow)
+        tt, th, tw = (
+            ot // self.patch_size[0],
+            oh // self.patch_size[1],
+            ow // self.patch_size[2],
+        )
+        # Prepare modulation vectors.
+        vec = self.time_in(t)
+        if self.i2v_condition_type == "token_replace":
+            token_replace_t = torch.zeros_like(t)
+            token_replace_vec = self.time_in(token_replace_t)
+            frist_frame_token_num = th * tw
+        else:
+            token_replace_vec = None
+            frist_frame_token_num = None
+        # text modulation
+        #vec_2 = self.vector_in(text_states_2)
+        #vec = vec + vec_2
+        #if self.i2v_condition_type == "token_replace":
+        #    token_replace_vec = token_replace_vec + vec_2
+        # guidance modulation
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError(
+                    "Didn't get guidance strength for guidance distilled model."
+                )
+            # our timestep_embedding is merged into guidance_in(TimestepEmbedder)
+            vec = vec + self.guidance_in(guidance)
+        # Embed image and text.
+        img = self.img_in(img)
+        if self.text_projection == "linear":
+            txt = self.txt_in(txt)
+        elif self.text_projection == "single_refiner":
+            txt = self.txt_in(txt, t, text_mask if self.use_attention_mask else None)
+        else:
+            raise NotImplementedError(
+                f"Unsupported text_projection: {self.text_projection}"
+            )
+        txt_seq_len = txt.shape[1]
+        img_seq_len = img.shape[1]
+        # Compute cu_squlens and max_seqlen for flash attention
+        cu_seqlens_q = get_cu_seqlens(text_mask, img_seq_len)
+        cu_seqlens_kv = cu_seqlens_q
+        max_seqlen_q = img_seq_len + txt_seq_len
+        max_seqlen_kv = max_seqlen_q
+        freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
+        # teacache
+        if self.enable_teacache:
+            inp = img.clone()
+            vec_ = vec.clone()
+            txt_ = txt.clone()
+            (
+                img_mod1_shift,
+                img_mod1_scale,
+                img_mod1_gate,
+                img_mod2_shift,
+                img_mod2_scale,
+                img_mod2_gate,
+            ) = self.double_blocks[0].img_mod(vec_).chunk(6, dim=-1)
+            normed_inp = self.double_blocks[0].img_norm1(inp)
+            modulated_inp = modulate(
+                normed_inp, shift=img_mod1_shift, scale=img_mod1_scale
+            )
+            if self.cnt == 0 or self.cnt == self.num_steps-1:
+                should_calc = True
+                self.accumulated_rel_l1_distance = 0
+            else:
+                coefficients = [7.33226126e+02, -4.01131952e+02,  6.75869174e+01, -3.14987800e+00, 9.61237896e-02]
+                #coefficients = [-296.53, 191.67, -39.037, 3.705, -0.0383]
+                rescale_func = np.poly1d(coefficients)
+                self.accumulated_rel_l1_distance += rescale_func(((modulated_inp-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
+                if self.accumulated_rel_l1_distance < self.rel_l1_thresh:
+                    should_calc = False
+                else:
+                    should_calc = True
+                    self.accumulated_rel_l1_distance = 0
+            self.previous_modulated_input = modulated_inp
+            self.cnt += 1
+            if self.cnt == self.num_steps:
+                self.cnt = 0
+        if self.enable_teacache:
+            if not should_calc:
+                img += self.previous_residual
+            else:
+                ori_img = img.clone()
+                # --------------------- Pass through DiT blocks ------------------------
+                for _, block in enumerate(self.double_blocks):
+                    if torch.is_grad_enabled() and self.gradient_checkpointing:
+                        def create_custom_forward(module):
+                            def custom_forward(*inputs):
+                                return module(*inputs)
+                            return custom_forward
+                        ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                        image_kwargs: Dict[str, Any] = {"tt":hidden_states.shape[2] // self.patch_size[0],
+                            "th":hidden_states.shape[3] // self.patch_size[1],
+                            "tw":hidden_states.shape[4] // self.patch_size[2]}
+                        img, txt = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(block),
+                            img,
+                            txt,
+                            vec,
+                            cu_seqlens_q,
+                            cu_seqlens_kv,
+                            max_seqlen_q,
+                            max_seqlen_kv,
+                            freqs_cis,
+                            image_kwargs,
+                            mouse_condition,
+                            keyboard_condition,
+                            self.i2v_condition_type,
+                            token_replace_vec,
+                            frist_frame_token_num,
+                            **ckpt_kwargs,
+                        )
+                    else:
+                        image_kwargs: Dict[str, Any] = {"tt":hidden_states.shape[2] // self.patch_size[0],
+                            "th":hidden_states.shape[3] // self.patch_size[1],
+                            "tw":hidden_states.shape[4] // self.patch_size[2]}
+                        double_block_args = [
+                            img,
+                            txt,
+                            vec,
+                            cu_seqlens_q,
+                            cu_seqlens_kv,
+                            max_seqlen_q,
+                            max_seqlen_kv,
+                            freqs_cis,
+                            image_kwargs,
+                            mouse_condition,
+                            keyboard_condition,
+                            self.i2v_condition_type,
+                            token_replace_vec,
+                            frist_frame_token_num,
+                        ]
+                        img, txt = block(*double_block_args)
+                # Merge txt and img to pass through single stream blocks.
+                x = torch.cat((img, txt), 1)
+                if len(self.single_blocks) > 0:
+                    for _, block in enumerate(self.single_blocks):
+                        if torch.is_grad_enabled() and self.gradient_checkpointing:
+                            def create_custom_forward(module):
+                                def custom_forward(*inputs):
+                                    return module(*inputs)
+                                return custom_forward
+                            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                            image_kwargs: Dict[str, Any] = {"tt":hidden_states.shape[2] // self.patch_size[0],
+                                "th":hidden_states.shape[3] // self.patch_size[1],
+                                "tw":hidden_states.shape[4] // self.patch_size[2]}
+                            x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block),
+                                x,
+                                vec,
+                                txt_seq_len,
+                                cu_seqlens_q,
+                                cu_seqlens_kv,
+                                max_seqlen_q,
+                                max_seqlen_kv,
+                                (freqs_cos, freqs_sin),
+                                image_kwargs,
+                                mouse_condition,
+                                keyboard_condition,
+                                self.i2v_condition_type,
+                                token_replace_vec,
+                                frist_frame_token_num,
+                                **ckpt_kwargs,
+                            )
+                        else:
+                            image_kwargs: Dict[str, Any] = {"tt":hidden_states.shape[2] // self.patch_size[0],
+                                "th":hidden_states.shape[3] // self.patch_size[1],
+                                "tw":hidden_states.shape[4] // self.patch_size[2]}
+                            single_block_args = [
+                                x,
+                                vec,
+                                txt_seq_len,
+                                cu_seqlens_q,
+                                cu_seqlens_kv,
+                                max_seqlen_q,
+                                max_seqlen_kv,
+                                (freqs_cos, freqs_sin),
+                                image_kwargs,
+                                mouse_condition,
+                                keyboard_condition,
+                                self.i2v_condition_type,
+                                token_replace_vec,
+                                frist_frame_token_num,
+                            ]
+                            x = block(*single_block_args)
+                img = x[:, :img_seq_len, ...]
+                self.previous_residual = img - ori_img
+        else:
+            # --------------------- Pass through DiT blocks ------------------------
+            for _, block in enumerate(self.double_blocks):
+                if torch.is_grad_enabled() and self.gradient_checkpointing:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+                        return custom_forward
+                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                    image_kwargs: Dict[str, Any] = {"tt":hidden_states.shape[2] // self.patch_size[0],
+                        "th":hidden_states.shape[3] // self.patch_size[1],
+                        "tw":hidden_states.shape[4] // self.patch_size[2]}
+                    img, txt = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        img,
+                        txt,
+                        vec,
+                        cu_seqlens_q,
+                        cu_seqlens_kv,
+                        max_seqlen_q,
+                        max_seqlen_kv,
+                        freqs_cis,
+                        image_kwargs,
+                        mouse_condition,
+                        keyboard_condition,
+                        self.i2v_condition_type,
+                        token_replace_vec,
+                        frist_frame_token_num,
+                        **ckpt_kwargs,
+                    )
+                else:
+                    image_kwargs: Dict[str, Any] = {"tt":hidden_states.shape[2] // self.patch_size[0],
+                        "th":hidden_states.shape[3] // self.patch_size[1],
+                        "tw":hidden_states.shape[4] // self.patch_size[2]}
+                    double_block_args = [
+                        img,
+                        txt,
+                        vec,
+                        cu_seqlens_q,
+                        cu_seqlens_kv,
+                        max_seqlen_q,
+                        max_seqlen_kv,
+                        freqs_cis,
+                        image_kwargs,
+                        mouse_condition,
+                        keyboard_condition,
+                        self.i2v_condition_type,
+                        token_replace_vec,
+                        frist_frame_token_num,
+                    ]
+                    img, txt = block(*double_block_args)
+            # Merge txt and img to pass through single stream blocks.
+            x = torch.cat((img, txt), 1)
+            if len(self.single_blocks) > 0:
+                for _, block in enumerate(self.single_blocks):
+                    if torch.is_grad_enabled() and self.gradient_checkpointing:
+                        def create_custom_forward(module):
+                            def custom_forward(*inputs):
+                                return module(*inputs)
+                            return custom_forward
+                        ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                        image_kwargs: Dict[str, Any] = {"tt":hidden_states.shape[2] // self.patch_size[0],
+                            "th":hidden_states.shape[3] // self.patch_size[1],
+                            "tw":hidden_states.shape[4] // self.patch_size[2]}
+                        x = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(block),
+                            x,
+                            vec,
+                            txt_seq_len,
+                            cu_seqlens_q,
+                            cu_seqlens_kv,
+                            max_seqlen_q,
+                            max_seqlen_kv,
+                            (freqs_cos, freqs_sin),
+                            image_kwargs,
+                            mouse_condition,
+                            keyboard_condition,
+                            self.i2v_condition_type,
+                            token_replace_vec,
+                            frist_frame_token_num,
+                            **ckpt_kwargs,
+                        )
+                    else:
+                        image_kwargs: Dict[str, Any] = {"tt":hidden_states.shape[2] // self.patch_size[0],
+                            "th":hidden_states.shape[3] // self.patch_size[1],
+                            "tw":hidden_states.shape[4] // self.patch_size[2]}
+                        single_block_args = [
+                            x,
+                            vec,
+                            txt_seq_len,
+                            cu_seqlens_q,
+                            cu_seqlens_kv,
+                            max_seqlen_q,
+                            max_seqlen_kv,
+                            (freqs_cos, freqs_sin),
+                            image_kwargs,
+                            mouse_condition,
+                            keyboard_condition,
+                            self.i2v_condition_type,
+                            token_replace_vec,
+                            frist_frame_token_num,
+                        ]
+                        x = block(*single_block_args)
+            img = x[:, :img_seq_len, ...]
+        # ---------------------------- Final layer ------------------------------
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.unpatchify(img, tt, th, tw)
+        if return_dict:
+            out["x"] = img
+            return out
+        return (img,)

tools/visualize.py ADDED Viewed

	@@ -0,0 +1,190 @@

+from operator import index
+import cv2
+import numpy as np
+import os
+import subprocess
+from diffusers.utils import export_to_video
+def parse_config(config):
+    """
+    根据配置生成按键数据和鼠标数据
+    - config: list_actions[i] 的配置
+    - 返回: key_data 和 mouse_data
+    """
+    key_data = {}
+    mouse_data = {}
+    # 解析 Space 按键的帧范围
+    space_frames = set()
+    key, mouse = config
+    for i in range(len(mouse)):
+        if  len(key[i])==7:
+            w, s, a, d, space, attack, _ = key[i]
+        else:
+            w, s, a, d, space, attack = key[i]
+        mouse_y, mouse_x = mouse[i]
+        mouse_y = -1 * mouse_y
+        # 按键状态
+        key_data[i] = {
+            "W": bool(w),
+            "A": bool(a),
+            "S": bool(s),
+            "D": bool(d),
+            "Space": bool(space),
+            "Attack": bool(attack),
+        }
+        # 鼠标位置
+        if i == 0:
+            mouse_data[i] = (320, 176)  # 默认初始位置
+        else:
+            global_scale_factor = 0.2
+            mouse_scale_x = 15 * global_scale_factor
+            mouse_scale_y = 15 * 4 * global_scale_factor
+            mouse_data[i] = (
+                mouse_data[i-1][0] + mouse_x * mouse_scale_x,  # x 坐标累计
+                mouse_data[i-1][1] + mouse_y * mouse_scale_y,  # y 坐标累计
+            )
+    return key_data, mouse_data
+# 绘制圆角矩形
+def draw_rounded_rectangle(image, top_left, bottom_right, color, radius=10, alpha=0.5):
+    overlay = image.copy()
+    x1, y1 = top_left
+    x2, y2 = bottom_right
+    cv2.rectangle(overlay, (x1 + radius, y1), (x2 - radius, y2), color, -1)
+    cv2.rectangle(overlay, (x1, y1 + radius), (x2, y2 - radius), color, -1)
+    cv2.ellipse(overlay, (x1 + radius, y1 + radius), (radius, radius), 180, 0, 90, color, -1)
+    cv2.ellipse(overlay, (x2 - radius, y1 + radius), (radius, radius), 270, 0, 90, color, -1)
+    cv2.ellipse(overlay, (x1 + radius, y2 - radius), (radius, radius), 90, 0, 90, color, -1)
+    cv2.ellipse(overlay, (x2 - radius, y2 - radius), (radius, radius), 0, 0, 90, color, -1)
+    cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0, image)
+# 在帧上绘制按键
+def draw_keys_on_frame(frame, keys, key_size=(80, 50), spacing=20, bottom_margin=30):
+    h, w, _ = frame.shape
+    horison_shift = 90
+    vertical_shift = -20
+    horizon_shift_all = 50
+    key_positions = {
+        "W": (w // 2 - key_size[0] // 2 - horison_shift - horizon_shift_all + spacing* 2, h - bottom_margin - key_size[1] * 2 + vertical_shift - 20),
+        "A": (w // 2 - key_size[0] * 2 + 5 - horison_shift - horizon_shift_all+ spacing* 2, h - bottom_margin - key_size[1] + vertical_shift),
+        "S": (w // 2 - key_size[0] // 2 - horison_shift - horizon_shift_all+ spacing* 2, h - bottom_margin - key_size[1] + vertical_shift),
+        "D": (w // 2 + key_size[0] - 5 - horison_shift - horizon_shift_all+ spacing* 2, h - bottom_margin - key_size[1] + vertical_shift),
+        "Space": (w // 2 + key_size[0] * 2 + spacing * 4 - horison_shift - horizon_shift_all , h - bottom_margin - key_size[1] + vertical_shift),
+        "Attack": (w // 2 + key_size[0] * 3 + spacing * 9 - horison_shift - horizon_shift_all, h - bottom_margin - key_size[1] + vertical_shift),
+    }
+    for key, (x, y) in key_positions.items():
+        is_pressed = keys.get(key, False)
+        top_left = (x, y)
+        if key in ["Space", "Attack"]:
+            bottom_right = (x + key_size[0]+40, y + key_size[1])
+        else:
+            bottom_right = (x + key_size[0], y + key_size[1])
+        color = (0, 255, 0) if is_pressed else (200, 200, 200)
+        alpha = 0.8 if is_pressed else 0.5
+        draw_rounded_rectangle(frame, top_left, bottom_right, color, radius=10, alpha=alpha)
+        text_size = cv2.getTextSize(key, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)[0]
+        if key in ["Space", "Attack"]:
+            text_x = x + (key_size[0]+40 - text_size[0]) // 2
+        else:
+            text_x = x + (key_size[0] - text_size[0]) // 2
+        text_y = y + (key_size[1] + text_size[1]) // 2
+        cv2.putText(frame, key, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 0), 2)
+# 在帧上叠加鼠标图案
+def overlay_icon(frame, icon, position, scale=1.0, rotation=0):
+    x, y = position
+    h, w, _ = icon.shape
+    # 缩放图标
+    scaled_width = int(w * scale)
+    scaled_height = int(h * scale)
+    icon_resized = cv2.resize(icon, (scaled_width, scaled_height), interpolation=cv2.INTER_AREA)
+    # 旋转图标
+    center = (scaled_width // 2, scaled_height // 2)
+    rotation_matrix = cv2.getRotationMatrix2D(center, rotation, 1.0)
+    icon_rotated = cv2.warpAffine(icon_resized, rotation_matrix, (scaled_width, scaled_height), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=(0, 0, 0, 0))
+    h, w, _ = icon_rotated.shape
+    frame_h, frame_w, _ = frame.shape
+    # 计算绘制区域
+    top_left_x = max(0, int(x - w // 2))
+    top_left_y = max(0, int(y - h // 2))
+    bottom_right_x = min(frame_w, int(x + w // 2))
+    bottom_right_y = min(frame_h, int(y + h // 2))
+    icon_x_start = max(0, int(-x + w // 2))
+    icon_y_start = max(0, int(-y + h // 2))
+    icon_x_end = icon_x_start + (bottom_right_x - top_left_x)
+    icon_y_end = icon_y_start + (bottom_right_y - top_left_y)
+    # 提取图标区域
+    icon_region = icon_rotated[icon_y_start:icon_y_end, icon_x_start:icon_x_end]
+    alpha = icon_region[:, :, 3] / 255.0
+    icon_rgb = icon_region[:, :, :3]
+    # 提取帧对应区域
+    frame_region = frame[top_left_y:bottom_right_y, top_left_x:bottom_right_x]
+    # 叠加图标
+    print(frame_region.shape, icon_rgb.shape, alpha.shape)
+    # import ipdb; ipdb.set_trace()
+    for c in range(3):
+        frame_region[:, :, c] = (1 - alpha) * frame_region[:, :, c] + alpha * icon_rgb[:, :, c]
+    # 替换帧对应区域
+    frame[top_left_y:bottom_right_y, top_left_x:bottom_right_x] = frame_region
+# 处理视频
+def process_video(input_video, output_video, config, mouse_icon_path, mouse_scale=2.0, mouse_rotation=0,fps=16):
+    key_data, mouse_data = parse_config(config)
+    fps = fps
+    frame_width = input_video[0].shape[1]
+    frame_height = input_video[0].shape[0]
+    frame_count = len(input_video)
+    mouse_icon = cv2.imread(mouse_icon_path, cv2.IMREAD_UNCHANGED)
+    out_video = []
+    frame_idx = 0
+    for frame in input_video:
+        keys = key_data.get(frame_idx, {"W": False, "A": False, "S": False, "D": False, "Space": False, "Attack": False})
+        raw_mouse_pos = mouse_data.get(frame_idx, (frame_width // 2 // 2, frame_height // 2 // 2))  # fallback 也用小分辨率中心
+        mouse_position = (int(raw_mouse_pos[0] * 2), int(raw_mouse_pos[1] * 2))
+        draw_keys_on_frame(frame, keys, key_size=(75, 75), spacing=10, bottom_margin=20)
+        overlay_icon(frame, mouse_icon, mouse_position, scale=mouse_scale, rotation=mouse_rotation)
+        out_video.append(frame / 255)
+        frame_idx += 1
+        print(f"Processing frame {frame_idx}/{frame_count}", end="\r")
+    export_to_video(out_video, output_video, fps=fps)
+    print("\nProcessing complete!")
+# 处理视频
+def save_video(input_video, output_video, fps=16):
+    fps = fps
+    frame_width = input_video[0].shape[1]
+    frame_height = input_video[0].shape[0]
+    frame_count = len(input_video)
+    out_video = []
+    frame_idx = 0
+    for frame in input_video:
+        out_video.append(frame / 255)
+        frame_idx += 1
+        print(f"Processing frame {frame_idx}/{frame_count}", end="\r")
+    export_to_video(out_video, output_video, fps=fps)
+    print("\nProcessing complete!")