Spaces:

jbilcke-hf
/

Hunyuan-GameCraft

Running on A100

App Files Files Community

jbilcke-hf HF Staff commited on 16 days ago

Commit

01c0e76

0 Parent(s):

Initial commit with LFS-tracked binary files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.claude/settings.local.json +19 -0
.gitattributes +26 -0
.gitignore +1 -0
CLAUDE.md +137 -0
LICENSE +77 -0
Notice.txt +100 -0
README.md +296 -0
app.py +360 -0
asset/method.png +3 -0
asset/teaser.png +3 -0
asset/village.png +3 -0
docs_for_ai_coding_bots/.DS_Store +0 -0
docs_for_ai_coding_bots/huggingface_hub/Downloading-model-from-hub.md +174 -0
docs_for_ai_coding_bots/huggingface_hub/Using-the-cache-in-hf-hub-library.md +531 -0
hymm_sp/__init__.py +0 -0
hymm_sp/config.py +160 -0
hymm_sp/constants.py +58 -0
hymm_sp/data_kits/data_tools.py +115 -0
hymm_sp/data_kits/video_dataset.py +259 -0
hymm_sp/diffusion/__init__.py +30 -0
hymm_sp/diffusion/pipelines/__init__.py +5 -0
hymm_sp/diffusion/pipelines/pipeline_hunyuan_video_game.py +1152 -0
hymm_sp/diffusion/schedulers/__init__.py +2 -0
hymm_sp/diffusion/schedulers/scheduling_flow_match_discrete.py +240 -0
hymm_sp/helpers.py +194 -0
hymm_sp/inference.py +201 -0
hymm_sp/modules/__init__.py +38 -0
hymm_sp/modules/activation_layers.py +23 -0
hymm_sp/modules/attn_layers.py +437 -0
hymm_sp/modules/cameranet.py +248 -0
hymm_sp/modules/embed_layers.py +146 -0
hymm_sp/modules/fp8_optimization.py +246 -0
hymm_sp/modules/mlp_layers.py +97 -0
hymm_sp/modules/models.py +697 -0
hymm_sp/modules/modulate_layers.py +76 -0
hymm_sp/modules/norm_layers.py +77 -0
hymm_sp/modules/parallel_states.py +381 -0
hymm_sp/modules/posemb_layers.py +112 -0
hymm_sp/modules/token_refiner.py +265 -0
hymm_sp/sample_batch.py +298 -0
hymm_sp/sample_inference.py +716 -0
hymm_sp/text_encoder/__init__.py +310 -0
hymm_sp/vae/__init__.py +79 -0
hymm_sp/vae/autoencoder_kl_causal_3d.py +781 -0
hymm_sp/vae/unet_causal_3d_blocks.py +900 -0
hymm_sp/vae/vae.py +433 -0
requirements.txt +60 -0
scripts/run_sample_batch_4090.sh +35 -0
scripts/run_sample_batch_distill.sh +24 -0
scripts/run_sample_batch_sp.sh +24 -0

.claude/settings.local.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "permissions": {
+    "allow": [
+      "Bash(git remote set-url:*)",
+      "Bash(git lfs track:*)",
+      "Bash(git add:*)",
+      "Bash(git commit:*)",
+      "Bash(git push:*)",
+      "Bash(git rm:*)",
+      "Bash(git lfs:*)",
+      "Bash(git gc:*)",
+      "Bash(GIT_TRACE=1 git push origin main -f)",
+      "Bash(git rev-list:*)",
+      "Bash(git checkout:*)"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}

.gitattributes ADDED Viewed

	@@ -0,0 +1,26 @@

+*.npy filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.avi filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.hdf5 filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.mov filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.webm filter=lfs diff=lfs merge=lfs -text
+asset/teaser.png filter=lfs diff=lfs merge=lfs -text
+asset/*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .DS_Store

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,137 @@

+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Project Overview
+Hunyuan-GameCraft is a high-dynamic interactive game video generation system that creates gameplay videos with controllable camera movements and actions. The system uses diffusion models and action-controlled generation to synthesize realistic game footage from reference images and keyboard/mouse input controls.
+## Key Commands
+### Installation
+```bash
+# Create and activate conda environment
+conda create -n HYGameCraft python==3.10
+conda activate HYGameCraft
+# Install PyTorch and dependencies
+conda install pytorch==2.5.1 torchvision==0.20.0 torchaudio==2.5.1 pytorch-cuda=12.4 -c pytorch -c nvidia
+# Install requirements
+python -m pip install -r requirements.txt
+# Install flash attention (optional, for acceleration)
+python -m pip install ninja
+python -m pip install git+https://github.com/Dao-AILab/[email protected]
+```
+### Download Models
+```bash
+cd weights
+huggingface-cli download tencent/Hunyuan-GameCraft-1.0 --local-dir ./
+```
+### Run Inference
+**Multi-GPU (8 GPUs) - Standard Model:**
+```bash
+torchrun --nnodes=1 --nproc_per_node=8 --master_port 29605 hymm_sp/sample_batch.py \
+    --image-path "asset/village.png" \
+    --prompt "YOUR_PROMPT" \
+    --ckpt weights/gamecraft_models/mp_rank_00_model_states.pt \
+    --video-size 704 1216 \
+    --cfg-scale 2.0 \
+    --image-start \
+    --action-list w s d a \
+    --action-speed-list 0.2 0.2 0.2 0.2 \
+    --seed 250160 \
+    --infer-steps 50 \
+    --save-path './results/'
+```
+**Single GPU with Low VRAM (24GB minimum):**
+```bash
+export DISABLE_SP=1
+export CPU_OFFLOAD=1
+torchrun --nnodes=1 --nproc_per_node=1 --master_port 29605 hymm_sp/sample_batch.py \
+    --ckpt weights/gamecraft_models/mp_rank_00_model_states.pt \
+    --cpu-offload \
+    --use-fp8 \
+    [other parameters...]
+```
+**Distilled Model (faster, 8 inference steps):**
+```bash
+torchrun --nnodes=1 --nproc_per_node=8 --master_port 29605 hymm_sp/sample_batch.py \
+    --ckpt weights/gamecraft_models/mp_rank_00_model_states_distill.pt \
+    --cfg-scale 1.0 \
+    --infer-steps 8 \
+    --use-fp8 \
+    [other parameters...]
+```
+## Architecture Overview
+### Core Components
+1. **Main Entry Points**
+   - `hymm_sp/sample_batch.py`: Main script for batch video generation with distributed processing
+   - `hymm_sp/sample_inference.py`: Core inference logic and model sampling
+   - `hymm_sp/config.py`: Configuration parsing and argument handling
+2. **Model Architecture (`hymm_sp/modules/`)**
+   - `models.py`: Core diffusion model implementation
+   - `cameranet.py`: Camera control and action encoding for game interactions
+   - `token_refiner.py`: Text token refinement for prompt conditioning
+   - `parallel_states.py`: Distributed training/inference state management
+   - `fp8_optimization.py`: FP8 quantization for memory/speed optimization
+3. **VAE Module (`hymm_sp/vae/`)**
+   - `autoencoder_kl_causal_3d.py`: 3D causal VAE for video encoding/decoding
+   - Handles latent space conversion for video frames
+4. **Diffusion Pipeline (`hymm_sp/diffusion/`)**
+   - `pipeline_hunyuan_video_game.py`: Custom pipeline for game video generation
+   - `scheduling_flow_match_discrete.py`: Flow matching scheduler for denoising
+5. **Data Processing (`hymm_sp/data_kits/`)**
+   - `video_dataset.py`: Dataset handling for video inputs
+   - `data_tools.py`: Video saving and processing utilities
+### Key Features
+- **Action Control**: Maps keyboard inputs (w/a/s/d) to continuous camera space for smooth transitions
+- **Hybrid History Conditioning**: Extends video sequences autoregressively while preserving scene context
+- **Model Distillation**: Accelerated inference model (8 steps vs 50 steps)
+- **Memory Optimization**: FP8 quantization, CPU offloading, and SageAttention support
+- **Distributed Processing**: Multi-GPU support with sequence parallelism
+### Important Parameters
+- `--action-list`: Sequence of keyboard actions (w/a/s/d)
+- `--action-speed-list`: Movement speed for each action (0.0-3.0)
+- `--video-size`: Output resolution (height width)
+- `--cfg-scale`: Classifier-free guidance scale (1.0 for distilled, 2.0 for standard)
+- `--infer-steps`: Denoising steps (8 for distilled, 50 for standard)
+- `--use-fp8`: Enable FP8 optimization for memory reduction
+- `--cpu-offload`: Offload model to CPU for low VRAM scenarios
+### Model Weights Structure
+```
+weights/
+├── gamecraft_models/
+│   ├── mp_rank_00_model_states.pt        # Standard model
+│   └── mp_rank_00_model_states_distill.pt # Distilled model
+└── stdmodels/
+    ├── vae_3d/                            # 3D VAE model
+    ├── llava-llama-3-8b-v1_1-transformers/ # Text encoder
+    └── openai_clip-vit-large-patch14/     # CLIP encoder
+```
+## Development Notes
+- Environment variable `MODEL_BASE` should point to `weights/stdmodels`
+- Use `export DISABLE_SP=1` and `export CPU_OFFLOAD=1` for single GPU inference
+- Minimum GPU memory: 24GB (very slow), Recommended: 80GB per GPU
+- Action length determines video duration (1 action = 33 frames at 25 FPS)
+- SageAttention can be installed for additional acceleration

LICENSE ADDED Viewed

	@@ -0,0 +1,77 @@

+TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
+Tencent Hunyuan-GameCraft Release Date: August 14, 2025
+THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
+By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
+1.	DEFINITIONS.
+a.	“Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
+b.	“Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
+c.	“Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
+d.	“Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
+e.	“Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
+f.	“Materials” shall mean, collectively, Tencent’s proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
+g.	“Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
+h.	“Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
+i.	“Tencent,” “We” or “Us” shall mean the applicable entity or entities in the Tencent corporate family that own(s) intellectual property or other rights embodied in or utilized by the Materials.
+j.	“Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent Hunyuan-GameCraft released at [https://github.com/Tencent-Hunyuan/Hunyuan-GameCraft-1.0].
+k.	“Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
+l.	“Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
+m.	“Third Party” or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
+n.	“including” shall mean including but not limited to.
+2.	GRANT OF RIGHTS.
+We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
+3.	DISTRIBUTION.
+You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
+a.	You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
+b.	You must cause any modified files to carry prominent notices stating that You changed the files;
+c.	You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
+d.	All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2025 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
+You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
+4.	ADDITIONAL COMMERCIAL TERMS.
+If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
+5.	RULES OF USE.
+a.	Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
+b.	You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other AI model (other than Tencent Hunyuan or Model Derivatives thereof).
+c.	You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
+6.	INTELLECTUAL PROPERTY.
+a.	Subject to Tencent’s ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
+b.	No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
+c.	If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent Hunyuan Works.
+d.	Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
+7.	DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
+a.	We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
+b.	UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
+c.	TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+8.	SURVIVAL AND TERMINATION.
+a.	The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
+b.	We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
+9.	GOVERNING LAW AND JURISDICTION.
+a.	This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
+b.	Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
+EXHIBIT A
+ACCEPTABLE USE POLICY
+Tencent reserves the right to update this Acceptable Use Policy from time to time.
+Last modified: November 5, 2024
+Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
+1.	Outside the Territory;
+2.	In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
+3.	To harm Yourself or others;
+4.	To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
+5.	To override or circumvent the safety guardrails and safeguards We have put in place;
+6.	For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
+7.	To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
+8.	To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
+9.	To intentionally defame, disparage or otherwise harass others;
+10.	To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
+11.	To generate or disseminate personal identifiable information with the purpose of harming others;
+12.	To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
+13.	To impersonate another individual without consent, authorization, or legal right;
+14.	To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
+15.	In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
+16.	To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
+17.	For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
+18.	To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
+19.	For military purposes;
+20.	To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.

Notice.txt ADDED Viewed

	@@ -0,0 +1,100 @@

+Usage and Legal Notices:
+Tencent is pleased to support the open source community by making Tencent Hunyuan-GameCraft available.
+Copyright (C) 2025 Tencent. All rights reserved. The below softwares in this distribution may have been modified by Tencent ("Tencent Modifications"). All Tencent Modifications are Copyright (C) Tencent.
+Tencent Hunyuan-GameCraft is licensed under Tencent Hunyuan Community License Agreement, which can be found in this repository called "LICENSE", except for the third-party components listed below. Tencent Hunyuan-GameCraft does not impose any additional limitations beyond what is outlined in the respective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
+Other dependencies and licenses:
+Open Source Software Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT and Other Licenses of the Third-Party Components therein:
+The below software in this distribution may have been modified by Tencent ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2025 Tencent.
+--------------------------------------------------------------------
+1. HunyuanVideo
+Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+Terms of the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT:
+--------------------------------------------------------------------
+TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT
+Tencent HunyuanVideo Release Date: December 3, 2024
+THIS LICENSE AGREEMENT DOES NOT APPLY IN THE EUROPEAN UNION, UNITED KINGDOM AND SOUTH KOREA AND IS EXPRESSLY LIMITED TO THE TERRITORY, AS DEFINED BELOW.
+By clicking to agree or by using, reproducing, modifying, distributing, performing or displaying any portion or element of the Tencent Hunyuan Works, including via any Hosted Service, You will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
+1.	DEFINITIONS.
+a.	“Acceptable Use Policy” shall mean the policy made available by Tencent as set forth in the Exhibit A.
+b.	“Agreement” shall mean the terms and conditions for use, reproduction, distribution, modification, performance and displaying of Tencent Hunyuan Works or any portion or element thereof set forth herein.
+c.	“Documentation” shall mean the specifications, manuals and documentation for Tencent Hunyuan made publicly available by Tencent.
+d.	“Hosted Service” shall mean a hosted service offered via an application programming interface (API), web access, or any other electronic or remote means.
+e.	“Licensee,” “You” or “Your” shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Tencent Hunyuan Works for any purpose and in any field of use.
+f.	“Materials” shall mean, collectively, Tencent’s proprietary Tencent Hunyuan and Documentation (and any portion thereof) as made available by Tencent under this Agreement.
+g.	“Model Derivatives” shall mean all: (i) modifications to Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; (ii) works based on Tencent Hunyuan or any Model Derivative of Tencent Hunyuan; or (iii) any other machine learning model which is created by transfer of patterns of the weights, parameters, operations, or Output of Tencent Hunyuan or any Model Derivative of Tencent Hunyuan, to that model in order to cause that model to perform similarly to Tencent Hunyuan or a Model Derivative of Tencent Hunyuan, including distillation methods, methods that use intermediate data representations, or methods based on the generation of synthetic data Outputs by Tencent Hunyuan or a Model Derivative of Tencent Hunyuan for training that model. For clarity, Outputs by themselves are not deemed Model Derivatives.
+h.	“Output” shall mean the information and/or content output of Tencent Hunyuan or a Model Derivative that results from operating or otherwise using Tencent Hunyuan or a Model Derivative, including via a Hosted Service.
+i.	“Tencent,” “We” or “Us” shall mean THL A29 Limited.
+j.	“Tencent Hunyuan” shall mean the large language models, text/image/video/audio/3D generation models, and multimodal large language models and their software and algorithms, including trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing made publicly available by Us, including, without limitation to, Tencent HunyuanVideo released at [https://github.com/Tencent/HunyuanVideo].
+k.	“Tencent Hunyuan Works” shall mean: (i) the Materials; (ii) Model Derivatives; and (iii) all derivative works thereof.
+l.	“Territory” shall mean the worldwide territory, excluding the territory of the European Union, United Kingdom and South Korea.
+m.	“Third Party�� or “Third Parties” shall mean individuals or legal entities that are not under common control with Us or You.
+n.	“including” shall mean including but not limited to.
+2.	GRANT OF RIGHTS.
+We grant You, for the Territory only, a non-exclusive, non-transferable and royalty-free limited license under Tencent’s intellectual property or other rights owned by Us embodied in or utilized by the Materials to use, reproduce, distribute, create derivative works of (including Model Derivatives), and make modifications to the Materials, only in accordance with the terms of this Agreement and the Acceptable Use Policy, and You must not violate (or encourage or permit anyone else to violate) any term of this Agreement or the Acceptable Use Policy.
+3.	DISTRIBUTION.
+You may, subject to Your compliance with this Agreement, distribute or make available to Third Parties the Tencent Hunyuan Works, exclusively in the Territory, provided that You meet all of the following conditions:
+a.	You must provide all such Third Party recipients of the Tencent Hunyuan Works or products or services using them a copy of this Agreement;
+b.	You must cause any modified files to carry prominent notices stating that You changed the files;
+c.	You are encouraged to: (i) publish at least one technology introduction blogpost or one public statement expressing Your experience of using the Tencent Hunyuan Works; and (ii) mark the products or services developed by using the Tencent Hunyuan Works to indicate that the product/service is “Powered by Tencent Hunyuan”; and
+d.	All distributions to Third Parties (other than through a Hosted Service) must be accompanied by a “Notice” text file that contains the following notice: “Tencent Hunyuan is licensed under the Tencent Hunyuan Community License Agreement, Copyright © 2024 Tencent. All Rights Reserved. The trademark rights of “Tencent Hunyuan” are owned by Tencent or its affiliate.”
+You may add Your own copyright statement to Your modifications and, except as set forth in this Section and in Section 5, may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Model Derivatives as a whole, provided Your use, reproduction, modification, distribution, performance and display of the work otherwise complies with the terms and conditions of this Agreement (including as regards the Territory). If You receive Tencent Hunyuan Works from a Licensee as part of an integrated end user product, then this Section 3 of this Agreement will not apply to You.
+4.	ADDITIONAL COMMERCIAL TERMS.
+If, on the Tencent Hunyuan version release date, the monthly active users of all products or services made available by or for Licensee is greater than 100 million monthly active users in the preceding calendar month, You must request a license from Tencent, which Tencent may grant to You in its sole discretion, and You are not authorized to exercise any of the rights under this Agreement unless or until Tencent otherwise expressly grants You such rights.
+5.	RULES OF USE.
+a.	Your use of the Tencent Hunyuan Works must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Tencent Hunyuan Works, which is hereby incorporated by reference into this Agreement. You must include the use restrictions referenced in these Sections 5(a) and 5(b) as an enforceable provision in any agreement (e.g., license agreement, terms of use, etc.) governing the use and/or distribution of Tencent Hunyuan Works and You must provide notice to subsequent users to whom You distribute that Tencent Hunyuan Works are subject to the use restrictions in these Sections 5(a) and 5(b).
+b.	You must not use the Tencent Hunyuan Works or any Output or results of the Tencent Hunyuan Works to improve any other AI model (other than Tencent Hunyuan or Model Derivatives thereof).
+c.	You must not use, reproduce, modify, distribute, or display the Tencent Hunyuan Works, Output or results of the Tencent Hunyuan Works outside the Territory. Any such use outside the Territory is unlicensed and unauthorized under this Agreement.
+6.	INTELLECTUAL PROPERTY.
+a.	Subject to Tencent’s ownership of Tencent Hunyuan Works made by or for Tencent and intellectual property rights therein, conditioned upon Your compliance with the terms and conditions of this Agreement, as between You and Tencent, You will be the owner of any derivative works and modifications of the Materials and any Model Derivatives that are made by or for You.
+b.	No trademark licenses are granted under this Agreement, and in connection with the Tencent Hunyuan Works, Licensee may not use any name or mark owned by or associated with Tencent or any of its affiliates, except as required for reasonable and customary use in describing and distributing the Tencent Hunyuan Works. Tencent hereby grants You a license to use “Tencent Hunyuan” (the “Mark”) in the Territory solely as required to comply with the provisions of Section 3(c), provided that You comply with any applicable laws related to trademark protection. All goodwill arising out of Your use of the Mark will inure to the benefit of Tencent.
+c.	If You commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any person or entity alleging that the Materials or any Output, or any portion of any of the foregoing, infringe any intellectual property or other right owned or licensable by You, then all licenses granted to You under this Agreement shall terminate as of the date such lawsuit or other proceeding is filed. You will defend, indemnify and hold harmless Us from and against any claim by any Third Party arising out of or related to Your or the Third Party’s use or distribution of the Tencent Hunyuan Works.
+d.	Tencent claims no rights in Outputs You generate. You and Your users are solely responsible for Outputs and their subsequent uses.
+7.	DISCLAIMERS OF WARRANTY AND LIMITATIONS OF LIABILITY.
+a.	We are not obligated to support, update, provide training for, or develop any further version of the Tencent Hunyuan Works or to grant any license thereto.
+b.	UNLESS AND ONLY TO THE EXTENT REQUIRED BY APPLICABLE LAW, THE TENCENT HUNYUAN WORKS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED “AS IS” WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES OF ANY KIND INCLUDING ANY WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, COURSE OF DEALING, USAGE OF TRADE, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING, REPRODUCING, MODIFYING, PERFORMING, DISPLAYING OR DISTRIBUTING ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND ASSUME ANY AND ALL RISKS ASSOCIATED WITH YOUR OR A THIRD PARTY’S USE OR DISTRIBUTION OF ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS AND YOUR EXERCISE OF RIGHTS AND PERMISSIONS UNDER THIS AGREEMENT.
+c.	TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL TENCENT OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, FOR ANY DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, CONSEQUENTIAL OR PUNITIVE DAMAGES, OR LOST PROFITS OF ANY KIND ARISING FROM THIS AGREEMENT OR RELATED TO ANY OF THE TENCENT HUNYUAN WORKS OR OUTPUTS, EVEN IF TENCENT OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+8.	SURVIVAL AND TERMINATION.
+a.	The term of this Agreement shall commence upon Your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
+b.	We may terminate this Agreement if You breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, You must promptly delete and cease use of the Tencent Hunyuan Works. Sections 6(a), 6(c), 7 and 9 shall survive the termination of this Agreement.
+9.	GOVERNING LAW AND JURISDICTION.
+a.	This Agreement and any dispute arising out of or relating to it will be governed by the laws of the Hong Kong Special Administrative Region of the People’s Republic of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
+b.	Exclusive jurisdiction and venue for any dispute arising out of or relating to this Agreement will be a court of competent jurisdiction in the Hong Kong Special Administrative Region of the People’s Republic of China, and Tencent and Licensee consent to the exclusive jurisdiction of such court with respect to any such dispute.
+EXHIBIT A
+ACCEPTABLE USE POLICY
+Tencent reserves the right to update this Acceptable Use Policy from time to time.
+Last modified: November 5, 2024
+Tencent endeavors to promote safe and fair use of its tools and features, including Tencent Hunyuan. You agree not to use Tencent Hunyuan or Model Derivatives:
+1.	Outside the Territory;
+2.	In any way that violates any applicable national, federal, state, local, international or any other law or regulation;
+3.	To harm Yourself or others;
+4.	To repurpose or distribute output from Tencent Hunyuan or any Model Derivatives to harm Yourself or others;
+5.	To override or circumvent the safety guardrails and safeguards We have put in place;
+6.	For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
+7.	To generate or disseminate verifiably false information and/or content with the purpose of harming others or influencing elections;
+8.	To generate or facilitate false online engagement, including fake reviews and other means of fake online engagement;
+9.	To intentionally defame, disparage or otherwise harass others;
+10.	To generate and/or disseminate malware (including ransomware) or any other content to be used for the purpose of harming electronic systems;
+11.	To generate or disseminate personal identifiable information with the purpose of harming others;
+12.	To generate or disseminate information (including images, code, posts, articles), and place the information in any public context (including –through the use of bot generated tweets), without expressly and conspicuously identifying that the information and/or content is machine generated;
+13.	To impersonate another individual without consent, authorization, or legal right;
+14.	To make high-stakes automated decisions in domains that affect an individual’s safety, rights or wellbeing (e.g., law enforcement, migration, medicine/health, management of critical infrastructure, safety components of products, essential services, credit, employment, housing, education, social scoring, or insurance);
+15.	In a manner that violates or disrespects the social ethics and moral standards of other countries or regions;
+16.	To perform, facilitate, threaten, incite, plan, promote or encourage violent extremism or terrorism;
+17.	For any use intended to discriminate against or harm individuals or groups based on protected characteristics or categories, online or offline social behavior or known or predicted personal or personality characteristics;
+18.	To intentionally exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
+19.	For military purposes;
+20.	To engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or other professional practices.
+For the license of other third party components, please refer to the following URL:
+https://github.com/Tencent-Hunyuan/HunyuanVideo/blob/ff2dd59277b3177785d8279d4170968afa3b1d55/Notice

README.md ADDED Viewed

	@@ -0,0 +1,296 @@

+---
+title: Hunyuan-GameCraft
+emoji: 🎮
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 5.42.0
+app_file: app.py
+pinned: true
+license: mit
+short_description: Interactive Game Video Generation
+---
+<!-- ## **Hunyuan-GameCraft** -->
+<!-- <p align="center">
+  <img src="assets/material/logo.png"  height=100>
+</p> -->
+# **Hunyuan-GameCraft** 🎮
+<div align="center">
+  <a href="https://github.com/Tencent-Hunyuan/Hunyuan-GameCraft-1.0"><img src="https://img.shields.io/static/v1?label=Code&message=Github&color=blue"></a> &ensp;
+  <a href="https://hunyuan-gamecraft.github.io/"><img src="https://img.shields.io/static/v1?label=Project%20Page&message=Web&color=green"></a> &ensp;
+<a href="https://arxiv.org/abs/2506.17201"><img src="https://img.shields.io/badge/ArXiv-2506.17201-red"></a> &ensp;
+  <a href="https://huggingface.co/tencent/Hunyuan-GameCraft-1.0"><img src="https://img.shields.io/static/v1?label=Huggingface&message=Hunyuan-GameCraft-1.0&color=yellow"></a> &ensp;
+</div>
+![image](asset/teaser.png)
+> [**Hunyuan-GameCraft: High-dynamic Interactive Game Video Generation with Hybrid History Condition**](https://arxiv.org/abs/2506.17201) <be>
+## 🔥🔥🔥 News!!
+* Aug  14, 2025: 👋 We release the inference code and model weights of Hunyuan-GameCraft. [Download](weights/README.md).
+## 📑 Open-source Plan
+- Hunyuan-GameCraft
+  - [x] Inference
+  - [x] Checkpoints
+  - [ ] Gradio & Huggingface Demo
+## Contents
+- [**Hunyuan-GameCraft** 🌅](#Hunyuan-GameCraft-)
+  - [🔥🔥🔥 News!!](#-news)
+  - [📑 Open-source Plan](#-open-source-plan)
+  - [Contents](#contents)
+  - [**Abstract**](#abstract)
+  - [**Overall Architecture**](#-overall-architecture)
+  - [📜 Requirements](#-requirements)
+  - [🛠️ Dependencies and Installation](#️-dependencies-and-installation)
+    - [Installation Guide for Linux](#installation-guide-for-linux)
+  - [🧱 Download Pretrained Models](#-download-pretrained-models)
+  - [🚀 Parallel Inference on Multiple GPUs](#-parallel-inference-on-multiple-gpus)
+  - [🔑 Single-gpu Inference](#-single-gpu-inference)
+    - [Run with very low VRAM](#run-with-very-low-vram)
+  - [Run a Gradio Server](#run-a-gradio-server)
+  - [🔗 BibTeX](#-bibtex)
+  - [Acknowledgements](#acknowledgements)
+---
+## **Abstract**
+Recent advances in diffusion-based and controllable video generation have enabled high-quality and temporally coherent video synthesis, laying the groundwork for immersive interactive gaming experiences. However, current methods face limitations in **dynamics**, **physically realistic**, **long-term consistency**, and **efficiency**, which limit the ability to create various gameplay videos. To address these gaps, we introduce Hunyuan-GameCraft, a novel framework for high-dynamic interactive video generation in game environments. To achieve fine-grained action control, we unify standard keyboard and mouse inputs into a **shared camera representation space**, facilitating smooth interpolation between various camera and movement operations. Then we propose a **hybrid history-conditioned training strategy** that extends video sequences autoregressively while preserving game scene information. Additionally, to enhance inference efficiency and playability, we achieve **model distillation** to reduce computational overhead while maintaining consistency across long temporal sequences, making it suitable for real-time deployment in complex interactive environments. The model is trained on a large-scale dataset comprising over one million gameplay recordings across over 100 AAA games, ensuring broad coverage and diversity, then fine-tuned on a carefully annotated synthetic dataset to enhance precision and control. The curated game scene data significantly improves the visual fidelity, realism and action controllability. Extensive experiments demonstrate that Hunyuan-GameCraft significantly outperforms existing models, advancing the realism and playability of interactive game video generation.
+## **Overall Architecture**
+![image](asset/method.png)
+Given a reference image and the corresponding prompt, the keyboard or mouse signal, we transform these options to the continuous camera space. Then we design a light-weight action encoder to encode the input camera trajectory. The action and image features are added after patchify. For long video extension, we design a variable mask indicator, where 1 and 0 indicate history frames and predicted frames, respectively.
+## 📜 Requirements
+* An NVIDIA GPU with CUDA support is required.
+  * The model is tested on a machine with 8*H20/H800GPUs.
+  * **Minimum**: The minimum GPU memory required is 24GB but very slow.
+  * **Recommended**: We recommend using a GPU with 80GB of memory for better generation quality.
+* Tested operating system: Linux
+## 🛠️ Dependencies and Installation
+Begin by cloning the repository:
+```shell
+git clone https://github.com/Tencent-Hunyuan/Hunyuan-GameCraft-1.0.git
+cd Hunyuan-GameCraft-1.0
+```
+### Installation Guide for Linux
+We recommend CUDA versions 12.4 for the manual installation.
+Conda's installation instructions are available [here](https://docs.anaconda.com/free/miniconda/index.html).
+```shell
+# 1. Create conda environment
+conda create -n HYGameCraft python==3.10
+# 2. Activate the environment
+conda activate HYGameCraft
+# 3. Install PyTorch and other dependencies using conda
+conda install pytorch==2.5.1 torchvision==0.20.0 torchaudio==2.5.1 pytorch-cuda=12.4 -c pytorch -c nvidia
+# 4. Install pip dependencies
+python -m pip install -r requirements.txt
+# 5. Install flash attention v2 for acceleration (requires CUDA 11.8 or above)
+python -m pip install ninja
+python -m pip install git+https://github.com/Dao-AILab/[email protected]
+```
+Additionally, you can also use HunyuanVideo Docker image. Use the following command to pull and run the docker image.
+```shell
+# For CUDA 12.4 (updated to avoid float point exception)
+docker pull hunyuanvideo/hunyuanvideo:cuda_12
+docker run -itd --gpus all --init --net=host --uts=host --ipc=host --name hunyuanvideo --security-opt=seccomp=unconfined --ulimit=stack=67108864 --ulimit=memlock=-1 --privileged hunyuanvideo/hunyuanvideo:cuda_12
+pip install diffusers==0.34.0 transformers==4.54.1
+```
+## 🧱 Download Pretrained Models
+The details of download pretrained models are shown [here](weights/README.md).
+## 🚀 Parallel Inference on Multiple GPUs
+For example, to generate a video using 8 GPUs, you can use the following command, where `--action-list w s d a` simulate keyboard manipulation signals to help you generate a video of the corresponding content. `--action-speed-list 0.2 0.2 0.2 0.2` represents the displacement distance and can be replaced with any value between 0 and 3.
+You can try any combination and any length of the action list (one action per 33 frames, 25FPS) to generate a long video, and make sure the length of `--action-speed-list` must be the same as `--action-list`. It should be noticed that the inference time is linearly related to the action length:
+```bash
+#!/bin/bash
+JOBS_DIR=$(dirname $(dirname "$0"))
+export PYTHONPATH=${JOBS_DIR}:$PYTHONPATH
+export MODEL_BASE="weights/stdmodels"
+checkpoint_path="weights/gamecraft_models/mp_rank_00_model_states.pt"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+modelname='Tencent_hunyuanGameCraft_720P'
+torchrun --nnodes=1 --nproc_per_node=8 --master_port 29605 hymm_sp/sample_batch.py \
+    --image-path "asset/village.png" \
+    --prompt "A charming medieval village with cobblestone streets, thatched-roof houses, and vibrant flower gardens under a bright blue sky." \
+    --add-pos-prompt "Realistic, High-quality." \
+    --add-neg-prompt "overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, text, subtitles, static, picture, black border." \
+    --ckpt ${checkpoint_path} \
+    --video-size 704 1216 \
+    --cfg-scale 2.0 \
+    --image-start \
+    --action-list w s d a \
+    --action-speed-list 0.2 0.2 0.2 0.2 \
+    --seed 250160 \
+    --infer-steps 50 \
+    --flow-shift-eval-video 5.0 \
+    --save-path './results/'
+```
+Additionally, we support FP8 optimization and [SageAttn](https://github.com/thu-ml/SageAttention). To enable FP8, simply add the `--use-fp8` to your command.
+And install SageAttention with:
+```bash
+git clone https://github.com/thu-ml/SageAttention.git
+cd SageAttention
+python setup.py install  # or pip install -e .
+```
+We also provide an accelerated model, you can use the following command:
+```bash
+#!/bin/bash
+JOBS_DIR=$(dirname $(dirname "$0"))
+export PYTHONPATH=${JOBS_DIR}:$PYTHONPATH
+export MODEL_BASE="weights/stdmodels"
+checkpoint_path="weights/gamecraft_models/mp_rank_00_model_states_distill.pt"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+modelname='Tencent_hunyuanGameCraft_720P'
+torchrun --nnodes=1 --nproc_per_node=8 --master_port 29605 hymm_sp/sample_batch.py \
+    --image-path "asset/village.png" \
+    --prompt "A charming medieval village with cobblestone streets, thatched-roof houses, and vibrant flower gardens under a bright blue sky." \
+    --add-neg-prompt "overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, text, subtitles, static, picture, black border." \
+    --ckpt ${checkpoint_path} \
+    --video-size 704 1216 \
+    --cfg-scale 1.0 \
+    --image-start \
+    --action-list w s d a \
+    --action-speed-list 0.2 0.2 0.2 0.2 \
+    --seed 250160 \
+    --infer-steps 8 \
+    --use-fp8 \
+    --flow-shift-eval-video 5.0 \
+    --save-path './results_distill/'
+```
+## 🔑 Single-gpu with Low-VRAM Inference
+For example, to generate a video with 1 GPU with Low-VRAM (minimum GPU memory required is 24GB for 704px1216p but very slow), you can use the following command:
+```bash
+#!/bin/bash
+JOBS_DIR=$(dirname $(dirname "$0"))
+export PYTHONPATH=${JOBS_DIR}:$PYTHONPATH
+export MODEL_BASE="weights/stdmodels"
+checkpoint_path="weights/gamecraft_models/mp_rank_00_model_states.pt"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+modelname='Tencent_hunyuanGameCraft_720P'
+# disable sp and cpu offload
+export DISABLE_SP=1
+export CPU_OFFLOAD=1
+torchrun --nnodes=1 --nproc_per_node=1 --master_port 29605 hymm_sp/sample_batch.py \
+    --image-path "asset/village.png" \
+    --prompt "A charming medieval village with cobblestone streets, thatched-roof houses, and vibrant flower gardens under a bright blue sky." \
+    --add-neg-prompt "overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, text, subtitles, static, picture, black border." \
+    --ckpt ${checkpoint_path} \
+    --video-size 704 1216 \
+    --cfg-scale 2.0 \
+    --image-start \
+    --action-list w a d s \
+    --action-speed-list 0.2 0.2 0.2 0.2 \
+    --seed 250160 \
+    --sample-n-frames 33 \
+    --infer-steps 50 \
+    --flow-shift-eval-video 5.0 \
+    --cpu-offload \
+    --use-fp8 \
+    --save-path './results_poor/'
+```
+As for using the accelerated model, you can use the following command:
+```bash
+#!/bin/bash
+JOBS_DIR=$(dirname $(dirname "$0"))
+export PYTHONPATH=${JOBS_DIR}:$PYTHONPATH
+export MODEL_BASE="weights/stdmodels"
+checkpoint_path="weights/gamecraft_models/mp_rank_00_model_states_distill.pt"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+modelname='Tencent_hunyuanGameCraft_720P'
+# disable sp and cpu offload
+export DISABLE_SP=1
+export CPU_OFFLOAD=1
+torchrun --nnodes=1 --nproc_per_node=1 --master_port 29605 hymm_sp/sample_batch.py \
+    --image-path "asset/village.png" \
+    --prompt "A charming medieval village with cobblestone streets, thatched-roof houses, and vibrant flower gardens under a bright blue sky." \
+    --add-neg-prompt "overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, text, subtitles, static, picture, black border." \
+    --ckpt ${checkpoint_path} \
+    --video-size 704 1216 \
+    --cfg-scale 1.0 \
+    --image-start \
+    --action-list w a d s \
+    --action-speed-list 0.2 0.2 0.2 0.2 \
+    --seed 250160 \
+    --sample-n-frames 33 \
+    --infer-steps 8 \
+    --flow-shift-eval-video 5.0 \
+    --cpu-offload \
+    --use-fp8 \
+    --save-path './results_distill_poor/'
+```
+## 🔗 BibTeX
+If you find [Hunyuan-GameCraft](https://arxiv.org/abs/2506.17201) useful for your research and applications, please cite using this BibTeX:
+```BibTeX
+@misc{li2025hunyuangamecrafthighdynamicinteractivegame,
+    title={Hunyuan-GameCraft: High-dynamic Interactive Game Video Generation with Hybrid History Condition},
+    author={Jiaqi Li and Junshu Tang and Zhiyong Xu and Longhuang Wu and Yuan Zhou and Shuai Shao and Tianbao Yu and Zhiguo Cao and Qinglin Lu},
+    year={2025},
+    eprint={2506.17201},
+    archivePrefix={arXiv},
+    primaryClass={cs.CV},
+    url={https://arxiv.org/abs/2506.17201},
+}
+```
+## Acknowledgements
+We would like to thank the contributors to the [HunyuanVideo](https://github.com/Tencent/HunyuanVideo), [HunyuanVideo-Avatar](https://github.com/Tencent-Hunyuan/HunyuanVideo-Avatar),[SD3](https://huggingface.co/stabilityai/stable-diffusion-3-medium), [FLUX](https://github.com/black-forest-labs/flux), [Llama](https://github.com/meta-llama/llama), [LLaVA](https://github.com/haotian-liu/LLaVA), [Xtuner](https://github.com/InternLM/xtuner), [diffusers](https://github.com/huggingface/diffusers) and [HuggingFace](https://huggingface.co) repositories, for their open research and exploration.

app.py ADDED Viewed

	@@ -0,0 +1,360 @@

+import os
+import torch
+import gradio as gr
+import numpy as np
+import random
+from pathlib import Path
+from PIL import Image
+import torchvision.transforms as transforms
+from loguru import logger
+from huggingface_hub import hf_hub_download
+import tempfile
+from hymm_sp.sample_inference import HunyuanVideoSampler
+from hymm_sp.data_kits.data_tools import save_videos_grid
+from hymm_sp.config import parse_args
+import argparse
+os.environ["MODEL_BASE"] = "weights/stdmodels"
+os.environ["DISABLE_SP"] = "1"
+os.environ["CPU_OFFLOAD"] = "1"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class CropResize:
+    def __init__(self, size=(704, 1216)):
+        self.target_h, self.target_w = size
+    def __call__(self, img):
+        w, h = img.size
+        scale = max(
+            self.target_w / w,
+            self.target_h / h
+        )
+        new_size = (int(h * scale), int(w * scale))
+        resize_transform = transforms.Resize(
+            new_size,
+            interpolation=transforms.InterpolationMode.BILINEAR
+        )
+        resized_img = resize_transform(img)
+        crop_transform = transforms.CenterCrop((self.target_h, self.target_w))
+        return crop_transform(resized_img)
+def create_args():
+    args = argparse.Namespace()
+    args.ckpt = "weights/gamecraft_models/mp_rank_00_model_states_distill.pt"
+    args.video_size = [704, 1216]
+    args.cfg_scale = 1.0
+    args.image_start = True
+    args.seed = None
+    args.infer_steps = 8
+    args.use_fp8 = True
+    args.flow_shift_eval_video = 5.0
+    args.sample_n_frames = 33
+    args.num_images = 1
+    args.use_linear_quadratic_schedule = False
+    args.linear_schedule_end = 0.25
+    args.use_deepcache = False
+    args.cpu_offload = True
+    args.use_sage = True
+    args.save_path = './results/'
+    args.save_path_suffix = ''
+    args.add_pos_prompt = "Realistic, High-quality."
+    args.add_neg_prompt = "overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, text, subtitles, static, picture, black border."
+    return args
+logger.info("Initializing Hunyuan-GameCraft model...")
+if not os.path.exists("weights/gamecraft_models/mp_rank_00_model_states_distill.pt"):
+    logger.info("Downloading model weights from Hugging Face...")
+    os.makedirs("weights/gamecraft_models", exist_ok=True)
+    hf_hub_download(
+        repo_id="tencent/Hunyuan-GameCraft-1.0",
+        filename="gamecraft_models/mp_rank_00_model_states_distill.pt",
+        local_dir="weights/",
+        local_dir_use_symlinks=False
+    )
+args = create_args()
+hunyuan_video_sampler = HunyuanVideoSampler.from_pretrained(
+    args.ckpt,
+    args=args,
+    device=torch.device("cpu")
+)
+args = hunyuan_video_sampler.args
+if args.cpu_offload:
+    from diffusers.hooks import apply_group_offloading
+    onload_device = torch.device("cuda")
+    apply_group_offloading(
+        hunyuan_video_sampler.pipeline.transformer,
+        onload_device=onload_device,
+        offload_type="block_level",
+        num_blocks_per_group=1
+    )
+    logger.info("Enabled CPU offloading for transformer blocks")
+logger.info("Model loaded successfully!")
+def generate_video(
+    input_image,
+    prompt,
+    action_sequence,
+    action_speeds,
+    negative_prompt,
+    seed,
+    cfg_scale,
+    num_inference_steps,
+    progress=gr.Progress(track_tqdm=True)
+):
+    try:
+        progress(0, desc="Initializing...")
+        if input_image is None:
+            return None, "Please upload an image first!"
+        action_list = action_sequence.lower().replace(" ", "").split(",") if action_sequence else ["w"]
+        speed_list = [float(s.strip()) for s in action_speeds.split(",")] if action_speeds else [0.2]
+        if len(speed_list) != len(action_list):
+            if len(speed_list) == 1:
+                speed_list = speed_list * len(action_list)
+            else:
+                return None, f"Error: Number of speeds ({len(speed_list)}) must match number of actions ({len(action_list)})"
+        for action in action_list:
+            if action not in ['w', 'a', 's', 'd']:
+                return None, f"Error: Invalid action '{action}'. Use only w, a, s, d"
+        for speed in speed_list:
+            if not 0.0 <= speed <= 3.0:
+                return None, f"Error: Speed {speed} out of range. Use values between 0.0 and 3.0"
+        progress(0.1, desc="Processing image...")
+        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp_file:
+            input_image.save(tmp_file.name)
+            image_path = tmp_file.name
+        closest_size = (704, 1216)
+        ref_image_transform = transforms.Compose([
+            CropResize(closest_size),
+            transforms.CenterCrop(closest_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5])
+        ])
+        raw_ref_image = Image.open(image_path).convert('RGB')
+        ref_image_pixel_values = ref_image_transform(raw_ref_image)
+        ref_image_pixel_values = ref_image_pixel_values.unsqueeze(0).unsqueeze(2).to(device)
+        progress(0.2, desc="Encoding image...")
+        with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
+            if args.cpu_offload:
+                hunyuan_video_sampler.vae.quant_conv.to('cuda')
+                hunyuan_video_sampler.vae.encoder.to('cuda')
+            hunyuan_video_sampler.pipeline.vae.enable_tiling()
+            raw_last_latents = hunyuan_video_sampler.vae.encode(
+                ref_image_pixel_values
+            ).latent_dist.sample().to(dtype=torch.float16)
+            raw_last_latents.mul_(hunyuan_video_sampler.vae.config.scaling_factor)
+            raw_ref_latents = raw_last_latents.clone()
+            hunyuan_video_sampler.pipeline.vae.disable_tiling()
+            if args.cpu_offload:
+                hunyuan_video_sampler.vae.quant_conv.to('cpu')
+                hunyuan_video_sampler.vae.encoder.to('cpu')
+        ref_images = [raw_ref_image]
+        last_latents = raw_last_latents
+        ref_latents = raw_ref_latents
+        progress(0.3, desc="Starting video generation...")
+        if seed is None or seed == -1:
+            seed = random.randint(0, 1_000_000)
+        all_samples = []
+        for idx, (action_id, action_speed) in enumerate(zip(action_list, speed_list)):
+            is_image = (idx == 0)
+            progress(0.3 + (0.6 * idx / len(action_list)),
+                    desc=f"Generating segment {idx+1}/{len(action_list)} (action: {action_id})")
+            outputs = hunyuan_video_sampler.predict(
+                prompt=prompt,
+                action_id=action_id,
+                action_speed=action_speed,
+                is_image=is_image,
+                size=(704, 1216),
+                seed=seed,
+                last_latents=last_latents,
+                ref_latents=ref_latents,
+                video_length=args.sample_n_frames,
+                guidance_scale=cfg_scale,
+                num_images_per_prompt=1,
+                negative_prompt=negative_prompt,
+                infer_steps=num_inference_steps,
+                flow_shift=args.flow_shift_eval_video,
+                use_linear_quadratic_schedule=args.use_linear_quadratic_schedule,
+                linear_schedule_end=args.linear_schedule_end,
+                use_deepcache=args.use_deepcache,
+                cpu_offload=args.cpu_offload,
+                ref_images=ref_images,
+                output_dir=None,
+                return_latents=True,
+                use_sage=args.use_sage,
+            )
+            ref_latents = outputs["ref_latents"]
+            last_latents = outputs["last_latents"]
+            sub_samples = outputs['samples'][0]
+            all_samples.append(sub_samples)
+        progress(0.9, desc="Finalizing video...")
+        if len(all_samples) > 0:
+            out_cat = torch.cat(all_samples, dim=2)
+        else:
+            out_cat = all_samples[0]
+        with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_video:
+            output_path = tmp_video.name
+        save_videos_grid(out_cat, output_path, n_rows=1, fps=25)
+        if os.path.exists(image_path):
+            os.remove(image_path)
+        progress(1.0, desc="Complete!")
+        return output_path, "Video generated successfully!"
+    except Exception as e:
+        logger.error(f"Error generating video: {e}")
+        return None, f"Error: {str(e)}"
+with gr.Blocks(title="Hunyuan-GameCraft") as demo:
+    gr.Markdown("""
+    # 🎮 Hunyuan-GameCraft Video Generation
+    Generate interactive game-style videos from a single image using keyboard actions (W/A/S/D).
+    Using the **distilled model** for faster generation (8 inference steps).
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_image = gr.Image(
+                label="Input Image",
+                type="pil",
+                height=400
+            )
+            prompt = gr.Textbox(
+                label="Prompt",
+                placeholder="Describe the scene...",
+                value="A charming medieval village with cobblestone streets, thatched-roof houses, and vibrant flower gardens under a bright blue sky.",
+                lines=3
+            )
+            with gr.Accordion("Action Controls", open=True):
+                action_sequence = gr.Textbox(
+                    label="Action Sequence (comma-separated)",
+                    placeholder="w, a, s, d",
+                    value="w, s, d, a",
+                    info="Use w (forward), a (left), s (backward), d (right)"
+                )
+                action_speeds = gr.Textbox(
+                    label="Action Speeds (comma-separated)",
+                    placeholder="0.2, 0.2, 0.2, 0.2",
+                    value="0.2, 0.2, 0.2, 0.2",
+                    info="Speed for each action (0.0 to 3.0). Single value applies to all."
+                )
+            with gr.Accordion("Advanced Settings", open=False):
+                negative_prompt = gr.Textbox(
+                    label="Negative Prompt",
+                    value="overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, text, subtitles, static, picture, black border.",
+                    lines=2
+                )
+                seed = gr.Number(
+                    label="Seed",
+                    value=-1,
+                    precision=0,
+                    info="Set to -1 for random seed"
+                )
+                cfg_scale = gr.Slider(
+                    label="CFG Scale",
+                    minimum=0.5,
+                    maximum=3.0,
+                    value=1.0,
+                    step=0.1,
+                    info="Classifier-free guidance scale (1.0 for distilled model)"
+                )
+                num_inference_steps = gr.Slider(
+                    label="Inference Steps",
+                    minimum=4,
+                    maximum=20,
+                    value=8,
+                    step=1,
+                    info="Number of denoising steps (8 for distilled model)"
+                )
+            generate_btn = gr.Button("Generate Video", variant="primary")
+        with gr.Column(scale=1):
+            output_video = gr.Video(
+                label="Generated Video",
+                height=400
+            )
+            status_text = gr.Textbox(
+                label="Status",
+                interactive=False
+            )
+    gr.Markdown("""
+    ### Tips:
+    - Each action generates 33 frames (1.3 seconds at 25 FPS)
+    - The distilled model is optimized for speed with 8 inference steps
+    - Use FP8 optimization for better memory efficiency
+    - Minimum GPU memory: 24GB VRAM
+    """)
+    generate_btn.click(
+        fn=generate_video,
+        inputs=[
+            input_image,
+            prompt,
+            action_sequence,
+            action_speeds,
+            negative_prompt,
+            seed,
+            cfg_scale,
+            num_inference_steps
+        ],
+        outputs=[output_video, status_text]
+    )
+    gr.Examples(
+        examples=[
+            [
+                "asset/village.png",
+                "A charming medieval village with cobblestone streets, thatched-roof houses, and vibrant flower gardens under a bright blue sky.",
+                "w, a, d, s",
+                "0.2, 0.2, 0.2, 0.2"
+            ]
+        ],
+        inputs=[input_image, prompt, action_sequence, action_speeds],
+        label="Example"
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

asset/method.png ADDED Viewed

Git LFS Details

SHA256: e9d0546830d54f90e96392614405472ab06eb700ad184e4a6689bd84ff436890
Pointer size: 132 Bytes
Size of remote file: 2.86 MB

asset/teaser.png ADDED Viewed

Git LFS Details

SHA256: 5272120a5f85af9ee44c5f9714d6d0d99ba186ef2a66181b4bbd1b718c399555
Pointer size: 133 Bytes
Size of remote file: 20.9 MB

asset/village.png ADDED Viewed

Git LFS Details

SHA256: 5a5e986bd3100537653cd82c280ad1b3a1f2b0edec7abfd6926fae440bb92cd1
Pointer size: 132 Bytes
Size of remote file: 2.61 MB

docs_for_ai_coding_bots/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

docs_for_ai_coding_bots/huggingface_hub/Downloading-model-from-hub.md ADDED Viewed

	@@ -0,0 +1,174 @@

+[](#download-files-from-the-hub)Download files from the Hub
+===========================================================
+The `huggingface_hub` library provides functions to download files from the repositories stored on the Hub. You can use these functions independently or integrate them into your own library, making it more convenient for your users to interact with the Hub. This guide will show you how to:
+*   Download and cache a single file.
+*   Download and cache an entire repository.
+*   Download files to a local folder.
+[](#download-a-single-file)Download a single file
+-------------------------------------------------
+The [hf\_hub\_download()](/docs/huggingface_hub/v0.32.2/en/package_reference/file_download#huggingface_hub.hf_hub_download) function is the main function for downloading files from the Hub. It downloads the remote file, caches it on disk (in a version-aware way), and returns its local file path.
+The returned filepath is a pointer to the HF local cache. Therefore, it is important to not modify the file to avoid having a corrupted cache. If you are interested in getting to know more about how files are cached, please refer to our [caching guide](./manage-cache).
+### [](#from-latest-version)From latest version
+Select the file to download using the `repo_id`, `repo_type` and `filename` parameters. By default, the file will be considered as being part of a `model` repo.
+Copied
+\>>> from huggingface\_hub import hf\_hub\_download
+\>>> hf\_hub\_download(repo\_id="lysandre/arxiv-nlp", filename="config.json")
+'/root/.cache/huggingface/hub/models--lysandre--arxiv-nlp/snapshots/894a9adde21d9a3e3843e6d5aeaaf01875c7fade/config.json'
+\# Download from a dataset
+\>>> hf\_hub\_download(repo\_id="google/fleurs", filename="fleurs.py", repo\_type="dataset")
+'/root/.cache/huggingface/hub/datasets--google--fleurs/snapshots/199e4ae37915137c555b1765c01477c216287d34/fleurs.py'
+### [](#from-specific-version)From specific version
+By default, the latest version from the `main` branch is downloaded. However, in some cases you want to download a file at a particular version (e.g. from a specific branch, a PR, a tag or a commit hash). To do so, use the `revision` parameter:
+Copied
+\# Download from the \`v1.0\` tag
+\>>> hf\_hub\_download(repo\_id="lysandre/arxiv-nlp", filename="config.json", revision="v1.0")
+\# Download from the \`test-branch\` branch
+\>>> hf\_hub\_download(repo\_id="lysandre/arxiv-nlp", filename="config.json", revision="test-branch")
+\# Download from Pull Request #3
+\>>> hf\_hub\_download(repo\_id="lysandre/arxiv-nlp", filename="config.json", revision="refs/pr/3")
+\# Download from a specific commit hash
+\>>> hf\_hub\_download(repo\_id="lysandre/arxiv-nlp", filename="config.json", revision="877b84a8f93f2d619faa2a6e514a32beef88ab0a")
+**Note:** When using the commit hash, it must be the full-length hash instead of a 7-character commit hash.
+### [](#construct-a-download-url)Construct a download URL
+In case you want to construct the URL used to download a file from a repo, you can use [hf\_hub\_url()](/docs/huggingface_hub/v0.32.2/en/package_reference/file_download#huggingface_hub.hf_hub_url) which returns a URL. Note that it is used internally by [hf\_hub\_download()](/docs/huggingface_hub/v0.32.2/en/package_reference/file_download#huggingface_hub.hf_hub_download).
+[](#download-an-entire-repository)Download an entire repository
+---------------------------------------------------------------
+[snapshot\_download()](/docs/huggingface_hub/v0.32.2/en/package_reference/file_download#huggingface_hub.snapshot_download) downloads an entire repository at a given revision. It uses internally [hf\_hub\_download()](/docs/huggingface_hub/v0.32.2/en/package_reference/file_download#huggingface_hub.hf_hub_download) which means all downloaded files are also cached on your local disk. Downloads are made concurrently to speed-up the process.
+To download a whole repository, just pass the `repo_id` and `repo_type`:
+Copied
+\>>> from huggingface\_hub import snapshot\_download
+\>>> snapshot\_download(repo\_id="lysandre/arxiv-nlp")
+'/home/lysandre/.cache/huggingface/hub/models--lysandre--arxiv-nlp/snapshots/894a9adde21d9a3e3843e6d5aeaaf01875c7fade'
+\# Or from a dataset
+\>>> snapshot\_download(repo\_id="google/fleurs", repo\_type="dataset")
+'/home/lysandre/.cache/huggingface/hub/datasets--google--fleurs/snapshots/199e4ae37915137c555b1765c01477c216287d34'
+[snapshot\_download()](/docs/huggingface_hub/v0.32.2/en/package_reference/file_download#huggingface_hub.snapshot_download) downloads the latest revision by default. If you want a specific repository revision, use the `revision` parameter:
+Copied
+\>>> from huggingface\_hub import snapshot\_download
+\>>> snapshot\_download(repo\_id="lysandre/arxiv-nlp", revision="refs/pr/1")
+### [](#filter-files-to-download)Filter files to download
+[snapshot\_download()](/docs/huggingface_hub/v0.32.2/en/package_reference/file_download#huggingface_hub.snapshot_download) provides an easy way to download a repository. However, you don’t always want to download the entire content of a repository. For example, you might want to prevent downloading all `.bin` files if you know you’ll only use the `.safetensors` weights. You can do that using `allow_patterns` and `ignore_patterns` parameters.
+These parameters accept either a single pattern or a list of patterns. Patterns are Standard Wildcards (globbing patterns) as documented [here](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm). The pattern matching is based on [`fnmatch`](https://docs.python.org/3/library/fnmatch.html).
+For example, you can use `allow_patterns` to only download JSON configuration files:
+Copied
+\>>> from huggingface\_hub import snapshot\_download
+\>>> snapshot\_download(repo\_id="lysandre/arxiv-nlp", allow\_patterns="\*.json")
+On the other hand, `ignore_patterns` can exclude certain files from being downloaded. The following example ignores the `.msgpack` and `.h5` file extensions:
+Copied
+\>>> from huggingface\_hub import snapshot\_download
+\>>> snapshot\_download(repo\_id="lysandre/arxiv-nlp", ignore\_patterns=\["\*.msgpack", "\*.h5"\])
+Finally, you can combine both to precisely filter your download. Here is an example to download all json and markdown files except `vocab.json`.
+Copied
+\>>> from huggingface\_hub import snapshot\_download
+\>>> snapshot\_download(repo\_id="gpt2", allow\_patterns=\["\*.md", "\*.json"\], ignore\_patterns="vocab.json")
+[](#download-files-to-a-local-folder)Download file(s) to a local folder
+-----------------------------------------------------------------------
+By default, we recommend using the [cache system](./manage-cache) to download files from the Hub. You can specify a custom cache location using the `cache_dir` parameter in [hf\_hub\_download()](/docs/huggingface_hub/v0.32.2/en/package_reference/file_download#huggingface_hub.hf_hub_download) and [snapshot\_download()](/docs/huggingface_hub/v0.32.2/en/package_reference/file_download#huggingface_hub.snapshot_download), or by setting the [`HF_HOME`](../package_reference/environment_variables#hf_home) environment variable.
+However, if you need to download files to a specific folder, you can pass a `local_dir` parameter to the download function. This is useful to get a workflow closer to what the `git` command offers. The downloaded files will maintain their original file structure within the specified folder. For example, if `filename="data/train.csv"` and `local_dir="path/to/folder"`, the resulting filepath will be `"path/to/folder/data/train.csv"`.
+A `.cache/huggingface/` folder is created at the root of your local directory containing metadata about the downloaded files. This prevents re-downloading files if they’re already up-to-date. If the metadata has changed, then the new file version is downloaded. This makes the `local_dir` optimized for pulling only the latest changes.
+After completing the download, you can safely remove the `.cache/huggingface/` folder if you no longer need it. However, be aware that re-running your script without this folder may result in longer recovery times, as metadata will be lost. Rest assured that your local data will remain intact and unaffected.
+Don’t worry about the `.cache/huggingface/` folder when committing changes to the Hub! This folder is automatically ignored by both `git` and [upload\_folder()](/docs/huggingface_hub/v0.32.2/en/package_reference/hf_api#huggingface_hub.HfApi.upload_folder).
+[](#download-from-the-cli)Download from the CLI
+-----------------------------------------------
+You can use the `huggingface-cli download` command from the terminal to directly download files from the Hub. Internally, it uses the same [hf\_hub\_download()](/docs/huggingface_hub/v0.32.2/en/package_reference/file_download#huggingface_hub.hf_hub_download) and [snapshot\_download()](/docs/huggingface_hub/v0.32.2/en/package_reference/file_download#huggingface_hub.snapshot_download) helpers described above and prints the returned path to the terminal.
+Copied
+\>>> huggingface-cli download gpt2 config.json
+/home/wauplin/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
+You can download multiple files at once which displays a progress bar and returns the snapshot path in which the files are located:
+Copied
+\>>> huggingface-cli download gpt2 config.json model.safetensors
+Fetching 2 files: 100%|████████████████████████████████████████████| 2/2 \[00:00<00:00, 23831.27it/s\]
+/home/wauplin/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10
+For more details about the CLI download command, please refer to the [CLI guide](./cli#huggingface-cli-download).
+[](#faster-downloads)Faster downloads
+-------------------------------------
+There are two options to speed up downloads. Both involve installing a Python package written in Rust.
+*   `hf_xet` is newer and uses the Xet storage backend for upload/download. It is available in production, but is in the process of being rolled out to all users, so join the [waitlist](https://huggingface.co/join/xet) to get onboarded soon!
+*   `hf_transfer` is a power-tool to download and upload to our LFS storage backend (note: this is less future-proof than Xet). It is thoroughly tested and has been in production for a long time, but it has some limitations.
+### [](#hfxet)hf\_xet
+Take advantage of faster downloads through `hf_xet`, the Python binding to the [`xet-core`](https://github.com/huggingface/xet-core) library that enables chunk-based deduplication for faster downloads and uploads. `hf_xet` integrates seamlessly with `huggingface_hub`, but uses the Rust `xet-core` library and Xet storage instead of LFS.
+`hf_xet` uses the Xet storage system, which breaks files down into immutable chunks, storing collections of these chunks (called blocks or xorbs) remotely and retrieving them to reassemble the file when requested. When downloading, after confirming the user is authorized to access the files, `hf_xet` will query the Xet content-addressable service (CAS) with the LFS SHA256 hash for this file to receive the reconstruction metadata (ranges within xorbs) to assemble these files, along with presigned URLs to download the xorbs directly. Then `hf_xet` will efficiently download the xorb ranges necessary and will write out the files on disk. `hf_xet` uses a local disk cache to only download chunks once, learn more in the [Chunk-based caching(Xet)](./manage-cache#chunk-based-caching-xet) section.
+To enable it, specify the `hf_xet` package when installing `huggingface_hub`:
+Copied
+pip install -U "huggingface\_hub\[hf\_xet\]"
+Note: `hf_xet` will only be utilized when the files being downloaded are being stored with Xet Storage.
+All other `huggingface_hub` APIs will continue to work without any modification. To learn more about the benefits of Xet storage and `hf_xet`, refer to this [section](https://huggingface.co/docs/hub/storage-backends).
+### [](#hftransfer)hf\_transfer
+If you are running on a machine with high bandwidth, you can increase your download speed with [`hf_transfer`](https://github.com/huggingface/hf_transfer), a Rust-based library developed to speed up file transfers with the Hub. To enable it:
+1.  Specify the `hf_transfer` extra when installing `huggingface_hub` (e.g. `pip install huggingface_hub[hf_transfer]`).
+2.  Set `HF_HUB_ENABLE_HF_TRANSFER=1` as an environment variable.
+`hf_transfer` is a power user tool! It is tested and production-ready, but it lacks user-friendly features like advanced error handling or proxies. For more details, please take a look at this [section](https://huggingface.co/docs/huggingface_hub/hf_transfer).
+[< \> Update on GitHub](https://github.com/huggingface/huggingface_hub/blob/main/docs/source/en/guides/download.md)
+Command Line Interface (CLI)

docs_for_ai_coding_bots/huggingface_hub/Using-the-cache-in-hf-hub-library.md ADDED Viewed

	@@ -0,0 +1,531 @@

+[](#understand-caching)Understand caching
+=========================================
+`huggingface_hub` utilizes the local disk as two caches, which avoid re-downloading items again. The first cache is a file-based cache, which caches individual files downloaded from the Hub and ensures that the same file is not downloaded again when a repo gets updated. The second cache is a chunk cache, where each chunk represents a byte range from a file and ensures that chunks that are shared across files are only downloaded once.
+[](#file-based-caching)File-based caching
+-----------------------------------------
+The Hugging Face Hub cache-system is designed to be the central cache shared across libraries that depend on the Hub. It has been updated in v0.8.0 to prevent re-downloading same files between revisions.
+The caching system is designed as follows:
+Copied
+<CACHE\_DIR\>
+├─ <MODELS\>
+├─ <DATASETS\>
+├─ <SPACES\>
+The default `<CACHE_DIR>` is `~/.cache/huggingface/hub`. However, it is customizable with the `cache_dir` argument on all methods, or by specifying either `HF_HOME` or `HF_HUB_CACHE` environment variable.
+Models, datasets and spaces share a common root. Each of these repositories contains the repository type, the namespace (organization or username) if it exists and the repository name:
+Copied
+<CACHE\_DIR\>
+├─ models\--julien\-c\--EsperBERTo\-small
+├─ models\--lysandrejik\--arxiv\-nlp
+├─ models\--bert\-base\-cased
+├─ datasets\--glue
+├─ datasets\--huggingface\--DataMeasurementsFiles
+├─ spaces\--dalle\-mini\--dalle\-mini
+It is within these folders that all files will now be downloaded from the Hub. Caching ensures that a file isn’t downloaded twice if it already exists and wasn’t updated; but if it was updated, and you’re asking for the latest file, then it will download the latest file (while keeping the previous file intact in case you need it again).
+In order to achieve this, all folders contain the same skeleton:
+Copied
+<CACHE\_DIR>
+├─ datasets\--glue
+│  ├─ refs
+│  ├─ blobs
+│  ├─ snapshots
+...
+Each folder is designed to contain the following:
+### [](#refs)Refs
+The `refs` folder contains files which indicates the latest revision of the given reference. For example, if we have previously fetched a file from the `main` branch of a repository, the `refs` folder will contain a file named `main`, which will itself contain the commit identifier of the current head.
+If the latest commit of `main` has `aaaaaa` as identifier, then it will contain `aaaaaa`.
+If that same branch gets updated with a new commit, that has `bbbbbb` as an identifier, then re-downloading a file from that reference will update the `refs/main` file to contain `bbbbbb`.
+### [](#blobs)Blobs
+The `blobs` folder contains the actual files that we have downloaded. The name of each file is their hash.
+### [](#snapshots)Snapshots
+The `snapshots` folder contains symlinks to the blobs mentioned above. It is itself made up of several folders: one per known revision!
+In the explanation above, we had initially fetched a file from the `aaaaaa` revision, before fetching a file from the `bbbbbb` revision. In this situation, we would now have two folders in the `snapshots` folder: `aaaaaa` and `bbbbbb`.
+In each of these folders, live symlinks that have the names of the files that we have downloaded. For example, if we had downloaded the `README.md` file at revision `aaaaaa`, we would have the following path:
+Copied
+<CACHE\_DIR>/<REPO\_NAME>/snapshots/aaaaaa/README.md
+That `README.md` file is actually a symlink linking to the blob that has the hash of the file.
+By creating the skeleton this way we open the mechanism to file sharing: if the same file was fetched in revision `bbbbbb`, it would have the same hash and the file would not need to be re-downloaded.
+### [](#noexist-advanced).no\_exist (advanced)
+In addition to the `blobs`, `refs` and `snapshots` folders, you might also find a `.no_exist` folder in your cache. This folder keeps track of files that you’ve tried to download once but don’t exist on the Hub. Its structure is the same as the `snapshots` folder with 1 subfolder per known revision:
+Copied
+<CACHE\_DIR>/<REPO\_NAME>/.no\_exist/aaaaaa/config\_that\_does\_not\_exist.json
+Unlike the `snapshots` folder, files are simple empty files (no symlinks). In this example, the file `"config_that_does_not_exist.json"` does not exist on the Hub for the revision `"aaaaaa"`. As it only stores empty files, this folder is neglectable in term of disk usage.
+So now you might wonder, why is this information even relevant? In some cases, a framework tries to load optional files for a model. Saving the non-existence of optional files makes it faster to load a model as it saves 1 HTTP call per possible optional file. This is for example the case in `transformers` where each tokenizer can support additional files. The first time you load the tokenizer on your machine, it will cache which optional files exist (and which doesn’t) to make the loading time faster for the next initializations.
+To test if a file is cached locally (without making any HTTP request), you can use the [try\_to\_load\_from\_cache()](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.try_to_load_from_cache) helper. It will either return the filepath (if exists and cached), the object `_CACHED_NO_EXIST` (if non-existence is cached) or `None` (if we don’t know).
+Copied
+from huggingface\_hub import try\_to\_load\_from\_cache, \_CACHED\_NO\_EXIST
+filepath = try\_to\_load\_from\_cache()
+if isinstance(filepath, str):
+    \# file exists and is cached
+    ...
+elif filepath is \_CACHED\_NO\_EXIST:
+    \# non-existence of file is cached
+    ...
+else:
+    \# file is not cached
+    ...
+### [](#in-practice)In practice
+In practice, your cache should look like the following tree:
+Copied
+    \[  96\]  .
+    └── \[ 160\]  models--julien-c--EsperBERTo-small
+        ├── \[ 160\]  blobs
+        │   ├── \[321M\]  403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
+        │   ├── \[ 398\]  7cb18dc9bafbfcf74629a4b760af1b160957a83e
+        │   └── \[1.4K\]  d7edf6bd2a681fb0175f7735299831ee1b22b812
+        ├── \[  96\]  refs
+        │   └── \[  40\]  main
+        └── \[ 128\]  snapshots
+            ├── \[ 128\]  2439f60ef33a0d46d85da5001d52aeda5b00ce9f
+            │   ├── \[  52\]  README.md -> ../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
+            │   └── \[  76\]  pytorch\_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
+            └── \[ 128\]  bbc77c8132af1cc5cf678da3f1ddf2de43606d48
+                ├── \[  52\]  README.md -> ../../blobs/7cb18dc9bafbfcf74629a4b760af1b160957a83e
+                └── \[  76\]  pytorch\_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
+### [](#limitations)Limitations
+In order to have an efficient cache-system, `huggingface-hub` uses symlinks. However, symlinks are not supported on all machines. This is a known limitation especially on Windows. When this is the case, `huggingface_hub` do not use the `blobs/` directory but directly stores the files in the `snapshots/` directory instead. This workaround allows users to download and cache files from the Hub exactly the same way. Tools to inspect and delete the cache (see below) are also supported. However, the cache-system is less efficient as a single file might be downloaded several times if multiple revisions of the same repo is downloaded.
+If you want to benefit from the symlink-based cache-system on a Windows machine, you either need to [activate Developer Mode](https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development) or to run Python as an administrator.
+When symlinks are not supported, a warning message is displayed to the user to alert them they are using a degraded version of the cache-system. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable to true.
+[](#chunk-based-caching-xet)Chunk-based caching (Xet)
+-----------------------------------------------------
+To provide more efficient file transfers, `hf_xet` adds a `xet` directory to the existing `huggingface_hub` cache, creating additional caching layer to enable chunk-based deduplication. This cache holds chunks, which are immutable byte ranges from files (up to 64KB) that are created using content-defined chunking. For more information on the Xet Storage system, see this [section](https://huggingface.co/docs/hub/storage-backends).
+The `xet` directory, located at `~/.cache/huggingface/xet` by default, contains two caches, utilized for uploads and downloads with the following structure
+Copied
+<CACHE\_DIR>
+├─ chunk\_cache
+├─ shard\_cache
+The `xet` cache, like the rest of `hf_xet` is fully integrated with `huggingface_hub`. If you use the existing APIs for interacting with cached assets, there is no need to update your workflow. The `xet` cache is built as an optimization layer on top of the existing `hf_xet` chunk-based deduplication and `huggingface_hub` cache system.
+The `chunk-cache` directory contains cached data chunks that are used to speed up downloads while the `shard-cache` directory contains cached shards that are utilized on the upload path.
+### [](#chunkcache)chunk\_cache
+This cache is used on the download path. The cache directory structure is based on a base-64 encoded hash from the content-addressed store (CAS) that backs each Xet-enabled repository. A CAS hash serves as the key to lookup the offsets of where the data is stored.
+At the topmost level, the first two letters of the base 64 encoded CAS hash are used to create a subdirectory in the `chunk_cache` (keys that share these first two letters are grouped here). The inner levels are comprised of subdirectories with the full key as the directory name. At the base are the cache items which are ranges of blocks that contain the cached chunks.
+Copied
+<CACHE\_DIR>
+├─ xet
+│  ├─ chunk\_cache
+│  │  ├─ A1
+│  │  │  ├─ A1GerURLUcISVivdseeoY1PnYifYkOaCCJ7V5Q9fjgxkZWZhdWx0
+│  │  │  │  ├─ AAAAAAEAAAA5DQAAAAAAAIhRLjDI3SS5jYs4ysNKZiJy9XFI8CN7Ww0UyEA9KPD9
+│  │  │  │  ├─ AQAAAAIAAABzngAAAAAAAPNqPjd5Zby5aBvabF7Z1itCx0ryMwoCnuQcDwq79jlB
+When requesting a file, the first thing `hf_xet` does is communicate with Xet storage’s content addressed store (CAS) for reconstruction information. The reconstruction information contains information about the CAS keys required to download the file in its entirety.
+Before executing the requests for the CAS keys, the `chunk_cache` is consulted. If a key in the cache matches a CAS key, then there is no reason to issue a request for that content. `hf_xet` uses the chunks stored in the directory instead.
+As the `chunk_cache` is purely an optimization, not a guarantee, `hf_xet` utilizes a computationally efficient eviction policy. When the `chunk_cache` is full (see `Limits and Limitations` below), `hf_xet` implements a random eviction policy when selecting an eviction candidate. This significantly reduces the overhead of managing a robust caching system (e.g., LRU) while still providing most of the benefits of caching chunks.
+### [](#shardcache)shard\_cache
+This cache is used when uploading content to the Hub. The directory is flat, comprising only of shard files, each using an ID for the shard name.
+Copied
+<CACHE\_DIR>
+├─ xet
+│  ├─ shard\_cache
+│  │  ├─ 1fe4ffd5cf0c3375f1ef9aec5016cf773ccc5ca294293d3f92d92771dacfc15d.mdb
+│  │  ├─ 906ee184dc1cd0615164a89ed64e8147b3fdccd1163d80d794c66814b3b09992.mdb
+│  │  ├─ ceeeb7ea4cf6c0a8d395a2cf9c08871211fbbd17b9b5dc1005811845307e6b8f.mdb
+│  │  ├─ e8535155b1b11ebd894c908e91a1e14e3461dddd1392695ddc90ae54a548d8b2.mdb
+The `shard_cache` contains shards that are:
+*   Locally generated and successfully uploaded to the CAS
+*   Downloaded from CAS as part of the global deduplication algorithm
+Shards provide a mapping between files and chunks. During uploads, each file is chunked and the hash of the chunk is saved. Every shard in the cache is then consulted. If a shard contains a chunk hash that is present in the local file being uploaded, then that chunk can be discarded as it is already stored in CAS.
+All shards have an expiration date of 3-4 weeks from when they are downloaded. Shards that are expired are not loaded during upload and are deleted one week after expiration.
+### [](#limits-and-limitations)Limits and Limitations
+The `chunk_cache` is limited to 10GB in size while the `shard_cache` is technically without limits (in practice, the size and use of shards are such that limiting the cache is unnecessary).
+By design, both caches are without high-level APIs. These caches are used primarily to facilitate the reconstruction (download) or upload of a file. To interact with the assets themselves, it’s recommended that you use the [`huggingface_hub` cache system APIs](https://huggingface.co/docs/huggingface_hub/guides/manage-cache).
+If you need to reclaim the space utilized by either cache or need to debug any potential cache-related issues, simply remove the `xet` cache entirely by running `rm -rf ~/<cache_dir>/xet` where `<cache_dir>` is the location of your Hugging Face cache, typically `~/.cache/huggingface`
+Example full `xet`cache directory tree:
+Copied
+<CACHE\_DIR>
+├─ xet
+│  ├─ chunk\_cache
+│  │  ├─ L1
+│  │  │  ├─ L1GerURLUcISVivdseeoY1PnYifYkOaCCJ7V5Q9fjgxkZWZhdWx0
+│  │  │  │  ├─ AAAAAAEAAAA5DQAAAAAAAIhRLjDI3SS5jYs4ysNKZiJy9XFI8CN7Ww0UyEA9KPD9
+│  │  │  │  ├─ AQAAAAIAAABzngAAAAAAAPNqPjd5Zby5aBvabF7Z1itCx0ryMwoCnuQcDwq79jlB
+│  ├─ shard\_cache
+│  │  ├─ 1fe4ffd5cf0c3375f1ef9aec5016cf773ccc5ca294293d3f92d92771dacfc15d.mdb
+│  │  ├─ 906ee184dc1cd0615164a89ed64e8147b3fdccd1163d80d794c66814b3b09992.mdb
+│  │  ├─ ceeeb7ea4cf6c0a8d395a2cf9c08871211fbbd17b9b5dc1005811845307e6b8f.mdb
+│  │  ├─ e8535155b1b11ebd894c908e91a1e14e3461dddd1392695ddc90ae54a548d8b2.mdb
+To learn more about Xet Storage, see this [section](https://huggingface.co/docs/hub/storage-backends).
+[](#caching-assets)Caching assets
+---------------------------------
+In addition to caching files from the Hub, downstream libraries often requires to cache other files related to HF but not handled directly by `huggingface_hub` (example: file downloaded from GitHub, preprocessed data, logs,…). In order to cache those files, called `assets`, one can use [cached\_assets\_path()](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.cached_assets_path). This small helper generates paths in the HF cache in a unified way based on the name of the library requesting it and optionally on a namespace and a subfolder name. The goal is to let every downstream libraries manage its assets its own way (e.g. no rule on the structure) as long as it stays in the right assets folder. Those libraries can then leverage tools from `huggingface_hub` to manage the cache, in particular scanning and deleting parts of the assets from a CLI command.
+Copied
+from huggingface\_hub import cached\_assets\_path
+assets\_path = cached\_assets\_path(library\_name="datasets", namespace="SQuAD", subfolder="download")
+something\_path = assets\_path / "something.json" \# Do anything you like in your assets folder !
+[cached\_assets\_path()](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.cached_assets_path) is the recommended way to store assets but is not mandatory. If your library already uses its own cache, feel free to use it!
+### [](#assets-in-practice)Assets in practice
+In practice, your assets cache should look like the following tree:
+Copied
+    assets/
+    └── datasets/
+    │   ├── SQuAD/
+    │   │   ├── downloaded/
+    │   │   ├── extracted/
+    │   │   └── processed/
+    │   ├── Helsinki-NLP--tatoeba\_mt/
+    │       ├── downloaded/
+    │       ├── extracted/
+    │       └── processed/
+    └── transformers/
+        ├── default/
+        │   ├── something/
+        ├── bert-base-cased/
+        │   ├── default/
+        │   └── training/
+    hub/
+    └── models--julien-c--EsperBERTo-small/
+        ├── blobs/
+        │   ├── (...)
+        │   ├── (...)
+        ├── refs/
+        │   └── (...)
+        └── \[ 128\]  snapshots/
+            ├── 2439f60ef33a0d46d85da5001d52aeda5b00ce9f/
+            │   ├── (...)
+            └── bbc77c8132af1cc5cf678da3f1ddf2de43606d48/
+                └── (...)
+[](#manage-your-file-based-cache)Manage your file-based cache
+-------------------------------------------------------------
+### [](#scan-your-cache)Scan your cache
+At the moment, cached files are never deleted from your local directory: when you download a new revision of a branch, previous files are kept in case you need them again. Therefore it can be useful to scan your cache directory in order to know which repos and revisions are taking the most disk space. `huggingface_hub` provides an helper to do so that can be used via `huggingface-cli` or in a python script.
+**Scan cache from the terminal**
+The easiest way to scan your HF cache-system is to use the `scan-cache` command from `huggingface-cli` tool. This command scans the cache and prints a report with information like repo id, repo type, disk usage, refs and full local path.
+The snippet below shows a scan report in a folder in which 4 models and 2 datasets are cached.
+Copied
+➜ huggingface-cli scan-cache
+REPO ID                     REPO TYPE SIZE ON DISK NB FILES LAST\_ACCESSED LAST\_MODIFIED REFS                LOCAL PATH
+--------------------------- --------- ------------ -------- ------------- ------------- ------------------- -------------------------------------------------------------------------
+glue                        dataset         116.3K       15 4 days ago    4 days ago    2.4.0, main, 1.17.0 /home/wauplin/.cache/huggingface/hub/datasets--glue
+google/fleurs               dataset          64.9M        6 1 week ago    1 week ago    refs/pr/1, main     /home/wauplin/.cache/huggingface/hub/datasets--google--fleurs
+Jean-Baptiste/camembert-ner model           441.0M        7 2 weeks ago   16 hours ago  main                /home/wauplin/.cache/huggingface/hub/models--Jean-Baptiste--camembert-ner
+bert-base-cased             model             1.9G       13 1 week ago    2 years ago                       /home/wauplin/.cache/huggingface/hub/models--bert-base-cased
+t5-base                     model            10.1K        3 3 months ago  3 months ago  main                /home/wauplin/.cache/huggingface/hub/models--t5-base
+t5-small                    model           970.7M       11 3 days ago    3 days ago    refs/pr/1, main     /home/wauplin/.cache/huggingface/hub/models--t5-small
+Done in 0.0s. Scanned 6 repo(s) for a total of 3.4G.
+Got 1 warning(s) while scanning. Use -vvv to print details.
+To get a more detailed report, use the `--verbose` option. For each repo, you get a list of all revisions that have been downloaded. As explained above, the files that don’t change between 2 revisions are shared thanks to the symlinks. This means that the size of the repo on disk is expected to be less than the sum of the size of each of its revisions. For example, here `bert-base-cased` has 2 revisions of 1.4G and 1.5G but the total disk usage is only 1.9G.
+Copied
+➜ huggingface-cli scan-cache -v
+REPO ID                     REPO TYPE REVISION                                 SIZE ON DISK NB FILES LAST\_MODIFIED REFS        LOCAL PATH
+--------------------------- --------- ---------------------------------------- ------------ -------- ------------- ----------- ----------------------------------------------------------------------------------------------------------------------------
+glue                        dataset   9338f7b671827df886678df2bdd7cc7b4f36dffd        97.7K       14 4 days ago    main, 2.4.0 /home/wauplin/.cache/huggingface/hub/datasets--glue/snapshots/9338f7b671827df886678df2bdd7cc7b4f36dffd
+glue                        dataset   f021ae41c879fcabcf823648ec685e3fead91fe7        97.8K       14 1 week ago    1.17.0      /home/wauplin/.cache/huggingface/hub/datasets--glue/snapshots/f021ae41c879fcabcf823648ec685e3fead91fe7
+google/fleurs               dataset   129b6e96cf1967cd5d2b9b6aec75ce6cce7c89e8        25.4K        3 2 weeks ago   refs/pr/1   /home/wauplin/.cache/huggingface/hub/datasets--google--fleurs/snapshots/129b6e96cf1967cd5d2b9b6aec75ce6cce7c89e8
+google/fleurs               dataset   24f85a01eb955224ca3946e70050869c56446805        64.9M        4 1 week ago    main        /home/wauplin/.cache/huggingface/hub/datasets--google--fleurs/snapshots/24f85a01eb955224ca3946e70050869c56446805
+Jean-Baptiste/camembert-ner model     dbec8489a1c44ecad9da8a9185115bccabd799fe       441.0M        7 16 hours ago  main        /home/wauplin/.cache/huggingface/hub/models--Jean-Baptiste--camembert-ner/snapshots/dbec8489a1c44ecad9da8a9185115bccabd799fe
+bert-base-cased             model     378aa1bda6387fd00e824948ebe3488630ad8565         1.5G        9 2 years ago               /home/wauplin/.cache/huggingface/hub/models--bert-base-cased/snapshots/378aa1bda6387fd00e824948ebe3488630ad8565
+bert-base-cased             model     a8d257ba9925ef39f3036bfc338acf5283c512d9         1.4G        9 3 days ago    main        /home/wauplin/.cache/huggingface/hub/models--bert-base-cased/snapshots/a8d257ba9925ef39f3036bfc338acf5283c512d9
+t5-base                     model     23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9        10.1K        3 1 week ago    main        /home/wauplin/.cache/huggingface/hub/models--t5-base/snapshots/23aa4f41cb7c08d4b05c8f327b22bfa0eb8c7ad9
+t5-small                    model     98ffebbb27340ec1b1abd7c45da12c253ee1882a       726.2M        6 1 week ago    refs/pr/1   /home/wauplin/.cache/huggingface/hub/models--t5-small/snapshots/98ffebbb27340ec1b1abd7c45da12c253ee1882a
+t5-small                    model     d0a119eedb3718e34c648e594394474cf95e0617       485.8M        6 4 weeks ago               /home/wauplin/.cache/huggingface/hub/models--t5-small/snapshots/d0a119eedb3718e34c648e594394474cf95e0617
+t5-small                    model     d78aea13fa7ecd06c29e3e46195d6341255065d5       970.7M        9 1 week ago    main        /home/wauplin/.cache/huggingface/hub/models--t5-small/snapshots/d78aea13fa7ecd06c29e3e46195d6341255065d5
+Done in 0.0s. Scanned 6 repo(s) for a total of 3.4G.
+Got 1 warning(s) while scanning. Use -vvv to print details.
+**Grep example**
+Since the output is in tabular format, you can combine it with any `grep`\-like tools to filter the entries. Here is an example to filter only revisions from the “t5-small” model on a Unix-based machine.
+Copied
+➜ eval "huggingface-cli scan-cache -v" | grep "t5-small"
+t5-small                    model     98ffebbb27340ec1b1abd7c45da12c253ee1882a       726.2M        6 1 week ago    refs/pr/1   /home/wauplin/.cache/huggingface/hub/models--t5-small/snapshots/98ffebbb27340ec1b1abd7c45da12c253ee1882a
+t5-small                    model     d0a119eedb3718e34c648e594394474cf95e0617       485.8M        6 4 weeks ago               /home/wauplin/.cache/huggingface/hub/models--t5-small/snapshots/d0a119eedb3718e34c648e594394474cf95e0617
+t5-small                    model     d78aea13fa7ecd06c29e3e46195d6341255065d5       970.7M        9 1 week ago    main        /home/wauplin/.cache/huggingface/hub/models--t5-small/snapshots/d78aea13fa7ecd06c29e3e46195d6341255065d5
+**Scan cache from Python**
+For a more advanced usage, use [scan\_cache\_dir()](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.scan_cache_dir) which is the python utility called by the CLI tool.
+You can use it to get a detailed report structured around 4 dataclasses:
+*   [HFCacheInfo](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.HFCacheInfo): complete report returned by [scan\_cache\_dir()](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.scan_cache_dir)
+*   [CachedRepoInfo](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.CachedRepoInfo): information about a cached repo
+*   [CachedRevisionInfo](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.CachedRevisionInfo): information about a cached revision (e.g. “snapshot”) inside a repo
+*   [CachedFileInfo](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.CachedFileInfo): information about a cached file in a snapshot
+Here is a simple usage example. See reference for details.
+Copied
+\>>> from huggingface\_hub import scan\_cache\_dir
+\>>> hf\_cache\_info = scan\_cache\_dir()
+HFCacheInfo(
+    size\_on\_disk=3398085269,
+    repos=frozenset({
+        CachedRepoInfo(
+            repo\_id='t5-small',
+            repo\_type='model',
+            repo\_path=PosixPath(...),
+            size\_on\_disk=970726914,
+            nb\_files=11,
+            last\_accessed=1662971707.3567169,
+            last\_modified=1662971107.3567169,
+            revisions=frozenset({
+                CachedRevisionInfo(
+                    commit\_hash='d78aea13fa7ecd06c29e3e46195d6341255065d5',
+                    size\_on\_disk=970726339,
+                    snapshot\_path=PosixPath(...),
+                    \# No \`last\_accessed\` as blobs are shared among revisions
+                    last\_modified=1662971107.3567169,
+                    files=frozenset({
+                        CachedFileInfo(
+                            file\_name='config.json',
+                            size\_on\_disk=1197
+                            file\_path=PosixPath(...),
+                            blob\_path=PosixPath(...),
+                            blob\_last\_accessed=1662971707.3567169,
+                            blob\_last\_modified=1662971107.3567169,
+                        ),
+                        CachedFileInfo(...),
+                        ...
+                    }),
+                ),
+                CachedRevisionInfo(...),
+                ...
+            }),
+        ),
+        CachedRepoInfo(...),
+        ...
+    }),
+    warnings=\[
+        CorruptedCacheException("Snapshots dir doesn't exist in cached repo: ..."),
+        CorruptedCacheException(...),
+        ...
+    \],
+)
+### [](#clean-your-cache)Clean your cache
+Scanning your cache is interesting but what you really want to do next is usually to delete some portions to free up some space on your drive. This is possible using the `delete-cache` CLI command. One can also programmatically use the [delete\_revisions()](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.HFCacheInfo.delete_revisions) helper from [HFCacheInfo](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.HFCacheInfo) object returned when scanning the cache.
+**Delete strategy**
+To delete some cache, you need to pass a list of revisions to delete. The tool will define a strategy to free up the space based on this list. It returns a [DeleteCacheStrategy](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.DeleteCacheStrategy) object that describes which files and folders will be deleted. The [DeleteCacheStrategy](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.DeleteCacheStrategy) allows give you how much space is expected to be freed. Once you agree with the deletion, you must execute it to make the deletion effective. In order to avoid discrepancies, you cannot edit a strategy object manually.
+The strategy to delete revisions is the following:
+*   the `snapshot` folder containing the revision symlinks is deleted.
+*   blobs files that are targeted only by revisions to be deleted are deleted as well.
+*   if a revision is linked to 1 or more `refs`, references are deleted.
+*   if all revisions from a repo are deleted, the entire cached repository is deleted.
+Revision hashes are unique across all repositories. This means you don’t need to provide any `repo_id` or `repo_type` when removing revisions.
+If a revision is not found in the cache, it will be silently ignored. Besides, if a file or folder cannot be found while trying to delete it, a warning will be logged but no error is thrown. The deletion continues for other paths contained in the [DeleteCacheStrategy](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.DeleteCacheStrategy) object.
+**Clean cache from the terminal**
+The easiest way to delete some revisions from your HF cache-system is to use the `delete-cache` command from `huggingface-cli` tool. The command has two modes. By default, a TUI (Terminal User Interface) is displayed to the user to select which revisions to delete. This TUI is currently in beta as it has not been tested on all platforms. If the TUI doesn’t work on your machine, you can disable it using the `--disable-tui` flag.
+**Using the TUI**
+This is the default mode. To use it, you first need to install extra dependencies by running the following command:
+Copied
+pip install huggingface\_hub\["cli"\]
+Then run the command:
+Copied
+huggingface-cli delete\-cache
+You should now see a list of revisions that you can select/deselect:
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/delete-cache-tui.png)
+Instructions:
+*   Press keyboard arrow keys `<up>` and `<down>` to move the cursor.
+*   Press `<space>` to toggle (select/unselect) an item.
+*   When a revision is selected, the first line is updated to show you how much space will be freed.
+*   Press `<enter>` to confirm your selection.
+*   If you want to cancel the operation and quit, you can select the first item (“None of the following”). If this item is selected, the delete process will be cancelled, no matter what other items are selected. Otherwise you can also press `<ctrl+c>` to quit the TUI.
+Once you’ve selected the revisions you want to delete and pressed `<enter>`, a last confirmation message will be prompted. Press `<enter>` again and the deletion will be effective. If you want to cancel, enter `n`.
+Copied
+✗ huggingface-cli delete-cache --dir ~/.cache/huggingface/hub
+? Select revisions to delete: 2 revision(s) selected.
+? 2 revisions selected counting for 3.1G. Confirm deletion ? Yes
+Start deletion.
+Done. Deleted 1 repo(s) and 0 revision(s) for a total of 3.1G.
+**Without TUI**
+As mentioned above, the TUI mode is currently in beta and is optional. It may be the case that it doesn’t work on your machine or that you don’t find it convenient.
+Another approach is to use the `--disable-tui` flag. The process is very similar as you will be asked to manually review the list of revisions to delete. However, this manual step will not take place in the terminal directly but in a temporary file generated on the fly and that you can manually edit.
+This file has all the instructions you need in the header. Open it in your favorite text editor. To select/deselect a revision, simply comment/uncomment it with a `#`. Once the manual review is done and the file is edited, you can save it. Go back to your terminal and press `<enter>`. By default it will compute how much space would be freed with the updated list of revisions. You can continue to edit the file or confirm with `"y"`.
+Copied
+huggingface-cli delete-cache --disable-tui
+Example of command file:
+Copied
+\# INSTRUCTIONS
+# ------------
+# This is a temporary file created by running \`huggingface-cli delete-cache\` with the
+# \`--disable-tui\` option. It contains a set of revisions that can be deleted from your
+# local cache directory.
+#
+# Please manually review the revisions you want to delete:
+#   - Revision hashes can be commented out with '#'.
+#   - Only non-commented revisions in this file will be deleted.
+#   - Revision hashes that are removed from this file are ignored as well.
+#   - If \`CANCEL\_DELETION\` line is uncommented, the all cache deletion is cancelled and
+#     no changes will be applied.
+#
+# Once you've manually reviewed this file, please confirm deletion in the terminal. This
+# file will be automatically removed once done.
+# ------------
+# KILL SWITCH
+# ------------
+# Un-comment following line to completely cancel the deletion process
+# CANCEL\_DELETION
+# ------------
+# REVISIONS
+# ------------
+# Dataset chrisjay/crowd-speech-africa (761.7M, used 5 days ago)
+    ebedcd8c55c90d39fd27126d29d8484566cd27ca # Refs: main # modified 5 days ago
+# Dataset oscar (3.3M, used 4 days ago)
+#    916f956518279c5e60c63902ebdf3ddf9fa9d629 # Refs: main # modified 4 days ago
+# Dataset wikiann (804.1K, used 2 weeks ago)
+    89d089624b6323d69dcd9e5eb2def0551887a73a # Refs: main # modified 2 weeks ago
+# Dataset z-uo/male-LJSpeech-italian (5.5G, used 5 days ago)
+#    9cfa5647b32c0a30d0adfca06bf198d82192a0d1 # Refs: main # modified 5 days ago
+**Clean cache from Python**
+For more flexibility, you can also use the [delete\_revisions()](/docs/huggingface_hub/v0.32.2/en/package_reference/cache#huggingface_hub.HFCacheInfo.delete_revisions) method programmatically. Here is a simple example. See reference for details.
+Copied
+\>>> from huggingface\_hub import scan\_cache\_dir
+\>>> delete\_strategy = scan\_cache\_dir().delete\_revisions(
+...     "81fd1d6e7847c99f5862c9fb81387956d99ec7aa"
+...     "e2983b237dccf3ab4937c97fa717319a9ca1a96d",
+...     "6c0e6080953db56375760c0471a8c5f2929baf11",
+... )
+\>>> print("Will free " + delete\_strategy.expected\_freed\_size\_str)
+Will free 8.6G
+\>>> delete\_strategy.execute()
+Cache deletion done. Saved 8.6G.
+[< \> Update on GitHub](https://github.com/huggingface/huggingface_hub/blob/main/docs/source/en/guides/manage-cache.md)
+Create and manage a repository

hymm_sp/__init__.py ADDED Viewed

File without changes

hymm_sp/config.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import argparse
+from hymm_sp.constants import *
+import re
+import collections.abc
+def as_tuple(x):
+    if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+        return tuple(x)
+    if x is None or isinstance(x, (int, float, str)):
+        return (x,)
+    else:
+        raise ValueError(f"Unknown type {type(x)}")
+def parse_args(namespace=None):
+    parser = argparse.ArgumentParser(description="Hunyuan Multimodal training/inference script")
+    parser = add_extra_args(parser)
+    args = parser.parse_args(namespace=namespace)
+    args = sanity_check_args(args)
+    return args
+def add_extra_args(parser: argparse.ArgumentParser):
+    parser = add_network_args(parser)
+    parser = add_extra_models_args(parser)
+    parser = add_denoise_schedule_args(parser)
+    parser = add_evaluation_args(parser)
+    parser = add_test_args(parser)
+    return parser
+def add_test_args(parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(title="Test")
+    group.add_argument("--image-start", action="store_true", help="Use one image from video for training")
+    group.add_argument("--use-csv-pose", action="store_true", help="Use one image from video for training")
+    group.add_argument("--add-button", action="store_true", help="Use one image from video for training")
+    group.add_argument("--action-list", type=str, nargs='+', default=None, help="CSV file for evaluation.")
+    group.add_argument("--action-speed-list", type=float, nargs='+', default=None, help="CSV file for evaluation.")
+    group.add_argument("--pose", type=str, default=None, help="CSV file for evaluation.")
+    return parser
+def add_network_args(parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(title="Network")
+    group.add_argument("--model", type=str, default="HYVideo-T/2",
+                       help="Model architecture to use. It it also used to determine the experiment directory.")
+    group.add_argument("--latent-channels", type=str, default=None,
+                       help="Number of latent channels of DiT. If None, it will be determined by `vae`. If provided, "
+                            "it still needs to match the latent channels of the VAE model.")
+    group.add_argument("--rope-theta", type=int, default=256, help="Theta used in RoPE.")
+    return parser
+def add_extra_models_args(parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(title="Extra Models (VAE, Text Encoder, Tokenizer)")
+    # VAE
+    group.add_argument("--vae", type=str, default="884-16c-hy0801",  help="Name of the VAE model.")
+    group.add_argument("--vae-precision", type=str, default="fp16",
+                       help="Precision mode for the VAE model.")
+    group.add_argument("--vae-tiling", action="store_true", default=True, help="Enable tiling for the VAE model.")
+    group.add_argument("--text-encoder", type=str, default="llava-llama-3-8b", choices=list(TEXT_ENCODER_PATH),
+                       help="Name of the text encoder model.")
+    group.add_argument("--text-encoder-precision", type=str, default="fp16", choices=PRECISIONS,
+                       help="Precision mode for the text encoder model.")
+    group.add_argument("--text-states-dim", type=int, default=4096, help="Dimension of the text encoder hidden states.")
+    group.add_argument("--text-len", type=int, default=256, help="Maximum length of the text input.")
+    group.add_argument("--tokenizer", type=str, default="llava-llama-3-8b", choices=list(TOKENIZER_PATH),
+                       help="Name of the tokenizer model.")
+    group.add_argument("--text-encoder-infer-mode", type=str, default="encoder", choices=["encoder", "decoder"],
+                       help="Inference mode for the text encoder model. It should match the text encoder type. T5 and "
+                            "CLIP can only work in 'encoder' mode, while Llava/GLM can work in both modes.")
+    group.add_argument("--prompt-template-video", type=str, default='li-dit-encode-video', choices=PROMPT_TEMPLATE,
+                       help="Video prompt template for the decoder-only text encoder model.")
+    group.add_argument("--hidden-state-skip-layer", type=int, default=2,
+                       help="Skip layer for hidden states.")
+    group.add_argument("--apply-final-norm", action="store_true",
+                       help="Apply final normalization to the used text encoder hidden states.")
+    # - CLIP
+    group.add_argument("--text-encoder-2", type=str, default='clipL', choices=list(TEXT_ENCODER_PATH),
+                       help="Name of the second text encoder model.")
+    group.add_argument("--text-encoder-precision-2", type=str, default="fp16", choices=PRECISIONS,
+                       help="Precision mode for the second text encoder model.")
+    group.add_argument("--text-states-dim-2", type=int, default=768,
+                       help="Dimension of the second text encoder hidden states.")
+    group.add_argument("--tokenizer-2", type=str, default='clipL', choices=list(TOKENIZER_PATH),
+                       help="Name of the second tokenizer model.")
+    group.add_argument("--text-len-2", type=int, default=77, help="Maximum length of the second text input.")
+    group.set_defaults(use_attention_mask=True)
+    group.add_argument("--text-projection", type=str, default="single_refiner", choices=TEXT_PROJECTION,
+                       help="A projection layer for bridging the text encoder hidden states and the diffusion model "
+                            "conditions.")
+    return parser
+def add_denoise_schedule_args(parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(title="Denoise schedule")
+    group.add_argument("--flow-shift-eval-video", type=float, default=None, help="Shift factor for flow matching schedulers when using video data.")
+    group.add_argument("--flow-reverse", action="store_true", default=True, help="If reverse, learning/sampling from t=1 -> t=0.")
+    group.add_argument("--flow-solver", type=str, default="euler", help="Solver for flow matching.")
+    group.add_argument("--use-linear-quadratic-schedule", action="store_true", help="Use linear quadratic schedule for flow matching."
+                                                    "Follow MovieGen (https://ai.meta.com/static-resource/movie-gen-research-paper)")
+    group.add_argument("--linear-schedule-end", type=int, default=25, help="End step for linear quadratic schedule for flow matching.")
+    return parser
+def add_evaluation_args(parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(title="Validation Loss Evaluation")
+    parser.add_argument("--precision", type=str, default="bf16", choices=PRECISIONS,
+                    help="Precision mode. Options: fp32, fp16, bf16. Applied to the backbone model and optimizer.")
+    parser.add_argument("--reproduce", action="store_true",
+                       help="Enable reproducibility by setting random seeds and deterministic algorithms.")
+    parser.add_argument("--ckpt", type=str, help="Path to the checkpoint to evaluate.")
+    parser.add_argument("--load-key", type=str, default="module", choices=["module", "ema"],
+                       help="Key to load the model states. 'module' for the main model, 'ema' for the EMA model.")
+    parser.add_argument("--cpu-offload", action="store_true", help="Use CPU offload for the model load.")
+    group.add_argument( "--use-fp8", action="store_true", help="Enable use fp8 for inference acceleration.")
+    group.add_argument("--video-size", type=int, nargs='+', default=512,
+                        help="Video size for training. If a single value is provided, it will be used for both width "
+                            "and height. If two values are provided, they will be used for width and height "
+                            "respectively.")
+    group.add_argument("--sample-n-frames", type=int, default=33,
+                       help="How many frames to sample from a video. if using 3d vae, the number should be 4n+1")
+    group.add_argument("--infer-steps", type=int, default=100, help="Number of denoising steps for inference.")
+    group.add_argument("--val-disable-autocast", action="store_true",
+                       help="Disable autocast for denoising loop and vae decoding in pipeline sampling.")
+    group.add_argument("--num-images", type=int, default=1, help="Number of images to generate for each prompt.")
+    group.add_argument("--seed", type=int, default=1024, help="Seed for evaluation.")
+    group.add_argument("--save-path-suffix", type=str, default="", help="Suffix for the directory of saved samples.")
+    group.add_argument("--prompt", type=str, default='', help="Main prompt")
+    group.add_argument("--pos-prompt", type=str, default='', help="Prompt for sampling during evaluation.")
+    group.add_argument("--neg-prompt", type=str, default='', help="Negative prompt for sampling during evaluation.")
+    group.add_argument("--add-pos-prompt", type=str, default='', help="Addition prompt for sampling during evaluation.")
+    group.add_argument("--add-neg-prompt", type=str, default='', help="Addition negative prompt for sampling during evaluation.")
+    group.add_argument("--pad-face-size", type=float, default=0.7, help="Pad bbox for face align.")
+    group.add_argument("--image-path", type=str, default="",  help="")
+    group.add_argument("--save-path", type=str, default=None, help="Path to save the generated samples.")
+    group.add_argument("--input", type=str, default=None, help="test data.")
+    group.add_argument("--item-name", type=str, default=None, help="")
+    group.add_argument("--cfg-scale", type=float, default=7.5, help="Classifier free guidance scale.")
+    group.add_argument("--ip-cfg-scale", type=float, default=0, help="Classifier free guidance scale.")
+    group.add_argument("--use-deepcache", type=int, default=1)
+    group.add_argument("--use-sage", action="store_true", help="Use sage attention for speed up.")
+    return parser
+def sanity_check_args(args):
+    # VAE channels
+    vae_pattern = r"\d{2,3}-\d{1,2}c-\w+"
+    if not re.match(vae_pattern, args.vae):
+        raise ValueError(
+            f"Invalid VAE model: {args.vae}. Must be in the format of '{vae_pattern}'."
+        )
+    vae_channels = int(args.vae.split("-")[1][:-1])
+    if args.latent_channels is None:
+        args.latent_channels = vae_channels
+    if vae_channels != args.latent_channels:
+        raise ValueError(
+            f"Latent channels ({args.latent_channels}) must match the VAE channels ({vae_channels})."
+        )
+    return args

hymm_sp/constants.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+import torch
+__all__ = [
+    "PROMPT_TEMPLATE", "MODEL_BASE", "PRECISION_TO_TYPE",
+    "PRECISIONS", "VAE_PATH", "TEXT_ENCODER_PATH", "TOKENIZER_PATH",
+    "TEXT_PROJECTION",
+]
+# =================== Constant Values =====================
+PRECISION_TO_TYPE = {
+    'fp32': torch.float32,
+    'fp16': torch.float16,
+    'bf16': torch.bfloat16,
+}
+PROMPT_TEMPLATE_ENCODE_VIDEO = (
+    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
+    "1. The main content and theme of the video."
+    "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
+    "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
+    "4. background environment, light, style and atmosphere."
+    "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
+    "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
+)
+PROMPT_TEMPLATE = {
+    "li-dit-encode-video": {"template": PROMPT_TEMPLATE_ENCODE_VIDEO, "crop_start": 95},
+}
+# ======================= Model ======================
+PRECISIONS = {"fp32", "fp16", "bf16"}
+# =================== Model Path =====================
+MODEL_BASE = os.getenv("MODEL_BASE")
+# 3D VAE
+VAE_PATH = {
+    "884-16c-hy0801": f"{MODEL_BASE}/vae_3d/hyvae",
+}
+# Text Encoder
+TEXT_ENCODER_PATH = {
+    "clipL": f"{MODEL_BASE}/openai_clip-vit-large-patch14",
+    "llava-llama-3-8b": f"{MODEL_BASE}/llava-llama-3-8b-v1_1-transformers",
+}
+# Tokenizer
+TOKENIZER_PATH = {
+    "clipL": f"{MODEL_BASE}/openai_clip-vit-large-patch14",
+    "llava-llama-3-8b": f"{MODEL_BASE}/llava-llama-3-8b-v1_1-transformers",
+}
+TEXT_PROJECTION = {
+    "linear",                               # Default, an nn.Linear() layer
+    "single_refiner",                       # Single TokenRefiner. Refer to LI-DiT
+}

hymm_sp/data_kits/data_tools.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import os
+import cv2
+import torch
+import numpy as np
+import imageio
+import torchvision
+from einops import rearrange
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8, quality=8):
+    """
+    Saves a batch of videos as a grid animation in GIF or video format.
+    Args:
+        videos (torch.Tensor): Input video tensor with shape (batch, channels, time, height, width)
+        path (str): Output file path (e.g., "output/videos.gif")
+        rescale (bool): If True, rescales video values from [-1, 1] to [0, 1]
+        n_rows (int): Number of rows in the grid layout
+        fps (int): Frames per second for the output animation
+        quality (int): Quality parameter for imageio (1-10, higher = better quality)
+    Process:
+        1. Rearranges tensor dimensions to (time, batch, channels, height, width)
+        2. For each frame in time:
+            a. Creates a grid of videos using torchvision.utils.make_grid
+            b. Adjusts dimensions to (height, width, channels)
+            c. Rescales values if needed
+            d. Converts to 8-bit uint8 format (0-255)
+        3. Saves frames as an animated GIF/video using imageio
+    """
+    # Rearrange dimensions to (time, batch, channels, height, width) for frame-wise processing
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []  # Stores processed frames for animation
+    for frame in videos:
+        # Create a grid of videos with n_rows rows
+        grid = torchvision.utils.make_grid(frame, nrow=n_rows)
+        # Convert from (channels, height, width) to (height, width, channels)
+        grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        # Rescale from [-1, 1] to [0, 1] if needed (common in GAN outputs)
+        if rescale:
+            grid = (grid + 1.0) / 2.0
+        # Clamp values to valid range [0, 1] and convert to 8-bit uint8 (0-255)
+        grid = torch.clamp(grid, 0, 1)
+        grid_np = (grid * 255).numpy().astype(np.uint8)
+        outputs.append(grid_np)
+    # Create output directory if it doesn't exist
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    # Save frames as an animated GIF/video
+    imageio.mimsave(path, outputs, fps=fps, quality=quality)
+def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
+    """
+    Resizes and pads an image to fit a target size while preserving aspect ratio.
+    Args:
+        crop_img (np.ndarray): Input image (shape: [height, width, channels])
+        size (tuple): Target size in (width, height) format
+        color (tuple): RGB color for padding (default: white)
+        resize_ratio (float): Scaling factor for resizing before padding (0-1)
+    Returns:
+        np.ndarray: Padded image with shape (target_height, target_width, channels)
+    Process:
+        1. Calculates scaling factors to fit image within target size
+        2. Resizes image while preserving aspect ratio
+        3. Adds padding to reach exact target size, centering the resized image
+    """
+    # Get input image dimensions
+    crop_h, crop_w = crop_img.shape[:2]
+    target_w, target_h = size  # Target dimensions (width, height)
+    # Calculate scaling factors to fit image within target size
+    scale_h = target_h / crop_h  # Scale needed to fit height
+    scale_w = target_w / crop_w  # Scale needed to fit width
+    # Choose the smaller scale to avoid exceeding target dimensions
+    if scale_w > scale_h:
+        # Height is the limiting factor: resize based on height
+        resize_h = int(target_h * resize_ratio)
+        resize_w = int(crop_w / crop_h * resize_h)  # Preserve aspect ratio
+    else:
+        # Width is the limiting factor: resize based on width
+        resize_w = int(target_w * resize_ratio)
+        resize_h = int(crop_h / crop_w * resize_w)  # Preserve aspect ratio
+    # Resize the image using OpenCV
+    resized_img = cv2.resize(crop_img, (resize_w, resize_h))
+    # Calculate padding needed to reach target size (centered)
+    pad_left = (target_w - resize_w) // 2
+    pad_top = (target_h - resize_h) // 2
+    pad_right = target_w - resize_w - pad_left  # Ensure total width matches target
+    pad_bottom = target_h - resize_h - pad_top  # Ensure total height matches target
+    # Add padding with the specified color
+    padded_img = cv2.copyMakeBorder(
+        resized_img,
+        top=pad_top,
+        bottom=pad_bottom,
+        left=pad_left,
+        right=pad_right,
+        borderType=cv2.BORDER_CONSTANT,
+        value=color
+    )
+    return padded_img

hymm_sp/data_kits/video_dataset.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import os
+import cv2
+import torch
+import json
+import numpy as np
+import pandas as pd
+from PIL import Image
+import torchvision.transforms as transforms
+from torch.utils.data import Dataset
+import csv
+def fix_nulls(s):
+    """
+    Helper generator to remove null characters from input lines.
+    Prevents parsing errors caused by invalid null bytes in CSV/JSON files.
+    Args:
+        s: Input iterable containing strings with potential null characters
+    Yields:
+        Strings with null characters replaced by spaces
+    """
+    for line in s:
+        yield line.replace('\0', ' ')
+def get_closest_ratio(height: float, width: float, ratios: list, buckets: list):
+    """
+    Find the closest aspect ratio from predefined buckets
+    Args:
+        height: Image height
+        width: Image width
+        ratios: List of predefined aspect ratios to match against
+        buckets: List of size tuples corresponding to ratios
+    Returns:
+        Tuple containing:
+            - Closest matching size bucket
+            - Closest ratio value
+    """
+    aspect_ratio = float(height) / float(width)
+    closest_ratio_id = np.abs(ratios - aspect_ratio).argmin()
+    closest_ratio = min(ratios, key=lambda ratio: abs(float(ratio) - aspect_ratio))
+    return buckets[closest_ratio_id], float(closest_ratio)
+def generate_crop_size_list(base_size=256, patch_size=16, max_ratio=4.0):
+    """
+    Generate valid crop sizes that maintain compatible dimensions with model patches
+    Args:
+        base_size: Base dimension for calculating patch count
+        patch_size: Size of model's input patches
+        max_ratio: Maximum allowed aspect ratio (height/width)
+    Returns:
+        List of (width, height) tuples representing valid crop sizes
+    """
+    # Calculate total number of patches from base size
+    num_patches = round((base_size / patch_size) ** 2)
+    assert max_ratio >= 1.0, "Maximum ratio must be at least 1.0"
+    crop_size_list = []
+    wp, hp = num_patches, 1  # Initialize with maximum width patches
+    # Generate valid patch combinations
+    while wp > 0:
+        # Only add sizes that maintain acceptable aspect ratio
+        if max(wp, hp) / min(wp, hp) <= max_ratio:
+            crop_size_list.append((wp * patch_size, hp * patch_size))
+        # Move to next valid patch configuration
+        if (hp + 1) * wp <= num_patches:
+            hp += 1
+        else:
+            wp -= 1
+    return crop_size_list
+class VideoCSVDataset(Dataset):
+    """
+    Dataset class for loading video generation data from CSV files
+    Handles:
+        - CSV parsing with null character handling
+        - Loading prompt and metadata
+        - Supporting multiple task types (image-to-video, etc.)
+    """
+    def __init__(self, csv_path, col_name='prompt', task_type=''):
+        """
+        Args:
+            csv_path: Path to CSV file containing dataset metadata
+            col_name: Column name containing generation prompts
+            task_type: Type of task (e.g., "i2v" for image-to-video)
+        """
+        # Read CSV with null character handling
+        with open(csv_path, 'r', newline="\n", encoding='utf-8-sig') as csvfile:
+            self.dataset = list(csv.DictReader(fix_nulls(csvfile), delimiter=';'))
+        self.col_name = col_name
+        self.task_type = task_type
+    def __len__(self):
+        """Return total number of samples in dataset"""
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        """
+        Get dataset item by index
+        Args:
+            idx: Index of sample to retrieve
+        Returns:
+            Dictionary containing:
+                - Prompt and metadata
+                - Paths to auxiliary files (npy, video, poses)
+                - Index for tracking outputs
+        """
+        example = {}
+        example["prompt"] = self.dataset[idx][self.col_name]
+        example['seed'] = int(self.dataset[idx]['seed'])
+        example['index'] = self.dataset[idx]['index']
+        # Add optional auxiliary paths if present in CSV
+        if "npy_path" in self.dataset[idx]:
+            example['npy_path'] = self.dataset[idx]['npy_path']
+        if "video_path" in self.dataset[idx]:
+            example['video_path'] = self.dataset[idx]['video_path']
+        if "monst3r_poses" in self.dataset[idx]:
+            example['monst3r_poses'] = self.dataset[idx]['monst3r_poses']
+        # Add image reference path for image-to-video tasks
+        if self.task_type == "i2v":
+            example['ref_image'] = self.dataset[idx]['ref_image_path']
+        return example
+class JsonDataset(object):
+    """
+    Dataset class for loading data from JSON files and image sequences
+    Handles:
+        - Reading image data from multiple formats
+        - Preprocessing for model compatibility
+        - Generating conditional and unconditional inputs
+    """
+    def __init__(self, args):
+        """
+        Args:
+            args: Command-line arguments containing configuration
+        """
+        self.args = args
+        self.data_list = args.input
+        self.pad_color = (255, 255, 255)  # White padding
+        self.llava_size = (336, 336)      # Standard size for LLaVA model
+        self.ref_size = (args.video_size[1], args.video_size[0])  # Reference output size
+        # Get list of data paths from input list or single file
+        if self.data_list.endswith('.list'):
+            self.data_paths = [line.strip() for line in open(self.data_list, 'r')] if self.data_list else []
+        else:
+            self.data_paths = [self.data_list]
+        # Transformation pipeline for LLaVA model input
+        self.llava_transform = transforms.Compose(
+            [
+                transforms.Resize(self.llava_size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    (0.48145466, 0.4578275, 0.4082107),
+                    (0.26862954, 0.26130258, 0.27577711)
+                ),
+            ]
+        )
+    def __len__(self):
+        """Return total number of data items"""
+        return len(self.data_paths)
+    def read_image(self, image_path):
+        """
+        Read image from path with fallback handling
+        Args:
+            image_path: Path to image file or dictionary containing path
+        Returns:
+            Tuple of (LLaVA-formatted image, reference-sized image)
+        """
+        # Extract path from dictionary if needed
+        if isinstance(image_path, dict):
+            image_path = image_path['seg_item_image_path']
+        try:
+            # Primary method: OpenCV for faster reading
+            face_image_masked = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
+        except:
+            # Fallback: PIL for special formats
+            face_image_masked = Image.open(image_path).convert('RGB')
+        # Prepare images for different processing stages
+        cat_face_image = pad_image(face_image_masked.copy(), self.ref_size)
+        llava_face_image = pad_image(face_image_masked.copy(), self.llava_size)
+        return llava_face_image, cat_face_image
+    def __getitem__(self, idx):
+        """
+        Get preprocessed data item by index
+        Args:
+            idx: Index of item to retrieve
+        Returns:
+            Dictionary containing:
+                - Preprocessed tensors for model input
+                - Metadata (prompt, index, paths)
+        """
+        data_path = self.data_paths[idx]
+        data_name = os.path.basename(os.path.splitext(data_path)[0])
+        # Load data from JSON or use default parameters
+        if data_path.endswith('.json'):
+            data = json.load(open(data_path, 'r'))
+            llava_item_image, cat_item_image = self.read_image(data)
+            item_prompt = data['item_prompt']
+            seed = data['seed']
+            prompt = data['prompt']
+            negative_prompt = data.get('negative_prompt', '')  # Default to empty string
+        else:
+            # Handle non-JSON data (direct image files)
+            llava_item_image, cat_item_image = self.read_image(data_path)
+            item_prompt = 'object'
+            seed = self.args.seed
+            prompt = self.args.pos_prompt
+            negative_prompt = self.args.neg_prompt
+        # Convert to tensors with appropriate transformations
+        llava_item_tensor = self.llava_transform(Image.fromarray(llava_item_image.astype(np.uint8)))
+        cat_item_tensor = torch.from_numpy(cat_item_image.copy()).permute((2, 0, 1)) / 255.0  # Normalize to [0,1]
+        # Create unconditional input (white background)
+        uncond_llava_item_image = np.ones_like(llava_item_image) * 255
+        uncond_llava_item_tensor = self.llava_transform(Image.fromarray(uncond_llava_item_image))
+        # Assemble final batch dictionary
+        return {
+            "pixel_value_llava": llava_item_tensor,
+            "uncond_pixel_value_llava": uncond_llava_item_tensor,
+            "pixel_value_ref": cat_item_tensor,
+            "prompt": prompt,
+            "negative_prompt": negative_prompt,
+            "seed": seed,
+            "name": item_prompt,
+            'data_name': data_name,
+            'index': [idx]  # Index for output tracking
+        }

hymm_sp/diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from .pipelines import HunyuanVideoGamePipeline
+from .schedulers import FlowMatchDiscreteScheduler
+def load_diffusion_pipeline(args, rank, vae, text_encoder, text_encoder_2, model, scheduler=None,
+                            device=None, progress_bar_config=None):
+    """ Load the denoising scheduler for inference. """
+    if scheduler is None:
+        scheduler = FlowMatchDiscreteScheduler(
+            shift=args.flow_shift_eval_video,
+            reverse=args.flow_reverse,
+            solver=args.flow_solver,
+        )
+    # Only enable progress bar for rank 0
+    progress_bar_config = progress_bar_config or {'leave': True, 'disable': rank != 0}
+    pipeline = HunyuanVideoGamePipeline(vae=vae,
+                                       text_encoder=text_encoder,
+                                       text_encoder_2=text_encoder_2,
+                                       transformer=model,
+                                       scheduler=scheduler,
+                                    #    safety_checker=None,
+                                    #    feature_extractor=None,
+                                    #    requires_safety_checker=False,
+                                       progress_bar_config=progress_bar_config,
+                                       args=args,
+                                       )
+    if not args.cpu_offload:
+        pipeline = pipeline.to(device)
+    return pipeline

hymm_sp/diffusion/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Import the HunyuanVideoGamePipeline class from the current package
+# This pipeline is specifically designed for handling video game content generation
+# using the Hunyuan model architecture, providing specialized functionality
+# for game-related video synthesis, character animation, and environment rendering.
+from .pipeline_hunyuan_video_game import HunyuanVideoGamePipeline

hymm_sp/diffusion/pipelines/pipeline_hunyuan_video_game.py ADDED Viewed

	@@ -0,0 +1,1152 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import numpy as np
+import torch
+from packaging import version
+from diffusers.utils import BaseOutput
+from dataclasses import dataclass
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, ImageProjection
+from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from hymm_sp.constants import PRECISION_TO_TYPE
+from hymm_sp.vae.autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from hymm_sp.text_encoder import TextEncoder
+from einops import rearrange
+from ...modules import HYVideoDiffusionTransformer
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """"""
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+@dataclass
+class HunyuanVideoPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+class HunyuanVideoGamePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using HunyuanVideo.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`TextEncoder`]):
+            Frozen text-encoder.
+        text_encoder_2 ([`TextEncoder`]):
+            Frozen text-encoder_2.
+        transformer ([`HYVideoDiffusionTransformer`]):
+            A `HYVideoDiffusionTransformer` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = ["text_encoder_2"]
+    _exclude_from_cpu_offload = ["transformer"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: TextEncoder,
+        transformer: HYVideoDiffusionTransformer,
+        scheduler: KarrasDiffusionSchedulers,
+        text_encoder_2: Optional[TextEncoder] = None,
+        progress_bar_config: Dict[str, Any] = None,
+        args=None,
+    ):
+        super().__init__()
+        # ==========================================================================================
+        if progress_bar_config is None:
+            progress_bar_config = {}
+        if not hasattr(self, '_progress_bar_config'):
+            self._progress_bar_config = {}
+        self._progress_bar_config.update(progress_bar_config)
+        self.args = args
+        # ==========================================================================================
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+            text_encoder_2=text_encoder_2
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+        text_encoder: Optional[TextEncoder] = None,
+        data_type: Optional[str] = "image",
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_videos_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            pixel_value_llava (`torch.Tensor`, *optional*):
+                The image tensor for llava.
+            uncond_pixel_value_llava (`torch.Tensor`, *optional*):
+                The image tensor for llava.  Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            attention_mask (`torch.Tensor`, *optional*):
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_attention_mask (`torch.Tensor`, *optional*):
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            text_encoder (TextEncoder, *optional*):
+        """
+        if text_encoder is None:
+            text_encoder = self.text_encoder
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(text_encoder.model, lora_scale)
+            else:
+                scale_lora_layers(text_encoder.model, lora_scale)
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, text_encoder.tokenizer)
+            text_inputs = text_encoder.text2tokens(prompt, data_type=data_type)
+            if clip_skip is None:
+                prompt_outputs = text_encoder.encode(text_inputs, data_type=data_type)
+                prompt_embeds = prompt_outputs.hidden_state
+            else:
+                prompt_outputs = text_encoder.encode(text_inputs, output_hidden_states=True, data_type=data_type)
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_outputs.hidden_states_list[-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = text_encoder.model.text_model.final_layer_norm(prompt_embeds)
+            attention_mask = prompt_outputs.attention_mask
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(device)
+                bs_embed, seq_len = attention_mask.shape
+                attention_mask = attention_mask.repeat(1, num_videos_per_prompt)
+                attention_mask = attention_mask.view(bs_embed * num_videos_per_prompt, seq_len)
+        if text_encoder is not None:
+            prompt_embeds_dtype = text_encoder.dtype
+        elif self.transformer is not None:
+            prompt_embeds_dtype = self.transformer.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        if prompt_embeds.ndim == 2:
+            bs_embed, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt)
+            prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, -1)
+        else:
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, text_encoder.tokenizer)
+            uncond_input = text_encoder.text2tokens(uncond_tokens, data_type=data_type)
+            negative_prompt_outputs = text_encoder.encode(uncond_input, data_type=data_type)
+            negative_prompt_embeds = negative_prompt_outputs.hidden_state
+            negative_attention_mask = negative_prompt_outputs.attention_mask
+            if negative_attention_mask is not None:
+                negative_attention_mask = negative_attention_mask.to(device)
+                _, seq_len = negative_attention_mask.shape
+                negative_attention_mask = negative_attention_mask.repeat(1, num_videos_per_prompt)
+                negative_attention_mask = negative_attention_mask.view(batch_size * num_videos_per_prompt, seq_len)
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+            if negative_prompt_embeds.ndim == 2:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt)
+                negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, -1)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+                negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+        if text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(text_encoder.model, lora_scale)
+        return prompt_embeds, negative_prompt_embeds, attention_mask, negative_attention_mask
+    def decode_latents(self, latents, enable_tiling=True):
+        deprecation_message = \
+            "The decode_latents method is deprecated and will be removed \
+            in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        if enable_tiling:
+            self.vae.enable_tiling()
+            image = self.vae.decode(latents, return_dict=False)[0]
+            self.vae.disable_tiling()
+        else:
+            image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        if image.ndim==4: image = image.cpu().permute(0, 2, 3, 1).float()
+        else: image = image.cpu().float()
+        return image
+    def prepare_extra_func_kwargs(self, func, kwargs):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        extra_step_kwargs = {}
+        for k, v in kwargs.items():
+            accepts = k in set(inspect.signature(func).parameters.keys())
+            if accepts:
+                extra_step_kwargs[k] = v
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        frame,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        vae_ver='88-4c-sd'
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        # if frame is not None:
+        #     if '884' in vae_ver:
+        #         if frame!=1 and (frame-1)%4!=0:
+        #             raise ValueError(f'`frame` has to be 1 or a multiple of 4 but is {frame}.')
+        #     elif '888' in vae_ver:
+        #         if frame!=1 and (frame-1)%8!=0:
+        #             raise ValueError(f'`frame` has to be 1 or a multiple of 8 but is {frame}.')
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in \
+                    {self._callback_tensor_inputs}, but found \
+                        {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+        return timesteps.to(device), num_inference_steps - t_start
+    def prepare_latents(self, batch_size, num_channels_latents, num_inference_steps,
+                        height, width, frame, dtype, device, timesteps,generator,
+                        latents=None, gt_latents=None, denoise_strength=1.0,):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            frame,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, denoise_strength, device)
+        if gt_latents.shape[2] == 1:
+            gt_latents = gt_latents.repeat(1, 1, frame, 1, 1)
+        # TODO: correct
+        x0 = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        # print("!!!!!!!!!!!!!! RANDOM NOISE !!!!!!!!!!!!!!!!!!")
+        # x0 = randn_tensor(shape, device=device, dtype=dtype)
+        x1 = gt_latents
+        t = torch.tensor([0.999]).to(device=device)
+        latents = x0 * t + x1 * (1 - t)
+        latents = torch.randn_like(x1)
+        # print("!!!randn_like", latents.shape)
+        latents = latents.to(dtype=dtype)
+        if latents is None:
+            latents = noise
+            original_latents = None
+        else:
+            latents = latents.to(device)
+        if hasattr(self.scheduler, "init_noise_sigma"):
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents, timesteps
+    # Copied from diffusers.pipelines.latent_consistency_models.
+    # pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        # return self._guidance_scale > 1 and self.transformer.config.time_cond_proj_dim is None
+        return self._guidance_scale > 1
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        cam_latents: Union[torch.Tensor],              # cam_latents
+        last_latents: Union[torch.Tensor],
+        uncond_cam_latents: Union[torch.Tensor],
+        gt_latents: Union[torch.Tensor],
+        height: int,
+        width: int,
+        video_length: int, # frame is called video_len in hunyuan_multimodal/dev_video
+        data_type: str='video',
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        ref_latents: Optional[torch.Tensor] = None,
+        uncond_ref_latents: Optional[torch.Tensor] = None,
+        ip_cfg_scale: float = 0.0,
+        use_deepcache: int = 1,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_attention_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+        vae_ver: str='88-4c-sd',
+        enable_tiling: bool=False,
+        n_tokens: Optional[int] = None,
+        video_val_flag: bool=False,
+        denoise_strength: float = 1.0,
+        mask = None,
+        cpu_offload: bool=False,
+        use_sage: bool=False,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`):
+                The height in pixels of the generated image.
+            width (`int`):
+                The width in pixels of the generated image.
+            video_length (`int`):
+                The number of frames in the generated video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            ref_latents (`torch.Tensor`, *optional*):
+                The image tensor for time-concat.
+            uncond_ref_latents (`torch.Tensor`, *optional*):
+                The image tensor for time-concat. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            pixel_value_llava (`torch.Tensor`, *optional*):
+                The image tensor for llava.
+            uncond_pixel_value_llava (`torch.Tensor`, *optional*):
+                The image tensor for llava.  Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`]
+                (https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~HunyuanVideoPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`HunyuanVideoPipelineOutput`] is returned,
+                otherwise a list with the generated images is returned.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to \
+                    `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to \
+                `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 0. Default height and width to transformer
+        # height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        # width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            video_length,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            vae_ver=vae_ver
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # device = self._execution_device
+        device = torch.device("cuda")
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds, prompt_mask, negative_prompt_mask = \
+            self.encode_prompt(
+                prompt,
+                device,
+                num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                attention_mask=attention_mask,
+                negative_prompt_embeds=negative_prompt_embeds,
+                negative_attention_mask=negative_attention_mask,
+                lora_scale=lora_scale,
+                clip_skip=self.clip_skip,
+                data_type=data_type
+            )
+        if self.text_encoder_2 is not None:
+            prompt_embeds_2, negative_prompt_embeds_2, prompt_mask_2, negative_prompt_mask_2 = \
+                self.encode_prompt(
+                    prompt,
+                    device,
+                    num_videos_per_prompt,
+                    self.do_classifier_free_guidance,
+                    negative_prompt,
+                    prompt_embeds=None,
+                    attention_mask=None,
+                    negative_prompt_embeds=None,
+                    negative_attention_mask=None,
+                    lora_scale=lora_scale,
+                    clip_skip=self.clip_skip,
+                    text_encoder=self.text_encoder_2,
+                )
+        else:
+            prompt_embeds_2 = None
+            negative_prompt_embeds_2 = None
+            prompt_mask_2 = None
+            negative_prompt_mask_2 = None
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            if prompt_mask is not None:
+                prompt_mask = torch.cat([negative_prompt_mask, prompt_mask])
+            if prompt_embeds_2 is not None:
+                prompt_embeds_2 = torch.cat([negative_prompt_embeds_2, prompt_embeds_2])
+            if prompt_mask_2 is not None:
+                prompt_mask_2 = torch.cat([negative_prompt_mask_2, prompt_mask_2])
+        if self.do_classifier_free_guidance:
+            if ref_latents is not None:
+                ref_latents = torch.cat([ref_latents, ref_latents], dim=0)
+                if prompt_mask[0].sum() > 575:
+                    prompt_mask[0] = torch.cat(
+                        [torch.ones((1, prompt_mask[0].sum() - 575)).to(prompt_mask),
+                         torch.zeros((1, prompt_mask.shape[1] - prompt_mask[0].sum() + 575)).to(prompt_mask)], dim=1)
+        if ip_cfg_scale>0:
+            prompt_embeds = torch.cat([prompt_embeds, prompt_embeds[1:]])
+            prompt_embeds_2 = torch.cat([prompt_embeds_2, prompt_embeds_2[1:]])
+            prompt_mask = torch.cat([prompt_mask, prompt_mask[1:]], dim=0)
+            ref_latents = torch.cat([uncond_ref_latents, uncond_ref_latents, ref_latents[1:]], dim=0)
+        # 4. Prepare timesteps
+        extra_set_timesteps_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.set_timesteps, {"n_tokens": n_tokens}
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas, **extra_set_timesteps_kwargs,
+        )
+        if '884' in vae_ver:
+            frame_length = (video_length - 2) // 4 + 2
+        elif '888' in vae_ver:
+            frame_length = (video_length - 1) // 8 + 1
+        else:
+            frame_length = video_length
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents, timesteps  = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_inference_steps,
+            height,
+            width,
+            frame_length,
+            prompt_embeds.dtype,
+            device,
+            timesteps,
+            generator,
+            latents,
+            gt_latents,
+            denoise_strength,
+        )
+        gt_latents = gt_latents.repeat(1, 1, frame_length, 1, 1)
+        gt_latents_concat = gt_latents.clone()
+        if frame_length == 10:
+            gt_latents_concat[:,:,1:,:,:] = 0.0
+            mask_concat = torch.ones(gt_latents.shape[0],
+                                     1,
+                                     gt_latents.shape[2],
+                                     gt_latents.shape[3],
+                                     gt_latents.shape[4]).to(device=gt_latents.device)
+            mask_concat[:, :, 1:,...] = 0.0
+        else:
+            gt_latents_concat[:,:,gt_latents_concat.shape[2]//2:,:,:] = 0.0
+            mask_zeros = torch.zeros(gt_latents.shape[0],
+                                     1,
+                                     gt_latents.shape[2]//2,
+                                     gt_latents.shape[3],
+                                     gt_latents.shape[4])
+            mask_ones = torch.ones(gt_latents.shape[0],
+                                   1,
+                                   gt_latents.shape[2]//2,
+                                   gt_latents.shape[3],
+                                   gt_latents.shape[4])
+            mask_concat = torch.cat([mask_ones, mask_zeros], dim=2).to(device=gt_latents.device)
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_func_kwargs(
+            self.scheduler.step, {"generator": generator, "eta": eta},
+        )
+        target_dtype = PRECISION_TO_TYPE[self.args.precision]
+        autocast_enabled = (target_dtype != torch.float32) and not self.args.val_disable_autocast
+        vae_dtype = PRECISION_TO_TYPE[self.args.vae_precision]
+        vae_autocast_enabled = (vae_dtype != torch.float32) and not self.args.val_disable_autocast
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        start_scale = ip_cfg_scale  #  3.0
+        end_scale = 1.0
+        step_scale = (start_scale - end_scale) / (self._num_timesteps - 1 + 1e-3)
+        if cpu_offload: torch.cuda.empty_cache()
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                if last_latents.shape[2] == 1:
+                    latents[:,:,0,:,:] = last_latents[:,:,-1,:,:]
+                else:
+                    latents[:,:,:latents.shape[2]//2,:,:] = last_latents
+                    gt_latents_concat[:,:,:latents.shape[2]//2,:,:] = last_latents
+                # expand the latents if we are doing classifier free guidance
+                latents_concat = torch.concat([latents, gt_latents_concat, mask_concat], dim=1)
+                latent_model_input = torch.cat([latents_concat] * 2) \
+                    if self.do_classifier_free_guidance else latents_concat
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                t_expand = t.repeat(latent_model_input.shape[0])
+                t_expand = t.repeat(latent_model_input.shape[0])
+                guidance_expand = None
+                cam_latents_ = torch.cat([uncond_cam_latents, cam_latents], dim=0) \
+                    if self.do_classifier_free_guidance else cam_latents
+                # predict the noise residual
+                with torch.autocast(device_type="cuda", dtype=target_dtype, enabled=autocast_enabled):
+                    is_cache = False
+                    if use_deepcache and num_inference_steps==50:
+                        no_cache_steps =  list(range(0, 10)) + list(range(10, 40, 2)) + list(range(40, 50))
+                        if i in no_cache_steps:
+                            is_cache = False
+                        else:
+                            is_cache = True
+                    if latent_model_input.shape[-1]*latent_model_input.shape[-2]>64*112 and cpu_offload:
+                        if i==0:
+                            print(f'cpu_offload={cpu_offload} and \
+                                {latent_model_input.shape[-2:]} is large, split infer noise-pred')
+                        noise_pred_uncond = self.transformer(latent_model_input[:1],
+                                                             t_expand[:1],
+                                                             text_states=prompt_embeds[:1],
+                                                             text_mask=prompt_mask[:1],
+                                                             text_states_2=prompt_embeds_2[:1],
+                                                             freqs_cos=freqs_cis[0],
+                                                             freqs_sin=freqs_cis[1],
+                                                             guidance=guidance_expand,
+                                                             return_dict=True,
+                                                             is_cache=is_cache,
+                                                             cam_latents=cam_latents_[:1])['x']
+                        torch.cuda.empty_cache()
+                        noise_pred_text = self.transformer(latent_model_input[1:],
+                                                           t_expand[1:],
+                                                           text_states=prompt_embeds[1:],
+                                                           text_mask=prompt_mask[1:],
+                                                           text_states_2=prompt_embeds_2[1:],
+                                                           freqs_cos=freqs_cis[0],
+                                                           freqs_sin=freqs_cis[1],
+                                                           guidance=guidance_expand,
+                                                           return_dict=True,
+                                                           is_cache=is_cache,
+                                                           cam_latents=cam_latents_[1:])['x']
+                        noise_pred = torch.cat([noise_pred_uncond, noise_pred_text], dim=0)
+                        torch.cuda.empty_cache()
+                    else:
+                        noise_pred = self.transformer(             # For an input image (1, 256, 256)
+                            latent_model_input,             # [2, 16, 1, 32, 32] #
+                            t_expand,                       # [2]
+                            text_states=prompt_embeds,      # [2, 256, 4096]
+                            text_mask=prompt_mask,          # [2, 256]
+                            text_states_2=prompt_embeds_2,  # [2, 768]
+                            freqs_cos=freqs_cis[0],         # [seqlen, head_dim]
+                            freqs_sin=freqs_cis[1],         # [seqlen, head_dim]
+                            guidance=guidance_expand,
+                            return_dict=True,
+                            is_cache=is_cache,
+                            cam_latents=cam_latents_,
+                            use_sage=use_sage,
+                        )['x']
+                # perform guidance
+                if self.do_classifier_free_guidance and ip_cfg_scale < 0.1:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                if ip_cfg_scale > 0:
+                    noise_pred_uncond, noise_pred_text, noise_pred_ip = noise_pred.chunk(3)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * \
+                        (noise_pred_text - noise_pred_uncond) + start_scale * (noise_pred_ip-noise_pred_text)
+                    start_scale -= step_scale
+                    if i==0:
+                        print(f'i={i}, noise_pred shape={noise_pred.shape}')
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred,
+                                                   noise_pred_text,
+                                                   guidance_rescale=self.guidance_rescale)
+                # compute the previous noisy sample x_t -> x_t-1
+                # latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if last_latents.shape[2] == 1:
+                    latents[:,:,1:,:,:] = self.scheduler.step(noise_pred[:,:,1:,:,:],
+                                                              t,
+                                                              latents[:,:,1:,:,:],
+                                                              **extra_step_kwargs,
+                                                              return_dict=False)[0]
+                else:
+                    latents[:,:,noise_pred.shape[2]//2:,:,:] = self.scheduler.step(
+                                noise_pred[:,:,noise_pred.shape[2]//2:,:,:],
+                                t,
+                                latents[:,:,latents.shape[2]//2:,:,:],
+                                **extra_step_kwargs, return_dict=False)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    if progress_bar is not None:
+                        progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        if cpu_offload: torch.cuda.empty_cache()
+        # if mask_latents is not None:
+        #     latents = mask_latents * latents + (1 - mask_latents) * original_latents
+        if last_latents.shape[2] == 1:
+            latents = latents[:,:,1:,:,:]
+        if not output_type == "latent":
+            expand_temporal_dim = False
+            if len(latents.shape) == 4:
+                if isinstance(self.vae, AutoencoderKLCausal3D):
+                    latents = latents.unsqueeze(2)
+                    expand_temporal_dim = True
+            elif len(latents.shape) == 5:
+                pass
+            else:
+                raise ValueError(
+                    f"Only support latents with shape (b, c, h, w) or (b, c, f, h, w), but got {latents.shape}.")
+            if not last_latents.shape[2] == 1:
+                last_latents = latents[:,:,latents.shape[2]//2:,:,:]
+            else:
+                last_latents = latents
+            latent_decode = last_latents.clone()
+            latent_decode = latent_decode / self.vae.config.scaling_factor
+            with torch.autocast(device_type="cuda", dtype=vae_dtype, enabled=vae_autocast_enabled):
+                if enable_tiling:
+                    self.vae.enable_tiling()
+                    if cpu_offload:
+                        self.vae.post_quant_conv.to('cuda')
+                        self.vae.decoder.to('cuda')
+                    image = self.vae.decode(latent_decode, return_dict=False, generator=generator)[0]
+                    self.vae.disable_tiling()
+                    if cpu_offload:
+                        self.vae.post_quant_conv.to('cpu')
+                        self.vae.decoder.to('cpu')
+                        torch.cuda.empty_cache()
+                else:
+                    image = self.vae.decode(latent_decode, return_dict=False, generator=generator)[0]
+            # if image is None:
+            #     return (None, )
+            # if expand_temporal_dim or (not video_val_flag and image.shape[2] == 1):
+            #     image = image.squeeze(2)
+            if image is not None and (expand_temporal_dim or (not video_val_flag and image.shape[2] == 1)):
+                image = image.squeeze(2)
+        if image is not None:
+            image = (image / 2 + 0.5).clamp(0, 1)
+            # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+            image = image.cpu().float()
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if cpu_offload: torch.cuda.empty_cache()
+        if not return_dict:
+            return image
+        return_latents = kwargs.get("return_latents", False)
+        if return_latents:
+            return HunyuanVideoPipelineOutput(videos=image), \
+                latents, timesteps, last_latents, last_latents[:,:,-1:, ...]
+        return HunyuanVideoPipelineOutput(videos=image)

hymm_sp/diffusion/schedulers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .scheduling_flow_match_discrete import FlowMatchDiscreteScheduler
2	+ # from .scheduling_flow_match_discrete import FlowMatchDiscreteScheduler

hymm_sp/diffusion/schedulers/scheduling_flow_match_discrete.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Modified from diffusers==0.29.2
+#
+# ==============================================================================
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput, logging
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class FlowMatchDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+    prev_sample: torch.FloatTensor
+class FlowMatchDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+        reverse (`bool`, defaults to `True`):
+            Whether to reverse the timestep schedule.
+    """
+    _compatibles = []
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+        reverse: bool = True,
+        solver: str = "euler",
+        n_tokens: Optional[int] = None,
+    ):
+        sigmas = torch.linspace(1, 0, num_train_timesteps + 1)
+        if not reverse:
+            sigmas = sigmas.flip(0)
+        self.sigmas = sigmas
+        # the value fed to model
+        self.timesteps = (sigmas[:-1] * num_train_timesteps).to(dtype=torch.float32)
+        self._step_index = None
+        self._begin_index = None
+        self.supported_solver = ["euler"]
+        if solver not in self.supported_solver:
+            raise ValueError(f"Solver {solver} not supported. Supported solvers: {self.supported_solver}")
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None,
+                      n_tokens: int = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            n_tokens (`int`, *optional*):
+                Number of tokens in the input sequence.
+        """
+        self.num_inference_steps = num_inference_steps
+        sigmas = torch.linspace(1, 0, num_inference_steps + 1)
+        sigmas = self.sd3_time_shift(sigmas)
+        if not self.config.reverse:
+            sigmas = 1 - sigmas
+        self.sigmas = sigmas
+        self.timesteps = (sigmas[:-1] * self.config.num_train_timesteps).to(dtype=torch.float32, device=device)
+        # Reset step index
+        self._step_index = None
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        return sample
+    def sd3_time_shift(self, t: torch.Tensor):
+        return (self.config.shift * t) / (1 + (self.config.shift - 1) * t)
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[FlowMatchDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        dt = self.sigmas[self.step_index + 1] - self.sigmas[self.step_index]
+        if self.config.solver == "euler":
+            prev_sample = sample + model_output.float() * dt
+        else:
+            raise ValueError(f"Solver {self.config.solver} not supported. Supported solvers: {self.supported_solver}")
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return FlowMatchDiscreteSchedulerOutput(prev_sample=prev_sample)
+    def __len__(self):
+        return self.config.num_train_timesteps

hymm_sp/helpers.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import torch
+from typing import Union, List
+from hymm_sp.modules.posemb_layers import get_1d_rotary_pos_embed, get_meshgrid_nd
+from itertools import repeat
+import collections.abc
+def _ntuple(n):
+    """
+    Creates a helper function to convert inputs to tuples of specified length.
+    Converts iterable inputs (excluding strings) to tuples of length n,
+    or repeats single values n times to form a tuple. Useful for handling
+    multi-dimensional parameters like sizes and strides.
+    Args:
+        n (int): Target length of the tuple
+    Returns:
+        function: Parser function that converts inputs to n-length tuples
+    """
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            x = tuple(x)
+            if len(x) == 1:
+                x = tuple(repeat(x[0], n))
+            return x
+        return tuple(repeat(x, n))
+    return parse
+# Create common tuple conversion functions for 1-4 dimensions
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+def get_rope_freq_from_size(
+    latents_size,
+    ndim,
+    target_ndim,
+    args,
+    rope_theta_rescale_factor: Union[float, List[float]] = 1.0,
+    rope_interpolation_factor: Union[float, List[float]] = 1.0,
+    concat_dict={}
+):
+    """
+    Calculates RoPE (Rotary Position Embedding) frequencies based on latent dimensions.
+    Converts latent space dimensions to rope-compatible sizes by accounting for
+    patch size, then generates the appropriate frequency embeddings for each dimension.
+    Args:
+        latents_size: Dimensions of the latent space tensor
+        ndim (int): Number of dimensions in the latent space
+        target_ndim (int): Target number of dimensions for the embeddings
+        args: Configuration arguments containing model parameters (patch_size, rope_theta, etc.)
+        rope_theta_rescale_factor: Rescaling factor(s) for theta parameter (per dimension)
+        rope_interpolation_factor: Interpolation factor(s) for position embeddings (per dimension)
+        concat_dict: Dictionary for special concatenation modes (e.g., time-based extensions)
+    Returns:
+        tuple: Cosine and sine frequency embeddings (freqs_cos, freqs_sin)
+    """
+    # Calculate rope sizes by dividing latent dimensions by patch size
+    if isinstance(args.patch_size, int):
+        # Validate all latent dimensions are divisible by patch size
+        assert all(s % args.patch_size == 0 for s in latents_size), \
+            f"Latent size (last {ndim} dimensions) must be divisible by patch size ({args.patch_size}), " \
+            f"but got {latents_size}."
+        rope_sizes = [s // args.patch_size for s in latents_size]
+    elif isinstance(args.patch_size, list):
+        # Validate with per-dimension patch sizes
+        assert all(s % args.patch_size[idx] == 0 for idx, s in enumerate(latents_size)), \
+            f"Latent size (last {ndim} dimensions) must be divisible by patch size ({args.patch_size}), " \
+            f"but got {latents_size}."
+        rope_sizes = [s // args.patch_size[idx] for idx, s in enumerate(latents_size)]
+    # Add singleton dimensions if needed to match target_ndim (typically for time axis)
+    if len(rope_sizes) != target_ndim:
+        rope_sizes = [1] * (target_ndim - len(rope_sizes)) + rope_sizes
+    # Calculate head dimension and validate rope dimensions
+    head_dim = args.hidden_size // args.num_heads
+    rope_dim_list = args.rope_dim_list
+    # Default: split head dimension equally across target dimensions
+    if rope_dim_list is None:
+        rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+    # Ensure rope dimensions sum to head dimension
+    assert sum(rope_dim_list) == head_dim, \
+        "Sum of rope_dim_list must equal attention head dimension (hidden_size // num_heads)"
+    # Generate rotary position embeddings
+    freqs_cos, freqs_sin = get_nd_rotary_pos_embed_new(
+        rope_dim_list,
+        rope_sizes,
+        theta=args.rope_theta,
+        use_real=True,
+        theta_rescale_factor=rope_theta_rescale_factor,
+        interpolation_factor=rope_interpolation_factor,
+        concat_dict=concat_dict
+    )
+    return freqs_cos, freqs_sin
+def get_nd_rotary_pos_embed_new(
+    rope_dim_list,
+    start,
+    *args,
+    theta=10000.,
+    use_real=False,
+    theta_rescale_factor: Union[float, List[float]] = 1.0,
+    interpolation_factor: Union[float, List[float]] = 1.0,
+    concat_dict={}
+):
+    """
+    Generates multi-dimensional Rotary Position Embeddings (RoPE).
+    Creates position embeddings for n-dimensional spaces by generating a meshgrid
+    of positions and applying 1D rotary embeddings to each dimension, then combining them.
+    Args:
+        rope_dim_list (list): List of embedding dimensions for each axis
+        start: Starting dimensions for generating the meshgrid
+        *args: Additional arguments for meshgrid generation
+        theta (float): Base theta parameter for RoPE frequency calculation
+        use_real (bool): If True, returns separate cosine and sine embeddings
+        theta_rescale_factor: Rescaling factor(s) for theta (per dimension)
+        interpolation_factor: Interpolation factor(s) for position scaling (per dimension)
+        concat_dict: Dictionary for special concatenation modes (e.g., time-based extensions)
+    Returns:
+        tuple or tensor: Cosine and sine embeddings if use_real=True, combined embedding otherwise
+    """
+    # Generate n-dimensional meshgrid of positions (shape: [dim, *sizes])
+    grid = get_meshgrid_nd(start, *args, dim=len(rope_dim_list))
+    # Handle special concatenation modes (e.g., adding time-based bias)
+    if concat_dict:
+        if concat_dict['mode'] == 'timecat':
+            # Add bias as first element in first dimension
+            bias = grid[:, :1].clone()
+            bias[0] = concat_dict['bias'] * torch.ones_like(bias[0])
+            grid = torch.cat([bias, grid], dim=1)
+        elif concat_dict['mode'] == 'timecat-w':
+            # Add biased first element with spatial offset
+            bias = grid[:, :1].clone()
+            bias[0] = concat_dict['bias'] * torch.ones_like(bias[0])
+            bias[2] += start[-1]  # Spatial offset reference: OminiControl implementation
+            grid = torch.cat([bias, grid], dim=1)
+    # Normalize theta rescale factors to list format (per dimension)
+    if isinstance(theta_rescale_factor, (int, float)):
+        theta_rescale_factor = [theta_rescale_factor] * len(rope_dim_list)
+    elif isinstance(theta_rescale_factor, list) and len(theta_rescale_factor) == 1:
+        theta_rescale_factor = [theta_rescale_factor[0]] * len(rope_dim_list)
+    assert len(theta_rescale_factor) == len(rope_dim_list), \
+        "Length of theta_rescale_factor must match number of dimensions"
+    # Normalize interpolation factors to list format (per dimension)
+    if isinstance(interpolation_factor, (int, float)):
+        interpolation_factor = [interpolation_factor] * len(rope_dim_list)
+    elif isinstance(interpolation_factor, list) and len(interpolation_factor) == 1:
+        interpolation_factor = [interpolation_factor[0]] * len(rope_dim_list)
+    assert len(interpolation_factor) == len(rope_dim_list), \
+        "Length of interpolation_factor must match number of dimensions"
+    # Generate 1D rotary embeddings for each dimension and combine
+    embs = []
+    for i in range(len(rope_dim_list)):
+        # Flatten grid dimension and generate embeddings
+        emb = get_1d_rotary_pos_embed(
+            rope_dim_list[i],
+            grid[i].reshape(-1),  # Flatten to 1D positions
+            theta,
+            use_real=use_real,
+            theta_rescale_factor=theta_rescale_factor[i],
+            interpolation_factor=interpolation_factor[i]
+        )
+        embs.append(emb)
+    # Combine embeddings from all dimensions
+    if use_real:
+        # Return separate cosine and sine components
+        cos = torch.cat([emb[0] for emb in embs], dim=1)
+        sin = torch.cat([emb[1] for emb in embs], dim=1)
+        return cos, sin
+    else:
+        # Return combined embedding
+        return torch.cat(embs, dim=1)

hymm_sp/inference.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import torch
+from pathlib import Path
+from loguru import logger
+from hymm_sp.constants import PROMPT_TEMPLATE, PRECISION_TO_TYPE
+from hymm_sp.vae import load_vae
+from hymm_sp.modules import load_model
+from hymm_sp.text_encoder import TextEncoder
+import torch.distributed
+from hymm_sp.modules.parallel_states import (
+    initialize_sequence_parallel_state,
+    get_sequence_parallel_state,
+    nccl_info,
+)
+from hymm_sp.modules.fp8_optimization import convert_fp8_linear
+class Inference(object):
+    def __init__(self,
+                 args,
+                 vae,
+                 vae_kwargs,
+                 text_encoder,
+                 model,
+                 text_encoder_2=None,
+                 pipeline=None,
+                 cpu_offload=False,
+                 device=None,
+                 logger=None):
+        self.vae = vae
+        self.vae_kwargs = vae_kwargs
+        self.text_encoder = text_encoder
+        self.text_encoder_2 = text_encoder_2
+        self.model = model
+        self.pipeline = pipeline
+        self.cpu_offload = cpu_offload
+        self.args = args
+        self.device = device if device is not None else "cuda" if torch.cuda.is_available() else "cpu"
+        if nccl_info.sp_size > 1:
+            self.device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+        self.logger = logger
+    @classmethod
+    def from_pretrained(cls,
+                        pretrained_model_path,
+                        args,
+                        device=None,
+                        **kwargs):
+        """
+        Initialize the Inference pipeline.
+        Args:
+            pretrained_model_path (str or pathlib.Path): The model path,
+            including t2v, text encoder and vae checkpoints.
+            device (int): The device for inference. Default is 0.
+            logger (logging.Logger): The logger for the inference pipeline. Default is None.
+        """
+        # ========================================================================
+        logger.info(f"Got text-to-video model root path: {pretrained_model_path}")
+        # ======================== Get the args path =============================
+        # Set device and disable gradient
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        torch.set_grad_enabled(False)
+        logger.info("Building model...")
+        factor_kwargs = {'device': 'cpu' if args.cpu_offload else device, 'dtype': PRECISION_TO_TYPE[args.precision]}
+        in_channels = args.latent_channels
+        out_channels = args.latent_channels
+        print("="*25, f"build model", "="*25)
+        model = load_model(
+            args,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            factor_kwargs=factor_kwargs
+        )
+        if args.cpu_offload:
+            print(f'='*20, f'load transformer to cpu')
+            model = model.to('cpu')
+            torch.cuda.empty_cache()
+        else:
+            model = model.to(device)
+        model = Inference.load_state_dict(args, model, pretrained_model_path)
+        model.eval()
+        if args.use_fp8:
+            convert_fp8_linear(model)
+        # ============================= Build extra models ========================
+        # VAE
+        print("="*25, f"load vae", "="*25)
+        vae, _, s_ratio, t_ratio = load_vae(args.vae,
+                                            args.vae_precision,
+                                            logger=logger,
+                                            device='cpu' if args.cpu_offload else device)
+        vae_kwargs = {'s_ratio': s_ratio, 't_ratio': t_ratio}
+        # Parallel VAE
+        device_vaes = []
+        device_vaes.append(vae)
+        if nccl_info.sp_size > 1 and nccl_info.rank_within_group == 0:
+            for i in range(1, nccl_info.sp_size):
+                cur_device = torch.device(f"cuda:{i}")
+                # print("!!!!!!!!!! Load vae for ", cur_device)
+                device_vae, _, _, _ = load_vae(args.vae,
+                                               args.vae_precision,
+                                               logger=logger,
+                                               device='cpu' if args.cpu_offload else cur_device)
+                device_vaes.append(device_vae)
+            vae.device_vaes = device_vaes
+        # Text encoder
+        if args.prompt_template_video is not None:
+            crop_start = PROMPT_TEMPLATE[args.prompt_template_video].get("crop_start", 0)
+        else:
+            crop_start = 0
+        max_length = args.text_len + crop_start
+        # prompt_template_video
+        prompt_template_video = PROMPT_TEMPLATE[args.prompt_template_video] \
+                                if args.prompt_template_video is not None else None
+        print("="*25, f"load llava", "="*25)
+        text_encoder = TextEncoder(text_encoder_type = args.text_encoder,
+                                   max_length = max_length,
+                                   text_encoder_precision = args.text_encoder_precision,
+                                   tokenizer_type = args.tokenizer,
+                                   use_attention_mask = args.use_attention_mask,
+                                   prompt_template_video = prompt_template_video,
+                                   hidden_state_skip_layer = args.hidden_state_skip_layer,
+                                   apply_final_norm = args.apply_final_norm,
+                                   reproduce = args.reproduce,
+                                   logger = logger,
+                                   device = 'cpu' if args.cpu_offload else device ,
+                                   )
+        text_encoder_2 = None
+        if args.text_encoder_2 is not None:
+            text_encoder_2 = TextEncoder(text_encoder_type=args.text_encoder_2,
+                                         max_length=args.text_len_2,
+                                         text_encoder_precision=args.text_encoder_precision_2,
+                                         tokenizer_type=args.tokenizer_2,
+                                         use_attention_mask=args.use_attention_mask,
+                                         reproduce=args.reproduce,
+                                         logger=logger,
+                                         device='cpu' if args.cpu_offload else device ,
+                                         # if not args.use_cpu_offload else 'cpu'
+                                         )
+        return cls(args=args,
+                   vae=vae,
+                   vae_kwargs=vae_kwargs,
+                   text_encoder=text_encoder,
+                   model=model,
+                   text_encoder_2=text_encoder_2,
+                   device=device,
+                   logger=logger)
+    @staticmethod
+    def load_state_dict(args, model, ckpt_path):
+        load_key = args.load_key
+        ckpt_path = Path(ckpt_path)
+        if ckpt_path.is_dir():
+            ckpt_path = next(ckpt_path.glob("*_model_states.pt"))
+        state_dict = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
+        if load_key in state_dict:
+            state_dict = state_dict[load_key]
+        elif load_key == ".":
+            pass
+        else:
+            raise KeyError(f"Key '{load_key}' not found in the checkpoint. Existed keys: {state_dict.keys()}")
+        model.load_state_dict(state_dict, strict=False)
+        return model
+    def get_exp_dir_and_ckpt_id(self):
+        if self.ckpt is None:
+            raise ValueError("The checkpoint path is not provided.")
+        ckpt = Path(self.ckpt)
+        if ckpt.parents[1].name == "checkpoints":
+            # It should be a standard checkpoint path. We use the parent directory as the default save directory.
+            exp_dir = ckpt.parents[2]
+        else:
+            raise ValueError(f"We cannot infer the experiment directory from the checkpoint path: {ckpt}. "
+                             f"It seems that the checkpoint path is not standard. Please explicitly provide the "
+                             f"save path by --save-path.")
+        return exp_dir, ckpt.parent.name
+    @staticmethod
+    def parse_size(size):
+        if isinstance(size, int):
+            size = [size]
+        if not isinstance(size, (list, tuple)):
+            raise ValueError(f"Size must be an integer or (height, width), got {size}.")
+        if len(size) == 1:
+            size = [size[0], size[0]]
+        if len(size) != 2:
+            raise ValueError(f"Size must be an integer or (height, width), got {size}.")
+        return size

hymm_sp/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from .models import HYVideoDiffusionTransformer, HUNYUAN_VIDEO_CONFIG
+"""
+This module provides functionality to load the Hunyuan video diffusion transformer model,
+which is used for video generation tasks with specific configurations and parameters.
+"""
+def load_model(args, in_channels, out_channels, factor_kwargs):
+    """
+    Load and initialize the HYVideoDiffusionTransformer model with specified parameters.
+    Args:
+        args: Command-line arguments or configuration object containing model settings.
+            Must include 'model' attribute to select the appropriate configuration.
+        in_channels (int): Number of input channels for the model.
+        out_channels (int): Number of output channels the model should produce.
+        factor_kwargs (dict): Additional keyword arguments for factor adjustments
+            in the model architecture.
+    Returns:
+        HYVideoDiffusionTransformer: Initialized instance of the video diffusion transformer
+            model with the specified configuration.
+    Notes:
+        - Uses the HUNYUAN_VIDEO_CONFIG dictionary to retrieve model-specific configurations
+          based on the model name provided in args.
+        - Sets multitask_mask_training_type to "concat" as a default for this loading setup.
+    """
+    # Initialize the Hunyuan video diffusion transformer with combined configurations
+    # Merges base config from HUNYUAN_VIDEO_CONFIG and additional factor arguments
+    model = HYVideoDiffusionTransformer(
+        args,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        multitask_mask_training_type="concat",
+        **HUNYUAN_VIDEO_CONFIG[args.model],  # Unpack model-specific configuration
+        ** factor_kwargs,  # Unpack additional factor adjustments
+    )
+    return model

hymm_sp/modules/activation_layers.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch.nn as nn
+def get_activation_layer(act_type):
+    """get activation layer
+    Args:
+        act_type (str): the activation type
+    Returns:
+        torch.nn.functional: the activation layer
+    """
+    if act_type == "gelu":
+        return lambda: nn.GELU()
+    elif act_type == "gelu_tanh":
+        # Approximate `tanh` requires torch >= 1.13
+        return lambda: nn.GELU(approximate="tanh")
+    elif act_type == "relu":
+        return nn.ReLU
+    elif act_type == "silu":
+        return nn.SiLU
+    else:
+        raise ValueError(f"Unknown activation type: {act_type}")

hymm_sp/modules/attn_layers.py ADDED Viewed

	@@ -0,0 +1,437 @@

+import importlib.metadata
+import math
+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from flash_attn import flash_attn_qkvpacked_func, flash_attn_kvpacked_func, flash_attn_varlen_kvpacked_func
+    from flash_attn.bert_padding import index_first_axis
+except ImportError:
+    flash_attn_qkvpacked_func, flash_attn_kvpacked_func, flash_attn_varlen_kvpacked_func = None, None, None
+    index_first_axis = None
+from packaging import version
+from transformers.utils.import_utils import _is_package_available
+from .norm_layers import get_norm_layer
+def reshape_for_broadcast(freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]], x: torch.Tensor, head_first=False):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    Notes:
+        When using FlashMHAModified, head_first should be False.
+        When using Attention, head_first should be True.
+    Args:
+        freqs_cis (Union[torch.Tensor, Tuple[torch.Tensor]]): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+        head_first (bool): head dimension first (except batch dim) or not.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    Raises:
+        AssertionError: If the frequency tensor doesn't match the expected shape.
+        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.
+    """
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    if isinstance(freqs_cis, tuple):
+        # freqs_cis: (cos, sin) in real space
+        if head_first:
+            assert freqs_cis[0].shape == (x.shape[-2], x.shape[-1]), \
+            f'freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}'
+            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        else:
+            assert freqs_cis[0].shape == (x.shape[1], x.shape[-1]), \
+                f'freqs_cis shape {freqs_cis[0].shape} does not match x shape {x.shape}'
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
+    else:
+        # freqs_cis: values in complex space
+        if head_first:
+            assert freqs_cis.shape == (x.shape[-2], x.shape[-1]), \
+                f'freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}'
+            shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        else:
+            assert freqs_cis.shape == (x.shape[1], x.shape[-1]), \
+                f'freqs_cis shape {freqs_cis.shape} does not match x shape {x.shape}'
+            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis.view(*shape)
+def rotate_half(x):
+    x_real, x_imag = x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+def apply_rotary_emb(
+        xq: torch.Tensor,
+        xk: torch.Tensor,
+        freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        head_first: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings. [B, S, H, D]
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.   [B, S, H, D]
+        freqs_cis (torch.Tensor or tuple): Precomputed frequency tensor for complex exponential.
+        head_first (bool): head dimension first (except batch dim) or not.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    xk_out = None
+    if isinstance(freqs_cis, tuple):
+        cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)    # [S, D]
+        cos, sin = cos.to(xq.device), sin.to(xq.device)
+        # real * cos - imag * sin
+        # imag * cos + real * sin
+        xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).type_as(xq)
+        xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).type_as(xk)
+    else:
+        # view_as_complex will pack [..., D/2, 2](real) to [..., D/2](complex)
+        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))  # [B, S, H, D//2]
+        freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(xq.device)   # [S, D//2] --> [1, S, 1, D//2]
+        # (real, imag) * (cos, sin) = (real * cos - imag * sin, imag * cos + real * sin)
+        # view_as_real will expand [..., D/2](complex) to [..., D/2, 2](real)
+        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3).type_as(xq)
+        xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))  # [B, S, H, D//2]
+        xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3).type_as(xk)
+    return xq_out, xk_out
+class BasicAttentionLayer(nn.Module):
+    def __init__(self, attn_mode='flash', deterministic=False):
+        super().__init__()
+        self.attn_mode = attn_mode
+        self.deterministic = deterministic
+    def set_attn_mode(self, new_mode):
+        self.attn_mode = new_mode
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+MEMORY_LAYOUT = {
+    "self_flash": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "cross_flash": (
+        lambda x: x,
+        lambda x: x,
+    ),
+    "torch": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+    "vanilla": (
+        lambda x: x.transpose(1, 2),
+        lambda x: x.transpose(1, 2),
+    ),
+}
+# Copyed from https://github.com/huggingface/transformers/blob/
+# b873234cb649a24865021f0d598627ce2b24d34a/src/transformers/modeling_flash_attention_utils.py#L33C1-L57C6
+def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, int]:
+    """
+    Retrieves indexing data required to repad unpadded (ragged) tensors.
+    Arguments:
+        attention_mask (`torch.Tensor`):
+            Boolean or int tensor of shape (batch_size, sequence_length), 1 means
+            valid and 0 means not valid.
+    Return:
+        indices (`torch.Tensor):
+            The indices of non-masked tokens from the flattened input sequence.
+        cu_seqlens (`torch.Tensor`):
+            The cumulative sequence lengths, used to index into ragged (unpadded)
+            tensors. `cu_seqlens` shape is (batch_size + 1,).
+        max_seqlen_in_batch (`int`):
+            Maximum sequence length in batch.
+    """
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+# Copyed from https://github.com/huggingface/transformers/blob/
+# b873234cb649a24865021f0d598627ce2b24d34a/src/transformers/utils/import_utils.py#L822
+def is_flash_attn_greater_or_equal(library_version: str):
+    if not _is_package_available("flash_attn"):
+        return False
+    return version.parse(importlib.metadata.version("flash_attn")) >= version.parse(library_version)
+def get_kv_seqlens_with_mask(attn_mask, k, v):
+    indices_k, cu_seqlens_k, max_seqlen_k = _get_unpad_data(attn_mask)
+    b, s1, a, d = k.shape
+    k = index_first_axis(k.reshape(b * s1, a, d), indices_k)
+    v = index_first_axis(v.reshape(b * s1, a, d), indices_k)
+    kv = torch.stack([k, v], dim=1)
+    return cu_seqlens_k, max_seqlen_k, kv
+def get_q_seqlens(q):
+    bs, s, a, d = q.shape
+    cu_seqlens_q = torch.arange(0, (bs + 1) * s, step=s, dtype=torch.int32, device=q.device)
+    q = q.reshape(bs * s, a, d)
+    return cu_seqlens_q, s, q
+def attention(q, k, v, mode, drop_rate=0, attn_mask=None, causal=False, deterministic=False,
+              cu_seqlens=None, max_seqlen=None, cu_seqlens_k=None, max_seqlen_k=None):
+    """
+    Perform QKV self attention.
+    Args:
+        q (torch.Tensor): Query tensor with shape [b, s, a, d], where a is the number of heads.
+        k (torch.Tensor): Key tensor with shape [b, s1, a, d]
+        v (torch.Tensor): Value tensor with shape [b, s1, a, d]
+        mode (str): Attention mode. Choose from 'self_flash', 'cross_flash', 'torch', and 'vanilla'.
+        drop_rate (float): Dropout rate in attention map. (default: 0)
+        attn_mask (torch.Tensor): Attention mask with shape [b, s1] (cross_attn), or [b, a, s, s1] (torch or vanilla).
+            (default: None)
+        causal (bool): Whether to use causal attention. (default: False)
+        deterministic (bool): Whether to use deterministic attention. (default: False)
+        cu_seqlens (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into q.
+        max_seqlen (int): The maximum sequence length in the batch of q.
+        cu_seqlens_k (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
+            used to index into kv.
+        max_seqlen_k (int): The maximum sequence length in the batch of k and v.
+    Returns:
+        torch.Tensor: Output tensor after self attention with shape [b, s, ad]
+    """
+    pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
+    q = pre_attn_layout(q)
+    k = pre_attn_layout(k)
+    v = pre_attn_layout(v)
+    if mode == 'torch':
+        if attn_mask is not None and attn_mask.dtype != torch.bool:
+            attn_mask = attn_mask.to(q.dtype)
+        x = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal)
+    elif mode == 'vanilla':
+        scale_factor = 1 / math.sqrt(q.size(-1))
+        b, a, s, _ = q.shape
+        s1 = k.size(2)
+        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
+        if causal:
+            # Only applied to self attention
+            assert attn_mask is None, "Causal mask and attn_mask cannot be used together"
+            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(diagonal=0)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(q.dtype)
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+        attn = (q @ k.transpose(-2, -1)) * scale_factor
+        attn += attn_bias
+        attn = attn.softmax(dim=-1)
+        attn = torch.dropout(attn, p=drop_rate, train=True)
+        x = attn @ v
+    else:
+        raise NotImplementedError(f'Unsupported attention mode: {mode}')
+    x = post_attn_layout(x)
+    b, s, a, d = x.shape
+    out = x.reshape(b, s, -1)
+    return out
+class SelfAttentionLayer(BasicAttentionLayer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_norm=True,
+                 attn_drop=0,
+                 proj_drop=0,
+                 dtype=None,
+                 device=None,
+                 norm_type='layer',
+                 attn_mode='self_flash',
+                 deterministic=False,
+                 ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(attn_mode, deterministic)
+        self.dim = dim
+        self.num_heads = num_heads
+        assert self.dim % num_heads == 0, "dim must be divisible by num_heads"
+        self.head_dim = self.dim // num_heads
+        self.attn_drop = attn_drop
+        # This assertion is aligned with flash attention
+        assert (
+            self.head_dim % 8 == 0 and self.head_dim <= 128
+        ), "Only support head_dim <= 128 and divisible by 8"
+        self.Wqkv = nn.Linear(dim, dim * 3, bias=qkv_bias, **factory_kwargs)
+        norm_layer = get_norm_layer(norm_type)
+        self.q_norm = (
+            norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.k_norm = (
+            norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.out_proj = nn.Linear(dim, dim, bias=qkv_bias, **factory_kwargs)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, freqs_cis=None, attn_mask=None):
+        """
+        Args:
+            x (torch.Tensor): (batch, seq_len, hidden_dim) (where hidden_dim = num heads * head dim)
+            freqs_cis (torch.Tensor, optional): (batch, hidden_dim // 2), RoPE for image
+            attn_mask (torch.Tensor, optional): (batch, seq_len, seq_len), mask for attention
+        """
+        b, s, d = x.shape
+        # Apply QKV projection
+        qkv = self.Wqkv(x)
+        qkv = qkv.view(b, s, 3, self.num_heads, self.head_dim)  # [b, s, 3, a, d]
+        q, k, v = qkv.unbind(dim=2)                             # [b, s, a, d]
+        # Apply QK-Norm if needed
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        # Apply RoPE if needed
+        if freqs_cis is not None:
+            qq, kk = apply_rotary_emb(q, k, freqs_cis)
+            assert qq.shape == q.shape and kk.shape == k.shape, \
+                f'qq: {qq.shape}, q: {q.shape}, kk: {kk.shape}, k: {k.shape}'
+            q, k = qq, kk
+        # Apply self attention
+        context = attention(q, k, v,
+                            drop_rate=self.attn_drop if self.training else 0,
+                            attn_mask=attn_mask,
+                            mode=self.attn_mode,
+                            deterministic=self.deterministic,
+                            )
+        out = self.out_proj(context)
+        out = self.proj_drop(out)
+        return out
+class CrossAttentionLayer(BasicAttentionLayer):
+    def __init__(self,
+                 qdim,
+                 kdim,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_norm=True,
+                 attn_drop=0,
+                 proj_drop=0,
+                 dtype=None,
+                 device=None,
+                 norm_type='layer',
+                 attn_mode='cross_flash',
+                 deterministic=False,
+                 ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(attn_mode, deterministic)
+        self.qdim = qdim
+        self.kdim = kdim
+        self.num_heads = num_heads
+        assert self.qdim % num_heads == 0, "qdim must be divisible by num_heads"
+        self.head_dim = self.qdim // num_heads
+        self.attn_drop = attn_drop
+        # This assertion is aligned with flash attention
+        assert (
+                self.head_dim % 8 == 0 and self.head_dim <= 128
+        ), "Only support head_dim <= 128 and divisible by 8"
+        self.q_proj = nn.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
+        self.kv_proj = nn.Linear(kdim, 2 * qdim, bias=qkv_bias, **factory_kwargs)
+        norm_layer = get_norm_layer(norm_type)
+        self.q_norm = (
+            norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.k_norm = (
+            norm_layer(self.head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.out_proj = nn.Linear(qdim, qdim, bias=qkv_bias, **factory_kwargs)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, y, attn_mask=None):
+        """
+        Args:
+            x (torch.Tensor): (batch, seq_len, hidden_dim) (where hidden_dim = num heads * head dim)
+            y (torch.Tensor): (batch, seq_len1, hidden_dim1)
+            attn_mask (torch.Tensor): (batch, seq_len1), mask for attention
+        """
+        b, s, d = x.shape
+        _, s1, d1 = y.shape
+        q = self.q_proj(x).view(b, s, self.num_heads, self.head_dim)
+        kv = self.kv_proj(y).view(b, s1, 2, self.num_heads, self.head_dim)
+        k, v = kv.unbind(dim=2)
+        # Apply QK-Norm if needed
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        # Apply cross attention
+        context = attention(q, k, v,
+                            attn_mask=attn_mask,
+                            drop_rate=self.attn_drop if self.training else 0,
+                            mode=self.attn_mode,
+                            deterministic=self.deterministic,
+                            )
+        out = self.out_proj(context)
+        out = self.proj_drop(out)
+        return out

hymm_sp/modules/cameranet.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import einops
+import torch.nn.functional as F
+import collections.abc
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+from pathlib import Path
+from einops import rearrange
+from typing import Any, Dict, Optional, Tuple, Union
+from diffusers.models.modeling_utils import ModelMixin
+from itertools import repeat
+from .embed_layers import PatchEmbed
+def _ntuple(n):
+    """
+    Creates a helper function to convert inputs to tuples of specified length.
+    Functionality:
+    - Converts iterable inputs (excluding strings) to tuples, ensuring length n
+    - Repeats single values n times to form a tuple
+    Useful for handling multi-dimensional parameters like kernel sizes and strides.
+    Args:
+        n (int): Target length of the tuple
+    Returns:
+        function: A parser function that converts inputs to n-length tuples
+    """
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            x = tuple(x)
+            if len(x) == 1:
+                x = tuple(repeat(x[0], n))
+            return x
+        return tuple(repeat(x, n))
+    return parse
+# Create common tuple conversion functions
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+class CameraNet(ModelMixin):
+    """
+    Camera state encoding network that processes camera parameters into feature embeddings.
+    This network converts camera state information into suitable feature representations
+    for video generation models through downsampling, convolutional encoding, and
+    temporal dimension compression. Supports loading from pretrained weights.
+    """
+    def __init__(
+        self,
+        in_channels,
+        downscale_coef,
+        out_channels,
+        patch_size,
+        hidden_size,
+    ):
+        super().__init__()
+        # Calculate initial channels: PixelUnshuffle moves spatial info to channel dimension
+        # resulting in channels = in_channels * (downscale_coef^2)
+        start_channels = in_channels * (downscale_coef ** 2)
+        input_channels = [start_channels, start_channels // 2, start_channels // 4]
+        self.input_channels = input_channels
+        self.unshuffle = nn.PixelUnshuffle(downscale_coef)
+        self.encode_first = nn.Sequential(
+            nn.Conv2d(input_channels[0], input_channels[1], kernel_size=1, stride=1, padding=0),
+            nn.GroupNorm(2, input_channels[1]),
+            nn.ReLU(),
+        )
+        self._initialize_weights(self.encode_first)
+        self.encode_second = nn.Sequential(
+            nn.Conv2d(input_channels[1], input_channels[2], kernel_size=1, stride=1, padding=0),
+            nn.GroupNorm(2, input_channels[2]),
+            nn.ReLU(),
+        )
+        self._initialize_weights(self.encode_second)
+        self.final_proj = nn.Conv2d(input_channels[2], out_channels, kernel_size=1)
+        self.zeros_init_linear(self.final_proj)
+        self.scale = nn.Parameter(torch.ones(1))
+        self.camera_in = PatchEmbed(patch_size=patch_size, in_chans=out_channels, embed_dim=hidden_size)
+    def zeros_init_linear(self, linear: nn.Module):
+        """
+        Zero-initializes weights and biases of linear or convolutional layers.
+        Args:
+            linear (nn.Module): Linear or convolutional layer to initialize
+        """
+        if isinstance(linear, (nn.Linear, nn.Conv2d)):
+            if hasattr(linear, "weight"):
+                nn.init.zeros_(linear.weight)
+            if hasattr(linear, "bias"):
+                nn.init.zeros_(linear.bias)
+    def _initialize_weights(self, block):
+        """
+        Initializes convolutional layer weights using He initialization,
+        with biases initialized to zero.
+        Args:
+            block (nn.Sequential): Sequential block containing convolutional layers
+        """
+        for m in block:
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
+                init.normal_(m.weight, mean=0.0, std=np.sqrt(2.0 / n))
+                if m.bias is not None:
+                    init.zeros_(m.bias)
+    def compress_time(self, x, num_frames):
+        """
+        Temporal dimension compression: reduces number of frames using average pooling
+        while preserving key temporal information.
+        Handling logic:
+        - Special frame counts (66 or 34): split into two segments, keep first frame of each
+          segment then pool remaining frames
+        - Odd frame counts: keep first frame, pool remaining frames
+        - Even frame counts: directly pool all frames
+        Args:
+            x (torch.Tensor): Input tensor with shape (b*f, c, h, w)
+            num_frames (int): Number of frames in temporal dimension
+        Returns:
+            torch.Tensor: Temporally compressed tensor with shape (b*f', c, h, w) where f' < f
+        """
+        # Reshape: (b*f, c, h, w) -> (b, f, c, h, w)
+        x = rearrange(x, '(b f) c h w -> b f c h w', f=num_frames)
+        batch_size, frames, channels, height, width = x.shape
+        x = rearrange(x, 'b f c h w -> (b h w) c f')
+        # print(x.shape)
+        # raise Exception
+        # Handle special frame counts (66 or 34)
+        if x.shape[-1] == 66 or x.shape[-1] == 34:
+            x_len = x.shape[-1]
+            # Process first segment: keep first frame, pool remaining
+            x_clip1 = x[...,:x_len//2]
+            x_clip1_first, x_clip1_rest = x_clip1[..., 0].unsqueeze(-1), x_clip1[..., 1:]
+            x_clip1_rest = F.avg_pool1d(x_clip1_rest, kernel_size=2, stride=2)
+            # Process second segment: keep first frame, pool remaining
+            x_clip2 = x[...,x_len//2:x_len]
+            x_clip2_first, x_clip2_rest = x_clip2[..., 0].unsqueeze(-1), x_clip2[..., 1:]
+            x_clip2_rest = F.avg_pool1d(x_clip2_rest, kernel_size=2, stride=2)
+            # Concatenate results from both segments
+            x = torch.cat([x_clip1_first, x_clip1_rest, x_clip2_first, x_clip2_rest], dim=-1)
+        elif x.shape[-1] % 2 == 1:
+            x_first, x_rest = x[..., 0], x[..., 1:]
+            if x_rest.shape[-1] > 0:
+                x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
+            x = torch.cat([x_first[..., None], x_rest], dim=-1)
+        else:
+            x = F.avg_pool1d(x, kernel_size=2, stride=2)
+        x = rearrange(x, '(b h w) c f -> (b f) c h w', b=batch_size, h=height, w=width)
+        return x
+    def forward(
+        self,
+        camera_states: torch.Tensor,
+    ):
+        """
+        Forward pass: encodes camera states into feature embeddings.
+        Args:
+            camera_states (torch.Tensor): Camera state tensor with dimensions
+                (batch, frames, channels, height, width)
+        Returns:
+            torch.Tensor: Encoded feature embeddings after patch embedding and scaling
+        """
+        # import pdb;pdb.set_trace()
+        batch_size, num_frames, channels, height, width = camera_states.shape
+        camera_states = rearrange(camera_states, 'b f c h w -> (b f) c h w')
+        camera_states = self.unshuffle(camera_states)
+        camera_states = self.encode_first(camera_states)
+        camera_states = self.compress_time(camera_states, num_frames=num_frames)
+        num_frames = camera_states.shape[0] // batch_size
+        camera_states = self.encode_second(camera_states)
+        camera_states = self.compress_time(camera_states, num_frames=num_frames)
+        # camera_states = rearrange(camera_states, '(b f) c h w -> b f c h w', b=batch_size)
+        camera_states = self.final_proj(camera_states)
+        camera_states = rearrange(camera_states, "(b f) c h w -> b c f h w", b=batch_size)
+        camera_states = self.camera_in(camera_states)
+        return camera_states * self.scale
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path):
+        """
+        Loads model from pretrained weight file.
+        Args:
+            pretrained_model_path (str): Path to pretrained weight file
+        Returns:
+            CameraNet: Model instance with loaded pretrained weights
+        """
+        if not Path(pretrained_model_path).exists():
+            print(f"There is no model file in {pretrained_model_path}")
+        print(f"loaded CameraNet's pretrained weights from {pretrained_model_path}.")
+        state_dict = torch.load(pretrained_model_path, map_location="cpu")
+        model = CameraNet(in_channels=6, downscale_coef=8, out_channels=16)
+        model.load_state_dict(state_dict, strict=True)
+        return model
+if __name__ == "__main__":
+    # Test model initialization and forward pass
+    model = CameraNet(
+        in_channels=6,
+        downscale_coef=8,
+        out_channels=16,
+        patch_size=[1,2,2],
+        hidden_size=3072
+    )
+    print("Model structure:")
+    print(model)
+    # Generate test input (batch 1, 33 frames, 6 channels, 704x1280 resolution)
+    num_frames = 33
+    input_tensor = torch.randn(1, num_frames, 6, 704, 1280)
+    # Forward pass
+    output_tensor = model(input_tensor)
+    # Print results
+    print(f"Output shape: {output_tensor.shape}")  # Expected: torch.Size([1, ...])
+    print("Output tensor example:")
+    print(output_tensor)

hymm_sp/modules/embed_layers.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import math
+import torch
+import torch.nn as nn
+from hymm_sp.helpers import to_2tuple
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    Image to Patch Embedding using Conv2d
+    A convolution based approach to patchifying a 2D image w/ embedding projection.
+    Based on the impl in https://github.com/google-research/vision_transformer
+    Hacked together by / Copyright 2020 Ross Wightman
+    Remove the _assert function in forward function to be compatible with multi-resolution images.
+    """
+    def __init__(
+            self,
+            patch_size=16,
+            in_chans=3,
+            embed_dim=768,
+            multitask_mask_training_type=None,
+            norm_layer=None,
+            flatten=True,
+            bias=True,
+            dtype=None,
+            device=None
+    ):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.flatten = flatten
+        if multitask_mask_training_type == "concat":
+            orig_in_chans = in_chans
+            in_chans = in_chans * 2 + 1
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias,
+                              **factory_kwargs)
+        if multitask_mask_training_type == "concat":
+            nn.init.xavier_uniform_(\
+                self.proj.weight[:, :orig_in_chans].view(self.proj.weight[:, :orig_in_chans].size(0), -1))
+            nn.init.zeros_(self.proj.weight[:, orig_in_chans:].view(self.proj.weight[:, orig_in_chans:].size(0), -1))
+        else:
+            nn.init.xavier_uniform_(self.proj.weight.view(self.proj.weight.size(0), -1))
+        if bias:
+            nn.init.zeros_(self.proj.bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x):
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+class TextProjection(nn.Module):
+    """
+    Projects text embeddings. Also handles dropout for classifier-free guidance.
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+    def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+        self.linear_1 = nn.Linear(in_features=in_channels, out_features=hidden_size, bias=True, **factory_kwargs)
+        self.act_1 = act_layer()
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True, **factory_kwargs)
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+def timestep_embedding(t, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    Args:
+        t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        dim (int): the dimension of the output.
+        max_period (int): controls the minimum frequency of the embeddings.
+    Returns:
+        embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
+    .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32)
+        / half
+    ).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat(
+            [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+        )
+    return embedding
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self,
+                 hidden_size,
+                 act_layer,
+                 frequency_embedding_size=256,
+                 max_period=10000,
+                 out_size=None,
+                 dtype=None,
+                 device=None
+                 ):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.max_period = max_period
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True, **factory_kwargs),
+            act_layer(),
+            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
+        )
+        nn.init.normal_(self.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.mlp[2].weight, std=0.02)
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size, self.max_period).type(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb

hymm_sp/modules/fp8_optimization.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# modified from https://github.com/neuralmagic/AutoFP8/blob/main/auto_fp8/quantize.py
+import gc
+from typing import Tuple
+import copy
+import torch
+import tqdm
+import triton
+import triton.language as tl
+def cleanup_memory():
+    gc.collect()
+    torch.cuda.empty_cache()
+def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
+    """Quantize a tensor using per-tensor static scaling factor.
+    Args:
+        tensor: The input tensor.
+    """
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    # Calculate the scale as dtype max divided by absmax.
+    # Since .abs() creates a new tensor, we use aminmax to get
+    # the min and max first and then calculate the absmax.
+    if tensor.numel() == 0:
+        # Deal with empty tensors (triggered by empty MoE experts)
+        min_val, max_val = (
+            torch.tensor(-16.0, dtype=tensor.dtype),
+            torch.tensor(16.0, dtype=tensor.dtype),
+        )
+    else:
+        min_val, max_val = tensor.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs())
+    scale = finfo.max / amax.clamp(min=1e-12)
+    # scale and clamp the tensor to bring it to
+    # the representative range of float8 data type
+    # (as default cast is unsaturated)
+    qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
+    # Return both float8 data and the inverse scale (as float),
+    # as both required as inputs to torch._scaled_mm
+    qweight = qweight.to(torch.float8_e4m3fn)
+    scale = scale.float().reciprocal()
+    return qweight, scale
+fp8_gemm_configs = [
+    triton.Config({'BLOCK_SIZE_M': block_m,
+                   'BLOCK_SIZE_N': block_n,
+                   'BLOCK_SIZE_K': 128}, num_stages=num_stages, num_warps=8)
+    for block_m in [16, 32, 64] for block_n in [32, 64, 128] for num_stages in [3, 4, 5, 6]
+]
+@triton.autotune(configs=fp8_gemm_configs, key=['N', 'K'])
+@triton.jit
+def fp8_gemm_kernel(a_ptr, b_ptr, c_ptr,
+                    a_scale, b_scale,  # 改为单个scale值
+                    M, N: tl.constexpr, K: tl.constexpr,
+                    BLOCK_SIZE_M: tl.constexpr,
+                    BLOCK_SIZE_N: tl.constexpr,
+                    BLOCK_SIZE_K: tl.constexpr):
+    """
+    Performs a matrix multiplication operation on FP8 matrices with scaling factors.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + offs_m[:, None] * K + offs_k[None, :]
+    b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for i in range(0, K, BLOCK_SIZE_K):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i, other=0.0)
+        accumulator += tl.dot(a, b) * a_scale * b_scale
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += BLOCK_SIZE_K
+    c = accumulator.to(c_ptr.dtype.element_ty)
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    tl.store(c_ptrs, c, mask=mask)
+def triton_fp8_gemm(a: torch.Tensor,
+                    b: torch.Tensor,
+                    a_scale: float,
+                    b_scale: float,
+                    out_dtype=torch.bfloat16,
+                    bias=None) -> torch.Tensor:
+    """
+    Perform a matrix multiplication using FP8 precision with per-tensor quantization.
+    """
+    assert a.is_contiguous() and b.is_contiguous()
+    K = a.size(-1)
+    M = a.numel() // K
+    N = b.size(0)
+    c = torch.empty((M, N), dtype=out_dtype, device=a.device)
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']), triton.cdiv(N, META['BLOCK_SIZE_N']))
+    if isinstance(a_scale, torch.Tensor):
+        a_scale = a_scale.item()
+    if isinstance(b_scale, torch.Tensor):
+        b_scale = b_scale.item()
+    # import pdb; pdb.set_trace()
+    fp8_gemm_kernel[grid](a, b, c, a_scale, b_scale, M, N, K)
+    if bias is not None:
+        c += bias
+    return c
+def fp8_gemm(A, A_scale, B, B_scale, bias, out_dtype, native_fp8_support=False):
+    """
+    Optimized FP8 GEMM implementation, supports both native FP8 and Triton paths,
+    and automatically handles 3D input and bias.
+    """
+    if A.numel() == 0:
+        # Handle empty tensor (e.g., when MoE expert is empty)
+        return torch.empty(size=(0, B.shape[0]), dtype=out_dtype, device=A.device)
+    # Check if reshape is needed (support for 3D input)
+    need_reshape = (A.dim() == 3)
+    batch_size = A.shape[0] if need_reshape else None
+    A_input = A.reshape(-1, A.shape[-1]).contiguous() if need_reshape else A
+    if native_fp8_support:
+        # Native FP8 support
+        output = torch._scaled_mm(
+            A_input,
+            B.t(),
+            out_dtype=out_dtype,
+            scale_a=torch.tensor(A_scale) if not isinstance(A_scale, torch.Tensor) else A_scale,
+            scale_b=torch.tensor(B_scale) if not isinstance(B_scale, torch.Tensor) else B_scale,
+            bias=bias.to(out_dtype),
+        )
+    else:
+        # Triton implementation
+        output = triton_fp8_gemm(
+            A_input,
+            B.contiguous(),
+            out_dtype=out_dtype,
+            a_scale=A_scale,
+            b_scale=B_scale,
+            bias=None,
+        )
+        if bias is not None:
+            output += bias
+    if need_reshape:
+        # Restore original batch dimension
+        output = output.reshape(batch_size, -1, output.shape[-1])
+    return output
+# Class responsible for quantizing weights
+class FP8DynamicLinear(torch.nn.Module):
+    def __init__(
+        self,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        bias: torch.nn.Parameter,
+        native_fp8_support: bool = False,
+        name: str = ""
+    ):
+        super().__init__()
+        self.weight = torch.nn.Parameter(weight, requires_grad=False)
+        self.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+        self.bias = bias
+        self.native_fp8_support = native_fp8_support
+        self.name = name
+    # @torch.compile
+    def forward(self, x):
+        if x.dtype != torch.float16 and x.dtype != torch.bfloat16:
+            # print(f"Warning: {self.name}'s input is not quantized to float16 or bfloat16")
+            # print(f"input dtype: {x.dtype}")
+            x = x.to(torch.bfloat16)
+        qinput, x_scale = per_tensor_quantize(x)
+        # print("--------------")
+        # print("layer_name:", self.name)
+        # print("A_input.shape:", qinput.shape)
+        # print("B.shape:", self.weight.shape)
+        # print("--------------")
+        output = fp8_gemm(
+            A=qinput,
+            A_scale=x_scale,
+            B=self.weight,
+            B_scale=self.weight_scale,
+            bias=self.bias,
+            out_dtype=x.dtype,
+            native_fp8_support=self.native_fp8_support,
+        )
+        return output
+def replace_module(model: torch.nn.Module, name: str, new_module: torch.nn.Module):
+    if "." in name:
+        parent_name = name.rsplit(".", 1)[0]
+        child_name = name[len(parent_name) + 1 :]
+        parent = model.get_submodule(parent_name)
+    else:
+        parent_name = ""
+        parent = model
+        child_name = name
+    setattr(parent, child_name, new_module)
+def convert_fp8_linear(model: torch.nn.Module):
+    # native_fp8_support = (
+    #     torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0)
+    # )
+    native_fp8_support = False
+    named_modules = list(model.named_modules())
+    for name, linear in tqdm.tqdm(named_modules, desc="Quantizing weights"):
+        if not isinstance(linear, torch.nn.Linear):
+            continue
+        if "mod" in name:
+            print(f"Warning: {name} is a mod module, skipping")
+            continue
+        if "block" not in name:
+            print(f"Warning: {name} is not in a block module, skipping")
+            continue
+        quant_weight, weight_scale = per_tensor_quantize(linear.weight)
+        bias = copy.deepcopy(linear.bias) if linear.bias is not None else None
+        quant_linear = FP8DynamicLinear(
+            weight=quant_weight,
+            weight_scale=weight_scale,
+            bias=bias,
+            native_fp8_support=native_fp8_support,
+            name = name
+        )
+        replace_module(model, name, quant_linear)
+        del linear.weight
+        del linear.bias
+        del linear
+    cleanup_memory()

hymm_sp/modules/mlp_layers.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Modified from timm library:
+# https://github.com/huggingface/pytorch-image-models/blob/648aaa41233ba83eb38faf5ba9d415d574823241/timm/layers/mlp.py#L13
+from functools import partial
+import torch
+import torch.nn as nn
+from .modulate_layers import modulate
+from hymm_sp.helpers import to_2tuple
+class MLP(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(self,
+                 in_channels,
+                 hidden_channels=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 norm_layer=None,
+                 bias=True,
+                 drop=0.,
+                 use_conv=False,
+                 device=None,
+                 dtype=None
+                 ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        out_features = out_features or in_channels
+        hidden_channels = hidden_channels or in_channels
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_channels, hidden_channels, bias=bias[0], **factory_kwargs)
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_channels, **factory_kwargs) if norm_layer is not None else nn.Identity()
+        self.fc2 = linear_layer(hidden_channels, out_features, bias=bias[1], **factory_kwargs)
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class MLPEmbedder(nn.Module):
+    """copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py"""
+    def __init__(self, in_dim: int, hidden_dim: int, device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True, **factory_kwargs)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True, **factory_kwargs)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class FinalLayer(nn.Module):
+    """The final layer of DiT."""
+    def __init__(self, hidden_size, patch_size, out_channels, act_layer, device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        # Just use LayerNorm for the final layer
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        if isinstance(patch_size, int):
+            self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, **factory_kwargs)
+        else:
+            self.linear = nn.Linear(hidden_size,
+                                    patch_size[0] * patch_size[1] * patch_size[2] * out_channels,
+                                    bias=True)
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+        # Here we don't distinguish between the modulate types. Just use the simple one.
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs)
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift=shift, scale=scale)
+        x = self.linear(x)
+        return x

hymm_sp/modules/models.py ADDED Viewed

	@@ -0,0 +1,697 @@

+from typing import List, Tuple, Optional, Union, Dict
+from einops import rearrange
+import torch, os
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models import ModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from flash_attn.flash_attn_interface import flash_attn_varlen_func
+from .activation_layers import get_activation_layer
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, PatchEmbed, TextProjection
+from .attn_layers import apply_rotary_emb
+from .mlp_layers import MLP, MLPEmbedder, FinalLayer
+from .modulate_layers import ModulateDiT, modulate, apply_gate
+from .token_refiner import SingleTokenRefiner
+from .cameranet import CameraNet
+from .parallel_states import (
+    nccl_info,
+    get_cu_seqlens,
+    get_sequence_parallel_state,
+    parallel_attention,
+    all_gather,
+)
+CPU_OFFLOAD = int(os.environ.get("CPU_OFFLOAD", 0))
+DISABLE_SP = int(os.environ.get("DISABLE_SP", 0))
+print(f'models: cpu_offload={CPU_OFFLOAD}, DISABLE_SP={DISABLE_SP}')
+class DoubleStreamBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_width_ratio: float,
+        mlp_act_type: str = 'gelu_tanh',
+        qk_norm: bool = True,
+        qk_norm_type: str = 'rms',
+        qkv_bias: bool = False,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.deterministic = False
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.img_mod = ModulateDiT(hidden_size, factor=6, act_layer=get_activation_layer("silu"), **factory_kwargs)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.img_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.img_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.img_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.img_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.img_mlp = MLP(
+            hidden_size,
+            mlp_hidden_dim,
+            act_layer=get_activation_layer(mlp_act_type),
+            bias=True,
+            **factory_kwargs
+        )
+        self.txt_mod = ModulateDiT(hidden_size, factor=6, act_layer=get_activation_layer("silu"), **factory_kwargs)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.txt_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.txt_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.txt_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.txt_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.txt_mlp = MLP(
+            hidden_size,
+            mlp_hidden_dim,
+            act_layer=get_activation_layer(mlp_act_type),
+            bias=True,
+            **factory_kwargs
+        )
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+    def forward(
+        self,
+        img: torch.Tensor,
+        txt: torch.Tensor,
+        vec: torch.Tensor,
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        freqs_cis: tuple = None,
+        use_sage: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        img_mod1_shift, img_mod1_scale, img_mod1_gate, img_mod2_shift, img_mod2_scale, img_mod2_gate = (
+            self.img_mod(vec).chunk(6, dim=-1)
+        )
+        txt_mod1_shift, txt_mod1_scale, txt_mod1_gate, txt_mod2_shift, txt_mod2_scale, txt_mod2_gate = (
+            self.txt_mod(vec).chunk(6, dim=-1)
+        )
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        # Prepare image for attention.
+        img_modulated = self.img_norm1(img)
+        img_modulated = modulate(img_modulated, shift=img_mod1_shift, scale=img_mod1_scale)
+        img_qkv = self.img_attn_qkv(img_modulated)
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
+        # Apply QK-Norm if needed
+        img_q = self.img_attn_q_norm(img_q).to(img_v)
+        img_k = self.img_attn_k_norm(img_k).to(img_v)
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        # Apply RoPE if needed.
+        if freqs_cis is not None:
+            img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            assert img_qq.shape == img_q.shape and img_kk.shape == img_k.shape, \
+                f'img_kk: {img_qq.shape}, img_q: {img_q.shape}, img_kk: {img_kk.shape}, img_k: {img_k.shape}'
+            img_q, img_k = img_qq, img_kk
+        # Prepare txt for attention.
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = modulate(txt_modulated, shift=txt_mod1_shift, scale=txt_mod1_scale)
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        txt_qkv = self.txt_attn_qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
+        # Apply QK-Norm if needed.
+        txt_q = self.txt_attn_q_norm(txt_q).to(txt_v)
+        txt_k = self.txt_attn_k_norm(txt_k).to(txt_v)
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        # Run actual attention.
+        q = torch.cat((img_q, txt_q), dim=1)
+        k = torch.cat((img_k, txt_k), dim=1)
+        v = torch.cat((img_v, txt_v), dim=1)
+        # Compute attention.
+        if DISABLE_SP:
+            assert cu_seqlens_q.shape[0] == 2 * img.shape[0] + 1
+            q, k, v = [
+                x.view(x.shape[0] * x.shape[1], *x.shape[2:])
+                for x in [q, k, v]
+            ]
+            attn = flash_attn_varlen_func(
+                q,
+                k,
+                v,
+                cu_seqlens_q,
+                cu_seqlens_kv,
+                max_seqlen_q,
+                max_seqlen_kv,
+            )
+            attn = attn.view(img_k.shape[0], max_seqlen_q, -1).contiguous()
+        else:
+            attn, _ = parallel_attention(
+                (img_q, txt_q),
+                (img_k, txt_k),
+                (img_v, txt_v),
+                img_q_len=img_q.shape[1],
+                img_kv_len=img_k.shape[1],
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                use_sage=use_sage,
+            )
+        img_attn, txt_attn = attn[:, :img.shape[1]], attn[:, img.shape[1]:]
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        # Calculate the img bloks.
+        img = img + apply_gate(self.img_attn_proj(img_attn), gate=img_mod1_gate)
+        img = img + apply_gate(self.img_mlp(modulate(
+                                            self.img_norm2(img),
+                                            shift=img_mod2_shift,
+                                            scale=img_mod2_scale)), gate=img_mod2_gate)
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        # Calculate the txt bloks.
+        txt = txt + apply_gate(self.txt_attn_proj(txt_attn), gate=txt_mod1_gate)
+        txt = txt + apply_gate(self.txt_mlp(modulate(self.txt_norm2(txt),
+                                                     shift=txt_mod2_shift,
+                                                     scale=txt_mod2_scale)), gate=txt_mod2_gate)
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        return img, txt
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = 'gelu_tanh',
+        qk_norm: bool = True,
+        qk_norm_type: str = 'rms',
+        qk_scale: float = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.deterministic = False
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
+        self.mlp_hidden_dim = mlp_hidden_dim
+        self.scale = qk_scale or head_dim**-0.5
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim, **factory_kwargs)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + mlp_hidden_dim, hidden_size, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs)
+        self.mlp_act = get_activation_layer(mlp_act_type)()
+        self.modulation = ModulateDiT(hidden_size, factor=3, act_layer=get_activation_layer("silu"), **factory_kwargs)
+    def enable_deterministic(self):
+        self.deterministic = True
+    def disable_deterministic(self):
+        self.deterministic = False
+    def forward(
+        self,
+        x: torch.Tensor,
+        vec: torch.Tensor,
+        txt_len: int,
+        cu_seqlens_q: Optional[torch.Tensor] = None,
+        cu_seqlens_kv: Optional[torch.Tensor] = None,
+        max_seqlen_q: Optional[int] = None,
+        max_seqlen_kv: Optional[int] = None,
+        freqs_cis: Tuple[torch.Tensor, torch.Tensor] = None,
+        use_sage: bool = True,
+    ) -> torch.Tensor:
+        mod_shift, mod_scale, mod_gate = (
+            self.modulation(vec).chunk(3, dim=-1)
+        )
+        x_mod = modulate(self.pre_norm(x), shift=mod_shift, scale=mod_scale)
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        # Apply QK-Norm if needed.
+        q = self.q_norm(q).to(v)
+        k = self.k_norm(k).to(v)
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        # Apply RoPE if needed.
+        if freqs_cis is not None:
+            img_q, txt_q = q[:, :-txt_len, :, :], q[:, -txt_len:, :, :]
+            img_k, txt_k = k[:, :-txt_len, :, :], k[:, -txt_len:, :, :]
+            img_qq, img_kk = apply_rotary_emb(img_q, img_k, freqs_cis, head_first=False)
+            assert img_qq.shape == img_q.shape and img_kk.shape == img_k.shape, \
+                f'img_kk: {img_qq.shape}, img_q: {img_q.shape}, img_kk: {img_kk.shape}, img_k: {img_k.shape}'
+            img_q, img_k = img_qq, img_kk
+            q = torch.cat((img_q, txt_q), dim=1)
+            k = torch.cat((img_k, txt_k), dim=1)
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        # Compute attention.
+        if DISABLE_SP:
+            assert cu_seqlens_q.shape[0] == 2 * x.shape[0] + 1, \
+                f"cu_seqlens_q.shape:{cu_seqlens_q.shape}, x.shape[0]:{x.shape[0]}"
+            # [b, s+l, a, d] -> [s+l, b, a, d]
+            q, k, v = [
+                x.view(x.shape[0] * x.shape[1], *x.shape[2:])
+                for x in [q, k, v]
+            ]
+            attn = flash_attn_varlen_func(
+                q,
+                k,
+                v,
+                cu_seqlens_q,
+                cu_seqlens_kv,
+                max_seqlen_q,
+                max_seqlen_kv,
+            )
+            attn = attn.view(x.shape[0], max_seqlen_q, -1).contiguous()
+        else:
+            img_v, txt_v = v[:, :-txt_len, :, :], v[:, -txt_len:, :, :]
+            attn, _ = parallel_attention(
+                (img_q, txt_q),
+                (img_k, txt_k),
+                (img_v, txt_v),
+                img_q_len=img_q.shape[1],
+                img_kv_len=img_k.shape[1],
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                use_sage=use_sage,
+            )
+        if CPU_OFFLOAD:
+            torch.cuda.empty_cache()
+            tmp = torch.cat((attn, self.mlp_act(mlp)), 2)
+            torch.cuda.empty_cache()
+            output = self.linear2(tmp)
+        else:
+            output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + apply_gate(output, gate=mod_gate)
+class HYVideoDiffusionTransformer(ModelMixin, ConfigMixin):
+    """
+    HunyuanVideo Transformer backbone
+    Inherited from ModelMixin and ConfigMixin for compatibility with diffusers' sampler StableDiffusionPipeline.
+    Reference:
+    [1] Flux.1: https://github.com/black-forest-labs/flux
+    [2] MMDiT: http://arxiv.org/abs/2403.03206,
+               https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
+    """
+    @register_to_config
+    def __init__(
+        self,
+        args,
+        patch_size: list = [1,2,2],
+        in_channels: int = 4, # Should be VAE.config.latent_channels.
+        out_channels: int = None,
+        hidden_size: int = 3072,
+        mlp_width_ratio: float = 4.0,
+        mlp_act_type: str = 'gelu_tanh',
+        num_heads: int = 24,
+        depth_double_blocks: int = 19,
+        depth_single_blocks: int = 38,
+        rope_dim_list: List[int] = [16, 56, 56],
+        qkv_bias: bool = True,
+        qk_norm: bool = True,
+        qk_norm_type: str = 'rms',
+        guidance_embed: bool = False, # For modulation.
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        multitask_mask_training_type: Optional[str] = None,
+        camera_in_channels: int = 6,
+        camera_down_coef: int = 8,
+    ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        # Text projection. Default to linear projection.
+        # Alternative: TokenRefiner. See more details (LI-DiT): http://arxiv.org/abs/2406.11831
+        self.text_projection = args.text_projection
+        self.text_states_dim = args.text_states_dim
+        self.use_attention_mask = args.use_attention_mask
+        self.text_states_dim_2 = args.text_states_dim_2
+        # Now we only use above configs from args.
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.unpatchify_channels = self.out_channels
+        self.guidance_embed = guidance_embed
+        self.rope_dim_list = rope_dim_list
+        self.multitask_mask_training_type = multitask_mask_training_type
+        if hidden_size % num_heads != 0:
+            raise ValueError(
+                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
+            )
+        pe_dim = hidden_size // num_heads
+        if sum(rope_dim_list) != pe_dim:
+            raise ValueError(f"Got {rope_dim_list} but expected positional dim {pe_dim}")
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        # image projection
+        self.img_in = PatchEmbed(
+            self.patch_size, self.in_channels, self.hidden_size, self.multitask_mask_training_type, **factory_kwargs
+        )
+        # text projection
+        if self.text_projection == "linear":
+            self.txt_in = TextProjection(
+                self.text_states_dim,
+                self.hidden_size,
+                get_activation_layer("silu"),
+                **factory_kwargs
+            )
+        elif self.text_projection == "single_refiner":
+            self.txt_in = SingleTokenRefiner(
+                self.text_states_dim, hidden_size, num_heads, depth=2, **factory_kwargs
+            )
+        else:
+            raise NotImplementedError(f"Unsupported text_projection: {self.text_projection}")
+        # time modulation
+        self.time_in = TimestepEmbedder(
+            self.hidden_size, get_activation_layer("silu"), **factory_kwargs
+        )
+        # text modulation
+        self.vector_in = MLPEmbedder(
+            self.text_states_dim_2, self.hidden_size, **factory_kwargs
+        )
+        # guidance modulation
+        self.guidance_in = TimestepEmbedder(
+            self.hidden_size, get_activation_layer("silu"), **factory_kwargs
+        ) if guidance_embed else None
+        # double blocks
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    qkv_bias=qkv_bias,
+                    **factory_kwargs
+                )
+                for _ in range(depth_double_blocks)
+            ]
+        )
+        # single blocks
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_width_ratio=mlp_width_ratio,
+                    mlp_act_type=mlp_act_type,
+                    qk_norm=qk_norm,
+                    qk_norm_type=qk_norm_type,
+                    **factory_kwargs
+                )
+                for _ in range(depth_single_blocks)
+            ]
+        )
+        self.final_layer = FinalLayer(
+            self.hidden_size,
+            self.patch_size,
+            self.out_channels,
+            get_activation_layer("silu"),
+            **factory_kwargs
+        )
+        self.camera_net = CameraNet(in_channels=camera_in_channels,
+                                    out_channels=in_channels,
+                                    downscale_coef=camera_down_coef,
+                                    patch_size=self.patch_size,
+                                    hidden_size=self.hidden_size,
+                                    )
+    def enable_deterministic(self):
+        for block in self.double_blocks:
+            block.enable_deterministic()
+        for block in self.single_blocks:
+            block.enable_deterministic()
+    def disable_deterministic(self):
+        for block in self.double_blocks:
+            block.disable_deterministic()
+        for block in self.single_blocks:
+            block.disable_deterministic()
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor, # Should be in range(0, 1000).
+        text_states: torch.Tensor = None,
+        text_mask: torch.Tensor = None, # Now we don't use it.
+        text_states_2: Optional[torch.Tensor] = None, # Text embedding for modulation.
+        freqs_cos: Optional[torch.Tensor] = None,
+        freqs_sin: Optional[torch.Tensor] = None,
+        guidance: torch.Tensor = None, # Guidance for modulation, should be cfg_scale x 1000.
+        return_dict: bool = True,
+        is_cache: bool = False,
+        cam_latents = None,
+        use_sage: bool = False,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        out = {}
+        img = x
+        txt = text_states
+        _, _, ot, oh, ow = x.shape
+        tt, th, tw = ot // self.patch_size[0], oh // self.patch_size[1], ow // self.patch_size[2]
+        # Prepare modulation vectors.
+        vec = self.time_in(t)
+        # text modulation
+        vec = vec + self.vector_in(text_states_2)
+        # guidance modulation
+        if self.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            else:
+                # our timestep_embedding is merged into guidance_in(TimestepEmbedder)
+                vec = vec + self.guidance_in(guidance)
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        camera_condition = cam_latents
+        assert camera_condition is not None, print("plucker_embedding is not provided")
+        latent_len = img.shape[2]
+        # Embed image and text.
+        img = self.img_in(img)
+        # ref_latents = self.img_in(ref_latents) # off in latent concat
+        if self.text_projection == "linear":
+            txt = self.txt_in(txt)
+        elif self.text_projection == "single_refiner":
+            txt = self.txt_in(txt, t, text_mask if self.use_attention_mask else None)
+        else:
+            raise NotImplementedError(f"Unsupported text_projection: {self.text_projection}")
+        if camera_condition is not None:
+            if latent_len == 18:
+                camera_latents = torch.cat([self.camera_net(torch.zeros_like(camera_condition)), \
+                                            self.camera_net(camera_condition)], dim=1)
+            elif latent_len == 9:
+                camera_latents = self.camera_net(camera_condition)
+            elif latent_len == 10:
+                camera_latents = torch.cat([self.camera_net(torch.zeros_like(camera_condition)[:,0:4,:,:,:]), \
+                                            self.camera_net(camera_condition)], dim=1)
+            img = img + camera_latents
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        # ref_length = ref_latents.shape[-2]
+        # img = torch.cat([ref_latents, img], dim=-2) # t c
+        txt_seq_len = txt.shape[1]
+        img_seq_len = img.shape[1]
+        # Compute 'self-attention mask'.
+        cu_seqlens_q = get_cu_seqlens(text_mask, img_seq_len)
+        cu_seqlens_kv = cu_seqlens_q
+        max_seqlen_q = img_seq_len + txt_seq_len
+        max_seqlen_kv = max_seqlen_q
+        if get_sequence_parallel_state():
+            sp_size = nccl_info.sp_size
+            sp_rank = nccl_info.rank_within_group
+            assert img.shape[1] % sp_size == 0, f"Cannot split video sequence into ulysses SP ({sp_size}) parts evenly"
+            img = torch.chunk(img, sp_size, dim=1)[sp_rank]
+            freqs_cos = torch.chunk(freqs_cos, sp_size, dim=0)[sp_rank]
+            freqs_sin = torch.chunk(freqs_sin, sp_size, dim=0)[sp_rank]
+        if CPU_OFFLOAD: torch.cuda.empty_cache()
+        freqs_cis = (freqs_cos, freqs_sin) if freqs_cos is not None else None
+        # --------------------- Pass through DiT blocks ------------------------
+        if not is_cache:
+            for layer_num, block in enumerate(self.double_blocks):
+                double_block_args = [img, txt, vec, cu_seqlens_q, cu_seqlens_kv, \
+                                     max_seqlen_q, max_seqlen_kv, freqs_cis, use_sage]
+                img, txt = block(*double_block_args)
+                if CPU_OFFLOAD: torch.cuda.empty_cache()
+            # Merge txt and img to pass through single stream blocks.
+            x = torch.cat((img, txt), 1)
+            # Compatible with MMDiT.
+            if len(self.single_blocks) > 0:
+                for layer_num, block in enumerate(self.single_blocks):
+                    if layer_num == (len(self.single_blocks) - 1):
+                        self.cache_out = x
+                    single_block_args = [x, vec, txt_seq_len, cu_seqlens_q, \
+                        cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, (freqs_cos, freqs_sin), use_sage]
+                    x = block(*single_block_args)
+                    if CPU_OFFLOAD: torch.cuda.empty_cache()
+        else:
+            x = self.cache_out
+            if len(self.single_blocks) > 0:
+                for layer_num, block in enumerate(self.single_blocks):
+                    if layer_num < (len(self.single_blocks) - 1):
+                        continue
+                    single_block_args = [x, vec, txt_seq_len, cu_seqlens_q, \
+                        cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, (freqs_cos, freqs_sin), use_sage]
+                    x = block(*single_block_args)
+                    if CPU_OFFLOAD: torch.cuda.empty_cache()
+        img = x[:, :-txt_seq_len, ...]
+        if get_sequence_parallel_state():
+            img = all_gather(img, dim=1)
+        # img = img[:, ref_length:]
+        # ---------------------------- Final layer ------------------------------
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.unpatchify(img, tt, th, tw)
+        if return_dict:
+            out['x'] = img
+            return out
+        return img
+    def unpatchify(self, x, t, h, w):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.unpatchify_channels
+        pt, ph, pw = self.patch_size
+        assert t * h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], t, h, w, c, pt, ph, pw))
+        x = torch.einsum('nthwcopq->nctohpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw))
+        return imgs
+    def params_count(self):
+        counts = {
+            "double": sum([
+                sum(p.numel() for p in block.img_attn_qkv.parameters()) +
+                sum(p.numel() for p in block.img_attn_proj.parameters()) +
+                sum(p.numel() for p in block.img_mlp.parameters()) +
+                sum(p.numel() for p in block.txt_attn_qkv.parameters()) +
+                sum(p.numel() for p in block.txt_attn_proj.parameters()) +
+                sum(p.numel() for p in block.txt_mlp.parameters())
+                for block in self.double_blocks
+            ]),
+            "single": sum([
+                sum(p.numel() for p in block.linear1.parameters()) +
+                sum(p.numel() for p in block.linear2.parameters())
+                for block in self.single_blocks
+            ]),
+            "total": sum(p.numel() for p in self.parameters()),
+        }
+        counts["attn+mlp"] = counts["double"] + counts["single"]
+        return counts
+#################################################################################
+#                             HunyuanVideo Configs                              #
+#################################################################################
+HUNYUAN_VIDEO_CONFIG = {                                                                   # Attn+MLP / Total
+    'HYVideo-T/2': {                                                                       #   9.0B   / 12.5B
+        'depth_double_blocks': 20,
+        'depth_single_blocks': 40,
+        'rope_dim_list': [16, 56, 56],
+        'hidden_size': 3072,
+        'num_heads': 24,
+        'mlp_width_ratio': 4,
+    },
+}

hymm_sp/modules/modulate_layers.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from typing import Callable
+import torch
+import torch.nn as nn
+class ModulateDiT(nn.Module):
+    """Modulation layer for DiT."""
+    def __init__(
+        self,
+        hidden_size: int,
+        factor: int,
+        act_layer: Callable,
+        dtype=None,
+        device=None,
+    ):
+        factory_kwargs = {"dtype": dtype, "device": device}
+        super().__init__()
+        self.act = act_layer()
+        self.linear = nn.Linear(
+            hidden_size, factor * hidden_size, bias=True, **factory_kwargs
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(self.act(x))
+def modulate(x, shift=None, scale=None):
+    """modulate by shift and scale
+    Args:
+        x (torch.Tensor): input tensor.
+        shift (torch.Tensor, optional): shift tensor. Defaults to None.
+        scale (torch.Tensor, optional): scale tensor. Defaults to None.
+    Returns:
+        torch.Tensor: the output tensor after modulate.
+    """
+    if scale is None and shift is None:
+        return x
+    elif shift is None:
+        return x * (1 + scale.unsqueeze(1))
+    elif scale is None:
+        return x + shift.unsqueeze(1)
+    else:
+        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+def apply_gate(x, gate=None, tanh=False):
+    """AI is creating summary for apply_gate
+    Args:
+        x (torch.Tensor): input tensor.
+        gate (torch.Tensor, optional): gate tensor. Defaults to None.
+        tanh (bool, optional): whether to use tanh function. Defaults to False.
+    Returns:
+        torch.Tensor: the output tensor after apply gate.
+    """
+    if gate is None:
+        return x
+    if tanh:
+        return x * gate.unsqueeze(1).tanh()
+    else:
+        return x * gate.unsqueeze(1)
+def ckpt_wrapper(module):
+    def ckpt_forward(*inputs):
+        outputs = module(*inputs)
+        return outputs
+    return ckpt_forward

hymm_sp/modules/norm_layers.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+import torch.nn as nn
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        elementwise_affine=True,
+        eps: float = 1e-6,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        if hasattr(self, "weight"):
+            output = output * self.weight
+        return output
+def get_norm_layer(norm_layer):
+    """
+    Get the normalization layer.
+    Args:
+        norm_layer (str): The type of normalization layer.
+    Returns:
+        norm_layer (nn.Module): The normalization layer.
+    """
+    if norm_layer == "layer":
+        return nn.LayerNorm
+    elif norm_layer == "rms":
+        return RMSNorm
+    else:
+        raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")

hymm_sp/modules/parallel_states.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import os
+import torch
+import datetime
+import torch.distributed as dist
+from typing import Any, Tuple
+from torch import Tensor
+from flash_attn.flash_attn_interface import flash_attn_varlen_func
+class COMM_INFO:
+    def __init__(self):
+        self.group = None
+        self.sp_size = 1
+        self.global_rank = 0
+        self.rank_within_group = 0
+        self.group_id = 0
+nccl_info = COMM_INFO()
+_SEQUENCE_PARALLEL_STATE = False
+def get_cu_seqlens(text_mask, img_len):
+    """Calculate cu_seqlens_q, cu_seqlens_kv using text_mask and img_len
+    Args:
+        text_mask (torch.Tensor): the mask of text
+        img_len (int): the length of image
+    Returns:
+        torch.Tensor: the calculated cu_seqlens for flash attention
+    """
+    batch_size = text_mask.shape[0]
+    text_len = text_mask.sum(dim=1)
+    max_len = text_mask.shape[1] + img_len
+    cu_seqlens = torch.zeros([2 * batch_size + 1], dtype=torch.int32, device="cuda")
+    for i in range(batch_size):
+        s = text_len[i] + img_len
+        s1 = i * max_len + s
+        s2 = (i + 1) * max_len
+        cu_seqlens[2 * i + 1] = s1
+        cu_seqlens[2 * i + 2] = s2
+    return cu_seqlens
+def initialize_sequence_parallel_state(sequence_parallel_size):
+    global _SEQUENCE_PARALLEL_STATE
+    if sequence_parallel_size > 1:
+        _SEQUENCE_PARALLEL_STATE = True
+        initialize_sequence_parallel_group(sequence_parallel_size)
+    else:
+        nccl_info.sp_size = 1
+        nccl_info.global_rank = int(os.getenv("RANK", "0"))
+        nccl_info.rank_within_group = 0
+        nccl_info.group_id = int(os.getenv("RANK", "0"))
+def get_sequence_parallel_state():
+    return _SEQUENCE_PARALLEL_STATE
+def initialize_sequence_parallel_group(sequence_parallel_size):
+    """Initialize the sequence parallel group."""
+    rank = int(os.getenv("RANK", "0"))
+    world_size = int(os.getenv("WORLD_SIZE", "1"))
+    assert (
+        world_size % sequence_parallel_size == 0
+    ), "world_size must be divisible by sequence_parallel_size, \
+        but got world_size: {}, sequence_parallel_size: {}".format(
+        world_size, sequence_parallel_size)
+    nccl_info.sp_size = sequence_parallel_size
+    nccl_info.global_rank = rank
+    num_sequence_parallel_groups: int = world_size // sequence_parallel_size
+    for i in range(num_sequence_parallel_groups):
+        ranks = range(i * sequence_parallel_size, (i + 1) * sequence_parallel_size)
+        group = dist.new_group(ranks)
+        if rank in ranks:
+            nccl_info.group = group
+            nccl_info.rank_within_group = rank - i * sequence_parallel_size
+            nccl_info.group_id = i
+def initialize_distributed(seed):
+    local_rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="nccl",
+                            init_method="env://",
+                            timeout=datetime.timedelta(seconds=2**31-1),
+                            world_size=world_size,
+                            rank=local_rank)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    initialize_sequence_parallel_state(world_size)
+def _all_to_all_4D(input: torch.tensor, scatter_idx: int = 2, gather_idx: int = 1, group=None) -> torch.tensor:
+    """
+    all-to-all for QKV
+    Args:
+        input (torch.tensor): a tensor sharded along dim scatter dim
+        scatter_idx (int): default 1
+        gather_idx (int): default 2
+        group : torch process group
+    Returns:
+        torch.tensor: resharded tensor (bs, seqlen/P, hc, hs)
+    """
+    assert (input.dim() == 4), f"input must be 4D tensor, got {input.dim()} and shape {input.shape}"
+    seq_world_size = dist.get_world_size(group)
+    if scatter_idx == 2 and gather_idx == 1:
+        # input (torch.tensor): a tensor sharded along dim 1 (bs, seqlen/P, hc, hs) output: (bs, seqlen, hc/P, hs)
+        bs, shard_seqlen, hc, hs = input.shape
+        seqlen = shard_seqlen * seq_world_size
+        shard_hc = hc // seq_world_size
+        # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
+        # (bs, seqlen/P, hc, hs) -reshape->
+        # (bs, seq_len/P, P, hc/P, hs) -transpose(0,2)->
+        # (P, seq_len/P, bs, hc/P, hs)
+        input_t = (input.reshape(bs, shard_seqlen, seq_world_size, shard_hc, hs).transpose(0, 2).contiguous())
+        output = torch.empty_like(input_t)
+        # https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_to_all_single
+        # (P, seq_len/P, bs, hc/P, hs) scatter seqlen -all2all-> (P, seq_len/P, bs, hc/P, hs) scatter head
+        if seq_world_size > 1:
+            dist.all_to_all_single(output, input_t, group=group)
+            torch.cuda.synchronize()
+        else:
+            output = input_t
+        # if scattering the seq-dim, transpose the heads back to the original dimension
+        output = output.reshape(seqlen, bs, shard_hc, hs)
+        # (seq_len, bs, hc/P, hs) -reshape-> (bs, seq_len, hc/P, hs)
+        output = output.transpose(0, 1).contiguous().reshape(bs, seqlen, shard_hc, hs)
+        return output
+    elif scatter_idx == 1 and gather_idx == 2:
+        # input (torch.tensor): a tensor sharded along dim 1 (bs, seqlen, hc/P, hs) output: (bs, seqlen/P, hc, hs)
+        bs, seqlen, shard_hc, hs = input.shape
+        hc = shard_hc * seq_world_size
+        shard_seqlen = seqlen // seq_world_size
+        seq_world_size = dist.get_world_size(group)
+        # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
+        # (bs, seqlen, hc/P, hs) -reshape-> (bs, P, seq_len/P, hc/P, hs) -transpose(0, 3)->
+        # (hc/P, P, seqlen/P, bs, hs) -transpose(0, 1) -> (P, hc/P, seqlen/P, bs, hs)
+        input_t = (input.reshape(bs, seq_world_size, shard_seqlen, shard_hc,
+                                 hs).transpose(0,
+                                               3).transpose(0,
+                                                            1).contiguous().reshape(seq_world_size, shard_hc,
+                                                                                    shard_seqlen, bs, hs))
+        output = torch.empty_like(input_t)
+        # https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_to_all_single
+        # (P, bs x hc/P, seqlen/P, hs) scatter seqlen -all2all-> (P, bs x seq_len/P, hc/P, hs) scatter head
+        if seq_world_size > 1:
+            dist.all_to_all_single(output, input_t, group=group)
+            torch.cuda.synchronize()
+        else:
+            output = input_t
+        # if scattering the seq-dim, transpose the heads back to the original dimension
+        output = output.reshape(hc, shard_seqlen, bs, hs)
+        # (hc, seqlen/N, bs, hs) -tranpose(0,2)-> (bs, seqlen/N, hc, hs)
+        output = output.transpose(0, 2).contiguous().reshape(bs, shard_seqlen, hc, hs)
+        return output
+    else:
+        raise RuntimeError("scatter_idx must be 1 or 2 and gather_idx must be 1 or 2")
+class SeqAllToAll4D(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        group: dist.ProcessGroup,
+        input: Tensor,
+        scatter_idx: int,
+        gather_idx: int,
+    ) -> Tensor:
+        ctx.group = group
+        ctx.scatter_idx = scatter_idx
+        ctx.gather_idx = gather_idx
+        return _all_to_all_4D(input, scatter_idx, gather_idx, group=group)
+    @staticmethod
+    def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
+        return (
+            None,
+            SeqAllToAll4D.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx),
+            None,
+            None,
+        )
+def all_to_all_4D(
+    input_: torch.Tensor,
+    scatter_dim: int = 2,
+    gather_dim: int = 1,
+):
+    return SeqAllToAll4D.apply(nccl_info.group, input_, scatter_dim, gather_dim)
+def _all_to_all(
+    input_: torch.Tensor,
+    world_size: int,
+    group: dist.ProcessGroup,
+    scatter_dim: int,
+    gather_dim: int,
+):
+    input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
+    output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
+    dist.all_to_all(output_list, input_list, group=group)
+    return torch.cat(output_list, dim=gather_dim).contiguous()
+class _AllToAll(torch.autograd.Function):
+    """All-to-all communication.
+    Args:
+        input_: input matrix
+        process_group: communication group
+        scatter_dim: scatter dimension
+        gather_dim: gather dimension
+    """
+    @staticmethod
+    def forward(ctx, input_, process_group, scatter_dim, gather_dim):
+        ctx.process_group = process_group
+        ctx.scatter_dim = scatter_dim
+        ctx.gather_dim = gather_dim
+        ctx.world_size = dist.get_world_size(process_group)
+        output = _all_to_all(input_, ctx.world_size, process_group, scatter_dim, gather_dim)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = _all_to_all(
+            grad_output,
+            ctx.world_size,
+            ctx.process_group,
+            ctx.gather_dim,
+            ctx.scatter_dim,
+        )
+        return (
+            grad_output,
+            None,
+            None,
+            None,
+        )
+def all_to_all(
+    input_: torch.Tensor,
+    scatter_dim: int = 2,
+    gather_dim: int = 1,
+):
+    return _AllToAll.apply(input_, nccl_info.group, scatter_dim, gather_dim)
+class _AllGather(torch.autograd.Function):
+    """All-gather communication with autograd support.
+    Args:
+        input_: input tensor
+        dim: dimension along which to concatenate
+    """
+    @staticmethod
+    def forward(ctx, input_, dim):
+        ctx.dim = dim
+        world_size = nccl_info.sp_size
+        group = nccl_info.group
+        input_size = list(input_.size())
+        ctx.input_size = input_size[dim]
+        tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+        input_ = input_.contiguous()
+        dist.all_gather(tensor_list, input_, group=group)
+        output = torch.cat(tensor_list, dim=dim)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        world_size = nccl_info.sp_size
+        rank = nccl_info.rank_within_group
+        dim = ctx.dim
+        input_size = ctx.input_size
+        sizes = [input_size] * world_size
+        grad_input_list = torch.split(grad_output, sizes, dim=dim)
+        grad_input = grad_input_list[rank]
+        return grad_input, None
+def all_gather(input_: torch.Tensor, dim: int = 1):
+    """Performs an all-gather operation on the input tensor along the specified dimension.
+    Args:
+        input_ (torch.Tensor): Input tensor of shape [B, H, S, D].
+        dim (int, optional): Dimension along which to concatenate. Defaults to 1.
+    Returns:
+        torch.Tensor: Output tensor after all-gather operation, concatenated along 'dim'.
+    """
+    return _AllGather.apply(input_, dim)
+def parallel_attention(q, k, v,
+                       img_q_len, img_kv_len, cu_seqlens_q, cu_seqlens_kv,
+                       max_seqlen_q, max_seqlen_kv, use_sage):
+    """
+    img_q_len,img_kv_len: 32256
+    text_mask: 2x256
+    query: [2, 32256, 24, 128])
+    encoder_query: [2, 256, 24, 128]
+    """
+    query, encoder_query = q
+    key, encoder_key = k
+    value, encoder_value = v
+    rank = torch.distributed.get_rank()
+    if get_sequence_parallel_state():
+        query = all_to_all_4D(query, scatter_dim=2, gather_dim=1)  # [2, 32256, 24, 128]
+        key = all_to_all_4D(key, scatter_dim=2, gather_dim=1)
+        value = all_to_all_4D(value, scatter_dim=2, gather_dim=1)
+        def shrink_head(encoder_state, dim):
+            local_heads = encoder_state.shape[dim] // nccl_info.sp_size
+            return encoder_state.narrow(dim, nccl_info.rank_within_group * local_heads, local_heads)
+        encoder_query = shrink_head(encoder_query, dim=2)
+        encoder_key = shrink_head(encoder_key, dim=2)
+        encoder_value = shrink_head(encoder_value, dim=2)
+    sequence_length = query.size(1) # 32256
+    encoder_sequence_length = encoder_query.size(1)  # 256
+    query = torch.cat([query, encoder_query], dim=1)
+    key = torch.cat([key, encoder_key], dim=1)
+    value = torch.cat([value, encoder_value], dim=1)
+    bsz = query.shape[0]
+    head = query.shape[-2]
+    head_dim = query.shape[-1]
+    if use_sage:
+        from sageattention import sageattn
+        hidden_states = sageattn(query, key, value, tensor_layout="NHD")
+    else:
+        query, key, value = [
+                x.view(x.shape[0] * x.shape[1], *x.shape[2:])
+                for x in [query, key, value]
+            ]
+        hidden_states = flash_attn_varlen_func(
+            query,
+            key,
+            value,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            max_seqlen_q,
+            max_seqlen_kv,
+        )
+    # B, S, 3, H, D
+    hidden_states = hidden_states.view(bsz, max_seqlen_q, head, head_dim).contiguous()
+    hidden_states, encoder_hidden_states = hidden_states.split_with_sizes((sequence_length, encoder_sequence_length),
+                                                                            dim=1)
+    if get_sequence_parallel_state():
+        hidden_states = all_to_all_4D(hidden_states, scatter_dim=1, gather_dim=2)
+        encoder_hidden_states = all_gather(encoder_hidden_states, dim=2).contiguous()
+    hidden_states = hidden_states.to(query.dtype)
+    encoder_hidden_states = encoder_hidden_states.to(query.dtype)
+    attn = torch.cat([hidden_states, encoder_hidden_states], dim=1)
+    b, s, _, _= attn.shape
+    attn = attn.reshape(b, s, -1)
+    return attn, None

hymm_sp/modules/posemb_layers.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+from typing import Union, Tuple, List
+def _to_tuple(x, dim=2):
+    if isinstance(x, int):
+        return (x,) * dim
+    elif len(x) == dim:
+        return x
+    else:
+        raise ValueError(f"Expected length {dim} or int, but got {x}")
+def get_meshgrid_nd(start, *args, dim=2):
+    """
+    Get n-D meshgrid with start, stop and num.
+    Args:
+        start (int or tuple): If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop,
+            step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num. For n-dim, start/stop/num
+            should be int or n-tuple. If n-tuple is provided, the meshgrid will be stacked following the dim order in
+            n-tuples.
+        *args: See above.
+        dim (int): Dimension of the meshgrid. Defaults to 2.
+    Returns:
+        grid (np.ndarray): [dim, ...]
+    """
+    if len(args) == 0:
+        # start is grid_size
+        num = _to_tuple(start, dim=dim)
+        start = (0,) * dim
+        stop = num
+    elif len(args) == 1:
+        # start is start, args[0] is stop, step is 1
+        start = _to_tuple(start, dim=dim)
+        stop = _to_tuple(args[0], dim=dim)
+        num = [stop[i] - start[i] for i in range(dim)]
+    elif len(args) == 2:
+        # start is start, args[0] is stop, args[1] is num
+        start = _to_tuple(start, dim=dim)       # Left-Top       eg: 12,0
+        stop = _to_tuple(args[0], dim=dim)      # Right-Bottom   eg: 20,32
+        num = _to_tuple(args[1], dim=dim)       # Target Size    eg: 32,124
+    else:
+        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
+    # PyTorch implement of np.linspace(start[i], stop[i], num[i], endpoint=False)
+    axis_grid = []
+    for i in range(dim):
+        a, b, n = start[i], stop[i], num[i]
+        g = torch.linspace(a, b, n + 1, dtype=torch.float32)[:n]
+        axis_grid.append(g)
+    grid = torch.meshgrid(*axis_grid, indexing="ij")   # dim x [W, H, D]
+    grid = torch.stack(grid, dim=0)     # [dim, W, H, D]
+    return grid
+#################################################################################
+#                   Rotary Positional Embedding Functions                       #
+#################################################################################
+# https://github.com/meta-llama/llama/blob/be327c427cc5e89cc1d3ab3d3fec4484df771245/llama/model.py#L80
+def get_1d_rotary_pos_embed(dim: int,
+                            pos: Union[torch.FloatTensor, int],
+                            theta: float = 10000.0,
+                            use_real: bool = False,
+                            theta_rescale_factor: float = 1.0,
+                            interpolation_factor: float = 1.0,
+                            ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Precompute the frequency tensor for complex exponential (cis) with given dimensions.
+    (Note: `cis` means `cos + i * sin`, where i is the imaginary unit.)
+    This function calculates a frequency tensor with complex exponential using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        pos (int or torch.FloatTensor): Position indices for the frequency tensor. [S] or scalar
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (bool, optional): If True, return real part and imaginary part separately.
+                                   Otherwise, return complex numbers.
+        theta_rescale_factor (float, optional): Rescale factor for theta. Defaults to 1.0.
+    Returns:
+        freqs_cis: Precomputed frequency tensor with complex exponential. [S, D/2]
+        freqs_cos, freqs_sin: Precomputed frequency tensor with real and imaginary parts separately. [S, D]
+    """
+    if isinstance(pos, int):
+        pos = torch.arange(pos).float()
+    # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+    # has some connection to NTK literature
+    if theta_rescale_factor != 1.0:
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+    freqs = 1.0 / (
+        theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+    )  # [D/2]
+    freqs = torch.outer(pos * interpolation_factor, freqs)  # [S, D/2]
+    if use_real:
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
+        return freqs_cos, freqs_sin
+    else:
+        freqs_cis = torch.polar(
+            torch.ones_like(freqs), freqs
+        )  # complex64     # [S, D/2]
+        return freqs_cis

hymm_sp/modules/token_refiner.py ADDED Viewed

	@@ -0,0 +1,265 @@

+from typing import Optional
+from einops import rearrange
+import torch
+import torch.nn as nn
+from .activation_layers import get_activation_layer
+from .attn_layers import attention
+from .norm_layers import get_norm_layer
+from .embed_layers import TimestepEmbedder, TextProjection
+from .attn_layers import attention
+from .mlp_layers import MLP
+from .modulate_layers import apply_gate
+class IndividualTokenRefinerBlock(nn.Module):
+    """
+    Transformer block for refining individual tokens with adaptive layer normalization.
+    Combines self-attention and feed-forward network (FFN) layers with modulation
+    based on conditional inputs (timestep and context embeddings). Supports query-key
+    normalization for improved attention stability.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        mlp_ratio: str = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+        self.self_attn_qkv = nn.Linear(hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs)
+        qk_norm_layer = get_norm_layer(qk_norm_type)
+        self.self_attn_q_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.self_attn_k_norm = (
+            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.self_attn_proj = nn.Linear(hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs)
+        act_layer = get_activation_layer(act_type)
+        self.mlp = MLP(
+            in_channels=hidden_size,
+            hidden_channels=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=mlp_drop_rate,
+            **factory_kwargs,
+        )
+        self.adaLN_modulation = nn.Sequential(
+            act_layer(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs)
+        )
+        # Zero-initialize the modulation
+        nn.init.zeros_(self.adaLN_modulation[1].weight)
+        nn.init.zeros_(self.adaLN_modulation[1].bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.Tensor, # timestep_aware_representations + context_aware_representations
+        attn_mask: torch.Tensor = None,
+    ):
+        """
+        Forward pass of the transformer block.
+        Args:
+            x: Input token embeddings (batch_size, seq_len, hidden_size)
+            c: Conditional embeddings (batch_size, hidden_size)
+            attn_mask: Attention mask (batch_size, 1, seq_len, seq_len)
+        Returns:
+            Updated token embeddings after self-attention and FFN
+        """
+        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
+        norm_x = self.norm1(x)
+        qkv = self.self_attn_qkv(norm_x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
+        # Apply QK-Norm if needed
+        q = self.self_attn_q_norm(q).to(v)
+        k = self.self_attn_k_norm(k).to(v)
+        # Self-Attention
+        attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
+        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
+        # FFN Layer
+        x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
+        return x
+class IndividualTokenRefiner(nn.Module):
+    """
+    Stack of IndividualTokenRefinerBlocks for sequential token refinement.
+    Processes token sequences through multiple transformer blocks with
+    attention masking support for handling variable-length sequences.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        depth,
+        mlp_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.blocks = nn.ModuleList([
+            IndividualTokenRefinerBlock(
+                hidden_size=hidden_size,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                mlp_drop_rate=mlp_drop_rate,
+                act_type=act_type,
+                qk_norm=qk_norm,
+                qk_norm_type=qk_norm_type,
+                qkv_bias=qkv_bias,
+                **factory_kwargs,
+            ) for _ in range(depth)
+        ])
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: torch.LongTensor,
+        mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        Forward pass through the stack of transformer blocks.
+        Args:
+            x: Input token embeddings (batch_size, seq_len, hidden_size)
+            c: Conditional embeddings (batch_size, hidden_size)
+            mask: Sequence mask indicating valid tokens (batch_size, seq_len)
+        Returns:
+            Refined token embeddings after all blocks
+        """
+        self_attn_mask = None
+        if mask is not None:
+            batch_size = mask.shape[0]
+            seq_len = mask.shape[1]
+            mask = mask.to(x.device)
+            # batch_size x 1 x seq_len x seq_len
+            self_attn_mask_1 = mask.view(batch_size, 1, 1, seq_len).repeat(1, 1, seq_len, 1)
+            # batch_size x 1 x seq_len x seq_len
+            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
+            # batch_size x 1 x seq_len x seq_len, 1 for broadcasting of num_heads
+            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
+            # avoids self-attention weight being NaN for padding tokens
+            self_attn_mask[:, :, :, 0] = True
+        for block in self.blocks:
+            x = block(x, c, self_attn_mask)
+        return x
+class SingleTokenRefiner(nn.Module):
+    """
+    Complete token refinement module with input embedding and conditional modulation.
+    Integrates timestep embedding, context projection, and a stack of transformer
+    blocks to refine token sequences based on both input data and conditional inputs.
+    """
+    def __init__(
+        self,
+        in_channels,
+        hidden_size,
+        num_heads,
+        depth,
+        mlp_ratio: float = 4.0,
+        mlp_drop_rate: float = 0.0,
+        act_type: str = "silu",
+        qk_norm: bool = False,
+        qk_norm_type: str = "layer",
+        qkv_bias: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+    ):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.input_embedder = nn.Linear(in_channels, hidden_size, bias=True, **factory_kwargs)
+        act_layer = get_activation_layer(act_type)
+        # Build timestep embedding layer
+        self.t_embedder = TimestepEmbedder(hidden_size, act_layer, **factory_kwargs)
+        # Build context embedding layer
+        self.c_embedder = TextProjection(in_channels, hidden_size, act_layer, **factory_kwargs)
+        self.individual_token_refiner = IndividualTokenRefiner(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            depth=depth,
+            mlp_ratio=mlp_ratio,
+            mlp_drop_rate=mlp_drop_rate,
+            act_type=act_type,
+            qk_norm=qk_norm,
+            qk_norm_type=qk_norm_type,
+            qkv_bias=qkv_bias,
+            **factory_kwargs
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.LongTensor,
+        mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Forward pass of the complete token refiner.
+        Args:
+            x: Input features (batch_size, seq_len, in_channels)
+            t: Timestep indices (batch_size,)
+            mask: Sequence mask for variable-length inputs (batch_size, seq_len)
+        Returns:
+            Refined token embeddings (batch_size, seq_len, hidden_size)
+        """
+        timestep_aware_representations = self.t_embedder(t)
+        if mask is None:
+            context_aware_representations = x.mean(dim=1)
+        else:
+            mask_float = mask.float().unsqueeze(-1)   # [b, s1, 1]
+            context_aware_representations = (
+                (x * mask_float).sum(dim=1) / mask_float.sum(dim=1)
+            )
+        context_aware_representations = self.c_embedder(context_aware_representations)
+        c = timestep_aware_representations + context_aware_representations
+        x = self.input_embedder(x)
+        x = self.individual_token_refiner(x, c, mask)
+        return x

hymm_sp/sample_batch.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import os
+from pathlib import Path
+from loguru import logger
+import torch
+import numpy as np
+import torch.distributed
+import random
+import torchvision.transforms as transforms
+from PIL import Image
+import cv2
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader
+from hymm_sp.config import parse_args
+from hymm_sp.sample_inference import HunyuanVideoSampler
+from hymm_sp.data_kits.video_dataset import VideoCSVDataset
+from hymm_sp.data_kits.data_tools import save_videos_grid
+from hymm_sp.modules.parallel_states import (
+    initialize_distributed,
+    nccl_info,
+)
+class CropResize:
+    """
+    Custom transform to resize and crop images to a target size while preserving aspect ratio.
+    Resizes the image to ensure it covers the target dimensions, then center-crops to the exact size.
+    Useful for preparing consistent input dimensions for video generation models.
+    """
+    def __init__(self, size=(704, 1216)):
+        """
+        Args:
+            size (tuple): Target dimensions (height, width) for the output image
+        """
+        self.target_h, self.target_w = size
+    def __call__(self, img):
+        """
+        Apply the transform to an image.
+        Args:
+            img (PIL.Image): Input image to transform
+        Returns:
+            PIL.Image: Resized and cropped image with target dimensions
+        """
+        # Get original image dimensions
+        w, h = img.size
+        # Calculate scaling factor to ensure image covers target size
+        scale = max(
+            self.target_w / w,  # Scale needed to cover target width
+            self.target_h / h   # Scale needed to cover target height
+        )
+        # Resize image while preserving aspect ratio
+        new_size = (int(h * scale), int(w * scale))
+        resize_transform = transforms.Resize(
+            new_size,
+            interpolation=transforms.InterpolationMode.BILINEAR
+        )
+        resized_img = resize_transform(img)
+        # Center-crop to exact target dimensions
+        crop_transform = transforms.CenterCrop((self.target_h, self.target_w))
+        return crop_transform(resized_img)
+def main():
+    """
+    Main function for video generation using the Hunyuan multimodal model.
+    Handles argument parsing, distributed setup, model loading, data preparation,
+    and video generation with action-controlled transitions. Supports both image-to-video
+    and video-to-video generation tasks.
+    """
+    # Parse command-line arguments and configuration
+    args = parse_args()
+    models_root_path = Path(args.ckpt)
+    action_list = args.action_list
+    action_speed_list = args.action_speed_list
+    negative_prompt = args.add_neg_prompt
+    # Initialize distributed training/evaluation environment
+    logger.info("*" * 20)
+    initialize_distributed(args.seed)
+    # Validate model checkpoint path exists
+    if not models_root_path.exists():
+        raise ValueError(f"Model checkpoint path does not exist: {models_root_path}")
+    logger.info("+" * 20)
+    # Set up output directory
+    save_path = args.save_path if args.save_path_suffix == "" else f'{args.save_path}_{args.save_path_suffix}'
+    os.makedirs(save_path, exist_ok=True)
+    logger.info(f"Generated videos will be saved to: {save_path}")
+    # Initialize device configuration for distributed processing
+    rank = 0
+    device = torch.device("cuda")
+    if nccl_info.sp_size > 1:
+        # Use specific GPU based on process rank in distributed setup
+        device = torch.device(f"cuda:{torch.distributed.get_rank()}")
+        rank = torch.distributed.get_rank()
+    # Load the Hunyuan video sampler model from checkpoint
+    logger.info(f"Loading model from checkpoint: {args.ckpt}")
+    hunyuan_video_sampler = HunyuanVideoSampler.from_pretrained(
+        args.ckpt,
+        args=args,
+        device=device if not args.cpu_offload else torch.device("cpu")
+    )
+    # Update args with model-specific configurations from the checkpoint
+    args = hunyuan_video_sampler.args
+    # Enable CPU offloading if specified to reduce GPU memory usage
+    if args.cpu_offload:
+        from diffusers.hooks import apply_group_offloading
+        onload_device = torch.device("cuda")
+        apply_group_offloading(
+            hunyuan_video_sampler.pipeline.transformer,
+            onload_device=onload_device,
+            offload_type="block_level",
+            num_blocks_per_group=1
+        )
+        logger.info("Enabled CPU offloading for transformer blocks")
+    # Process each batch in the dataset
+    prompt = args.prompt
+    image_paths = [args.image_path]
+    logger.info(f"Prompt: {prompt}, Image Path {args.image_path}")
+    # Generate random seed for reproducibility
+    seed = args.seed if args.seed else random.randint(0, 1_000_000)
+    # Define image transformation pipeline for input reference images
+    closest_size = (704, 1216)
+    ref_image_transform = transforms.Compose([
+        CropResize(closest_size),
+        transforms.CenterCrop(closest_size),
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5])  # Normalize to [-1, 1] range
+    ])
+    # Handle image-based generation (start from a single image)
+    if args.image_start:
+        # Load and preprocess reference images
+        raw_ref_images = [Image.open(image_path).convert('RGB') for image_path in image_paths]
+        # Apply transformations and prepare tensor for model input
+        ref_images_pixel_values = [ref_image_transform(ref_image) for ref_image in raw_ref_images]
+        ref_images_pixel_values = torch.cat(ref_images_pixel_values).unsqueeze(0).unsqueeze(2).to(device)
+        # Encode reference images to latent space using VAE
+        with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
+            if args.cpu_offload:
+                # Move VAE components to GPU temporarily for encoding
+                hunyuan_video_sampler.vae.quant_conv.to('cuda')
+                hunyuan_video_sampler.vae.encoder.to('cuda')
+            # Enable tiling for VAE to handle large images efficiently
+            hunyuan_video_sampler.pipeline.vae.enable_tiling()
+            # Encode image to latents and scale by VAE's scaling factor
+            raw_last_latents = hunyuan_video_sampler.vae.encode(
+                ref_images_pixel_values
+            ).latent_dist.sample().to(dtype=torch.float16)  # Shape: (B, C, F, H, W)
+            raw_last_latents.mul_(hunyuan_video_sampler.vae.config.scaling_factor)
+            raw_ref_latents = raw_last_latents.clone()
+            # Clean up
+            hunyuan_video_sampler.pipeline.vae.disable_tiling()
+            if args.cpu_offload:
+                # Move VAE components back to CPU after encoding
+                hunyuan_video_sampler.vae.quant_conv.to('cpu')
+                hunyuan_video_sampler.vae.encoder.to('cpu')
+    # Handle video-based generation (start from an existing video)
+    else:
+        from decord import VideoReader  # Lazy import for video handling
+        # Validate video file exists
+        video_path = args.video_path
+        if not os.path.exists(video_path):
+            raise RuntimeError(f"Video file not found: {video_path}")
+        # Load reference images from video metadata
+        raw_ref_images = [Image.open(image_path).convert('RGB') for image_path in image_paths]
+        # Load video and extract frames
+        ref_video = VideoReader(video_path)
+        ref_frames_length = len(ref_video)
+        logger.info(f"Loaded reference video with {ref_frames_length} frames")
+        # Preprocess video frames
+        transformed_images = []
+        for index in range(ref_frames_length):
+            # Convert video frame to PIL image and apply transformations
+            video_image = ref_video[index].numpy()
+            transformed_image = ref_image_transform(Image.fromarray(video_image))
+            transformed_images.append(transformed_image)
+        # Prepare tensor for model input
+        transformed_images = torch.stack(transformed_images, dim=1).unsqueeze(0).to(
+            device=hunyuan_video_sampler.device,
+            dtype=torch.float16
+        )
+        # Encode video frames to latent space using VAE
+        with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
+            if args.cpu_offload:
+                hunyuan_video_sampler.vae.quant_conv.to('cuda')
+                hunyuan_video_sampler.vae.encoder.to('cuda')
+            hunyuan_video_sampler.pipeline.vae.enable_tiling()
+            # Encode last 33 frames of video (model-specific requirement)
+            raw_last_latents = hunyuan_video_sampler.vae.encode(
+                transformed_images[:, :, -33:, ...]
+            ).latent_dist.sample().to(dtype=torch.float16)
+            raw_last_latents.mul_(hunyuan_video_sampler.vae.config.scaling_factor)
+            # Encode a single reference frame from the video
+            raw_ref_latents = hunyuan_video_sampler.vae.encode(
+                transformed_images[:, :, -33:-32, ...]
+            ).latent_dist.sample().to(dtype=torch.float16)
+            raw_ref_latents.mul_(hunyuan_video_sampler.vae.config.scaling_factor)
+            # Clean up
+            hunyuan_video_sampler.pipeline.vae.disable_tiling()
+            if args.cpu_offload:
+                hunyuan_video_sampler.vae.quant_conv.to('cpu')
+                hunyuan_video_sampler.vae.encoder.to('cpu')
+    # Store references for generation loop
+    ref_images = raw_ref_images
+    last_latents = raw_last_latents
+    ref_latents = raw_ref_latents
+    # Generate video segments for each action in the action list
+    for idx, action_id in enumerate(action_list):
+        # Determine if this is the first action and using image start
+        is_image = (idx == 0 and args.image_start)
+        logger.info(f"Generating segment {idx+1}/{len(action_list)} with action ID: {action_id}")
+        # Generate video segment with the current action
+        outputs = hunyuan_video_sampler.predict(
+            prompt=prompt,
+            action_id=action_id,
+            action_speed=action_speed_list[idx],
+            is_image=is_image,
+            size=(704, 1216),
+            seed=seed,
+            last_latents=last_latents,  # Previous frame latents for continuity
+            ref_latents=ref_latents,    # Reference latents for style consistency
+            video_length=args.sample_n_frames,
+            guidance_scale=args.cfg_scale,
+            num_images_per_prompt=args.num_images,
+            negative_prompt=negative_prompt,
+            infer_steps=args.infer_steps,
+            flow_shift=args.flow_shift_eval_video,
+            use_linear_quadratic_schedule=args.use_linear_quadratic_schedule,
+            linear_schedule_end=args.linear_schedule_end,
+            use_deepcache=args.use_deepcache,
+            cpu_offload=args.cpu_offload,
+            ref_images=ref_images,
+            output_dir=save_path,
+            return_latents=True,
+            use_sage=args.use_sage,
+        )
+        # Update latents for next iteration (maintain temporal consistency)
+        ref_latents = outputs["ref_latents"]
+        last_latents = outputs["last_latents"]
+        # Save generated video segments if this is the main process (rank 0)
+        if rank == 0:
+            sub_samples = outputs['samples'][0]
+            # Initialize or concatenate video segments
+            if idx == 0:
+                if args.image_start:
+                    out_cat = sub_samples
+                else:
+                    # Combine original video frames with generated frames
+                    out_cat = torch.cat([(transformed_images.detach().cpu() + 1) / 2.0, sub_samples], dim=2)
+            else:
+                # Append new segment to existing video
+                out_cat = torch.cat([out_cat, sub_samples], dim=2)
+            # Save final combined video
+            save_path_mp4 = f"{save_path}/{os.path.basename(args.image_path).split('.')[0]}.mp4"
+            save_videos_grid(out_cat, save_path_mp4, n_rows=1, fps=24)
+            logger.info(f"Saved generated video to: {save_path_mp4}")
+if __name__ == "__main__":
+    main()

hymm_sp/sample_inference.py ADDED Viewed

	@@ -0,0 +1,716 @@

+import math
+import time
+import torch
+import random
+from loguru import logger
+import numpy as np
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+from matplotlib.patches import Patch
+from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+from hymm_sp.diffusion import load_diffusion_pipeline
+from hymm_sp.helpers import get_nd_rotary_pos_embed_new
+from hymm_sp.inference import Inference
+from hymm_sp.diffusion.schedulers import FlowMatchDiscreteScheduler
+from packaging import version as pver
+ACTION_DICT = {"w": "forward", "a": "left", "d": "right", "s": "backward"}
+def custom_meshgrid(*args):
+    # ref: https://pytorch.org/docs/stable/generated/torch.meshgrid.html?highlight=meshgrid#torch.meshgrid
+    if pver.parse(torch.__version__) < pver.parse('1.10'):
+        return torch.meshgrid(*args)
+    else:
+        return torch.meshgrid(*args, indexing='ij')
+def get_relative_pose(cam_params):
+    abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+    abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+    source_cam_c2w = abs_c2ws[0]
+    cam_to_origin = 0
+    target_cam_c2w = np.array([
+        [1, 0, 0, 0],
+        [0, 1, 0, -cam_to_origin],
+        [0, 0, 1, 0],
+        [0, 0, 0, 1]
+    ])
+    abs2rel = target_cam_c2w @ abs_w2cs[0]
+    ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+    for pose in ret_poses:
+        pose[:3, -1:] *= 10
+    ret_poses = np.array(ret_poses, dtype=np.float32)
+    return ret_poses
+def ray_condition(K, c2w, H, W, device, flip_flag=None):
+    # c2w: B, V, 4, 4
+    # K: B, V, 4
+    B, V = K.shape[:2]
+    j, i = custom_meshgrid(
+        torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
+        torch.linspace(0, W - 1, W, device=device, dtype=c2w.dtype),
+    )
+    i = i.reshape([1, 1, H * W]).expand([B, V, H * W]) + 0.5          # [B, V, HxW]
+    j = j.reshape([1, 1, H * W]).expand([B, V, H * W]) + 0.5          # [B, V, HxW]
+    n_flip = torch.sum(flip_flag).item() if flip_flag is not None else 0
+    if n_flip > 0:
+        j_flip, i_flip = custom_meshgrid(
+            torch.linspace(0, H - 1, H, device=device, dtype=c2w.dtype),
+            torch.linspace(W - 1, 0, W, device=device, dtype=c2w.dtype)
+        )
+        i_flip = i_flip.reshape([1, 1, H * W]).expand(B, 1, H * W) + 0.5
+        j_flip = j_flip.reshape([1, 1, H * W]).expand(B, 1, H * W) + 0.5
+        i[:, flip_flag, ...] = i_flip
+        j[:, flip_flag, ...] = j_flip
+    fx, fy, cx, cy = K.chunk(4, dim=-1)     # B,V, 1
+    zs = torch.ones_like(i)                 # [B, V, HxW]
+    xs = (i - cx) / fx * zs
+    ys = (j - cy) / fy * zs
+    zs = zs.expand_as(ys)
+    directions = torch.stack((xs, ys, zs), dim=-1)              # B, V, HW, 3
+    directions = directions / directions.norm(dim=-1, keepdim=True)             # B, V, HW, 3
+    rays_d = directions @ c2w[..., :3, :3].transpose(-1, -2)        # B, V, HW, 3
+    rays_o = c2w[..., :3, 3]                                        # B, V, 3
+    rays_o = rays_o[:, :, None].expand_as(rays_d)                   # B, V, HW, 3
+    # c2w @ dirctions
+    rays_dxo = torch.cross(rays_o, rays_d)                          # B, V, HW, 3
+    plucker = torch.cat([rays_dxo, rays_d], dim=-1)
+    plucker = plucker.reshape(B, c2w.shape[1], H, W, 6)             # B, V, H, W, 6
+    # plucker = plucker.permute(0, 1, 4, 2, 3)
+    return plucker
+def get_c2w(w2cs, transform_matrix, relative_c2w):
+    if relative_c2w:
+        target_cam_c2w = np.array([
+            [1, 0, 0, 0],
+            [0, 1, 0, 0],
+            [0, 0, 1, 0],
+            [0, 0, 0, 1]
+        ])
+        abs2rel = target_cam_c2w @ w2cs[0]
+        ret_poses = [target_cam_c2w, ] + [abs2rel @ np.linalg.inv(w2c) for w2c in w2cs[1:]]
+        for pose in ret_poses:
+            pose[:3, -1:] *= 2
+        # ret_poses = [poses[:, :3]*2 for poses in ret_poses]
+        # ret_poses[:, :, :3] *= 2
+    else:
+        ret_poses = [np.linalg.inv(w2c) for w2c in w2cs]
+    ret_poses = [transform_matrix @ x for x in ret_poses]
+    return np.array(ret_poses, dtype=np.float32)
+def generate_motion_segment(current_pose,
+                            motion_type: str,
+                            value: float,
+                            duration: int = 30):
+    """
+    Parameters:
+        motion_type: ('forward', 'backward', 'left', 'right',
+                            'rotate_left', 'rotate_right', 'rotate_up', 'rotate_down')
+        value: Translation(m) or Rotation(degree)
+        duration: frames
+    Return:
+        positions: [np.array(x,y,z), ...]
+        rotations: [np.array(pitch,yaw,roll), ...]
+    """
+    positions = []
+    rotations = []
+    if motion_type in ['forward', 'backward']:
+        yaw_rad = np.radians(current_pose['rotation'][1])
+        pitch_rad = np.radians(current_pose['rotation'][0])
+        forward_vec = np.array([
+            -math.sin(yaw_rad) * math.cos(pitch_rad),
+            math.sin(pitch_rad),
+            -math.cos(yaw_rad) * math.cos(pitch_rad)
+        ])
+        direction = 1 if motion_type == 'forward' else -1
+        total_move = forward_vec * value * direction
+        step = total_move / duration
+        for i in range(1, duration+1):
+            new_pos = current_pose['position'] + step * i
+            positions.append(new_pos.copy())
+            rotations.append(current_pose['rotation'].copy())
+        current_pose['position'] = positions[-1]
+    elif motion_type in ['left', 'right']:
+        yaw_rad = np.radians(current_pose['rotation'][1])
+        right_vec = np.array([math.cos(yaw_rad), 0, -math.sin(yaw_rad)])
+        direction = -1 if motion_type == 'right' else 1
+        total_move = right_vec * value * direction
+        step = total_move / duration
+        for i in range(1, duration+1):
+            new_pos = current_pose['position'] + step * i
+            positions.append(new_pos.copy())
+            rotations.append(current_pose['rotation'].copy())
+        current_pose['position'] = positions[-1]
+    elif motion_type.endswith('rot'):
+        axis = motion_type.split('_')[0]
+        total_rotation = np.zeros(3)
+        if axis == 'left':
+            total_rotation[0] = value
+        elif axis == 'right':
+            total_rotation[0] = -value
+        elif axis == 'up':
+            total_rotation[2] = -value
+        elif axis == 'down':
+            total_rotation[2] = value
+        step = total_rotation / duration
+        for i in range(1, duration+1):
+            positions.append(current_pose['position'].copy())
+            new_rot = current_pose['rotation'] + step * i
+            rotations.append(new_rot.copy())
+        current_pose['rotation'] = rotations[-1]
+    return positions, rotations, current_pose
+def euler_to_quaternion(angles):
+    pitch, yaw, roll = np.radians(angles)
+    cy = math.cos(yaw * 0.5)
+    sy = math.sin(yaw * 0.5)
+    cp = math.cos(pitch * 0.5)
+    sp = math.sin(pitch * 0.5)
+    cr = math.cos(roll * 0.5)
+    sr = math.sin(roll * 0.5)
+    qw = cy * cp * cr + sy * sp * sr
+    qx = cy * cp * sr - sy * sp * cr
+    qy = sy * cp * sr + cy * sp * cr
+    qz = sy * cp * cr - cy * sp * sr
+    return [qw, qx, qy, qz]
+def quaternion_to_rotation_matrix(q):
+    qw, qx, qy, qz = q
+    return np.array([
+        [1 - 2*(qy**2 + qz**2), 2*(qx*qy - qw*qz), 2*(qx*qz + qw*qy)],
+        [2*(qx*qy + qw*qz), 1 - 2*(qx**2 + qz**2), 2*(qy*qz - qw*qx)],
+        [2*(qx*qz - qw*qy), 2*(qy*qz + qw*qx), 1 - 2*(qx**2 + qy**2)]
+    ])
+def ActionToPoseFromID(action_id, value=0.2, duration=33):
+    all_positions = []
+    all_rotations = []
+    current_pose = {
+        'position': np.array([0.0, 0.0, 0.0]),  # XYZ
+        'rotation': np.array([0.0, 0.0, 0.0])   # (pitch, yaw, roll)
+    }
+    intrinsic = [0.50505, 0.8979, 0.5, 0.5]
+    motion_type = ACTION_DICT[action_id]
+    positions, rotations, current_pose = generate_motion_segment(current_pose, motion_type, value, duration)
+    all_positions.extend(positions)
+    all_rotations.extend(rotations)
+    pose_list = []
+    row = [0] + intrinsic + [0, 0] + [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
+    first_row = " ".join(map(str, row))
+    pose_list.append(first_row)
+    for i, (pos, rot) in enumerate(zip(all_positions, all_rotations)):
+        quat = euler_to_quaternion(rot)
+        R = quaternion_to_rotation_matrix(quat)
+        extrinsic = np.hstack([R, pos.reshape(3, 1)])
+        row = [i] + intrinsic + [0, 0] + extrinsic.flatten().tolist()
+        pose_list.append(" ".join(map(str, row)))
+    return pose_list
+class Camera(object):
+    def __init__(self, entry):
+        fx, fy, cx, cy = entry[1:5]
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        w2c_mat = np.array(entry[7:]).reshape(3, 4)
+        w2c_mat_4x4 = np.eye(4)
+        w2c_mat_4x4[:3, :] = w2c_mat
+        self.w2c_mat = w2c_mat_4x4
+        self.c2w_mat = np.linalg.inv(w2c_mat_4x4)
+class CameraPoseVisualizer:
+    def __init__(self, xlim, ylim, zlim):
+        self.fig = plt.figure(figsize=(7, 7))
+        self.ax = self.fig.add_subplot(projection='3d')
+        # self.ax.view_init(elev=25, azim=-90)
+        self.plotly_data = None  # plotly data traces
+        self.ax.set_aspect("auto")
+        self.ax.set_xlim(xlim)
+        self.ax.set_ylim(ylim)
+        self.ax.set_zlim(zlim)
+        self.ax.set_xlabel('x')
+        self.ax.set_ylabel('y')
+        self.ax.set_zlabel('z')
+        print('initialize camera pose visualizer')
+    def extrinsic2pyramid(self, extrinsic, color_map='red', hw_ratio=9/16, base_xval=1, zval=3):
+        vertex_std = np.array([[0, 0, 0, 1],
+                               [base_xval, -base_xval * hw_ratio, zval, 1],
+                               [base_xval, base_xval * hw_ratio, zval, 1],
+                               [-base_xval, base_xval * hw_ratio, zval, 1],
+                               [-base_xval, -base_xval * hw_ratio, zval, 1]])
+        vertex_transformed = vertex_std @ extrinsic.T
+        meshes = [[vertex_transformed[0, :-1], vertex_transformed[1][:-1], vertex_transformed[2, :-1]],
+                            [vertex_transformed[0, :-1], vertex_transformed[2, :-1], vertex_transformed[3, :-1]],
+                            [vertex_transformed[0, :-1], vertex_transformed[3, :-1], vertex_transformed[4, :-1]],
+                            [vertex_transformed[0, :-1], vertex_transformed[4, :-1], vertex_transformed[1, :-1]],
+                            [vertex_transformed[1, :-1], vertex_transformed[2, :-1], vertex_transformed[3, :-1],
+                             vertex_transformed[4, :-1]]]
+        color = color_map if isinstance(color_map, str) else plt.cm.rainbow(color_map)
+        self.ax.add_collection3d(
+            Poly3DCollection(meshes, facecolors=color, linewidths=0.3, edgecolors=color, alpha=0.35))
+    def customize_legend(self, list_label):
+        list_handle = []
+        for idx, label in enumerate(list_label):
+            color = plt.cm.rainbow(idx / len(list_label))
+            patch = Patch(color=color, label=label)
+            list_handle.append(patch)
+        plt.legend(loc='right', bbox_to_anchor=(1.8, 0.5), handles=list_handle)
+    def colorbar(self, max_frame_length):
+        cmap = mpl.cm.rainbow
+        norm = mpl.colors.Normalize(vmin=0, vmax=max_frame_length)
+        self.fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap),
+                          ax=self.ax, orientation='vertical', label='Frame Number')
+    def show(self, file_name):
+        plt.title('Extrinsic Parameters')
+        # plt.show()
+        plt.savefig(file_name)
+def align_to(value, alignment):
+    return int(math.ceil(value / alignment) * alignment)
+def GetPoseEmbedsFromPoses(poses, h, w, target_length, flip=False, start_index=None):
+    poses = [pose.split(' ') for pose in poses]
+    start_idx = start_index
+    sample_id = [start_idx + i for i in range(target_length)]
+    poses = [poses[i] for i in sample_id]
+    frame = len(poses)
+    w2cs = [np.asarray([float(p) for p in pose[7:]]).reshape(3, 4) for pose in poses]
+    transform_matrix = np.asarray([[1, 0, 0, 0], [0, 0, 1, 0], [0, -1, 0, 0], [0, 0, 0, 1]]).reshape(4, 4)
+    last_row = np.zeros((1, 4))
+    last_row[0, -1] = 1.0
+    w2cs = [np.concatenate((w2c, last_row), axis=0) for w2c in w2cs]
+    c2ws = get_c2w(w2cs, transform_matrix, relative_c2w=True)
+    cam_params = [[float(x) for x in pose] for pose in poses]
+    assert len(cam_params) == target_length
+    cam_params = [Camera(cam_param) for cam_param in cam_params]
+    monst3r_w = cam_params[0].cx * 2
+    monst3r_h = cam_params[0].cy * 2
+    ratio_w, ratio_h = w/monst3r_w, h/monst3r_h
+    intrinsics = np.asarray([[cam_param.fx * ratio_w,
+                                cam_param.fy * ratio_h,
+                                cam_param.cx * ratio_w,
+                                cam_param.cy * ratio_h]
+                                for cam_param in cam_params], dtype=np.float32)
+    intrinsics = torch.as_tensor(intrinsics)[None]                  # [1, n_frame, 4]
+    relative_pose = True
+    if relative_pose:
+        c2w_poses = get_relative_pose(cam_params)
+    else:
+        c2w_poses = np.array([cam_param.c2w_mat for cam_param in cam_params], dtype=np.float32)
+    c2w = torch.as_tensor(c2w_poses)[None]                          # [1, n_frame, 4, 4]
+    uncond_c2w = torch.zeros_like(c2w)
+    uncond_c2w[:, :] = torch.eye(4, device=c2w.device)
+    flip_flag = torch.zeros(target_length, dtype=torch.bool, device=c2w.device)
+    plucker_embedding = ray_condition(intrinsics, c2w, h, w, device='cpu',
+                                        flip_flag=flip_flag)[0].permute(0, 3, 1, 2).contiguous()
+    uncond_plucker_embedding = ray_condition(intrinsics, uncond_c2w, h, w, device='cpu',
+                                        flip_flag=flip_flag)[0].permute(0, 3, 1, 2).contiguous()
+    return plucker_embedding, uncond_plucker_embedding, poses
+def GetPoseEmbedsFromTxt(pose_dir, h, w, target_length, flip=False, start_index=None, step=1):
+    # get camera pose
+    with open(pose_dir, 'r') as f:
+        poses = f.readlines()
+    poses = [pose.strip().split(' ') for pose in poses[1:]]
+    start_idx = start_index
+    sample_id = [start_idx + i*step for i in range(target_length)]
+    poses = [poses[i] for i in sample_id]
+    cam_params = [[float(x) for x in pose] for pose in poses]
+    assert len(cam_params) == target_length
+    cam_params = [Camera(cam_param) for cam_param in cam_params]
+    monst3r_w = cam_params[0].cx * 2
+    monst3r_h = cam_params[0].cy * 2
+    ratio_w, ratio_h = w/monst3r_w, h/monst3r_h
+    intrinsics = np.asarray([[cam_param.fx * ratio_w,
+                                cam_param.fy * ratio_h,
+                                cam_param.cx * ratio_w,
+                                cam_param.cy * ratio_h]
+                                for cam_param in cam_params], dtype=np.float32)
+    intrinsics = torch.as_tensor(intrinsics)[None]                  # [1, n_frame, 4]
+    relative_pose = True
+    if relative_pose:
+        c2w_poses = get_relative_pose(cam_params)
+    else:
+        c2w_poses = np.array([cam_param.c2w_mat for cam_param in cam_params], dtype=np.float32)
+    c2w = torch.as_tensor(c2w_poses)[None]                          # [1, n_frame, 4, 4]
+    uncond_c2w = torch.zeros_like(c2w)
+    uncond_c2w[:, :] = torch.eye(4, device=c2w.device)
+    if flip:
+        flip_flag = torch.ones(target_length, dtype=torch.bool, device=c2w.device)
+    else:
+        flip_flag = torch.zeros(target_length, dtype=torch.bool, device=c2w.device)
+    plucker_embedding = ray_condition(intrinsics, c2w, h, w, device='cpu',
+                                        flip_flag=flip_flag)[0].permute(0, 3, 1, 2).contiguous()
+    uncond_plucker_embedding = ray_condition(intrinsics, uncond_c2w, h, w, device='cpu',
+                                        flip_flag=flip_flag)[0].permute(0, 3, 1, 2).contiguous()
+    return plucker_embedding, uncond_plucker_embedding, poses
+class HunyuanVideoSampler(Inference):
+    def __init__(self, args, vae, vae_kwargs, text_encoder, model, text_encoder_2=None, pipeline=None,
+                 device=0, logger=None):
+        super().__init__(args, vae, vae_kwargs, text_encoder, model, text_encoder_2=text_encoder_2,
+                         pipeline=pipeline,  device=device, logger=logger)
+        self.args = args
+        self.pipeline = load_diffusion_pipeline(
+            args, 0, self.vae, self.text_encoder, self.text_encoder_2, self.model,
+            device=self.device)
+        print('load hunyuan model successful... ')
+    def get_rotary_pos_embed(self, video_length, height, width, concat_dict={}):
+        target_ndim = 3
+        ndim = 5 - 2
+        if '884' in self.args.vae:
+            latents_size = [(video_length-1)//4+1 , height//8, width//8]
+        else:
+            latents_size = [video_length , height//8, width//8]
+        if isinstance(self.model.patch_size, int):
+            assert all(s % self.model.patch_size == 0 for s in latents_size), \
+                f"Latent size(last {ndim} dimensions) should be divisible by patch size({self.model.patch_size}), " \
+                f"but got {latents_size}."
+            rope_sizes = [s // self.model.patch_size for s in latents_size]
+        elif isinstance(self.model.patch_size, list):
+            assert all(s % self.model.patch_size[idx] == 0 for idx, s in enumerate(latents_size)), \
+                f"Latent size(last {ndim} dimensions) should be divisible by patch size({self.model.patch_size}), " \
+                f"but got {latents_size}."
+            rope_sizes = [s // self.model.patch_size[idx] for idx, s in enumerate(latents_size)]
+        if len(rope_sizes) != target_ndim:
+            rope_sizes = [1] * (target_ndim - len(rope_sizes)) + rope_sizes  # time axis
+        head_dim = self.model.hidden_size // self.model.num_heads
+        rope_dim_list = self.model.rope_dim_list
+        if rope_dim_list is None:
+            rope_dim_list = [head_dim // target_ndim for _ in range(target_ndim)]
+        assert sum(rope_dim_list) == head_dim, "sum(rope_dim_list) should equal to head_dim of attention layer"
+        freqs_cos, freqs_sin = get_nd_rotary_pos_embed_new(rope_dim_list,
+                                                    rope_sizes,
+                                                    theta=self.args.rope_theta,
+                                                    use_real=True,
+                                                    theta_rescale_factor=1,
+                                                    concat_dict=concat_dict)
+        return freqs_cos, freqs_sin
+    @torch.no_grad()
+    def predict(self,
+                prompt,
+                is_image=True,
+                size=(720, 1280),
+                video_length=129,
+                seed=None,
+                negative_prompt=None,
+                infer_steps=50,
+                guidance_scale=6.0,
+                flow_shift=5.0,
+                batch_size=1,
+                num_videos_per_prompt=1,
+                verbose=1,
+                output_type="pil",
+                **kwargs):
+        """
+        Predict the image from the given text.
+        Args:
+            prompt (str or List[str]): The input text.
+            kwargs:
+                size (int): The (height, width) of the output image/video. Default is (256, 256).
+                video_length (int): The frame number of the output video. Default is 1.
+                seed (int or List[str]): The random seed for the generation. Default is a random integer.
+                negative_prompt (str or List[str]): The negative text prompt. Default is an empty string.
+                infer_steps (int): The number of inference steps. Default is 100.
+                guidance_scale (float): The guidance scale for the generation. Default is 6.0.
+                num_videos_per_prompt (int): The number of videos per prompt. Default is 1.
+                verbose (int): 0 for no log, 1 for all log, 2 for fewer log. Default is 1.
+                output_type (str): The output type of the image, can be one of `pil`, `np`, `pt`, `latent`.
+                    Default is 'pil'.
+        """
+        out_dict = dict()
+        # ---------------------------------
+        # Prompt
+        # ---------------------------------
+        prompt_embeds = kwargs.get("prompt_embeds", None)
+        attention_mask = kwargs.get("attention_mask", None)
+        negative_prompt_embeds = kwargs.get("negative_prompt_embeds", None)
+        negative_attention_mask = kwargs.get("negative_attention_mask", None)
+        ref_latents = kwargs.get("ref_latents", None)
+        uncond_ref_latents = kwargs.get("uncond_ref_latents", None)
+        return_latents = kwargs.get("return_latents", False)
+        negative_prompt = kwargs.get("negative_prompt", None)
+        action_id = kwargs.get("action_id", None)
+        action_speed = kwargs.get("action_speed", None)
+        start_index = kwargs.get("start_index", None)
+        last_latents = kwargs.get("last_latents", None)
+        ref_latents = kwargs.get("ref_latents", None)
+        input_pose = kwargs.get("input_pose", None)
+        step = kwargs.get("step", 1)
+        use_sage = kwargs.get("use_sage", False)
+        size = self.parse_size(size)
+        target_height = align_to(size[0], 16)
+        target_width = align_to(size[1], 16)
+        # target_video_length = video_length
+        if input_pose is not None:
+            pose_embeds, uncond_pose_embeds, poses = GetPoseEmbedsFromTxt(input_pose,
+                                                                          target_height,
+                                                                          target_width,
+                                                                          33,
+                                                                          kwargs.get("flip", False),
+                                                                          start_index,
+                                                                          step)
+        else:
+            pose = ActionToPoseFromID(action_id, value=action_speed)
+            pose_embeds, uncond_pose_embeds, poses = GetPoseEmbedsFromPoses(pose,
+                                                                            target_height,
+                                                                            target_width,
+                                                                            33,
+                                                                            kwargs.get("flip", False),
+                                                                            0)
+        if is_image:
+            target_length = 34
+        else:
+            target_length = 66
+        out_dict['frame'] = target_length
+        # print("pose embeds: ", pose_embeds.shape, uncond_pose_embeds.shape)
+        pose_embeds = pose_embeds.unsqueeze(0).to(torch.bfloat16).to('cuda')
+        uncond_pose_embeds = uncond_pose_embeds.unsqueeze(0).to(torch.bfloat16).to('cuda')
+        cpu_offload = kwargs.get("cpu_offload", 0)
+        use_deepcache = kwargs.get("use_deepcache", 1)
+        denoise_strength = kwargs.get("denoise_strength", 1.0)
+        init_latents = kwargs.get("init_latents", None)
+        mask = kwargs.get("mask", None)
+        if prompt is None:
+            # prompt_embeds, attention_mask, negative_prompt_embeds and negative_attention_mask should not be None
+            # pipeline will help to check this
+            prompt = None
+            negative_prompt = None
+            batch_size = prompt_embeds.shape[0]
+            assert prompt_embeds is not None
+        else:
+            # prompt_embeds, attention_mask, negative_prompt_embeds and negative_attention_mask should be None
+            # pipeline will help to check this
+            if isinstance(prompt, str):
+                batch_size = 1
+                prompt = [prompt]
+            elif isinstance(prompt, (list, tuple)):
+                batch_size = len(prompt)
+            else:
+                raise ValueError(f"Prompt must be a string or a list of strings, got {prompt}.")
+            if negative_prompt is None:
+                negative_prompt = [""] * batch_size
+            if isinstance(negative_prompt, str):
+                negative_prompt = [negative_prompt] * batch_size
+        # ---------------------------------
+        # Other arguments
+        # ---------------------------------
+        scheduler = FlowMatchDiscreteScheduler(shift=flow_shift,
+                                                reverse=self.args.flow_reverse,
+                                                solver=self.args.flow_solver,
+                                                )
+        self.pipeline.scheduler = scheduler
+        # ---------------------------------
+        # Random seed
+        # ---------------------------------
+        if isinstance(seed, torch.Tensor):
+            seed = seed.tolist()
+        if seed is None:
+            seeds = [random.randint(0, 1_000_000) for _ in range(batch_size * num_videos_per_prompt)]
+        elif isinstance(seed, int):
+            seeds = [seed + i for _ in range(batch_size) for i in range(num_videos_per_prompt)]
+        elif isinstance(seed, (list, tuple)):
+            if len(seed) == batch_size:
+                seeds = [int(seed[i]) + j for i in range(batch_size) for j in range(num_videos_per_prompt)]
+            elif len(seed) == batch_size * num_videos_per_prompt:
+                seeds = [int(s) for s in seed]
+            else:
+                raise ValueError(
+                    f"Length of seed must be equal to number of prompt(batch_size) or "
+                    f"batch_size * num_videos_per_prompt ({batch_size} * {num_videos_per_prompt}), got {seed}."
+                )
+        else:
+            raise ValueError(f"Seed must be an integer, a list of integers, or None, got {seed}.")
+        generator = [torch.Generator(self.device).manual_seed(seed) for seed in seeds]
+        # ---------------------------------
+        # Image/Video size and frame
+        # ---------------------------------
+        out_dict['size'] = (target_height, target_width)
+        out_dict['video_length'] = target_length
+        out_dict['seeds'] = seeds
+        out_dict['negative_prompt'] = negative_prompt
+        # ---------------------------------
+        # Build RoPE
+        # ---------------------------------
+        concat_dict = {'mode': 'timecat', 'bias': -1}
+        if is_image:
+            freqs_cos, freqs_sin = self.get_rotary_pos_embed(37, target_height, target_width)
+        else:
+            freqs_cos, freqs_sin = self.get_rotary_pos_embed(69, target_height, target_width)
+        n_tokens = freqs_cos.shape[0]
+        # ---------------------------------
+        # Inference
+        # ---------------------------------
+        output_dir = kwargs.get("output_dir", None)
+        if verbose == 1:
+            debug_str = f"""
+                  size: {out_dict['size']}
+          video_length: {target_length}
+                prompt: {prompt}
+            neg_prompt: {negative_prompt}
+                  seed: {seed}
+           infer_steps: {infer_steps}
+      denoise_strength: {denoise_strength}
+         use_deepcache: {use_deepcache}
+              use_sage: {use_sage}
+           cpu_offload: {cpu_offload}
+ num_images_per_prompt: {num_videos_per_prompt}
+        guidance_scale: {guidance_scale}
+              n_tokens: {n_tokens}
+            flow_shift: {flow_shift}
+                output: {output_dir}"""
+            self.logger.info(debug_str)
+        start_time = time.time()
+        samples = self.pipeline(prompt=prompt,
+                                last_latents=last_latents,
+                                cam_latents=pose_embeds,
+                                uncond_cam_latents=uncond_pose_embeds,
+                                height=target_height,
+                                width=target_width,
+                                video_length=target_length,
+                                gt_latents = ref_latents,
+                                num_inference_steps=infer_steps,
+                                guidance_scale=guidance_scale,
+                                negative_prompt=negative_prompt,
+                                num_videos_per_prompt=num_videos_per_prompt,
+                                generator=generator,
+                                prompt_embeds=prompt_embeds,
+                                ref_latents=ref_latents,
+                                latents=init_latents,
+                                denoise_strength=denoise_strength,
+                                mask=mask,
+                                uncond_ref_latents=uncond_ref_latents,
+                                ip_cfg_scale=self.args.ip_cfg_scale,
+                                use_deepcache=use_deepcache,
+                                attention_mask=attention_mask,
+                                negative_prompt_embeds=negative_prompt_embeds,
+                                negative_attention_mask=negative_attention_mask,
+                                output_type=output_type,
+                                freqs_cis=(freqs_cos, freqs_sin),
+                                n_tokens=n_tokens,
+                                data_type='video' if target_length > 1 else 'image',
+                                is_progress_bar=True,
+                                vae_ver=self.args.vae,
+                                enable_tiling=self.args.vae_tiling,
+                                cpu_offload=cpu_offload,
+                                return_latents=return_latents,
+                                use_sage=use_sage,
+                                )
+        if samples is None:
+            return None
+        out_dict['samples'] = []
+        out_dict["prompts"] = prompt
+        out_dict['pose'] = poses
+        if return_latents:
+            print("return_latents | TRUE")
+            latents, timesteps, last_latents, ref_latents = samples[1], samples[2], samples[3], samples[4]
+            # samples = samples[0][0]
+            if samples[0] is not None and len(samples[0]) > 0:
+                samples = samples[0][0]
+            else:
+                samples = None
+            out_dict["denoised_lantents"] = latents
+            out_dict["timesteps"] = timesteps
+            out_dict["ref_latents"] = ref_latents
+            out_dict["last_latents"] = last_latents
+        else:
+            samples = samples[0]
+        if samples is not None:
+            for i, sample in enumerate(samples):
+                sample = samples[i].unsqueeze(0)
+                sub_samples = []
+                sub_samples.append(sample)
+                sample_num = len(sub_samples)
+                sub_samples = torch.concat(sub_samples)
+                # only save in tp rank 0
+                out_dict['samples'].append(sub_samples)
+                # visualize pose
+        gen_time = time.time() - start_time
+        logger.info(f"Success, time: {gen_time}")
+        return out_dict

hymm_sp/text_encoder/__init__.py ADDED Viewed

	@@ -0,0 +1,310 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple
+from copy import deepcopy
+import torch, os
+import torch.nn as nn
+from transformers import (
+    CLIPTextModel, CLIPTokenizer, LlavaForConditionalGeneration,LlamaModel,
+    LlamaTokenizerFast
+)
+from transformers.utils import ModelOutput
+from ..constants import TEXT_ENCODER_PATH, TOKENIZER_PATH, PRECISION_TO_TYPE
+CPU_OFFLOAD = int(os.environ.get("CPU_OFFLOAD", 0))
+print(f'text_encoder: cpu_offload={CPU_OFFLOAD}')
+def use_default(value, default):
+    return value if value is not None else default
+def load_text_encoder(text_encoder_type,
+                      text_encoder_precision=None,
+                      text_encoder_path=None,
+                      logger=None,
+                      device=None
+                      ):
+    if text_encoder_path is None:
+        text_encoder_path = TEXT_ENCODER_PATH[text_encoder_type]
+    if logger is not None:
+        logger.info(f"Loading text encoder model ({text_encoder_type}) from: {text_encoder_path}")
+    if text_encoder_type == "clipL":
+        text_encoder = CLIPTextModel.from_pretrained(text_encoder_path)
+        text_encoder.final_layer_norm = text_encoder.text_model.final_layer_norm
+    elif text_encoder_type == "llava-llama-3-8b":
+        text_encoder = LlavaForConditionalGeneration.from_pretrained(text_encoder_path, low_cpu_mem_usage=True)
+        import transformers
+        transformers_version = transformers.__version__
+        if transformers_version >= "4.53.0":
+            text_encoder.final_layer_norm = text_encoder.language_model.norm
+        else:
+            text_encoder.final_layer_norm = text_encoder.language_model.model.norm
+    else:
+        raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
+    if text_encoder_precision is not None:
+        text_encoder = text_encoder.to(dtype=PRECISION_TO_TYPE[text_encoder_precision])
+    text_encoder.requires_grad_(False)
+    if logger is not None:
+        logger.info(f"Text encoder to dtype: {text_encoder.dtype}")
+    if device is not None:
+        text_encoder = text_encoder.to(device)
+    return text_encoder, text_encoder_path
+def load_tokenizer(tokenizer_type,
+                   tokenizer_path=None,
+                   padding_side="right",
+                   logger=None
+                   ):
+    if tokenizer_path is None:
+        tokenizer_path = TOKENIZER_PATH[tokenizer_type]
+    if logger is not None:
+        logger.info(f"Loading tokenizer ({tokenizer_type}) from: {tokenizer_path}")
+    if tokenizer_type == "clipL":
+        tokenizer = CLIPTokenizer.from_pretrained(tokenizer_path, max_length=77)
+    elif tokenizer_type == "llava-llama-3-8b":
+        tokenizer = LlamaTokenizerFast.from_pretrained(tokenizer_path, padding_side=padding_side)
+    else:
+        raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}")
+    return tokenizer, tokenizer_path
+@dataclass
+class TextEncoderModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+    Args:
+        hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+        hidden_states_list (`tuple(torch.FloatTensor)`, *optional*,
+        returned when `output_hidden_states=True` is passed):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        text_outputs (`list`, *optional*, returned when `return_texts=True` is passed):
+            List of decoded texts.
+    """
+    hidden_state: torch.FloatTensor = None
+    attention_mask: Optional[torch.LongTensor] = None
+    hidden_states_list: Optional[Tuple[torch.FloatTensor, ...]] = None
+    text_outputs: Optional[list] = None
+class TextEncoder(nn.Module):
+    def __init__(self,
+                 text_encoder_type: str,
+                 max_length: int,
+                 text_encoder_precision: Optional[str] = None,
+                 text_encoder_path: Optional[str] = None,
+                 tokenizer_type: Optional[str] = None,
+                 tokenizer_path: Optional[str] = None,
+                 output_key: Optional[str] = None,
+                 use_attention_mask: bool = True,
+                 input_max_length: Optional[int] = None,
+                 prompt_template_video: Optional[dict] = None,
+                 hidden_state_skip_layer: Optional[int] = None,
+                 apply_final_norm: bool = False,
+                 reproduce: bool = False,
+                 logger=None,
+                 device=None,
+                 ):
+        super().__init__()
+        self.text_encoder_type = text_encoder_type
+        self.max_length = max_length
+        self.precision = text_encoder_precision
+        self.model_path = text_encoder_path
+        self.tokenizer_type = tokenizer_type if tokenizer_type is not None else text_encoder_type
+        self.tokenizer_path = tokenizer_path if tokenizer_path is not None else text_encoder_path
+        self.use_attention_mask = use_attention_mask
+        if prompt_template_video is not None:
+            assert use_attention_mask is True, "Attention mask is True required when training videos."
+        self.input_max_length = input_max_length if input_max_length is not None else max_length
+        self.prompt_template_video = prompt_template_video
+        self.hidden_state_skip_layer = hidden_state_skip_layer
+        self.apply_final_norm = apply_final_norm
+        self.reproduce = reproduce
+        self.logger = logger
+        self.use_video_template = self.prompt_template_video is not None
+        if self.use_video_template:
+            if self.prompt_template_video is not None:
+                assert isinstance(self.prompt_template_video, dict) and "template" in self.prompt_template_video, (
+                    f"`prompt_template_video` must be a dictionary with a key 'template', \
+                    got {self.prompt_template_video}"
+                )
+            assert '{}' in str(self.prompt_template_video["template"]), (
+                "`prompt_template_video['template']` must contain a placeholder `{}` for the input text, "
+                f"got {self.prompt_template_video['template']}"
+            )
+        if "clip" in text_encoder_type:
+            self.output_key = output_key or "pooler_output"
+        elif "llama" in text_encoder_type:
+            self.output_key = output_key or "last_hidden_state"
+        else:
+            raise ValueError(f"Unsupported text encoder type: {text_encoder_type}")
+        self.model, self.model_path = load_text_encoder(
+            text_encoder_type=self.text_encoder_type,
+            text_encoder_precision=self.precision,
+            text_encoder_path=self.model_path,
+            logger=self.logger,
+            device=device
+        )
+        self.dtype = self.model.dtype
+        self.device = self.model.device
+        self.tokenizer, self.tokenizer_path = load_tokenizer(
+            tokenizer_type=self.tokenizer_type,
+            tokenizer_path=self.tokenizer_path,
+            padding_side="right",
+            logger=self.logger
+        )
+    def __repr__(self):
+        return f"{self.text_encoder_type} ({self.precision} - {self.model_path})"
+    @staticmethod
+    def apply_text_to_template(text, template):
+        """
+        Apply text to template.
+        Args:
+            text (str): Input text.
+            template (str or list): Template string or list of chat conversation.
+            prevent_empty_text (bool): If Ture, we will prevent the user text from being empty
+                by adding a space. Defaults to True.
+        """
+        if isinstance(template, str):
+            # Will send string to tokenizer. Used for llm
+            return template.format(text)
+        else:
+            raise TypeError(f"Unsupported template type: {type(template)}")
+    def text2tokens(self, text, data_type='video', name='person'):
+        """
+        Tokenize the input text.
+        Args:
+            text (str or list): Input text.
+        """
+        tokenize_input_type = 'str'
+        if self.use_video_template:
+            if data_type == 'video':
+                prompt_template = self.prompt_template_video["template"]
+            else:
+                raise ValueError(f"Unsupported data type: {data_type}")
+            if isinstance(text, (list, tuple)):
+                text = [self.apply_text_to_template(one_text, prompt_template) for one_text in text]
+                if isinstance(text[0], list):
+                    tokenize_input_type = 'list'
+            elif isinstance(text, str):
+                text = self.apply_text_to_template(text, prompt_template)
+                if isinstance(text, list):
+                    tokenize_input_type = 'list'
+            else:
+                raise TypeError(f"Unsupported text type: {type(text)}")
+        kwargs = dict(truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")
+        if self.text_encoder_type == "llava-llama-3-8b":
+            if isinstance(text, list):
+                for i in range(len(text)):
+                    text[i] = text[i] + '\nThe %s looks like<image>' % name
+            elif isinstance(text, str):
+                text = text + '\nThe %s looks like<image>' % name
+            else:
+                raise NotImplementedError
+        if tokenize_input_type == 'str':
+            return self.tokenizer(text,
+                                  return_length=False,
+                                  return_overflowing_tokens=False,
+                                  return_attention_mask=True,
+                                  **kwargs, )
+        elif tokenize_input_type == 'list':
+            return self.tokenizer.apply_chat_template(text,
+                                                      add_generation_prompt=True,
+                                                      tokenize=True,
+                                                      return_dict=True,
+                                                      **kwargs, )
+        else:
+            raise ValueError(f"Unsupported tokenize_input_type: {tokenize_input_type}")
+    def encode(self, batch_encoding, use_attention_mask=None, output_hidden_states=False, do_sample=None,
+               hidden_state_skip_layer=None, return_texts=False, data_type='image'):
+        """
+        Args:
+            batch_encoding (dict): Batch encoding from tokenizer.
+            use_attention_mask (bool): Whether to use attention mask. If None, use self.use_attention_mask.
+                Defaults to None.
+            output_hidden_states (bool): Whether to output hidden states. If False, return the value of
+                self.output_key. If True, return the entire output. If set self.hidden_state_skip_layer,
+                output_hidden_states will be set True. Defaults to False.
+            do_sample (bool): Whether to sample from the model. Used for Decoder-Only LLMs. Defaults to None.
+                When self.produce is False, do_sample is set to True by default.
+            hidden_state_skip_layer (int): Number of hidden states to hidden_state_skip_layer. 0 means the last layer.
+                If None, self.output_key will be used. Defaults to None.
+            return_texts (bool): Whether to return the decoded texts. Defaults to False.
+        """
+        use_attention_mask = use_default(use_attention_mask, self.use_attention_mask)
+        hidden_state_skip_layer = use_default(hidden_state_skip_layer, self.hidden_state_skip_layer)
+        do_sample = use_default(do_sample, not self.reproduce)
+        if CPU_OFFLOAD:
+            self.model.to('cuda')
+            print(f'encode prompt: move text_encoder to cuda')
+        attention_mask = batch_encoding["attention_mask"].to(self.model.device) if use_attention_mask else None
+        if 'pixel_value_llava' in batch_encoding:
+            outputs = self.model(
+                input_ids=batch_encoding["input_ids"].to(self.model.device),
+                attention_mask=attention_mask,
+                pixel_values=batch_encoding["pixel_value_llava"].to(self.model.device),
+                output_hidden_states=output_hidden_states or hidden_state_skip_layer is not None)
+        else:
+            outputs = self.model(
+            input_ids=batch_encoding["input_ids"].to(self.model.device),
+            attention_mask=attention_mask,
+            output_hidden_states=output_hidden_states or hidden_state_skip_layer is not None,)
+        if hidden_state_skip_layer is not None:
+            last_hidden_state = outputs.hidden_states[-(hidden_state_skip_layer + 1)]
+            # Real last hidden state already has layer norm applied. So here we only apply it
+            # for intermediate layers.
+            if hidden_state_skip_layer > 0 and self.apply_final_norm:
+                last_hidden_state = self.model.final_layer_norm(last_hidden_state)
+        else:
+            last_hidden_state = outputs[self.output_key]
+        # Remove hidden states of instruction tokens, only keep prompt tokens.
+        if self.use_video_template:
+            if data_type == 'video':
+                crop_start = self.prompt_template_video.get("crop_start", -1)
+            else:
+                raise ValueError(f"Unsupported data type: {data_type}")
+            if crop_start > 0:
+                last_hidden_state = last_hidden_state[:, crop_start:]
+                attention_mask = attention_mask[:, crop_start:] if use_attention_mask else None
+        if CPU_OFFLOAD:
+            self.model.to('cpu')
+            torch.cuda.empty_cache()
+            print(f'encode prompt successful: move text_encoder to cpu')
+        if output_hidden_states:
+            return TextEncoderModelOutput(last_hidden_state, attention_mask, outputs.hidden_states)
+        return TextEncoderModelOutput(last_hidden_state, attention_mask)
+    def forward(self, text, use_attention_mask=None, output_hidden_states=False, do_sample=False,
+                hidden_state_skip_layer=None, return_texts=False):
+        batch_encoding = self.text2tokens(text)
+        return self.encode(batch_encoding, use_attention_mask=use_attention_mask,
+                           output_hidden_states=output_hidden_states, do_sample=do_sample,
+                           hidden_state_skip_layer=hidden_state_skip_layer, return_texts=return_texts)

hymm_sp/vae/__init__.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+from pathlib import Path
+from .autoencoder_kl_causal_3d import AutoencoderKLCausal3D
+from ..constants import VAE_PATH, PRECISION_TO_TYPE
+def load_vae(vae_type,
+             vae_precision=None,
+             sample_size=None,
+             vae_path=None,
+             logger=None,
+             device=None
+             ):
+    """
+    Load and configure a Variational Autoencoder (VAE) model.
+    This function handles loading 3D causal VAE models, including configuration,
+    weight loading, precision setting, and device placement. It ensures the model
+    is properly initialized for inference.
+    Parameters:
+        vae_type (str): Type identifier for the VAE, must follow '???-*' format for 3D VAEs
+        vae_precision (str, optional): Desired precision type (e.g., 'fp16', 'fp32').
+                                     Uses model's default if not specified.
+        sample_size (tuple, optional): Input sample dimensions to override config defaults
+        vae_path (str, optional): Path to VAE model files. Uses predefined path from
+                                VAE_PATH constant if not specified.
+        logger (logging.Logger, optional): Logger instance for progress/debug messages
+        device (torch.device, optional): Target device to place the model (e.g., 'cuda' or 'cpu')
+    Returns:
+        tuple: Contains:
+            - vae (AutoencoderKLCausal3D): Loaded and configured VAE model
+            - vae_path (str): Actual path used to load the VAE
+            - spatial_compression_ratio (int): Spatial dimension compression factor
+            - time_compression_ratio (int): Temporal dimension compression factor
+    Raises:
+        ValueError: If vae_type does not follow the required 3D VAE format '???-*'
+    """
+    if vae_path is None:
+        vae_path = VAE_PATH[vae_type]
+    vae_compress_spec, _, _ = vae_type.split("-")
+    length = len(vae_compress_spec)
+    # Process 3D VAE (valid format with 3-character compression spec)
+    if length == 3:
+        if logger is not None:
+            logger.info(f"Loading 3D VAE model ({vae_type}) from: {vae_path}")
+        config = AutoencoderKLCausal3D.load_config(vae_path)
+        if sample_size:
+            vae = AutoencoderKLCausal3D.from_config(config, sample_size=sample_size)
+        else:
+            vae = AutoencoderKLCausal3D.from_config(config)
+        ckpt = torch.load(Path(vae_path) / "pytorch_model.pt", map_location=vae.device)
+        if "state_dict" in ckpt:
+            ckpt = ckpt["state_dict"]
+        vae_ckpt = {k.replace("vae.", ""): v for k, v in ckpt.items() if k.startswith("vae.")}
+        vae.load_state_dict(vae_ckpt)
+        spatial_compression_ratio = vae.config.spatial_compression_ratio
+        time_compression_ratio = vae.config.time_compression_ratio
+    else:
+        raise ValueError(f"Invalid VAE model: {vae_type}. Must be 3D VAE in the format of '???-*'.")
+    if vae_precision is not None:
+        vae = vae.to(dtype=PRECISION_TO_TYPE[vae_precision])
+    vae.requires_grad_(False)
+    if logger is not None:
+        logger.info(f"VAE to dtype: {vae.dtype}")
+    if device is not None:
+        vae = vae.to(device)
+    # Ensure model is in evaluation mode (disables dropout/batch norm training behavior)
+    # Note: Even with dropout rate 0, eval mode is recommended for consistent inference
+    vae.eval()
+    return vae, vae_path, spatial_compression_ratio, time_compression_ratio

hymm_sp/vae/autoencoder_kl_causal_3d.py ADDED Viewed

	@@ -0,0 +1,781 @@

+import os
+import math
+from typing import Dict, Optional, Tuple, Union
+from dataclasses import dataclass
+from torch import distributed as dist
+import loguru
+import torch
+import torch.nn as nn
+import torch.distributed
+from torch import distributed as dist
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+try:
+    # This diffusers is modified and packed in the mirror.
+    from diffusers.loaders import FromOriginalVAEMixin
+except ImportError:
+    # Use this to be compatible with the original diffusers.
+    from diffusers.loaders.single_file_model import FromOriginalModelMixin as FromOriginalVAEMixin
+from diffusers.utils.accelerate_utils import apply_forward_hook
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from .vae import DecoderCausal3D, BaseOutput, DecoderOutput, DiagonalGaussianDistribution, EncoderCausal3D
+import threading
+from hymm_sp.modules.parallel_states import (
+    initialize_sequence_parallel_state,
+    get_sequence_parallel_state,
+    nccl_info,
+)
+def cur_rank():
+    return nccl_info.rank_within_group
+def cur_world_size():
+    return nccl_info.sp_size
+"""
+use trt need install polygraphy and onnx-graphsurgeon
+python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
+"""
+try:
+    from polygraphy.backend.trt import ( TrtRunner, EngineFromBytes)
+    from polygraphy.backend.common import BytesFromPath
+except:
+    print("TrtRunner or EngineFromBytes is not available, you can not use trt engine")
+@dataclass
+class DecoderOutput2(BaseOutput):
+    sample: torch.FloatTensor
+    posterior: Optional[DiagonalGaussianDistribution] = None
+MODEL_OUTPUT_PATH = os.environ.get('MODEL_OUTPUT_PATH')
+MODEL_BASE = os.environ.get('MODEL_BASE')
+CPU_OFFLOAD = int(os.environ.get("CPU_OFFLOAD", 0))
+DISABLE_SP = int(os.environ.get("DISABLE_SP", 0))
+class AutoencoderKLCausal3D(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
+    r"""
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without loosing too much precision in which case
+            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlockCausal3D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlockCausal3D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        sample_tsize: int = 64,
+        scaling_factor: float = 0.18215,
+        force_upcast: float = True,
+        spatial_compression_ratio: int = 8,
+        time_compression_ratio: int = 4,
+        disable_causal_conv: bool = False,
+        mid_block_add_attention: bool = True,
+        mid_block_causal_attn: bool = False,
+        use_trt_engine: bool = False,
+        nccl_gather: bool = True,
+        engine_path: str = f"{MODEL_BASE}/HYVAE_decoder+conv_256x256xT_fp16_H20.engine",
+    ):
+        super().__init__()
+        self.disable_causal_conv = disable_causal_conv
+        self.time_compression_ratio = time_compression_ratio
+        self.encoder = EncoderCausal3D(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+            time_compression_ratio=time_compression_ratio,
+            spatial_compression_ratio=spatial_compression_ratio,
+            disable_causal=disable_causal_conv,
+            mid_block_add_attention=mid_block_add_attention,
+            mid_block_causal_attn=mid_block_causal_attn,
+        )
+        self.decoder = DecoderCausal3D(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            time_compression_ratio=time_compression_ratio,
+            spatial_compression_ratio=spatial_compression_ratio,
+            disable_causal=disable_causal_conv,
+            mid_block_add_attention=mid_block_add_attention,
+            mid_block_causal_attn=mid_block_causal_attn,
+        )
+        self.quant_conv = nn.Conv3d(2 * latent_channels, 2 * latent_channels, kernel_size=1)
+        self.post_quant_conv = nn.Conv3d(latent_channels, latent_channels, kernel_size=1)
+        self.use_slicing = False
+        self.use_spatial_tiling = False
+        self.use_temporal_tiling = False
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_tsize = sample_tsize
+        self.tile_latent_min_tsize = sample_tsize // time_compression_ratio
+        self.tile_sample_min_size = self.config.sample_size
+        sample_size = (
+            self.config.sample_size[0]
+            if isinstance(self.config.sample_size, (list, tuple))
+            else self.config.sample_size
+        )
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
+        # ============= parallism related code ===================
+        world_size = cur_world_size()
+        self.parallel_decode = False if CPU_OFFLOAD else get_sequence_parallel_state()
+        print("WORLD SIZE: ", world_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (EncoderCausal3D, DecoderCausal3D)):
+            module.gradient_checkpointing = value
+    def enable_temporal_tiling(self, use_tiling: bool = True):
+        self.use_temporal_tiling = use_tiling
+    def disable_temporal_tiling(self):
+        self.enable_temporal_tiling(False)
+    def enable_spatial_tiling(self, use_tiling: bool = True):
+        self.use_spatial_tiling = use_tiling
+    def disable_spatial_tiling(self):
+        self.enable_spatial_tiling(False)
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.enable_spatial_tiling(use_tiling)
+        self.enable_temporal_tiling(use_tiling)
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.disable_spatial_tiling()
+        self.disable_temporal_tiling()
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+    def load_trt_decoder(self):
+        self.use_trt_decoder = True
+        self.engine = EngineFromBytes(BytesFromPath(self.engine_path))
+        self.trt_decoder_runner = TrtRunner(self.engine)
+        self.activate_trt_decoder()
+    def disable_trt_decoder(self):
+        self.use_trt_decoder = False
+        del self.engine
+    def activate_trt_decoder(self):
+        self.trt_decoder_runner.activate()
+    def deactivate_trt_decoder(self):
+        self.trt_decoder_runner.deactivate()
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor, _remove_lora=_remove_lora)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` \
+                  when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor, _remove_lora=True)
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        assert len(x.shape) == 5, "The input tensor should have 5 dimensions"
+        if self.use_temporal_tiling and x.shape[2] > self.tile_sample_min_tsize:
+            return self.temporal_tiled_encode(x, return_dict=return_dict)
+        if self.use_spatial_tiling and \
+            (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.spatial_tiled_encode(x, return_dict=return_dict)
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        assert len(z.shape) == 5, "The input tensor should have 5 dimensions"
+        if self.use_temporal_tiling and z.shape[2] > self.tile_latent_min_tsize:
+            return self.temporal_tiled_decode(z, return_dict=return_dict)
+        if self.use_spatial_tiling and (z.shape[-1] > self.tile_latent_min_size or \
+                                        z.shape[-2] > self.tile_latent_min_size):
+            return self.spatial_tiled_decode(z, return_dict=return_dict)
+        if self.use_trt_decoder:
+            # For unknown reason, `copy_outputs_to_host` must be set to True
+            dec = self.trt_decoder_runner.infer({"input": z.to(RECOMMENDED_DTYPE).contiguous()}, \
+                                                copy_outputs_to_host=True)["output"].to(device=z.device, dtype=z.dtype)
+        else:
+            z = self.post_quant_conv(z)
+            dec = self.decoder(z)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    @apply_forward_hook
+    def decode(
+        self, z: torch.FloatTensor, return_dict: bool = True, generator=None
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        """
+        Decode a batch of images.
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        if blend_extent == 0:
+            return b
+        a_region = a[..., -blend_extent:, :]
+        b_region = b[..., :blend_extent, :]
+        weights = torch.arange(blend_extent, device=a.device, dtype=a.dtype) / blend_extent
+        weights = weights.view(1, 1, 1, blend_extent, 1)
+        blended = a_region * (1 - weights) + b_region * weights
+        b[..., :blend_extent, :] = blended
+        return b
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        if blend_extent == 0:
+            return b
+        a_region = a[..., -blend_extent:]
+        b_region = b[..., :blend_extent]
+        weights = torch.arange(blend_extent, device=a.device, dtype=a.dtype) / blend_extent
+        weights = weights.view(1, 1, 1, 1, blend_extent)
+        blended = a_region * (1 - weights) + b_region * weights
+        b[..., :blend_extent] = blended
+        return b
+    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-3], b.shape[-3], blend_extent)
+        if blend_extent == 0:
+            return b
+        a_region = a[..., -blend_extent:, :, :]
+        b_region = b[..., :blend_extent, :, :]
+        weights = torch.arange(blend_extent, device=a.device, dtype=a.dtype) / blend_extent
+        weights = weights.view(1, 1, blend_extent, 1, 1)
+        blended = a_region * (1 - weights) + b_region * weights
+        b[..., :blend_extent, :, :] = blended
+        return b
+    def spatial_tiled_encode(self,
+                             x: torch.FloatTensor,
+                             return_dict: bool = True,
+                             return_moments: bool = False) -> AutoencoderKLOutput:
+        r"""Encode a batch of images using a tiled encoder.
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+        # Split video into tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[-2], overlap_size):
+            row = []
+            for j in range(0, x.shape[-1], overlap_size):
+                tile = x[:, :, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        moments = torch.cat(result_rows, dim=-2)
+        if return_moments:
+            return moments
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def spatial_tiled_decode(self,
+                             z: torch.FloatTensor,
+                             return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rank = cur_rank()
+        rows = []
+        if self.parallel_decode and rank == 0:
+            rank = cur_rank()
+            #torch.cuda.set_device(rank) # set device for trt_runner
+            world_size = cur_world_size()
+            cur_device_id = 0
+            device_tasks = []
+            for i in range(world_size):
+                device_tasks.append([])
+            for i in range(0, z.shape[-2], overlap_size):
+                row = []
+                for j in range(0, z.shape[-1], overlap_size):
+                    tile = z[:, :, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+                    row.append(None)
+                    device_tasks[cur_device_id].append((i // overlap_size, \
+                                                        j // overlap_size, \
+                                                        tile.to("cuda:" + str(cur_device_id))))
+                    #device_tasks[cur_device_id].append((i // overlap_size, j // overlap_size, tile))
+                    cur_device_id = (cur_device_id + 1) % world_size
+                rows.append(row)
+            def thread_run(decoder, device_id, inputs, outputs):
+                for input in inputs:
+                    cur_vae = self.device_vaes[device_id]
+                    ret = cur_vae.decoder(cur_vae.post_quant_conv(input[2]))
+                    outputs[input[0]][input[1]] = ret
+                return
+            threads = []
+            for i in range(world_size):
+                cur_thread = threading.Thread(target=thread_run,
+                                              args=(self, i, device_tasks[i], rows),
+                                              name="DecoderThread-" + str(i))
+                threads.append(cur_thread)
+                cur_thread.start()
+            for cur_thread in threads:
+                cur_thread.join()
+            for i in range(len(rows)):
+                for j in range(len(rows[i])):
+                    rows[i][j] = rows[i][j].to("cuda:0")
+        else:
+            for i in range(0, z.shape[-2], overlap_size):
+                row = []
+                for j in range(0, z.shape[-1], overlap_size):
+                    tile = z[:, :, :, i: i + self.tile_latent_min_size, j: j + self.tile_latent_min_size]
+                    tile = self.post_quant_conv(tile)
+                    decoded = self.decoder(tile)
+                    row.append(decoded)
+                rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        if self.parallel_decode and rank != 0:
+            if not return_dict:
+                return (None,)
+            return DecoderOutput(sample=None)
+        dec = torch.cat(result_rows, dim=-2)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def temporal_tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
+        assert not self.disable_causal_conv, "Temporal tiling is only compatible with causal convolutions."
+        B, C, T, H, W = x.shape
+        overlap_size = int(self.tile_sample_min_tsize * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_tsize * self.tile_overlap_factor)
+        t_limit = self.tile_latent_min_tsize - blend_extent
+        # Split the video into tiles and encode them separately.
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = x[:, :, i : i + self.tile_sample_min_tsize + 1, :, :]
+            if self.use_spatial_tiling and \
+                (tile.shape[-1] > self.tile_sample_min_size or tile.shape[-2] > self.tile_sample_min_size):
+                tile = self.spatial_tiled_encode(tile, return_moments=True)
+            else:
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+            if i > 0:
+                tile = tile[:, :, 1:, :, :]
+            row.append(tile)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :t_limit, :, :])
+            else:
+                result_row.append(tile[:, :, :t_limit+1, :, :])
+        moments = torch.cat(result_row, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def temporal_tiled_decode(self,
+                              z: torch.FloatTensor,
+                              return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        # Split z into overlapping tiles and decode them separately.
+        assert not self.disable_causal_conv, "Temporal tiling is only supported with causal convolutions."
+        B, C, T, H, W = z.shape
+        overlap_size = int(self.tile_latent_min_tsize * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_tsize * self.tile_overlap_factor)
+        t_limit = self.tile_sample_min_tsize - blend_extent
+        rank = 0 if CPU_OFFLOAD or DISABLE_SP else cur_rank()
+        row = []
+        for i in range(0, T, overlap_size):
+            tile = z[:, :, i : i + self.tile_latent_min_tsize + 1, :, :]
+            if self.use_spatial_tiling and \
+                (tile.shape[-1] > self.tile_latent_min_size or tile.shape[-2] > self.tile_latent_min_size):
+                decoded = self.spatial_tiled_decode(tile, return_dict=True).sample
+            else:
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+            if i > 0 and (not self.parallel_decode or rank == 0):
+                decoded = decoded[:, :, 1:, :, :]
+            row.append(decoded)
+        if not CPU_OFFLOAD and not DISABLE_SP and self.parallel_decode and rank != 0:
+            return DecoderOutput(sample=None)
+        result_row = []
+        for i, tile in enumerate(row):
+            if i > 0:
+                tile = self.blend_t(row[i - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :t_limit, :, :])
+            else:
+                result_row.append(tile[:, :, :t_limit+1, :, :])
+        dec = torch.cat(result_row, dim=2)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        return_posterior: bool = False,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput2, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z).sample
+        if not return_dict:
+            if return_posterior:
+                return (dec, posterior)
+            else:
+                return (dec,)
+        if return_posterior:
+            return DecoderOutput2(sample=dec, posterior=posterior)
+        else:
+            return DecoderOutput2(sample=dec)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)

hymm_sp/vae/unet_causal_3d_blocks.py ADDED Viewed

	@@ -0,0 +1,900 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from einops import rearrange
+from diffusers.utils import is_torch_version, logging
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import SpatialNorm
+from diffusers.models.attention_processor import Attention
+from diffusers.models.normalization import AdaGroupNorm
+from diffusers.models.normalization import RMSNorm
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def prepare_causal_attention_mask(n_frame: int, n_hw: int, dtype, device, batch_size: int = None):
+    seq_len = n_frame * n_hw
+    mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
+    for i in range(seq_len):
+        i_frame = i // n_hw
+        mask[i, : (i_frame + 1) * n_hw] = 0
+    if batch_size is not None:
+        mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
+    return mask
+class CausalConv3d(nn.Module):
+    def __init__(
+        self,
+        chan_in,
+        chan_out,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        pad_mode = 'replicate',
+        disable_causal=False,
+        **kwargs
+    ):
+        super().__init__()
+        self.pad_mode = pad_mode
+        if disable_causal:
+            padding = (kernel_size // 2, kernel_size // 2, kernel_size // 2,
+                       kernel_size // 2, kernel_size // 2, kernel_size // 2)
+        else:
+            padding = (kernel_size // 2, kernel_size // 2, kernel_size // 2,
+                       kernel_size // 2, kernel_size - 1, 0) # W, H, T
+        self.time_causal_padding = padding
+        self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride = stride, dilation = dilation, **kwargs)
+    def forward(self, x):
+        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
+        return self.conv(x)
+class CausalAvgPool3d(nn.Module):
+    def __init__(
+        self,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]],
+        pad_mode = 'replicate',
+        disable_causal=False,
+        **kwargs
+    ):
+        super().__init__()
+        self.pad_mode = pad_mode
+        if disable_causal:
+            padding = (0, 0, 0, 0, 0, 0)
+        else:
+            padding = (0, 0, 0, 0, stride - 1, 0) # W, H, T
+        self.time_causal_padding = padding
+        self.conv = nn.AvgPool3d(kernel_size, stride=stride, ceil_mode=True, **kwargs)
+        self.pad_mode = pad_mode
+    def forward(self, x):
+        x = F.pad(x, self.time_causal_padding, mode=self.pad_mode)
+        return self.conv(x)
+class UpsampleCausal3D(nn.Module):
+    """A 3D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 3D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        use_conv_transpose: bool = False,
+        out_channels: Optional[int] = None,
+        name: str = "conv",
+        kernel_size: Optional[int] = None,
+        padding=1,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+        interpolate=True,
+        upsample_factor=(2, 2, 2),
+        disable_causal=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        self.interpolate = interpolate
+        self.upsample_factor = upsample_factor
+        self.disable_causal = disable_causal
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps, elementwise_affine)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+        conv = None
+        if use_conv_transpose:
+            assert False, "Not Implement yet"
+            if kernel_size is None:
+                kernel_size = 4
+            conv = nn.ConvTranspose2d(
+                channels, self.out_channels, kernel_size=kernel_size, stride=2, padding=padding, bias=bias
+            )
+        elif use_conv:
+            if kernel_size is None:
+                kernel_size = 3
+            conv = CausalConv3d(self.channels, self.out_channels,
+                                kernel_size=kernel_size, bias=bias, disable_causal=disable_causal)
+        if name == "conv":
+            self.conv = conv
+        else:
+            self.Conv2d_0 = conv
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        output_size: Optional[int] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+        if self.norm is not None:
+            assert False, "Not Implement yet"
+            hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        if self.use_conv_transpose:
+            return self.conv(hidden_states)
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        # https://github.com/pytorch/pytorch/issues/86679
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+        # upsample_nearest_nhwc fails with large batch sizes.
+        # see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+        # if `output_size` is passed we force the interpolation output
+        # size and do not make use of `scale_factor=2`
+        if self.interpolate:
+            B, C, T, H, W = hidden_states.shape
+            if not self.disable_causal:
+                first_h, other_h = hidden_states.split((1, T-1), dim=2)
+                if output_size is None:
+                    if T > 1:
+                        other_h = F.interpolate(other_h, scale_factor=self.upsample_factor, mode="nearest")
+                    first_h = first_h.squeeze(2)
+                    first_h = F.interpolate(first_h, scale_factor=self.upsample_factor[1:], mode="nearest")
+                    first_h = first_h.unsqueeze(2)
+                else:
+                    assert False, "Not Implement yet"
+                    other_h = F.interpolate(other_h, size=output_size, mode="nearest")
+                if T > 1:
+                    hidden_states = torch.cat((first_h, other_h), dim=2)
+                else:
+                    hidden_states = first_h
+            else:
+                hidden_states = F.interpolate(hidden_states, scale_factor=self.upsample_factor, mode="nearest")
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+        if self.use_conv:
+            if self.name == "conv":
+                hidden_states = self.conv(hidden_states)
+            else:
+                hidden_states = self.Conv2d_0(hidden_states)
+        return hidden_states
+class DownsampleCausal3D(nn.Module):
+    """A 3D downsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        padding (`int`, default `1`):
+            padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 3D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        padding: int = 1,
+        name: str = "conv",
+        kernel_size=3,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+        stride=2,
+        disable_causal=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = stride
+        self.name = name
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps, elementwise_affine)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+        if use_conv:
+            conv = CausalConv3d(
+                self.channels, self.out_channels, kernel_size=kernel_size, stride=stride,
+                disable_causal=disable_causal, bias=bias
+            )
+        else:
+            raise NotImplementedError
+        if name == "conv":
+            self.Conv2d_0 = conv
+            self.conv = conv
+        elif name == "Conv2d_0":
+            self.conv = conv
+        else:
+            self.conv = conv
+    def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+        assert hidden_states.shape[1] == self.channels
+        if self.norm is not None:
+            hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class ResnetBlockCausal3D(nn.Module):
+    r"""
+    A Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        groups_out (`int`, *optional*, default to None):
+            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
+        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
+            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" or
+            "ada_group" for a stronger conditioning with scale and shift.
+        kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see
+            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
+        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
+        use_in_shortcut (`bool`, *optional*, default to `True`):
+            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
+        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
+        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
+        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
+            `conv_shortcut` output.
+        conv_3d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
+            If None, same as `out_channels`.
+    """
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        pre_norm: bool = True,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        skip_time_act: bool = False,
+        time_embedding_norm: str = "default",  # default, scale_shift, ada_group, spatial
+        kernel: Optional[torch.FloatTensor] = None,
+        output_scale_factor: float = 1.0,
+        use_in_shortcut: Optional[bool] = None,
+        up: bool = False,
+        down: bool = False,
+        conv_shortcut_bias: bool = True,
+        conv_3d_out_channels: Optional[int] = None,
+        disable_causal: bool = False,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+        self.time_embedding_norm = time_embedding_norm
+        self.skip_time_act = skip_time_act
+        linear_cls = nn.Linear
+        if groups_out is None:
+            groups_out = groups
+        if self.time_embedding_norm == "ada_group":
+            self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm1 = SpatialNorm(in_channels, temb_channels)
+        else:
+            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, stride=1, disable_causal=disable_causal)
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                self.time_emb_proj = linear_cls(temb_channels, out_channels)
+            elif self.time_embedding_norm == "scale_shift":
+                self.time_emb_proj = linear_cls(temb_channels, 2 * out_channels)
+            elif self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+                self.time_emb_proj = None
+            else:
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+        else:
+            self.time_emb_proj = None
+        if self.time_embedding_norm == "ada_group":
+            self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm2 = SpatialNorm(out_channels, temb_channels)
+        else:
+            self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        conv_3d_out_channels = conv_3d_out_channels or out_channels
+        self.conv2 = CausalConv3d(out_channels, conv_3d_out_channels,
+                                  kernel_size=3, stride=1, disable_causal=disable_causal)
+        self.nonlinearity = get_activation(non_linearity)
+        self.upsample = self.downsample = None
+        if self.up:
+            self.upsample = UpsampleCausal3D(in_channels, use_conv=False, disable_causal=disable_causal)
+        elif self.down:
+            self.downsample = DownsampleCausal3D(in_channels, use_conv=False,
+                                                 disable_causal=disable_causal, name="op")
+        self.use_in_shortcut = self.in_channels != conv_3d_out_channels \
+                                if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = CausalConv3d(
+                in_channels,
+                conv_3d_out_channels,
+                kernel_size=1,
+                stride=1,
+                disable_causal=disable_causal,
+                bias=conv_shortcut_bias,
+            )
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm1(hidden_states, temb)
+        else:
+            hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        if self.upsample is not None:
+            # upsample_nearest_nhwc fails with large batch sizes.
+            # see https://github.com/huggingface/diffusers/issues/984
+            if hidden_states.shape[0] >= 64:
+                input_tensor = input_tensor.contiguous()
+                hidden_states = hidden_states.contiguous()
+            input_tensor = (
+                self.upsample(input_tensor, scale=scale)
+            )
+            hidden_states = (
+                self.upsample(hidden_states, scale=scale)
+            )
+        elif self.downsample is not None:
+            input_tensor = (
+                self.downsample(input_tensor, scale=scale)
+            )
+            hidden_states = (
+                self.downsample(hidden_states, scale=scale)
+            )
+        hidden_states = self.conv1(hidden_states)
+        if self.time_emb_proj is not None:
+            if not self.skip_time_act:
+                temb = self.nonlinearity(temb)
+            temb = (
+                self.time_emb_proj(temb, scale)[:, :, None, None]
+            )
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm2(hidden_states, temb)
+        else:
+            hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = (
+                self.conv_shortcut(input_tensor)
+            )
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        return output_tensor
+def get_down_block3d(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+    downsample_stride: int,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    downsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+    disable_causal: bool = False,
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling \
+            `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownEncoderBlockCausal3D":
+        return DownEncoderBlockCausal3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            downsample_stride=downsample_stride,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            disable_causal=disable_causal,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block3d(
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int,
+    temb_channels: int,
+    add_upsample: bool,
+    upsample_scale_factor: Tuple,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resolution_idx: Optional[int] = None,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    upsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+    disable_causal: bool = False,
+) -> nn.Module:
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. \
+                Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpDecoderBlockCausal3D":
+        return UpDecoderBlockCausal3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            upsample_scale_factor=upsample_scale_factor,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+            disable_causal=disable_causal,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlockCausal3D(nn.Module):
+    """
+    A 3D UNet mid-block [`UNetMidBlockCausal3D`] with multiple residual blocks and optional attention blocks.
+    Args:
+        in_channels (`int`): The number of input channels.
+        temb_channels (`int`): The number of temporal embedding channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
+            The type of normalization to apply to the time embeddings. This can help to improve the performance of the
+            model on tasks with long-range temporal dependencies.
+        resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
+        resnet_pre_norm (`bool`, *optional*, defaults to `True`):
+            Whether to use pre-normalization for the resnet blocks.
+        add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
+        attention_head_dim (`int`, *optional*, defaults to 1):
+            Dimension of a single attention head. The number of attention heads is determined based on this value and
+            the number of input channels.
+        output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        disable_causal: bool = False,
+        causal_attention: bool = False,
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.add_attention = add_attention
+        self.causal_attention = causal_attention
+        if attn_groups is None:
+            attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlockCausal3D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                disable_causal=disable_causal,
+            )
+        ]
+        attentions = []
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. \
+                Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+        for _ in range(num_layers):
+            if self.add_attention:
+                #assert False, "Not implemented yet"
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+            resnets.append(
+                ResnetBlockCausal3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    disable_causal=disable_causal,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def forward(self,
+                hidden_states: torch.FloatTensor,
+                temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                B, C, T, H, W = hidden_states.shape
+                hidden_states = rearrange(hidden_states, "b c f h w -> b (f h w) c")
+                if self.causal_attention:
+                    attention_mask = prepare_causal_attention_mask(T, H * W,
+                                                                   hidden_states.dtype,
+                                                                   hidden_states.device,
+                                                                   batch_size=B)
+                else:
+                    attention_mask = None
+                hidden_states = attn(hidden_states, temb=temb, attention_mask=attention_mask)
+                hidden_states = rearrange(hidden_states, "b (f h w) c -> b c f h w", f=T, h=H, w=W)
+            hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class DownEncoderBlockCausal3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_stride: int = 2,
+        downsample_padding: int = 1,
+        disable_causal: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlockCausal3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    disable_causal=disable_causal,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    DownsampleCausal3D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                        stride=downsample_stride,
+                        disable_causal=disable_causal,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+    def forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None, scale=scale)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale)
+        return hidden_states
+class UpDecoderBlockCausal3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        upsample_scale_factor = (2, 2, 2),
+        temb_channels: Optional[int] = None,
+        disable_causal: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlockCausal3D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    disable_causal=disable_causal,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    UpsampleCausal3D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        upsample_factor=upsample_scale_factor,
+                        disable_causal=disable_causal
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+        self.resolution_idx = resolution_idx
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        return hidden_states

hymm_sp/vae/vae.py ADDED Viewed

	@@ -0,0 +1,433 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+from diffusers.utils import BaseOutput, is_torch_version
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.attention_processor import SpatialNorm
+from .unet_causal_3d_blocks import (
+    CausalConv3d,
+    UNetMidBlockCausal3D,
+    get_down_block3d,
+    get_up_block3d,
+)
+@dataclass
+class DecoderOutput(BaseOutput):
+    r"""
+    Output of decoding method.
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The decoded output sample from the last layer of the model.
+    """
+    sample: torch.FloatTensor
+class EncoderCausal3D(nn.Module):
+    r"""
+    The `EncoderCausal3D` layer of a variational autoencoder that encodes its input into a latent representation.
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
+            options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        double_z (`bool`, *optional*, defaults to `True`):
+            Whether to double the number of output channels for the last block.
+    """
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlockCausal3D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        double_z: bool = True,
+        mid_block_add_attention=True,
+        time_compression_ratio: int = 4,
+        spatial_compression_ratio: int = 8,
+        disable_causal: bool = False,
+        mid_block_causal_attn: bool = False,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+        self.conv_in = CausalConv3d(in_channels, block_out_channels[0],
+                                    kernel_size=3, stride=1, disable_causal=disable_causal)
+        self.mid_block = None
+        self.down_blocks = nn.ModuleList([])
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            num_spatial_downsample_layers = int(np.log2(spatial_compression_ratio))
+            num_time_downsample_layers = int(np.log2(time_compression_ratio))
+            if time_compression_ratio == 4:
+                add_spatial_downsample = bool(i < num_spatial_downsample_layers)
+                add_time_downsample = bool(i >= (len(block_out_channels) - 1 - \
+                                    num_time_downsample_layers) and not is_final_block)
+            elif time_compression_ratio == 8:
+                add_spatial_downsample = bool(i < num_spatial_downsample_layers)
+                add_time_downsample = bool(i < num_time_downsample_layers)
+            else:
+                raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}")
+            downsample_stride_HW = (2, 2) if add_spatial_downsample else (1, 1)
+            downsample_stride_T = (2, ) if add_time_downsample else (1, )
+            downsample_stride = tuple(downsample_stride_T + downsample_stride_HW)
+            down_block = get_down_block3d(
+                down_block_type,
+                num_layers=self.layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                add_downsample=bool(add_spatial_downsample or add_time_downsample),
+                downsample_stride=downsample_stride,
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=None,
+                disable_causal=disable_causal,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlockCausal3D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+            add_attention=mid_block_add_attention,
+            disable_causal=disable_causal,
+            causal_attention=mid_block_causal_attn,
+        )
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = CausalConv3d(block_out_channels[-1], conv_out_channels,
+                                     kernel_size=3, disable_causal=disable_causal)
+        self.gradient_checkpointing = False
+    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `EncoderCausal3D` class."""
+        assert len(sample.shape) == 5, "The input tensor should have 5 dimensions"
+        sample = self.conv_in(sample)
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+                return custom_forward
+            # down
+            if is_torch_version(">=", "1.11.0"):
+                for down_block in self.down_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(down_block), sample, use_reentrant=False
+                    )
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, use_reentrant=False
+                )
+            else:
+                for down_block in self.down_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+        else:
+            # down
+            for down_block in self.down_blocks:
+                sample = down_block(sample)
+            # middle
+            sample = self.mid_block(sample)
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        return sample
+class DecoderCausal3D(nn.Module):
+    r"""
+    The `DecoderCausal3D` layer of a variational autoencoder that decodes its
+    latent representation into an output sample.
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            The normalization type to use. Can be either `"group"` or `"spatial"`.
+    """
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlockCausal3D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
+        mid_block_add_attention=True,
+        time_compression_ratio: int = 4,
+        spatial_compression_ratio: int = 8,
+        disable_causal: bool = False,
+        mid_block_causal_attn: bool = False,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+        self.conv_in = CausalConv3d(in_channels, block_out_channels[-1], kernel_size=3,
+                                    stride=1, disable_causal=disable_causal)
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        temb_channels = in_channels if norm_type == "spatial" else None
+        # mid
+        self.mid_block = UNetMidBlockCausal3D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=temb_channels,
+            add_attention=mid_block_add_attention,
+            disable_causal=disable_causal,
+            causal_attention=mid_block_causal_attn,
+        )
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            num_spatial_upsample_layers = int(np.log2(spatial_compression_ratio))
+            num_time_upsample_layers = int(np.log2(time_compression_ratio))
+            if time_compression_ratio == 4:
+                add_spatial_upsample = bool(i < num_spatial_upsample_layers)
+                add_time_upsample = bool(i >= len(block_out_channels) - 1 - \
+                    num_time_upsample_layers and not is_final_block)
+            elif time_compression_ratio == 8:
+                add_spatial_upsample = bool(i >= len(block_out_channels) - num_spatial_upsample_layers)
+                add_time_upsample = bool(i >= len(block_out_channels) - num_time_upsample_layers)
+            else:
+                raise ValueError(f"Unsupported time_compression_ratio: {time_compression_ratio}")
+            upsample_scale_factor_HW = (2, 2) if add_spatial_upsample else (1, 1)
+            upsample_scale_factor_T = (2, ) if add_time_upsample else (1, )
+            upsample_scale_factor = tuple(upsample_scale_factor_T + upsample_scale_factor_HW)
+            up_block = get_up_block3d(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=bool(add_spatial_upsample or add_time_upsample),
+                upsample_scale_factor=upsample_scale_factor,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
+                disable_causal=disable_causal,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_type == "spatial":
+            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+        else:
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = CausalConv3d(block_out_channels[0], out_channels, kernel_size=3, disable_causal=disable_causal)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        latent_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `DecoderCausal3D` class."""
+        assert len(sample.shape) == 5, "The input tensor should have 5 dimensions"
+        sample = self.conv_in(sample)
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+                return custom_forward
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    latent_embeds,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        latent_embeds,
+                        use_reentrant=False,
+                    )
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, latent_embeds
+                )
+                sample = sample.to(upscale_dtype)
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+        else:
+            # middle
+            sample = self.mid_block(sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
+            # up
+            for up_block in self.up_blocks:
+                sample = up_block(sample, latent_embeds)
+        # post-process
+        if latent_embeds is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, latent_embeds)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        return sample
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        if parameters.ndim == 3:
+            dim = 2 # (B, L, C)
+        elif parameters.ndim == 5 or parameters.ndim == 4:
+            dim = 1 # (B, C, T, H ,W) / (B, C, H, W)
+        else:
+            raise NotImplementedError
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+            )
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            reduce_dim = list(range(1, self.mean.ndim))
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=reduce_dim,
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=reduce_dim,
+                )
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+    def mode(self) -> torch.Tensor:
+        return self.mean

requirements.txt ADDED Viewed

	@@ -0,0 +1,60 @@

+accelerate==1.9.0
+av==15.0.0
+certifi==2025.8.3
+charset-normalizer==3.4.2
+contourpy==1.3.3
+cycler==0.12.1
+decord==0.6.0
+diffusers==0.34.0
+einops==0.8.1
+filelock==3.13.1
+fonttools==4.59.0
+fsspec==2024.6.1
+hf-xet==1.1.5
+huggingface-hub==0.34.3
+idna==3.10
+imageio==2.37.0
+imageio-ffmpeg==0.6.0
+importlib_metadata==8.7.0
+Jinja2==3.1.4
+kiwisolver==1.4.8
+loguru==0.7.3
+MarkupSafe==2.1.5
+matplotlib==3.10.5
+mpmath==1.3.0
+networkx==3.3
+ninja==1.11.1.4
+numpy==2.1.2
+nvidia-ml-py==12.575.51
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+nvitop==1.5.2
+opencv-python-headless==4.12.0.88
+packaging==25.0
+pandas==2.3.1
+pillow==11.0.0
+protobuf==6.31.1
+psutil==7.0.0
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+pytz==2025.2
+PyYAML==6.0.2
+regex==2025.7.34
+requests==2.32.4
+safetensors==0.5.3
+sentencepiece==0.2.0
+setuptools==78.1.1
+six==1.17.0
+sympy==1.13.1
+tokenizers==0.21.4
+tqdm==4.67.1
+transformers==4.54.1
+triton==3.1.0
+typing_extensions==4.12.2
+tzdata==2025.2
+urllib3==2.5.0
+wheel==0.45.1
+zipp==3.23.0
+gradio==5.42.0
+sageattention==1.0.6

scripts/run_sample_batch_4090.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/bash
+JOBS_DIR=$(dirname $(dirname "$0"))
+export PYTHONPATH=${JOBS_DIR}:$PYTHONPATH
+export MODEL_BASE="/path/to/models"
+checkpoint_path="/path/to/ckpts"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+modelname='Tencent_hunyuanGameCraft_720P'
+# disable sp and enable cpu offload
+export DISABLE_SP=1
+export CPU_OFFLOAD=1
+export NUM_GPU=1
+# # enable both sp and cpu offload
+# export DISABLE_SP=0
+# export CPU_OFFLOAD=1
+# export NUM_GPU=8
+torchrun --nnodes=1 --nproc_per_node=${NUM_GPU} --master_port 29605 hymm_sp/sample_batch.py \
+    --image-path "asset/village.png" \
+    --prompt "A charming medieval village with cobblestone streets, thatched-roof houses, and vibrant flower gardens under a bright blue sky." \
+    --add-neg-prompt "overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, text, subtitles, static, picture, black border." \
+    --ckpt ${checkpoint_path} \
+    --video-size 704 1216 \
+    --cfg-scale 2.0 \
+    --image-start \
+    --action-list w s d a \
+    --action-speed-list 0.2 0.2 0.2 0.2 \
+    --seed 250160 \
+    --infer-steps 50 \
+    --flow-shift-eval-video 5.0 \
+    --cpu-offload \
+    --use-fp8 \
+    --save-path './results/'

scripts/run_sample_batch_distill.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash
+JOBS_DIR=$(dirname $(dirname "$0"))
+export PYTHONPATH=${JOBS_DIR}:$PYTHONPATH
+export MODEL_BASE="weights/stdmodels"
+checkpoint_path="weights/gamecraft_models/mp_rank_00_model_states_distill.pt"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+modelname='Tencent_hunyuanGameCraft_720P'
+torchrun --nnodes=1 --nproc_per_node=8 --master_port 29605 hymm_sp/sample_batch.py \
+    --image-path "asset/village.png" \
+    --prompt "A charming medieval village with cobblestone streets, thatched-roof houses, and vibrant flower gardens under a bright blue sky." \
+    --add-neg-prompt "overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, text, subtitles, static, picture, black border." \
+    --ckpt ${checkpoint_path} \
+    --video-size 704 1216 \
+    --cfg-scale 1.0 \
+    --image-start \
+    --action-list w s d a \
+    --action-speed-list 0.2 0.2 0.2 0.2 \
+    --seed 250160 \
+    --infer-steps 8 \
+    --use-fp8 \
+    --flow-shift-eval-video 5.0 \
+    --save-path './results_distill/'

scripts/run_sample_batch_sp.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash
+JOBS_DIR=$(dirname $(dirname "$0"))
+export PYTHONPATH=${JOBS_DIR}:$PYTHONPATH
+export MODEL_BASE="weights/stdmodels"
+checkpoint_path="weights/gamecraft_models/mp_rank_00_model_states.pt"
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+modelname='Tencent_hunyuanGameCraft_720P'
+torchrun --nnodes=1 --nproc_per_node=8 --master_port 29605 hymm_sp/sample_batch.py \
+    --image-path "asset/village.png" \
+    --prompt "A charming medieval village with cobblestone streets, thatched-roof houses, and vibrant flower gardens under a bright blue sky." \
+    --add-pos-prompt "Realistic, High-quality." \
+    --add-neg-prompt "overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion, blurring, text, subtitles, static, picture, black border." \
+    --ckpt ${checkpoint_path} \
+    --video-size 704 1216 \
+    --cfg-scale 2.0 \
+    --image-start \
+    --action-list w s d a \
+    --action-speed-list 0.2 0.2 0.2 0.2 \
+    --seed 250160 \
+    --infer-steps 50 \
+    --flow-shift-eval-video 5.0 \
+    --save-path './results/'