Spaces:

pandaphd
/

generative_photography

Runtime error

App Files Files Community

pandaphd commited on Mar 4

Commit

1ae4e5b

1 Parent(s): e2515d4

Removed <file> from Git LFS tracking

Browse files

Files changed (26) hide show

.gitattributes +36 -3
README.md +14 -3
app.py +152 -3
configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml +66 -3
configs/inference_genphoto/adv3_256_384_genphoto_relora_color_temperature.yaml +66 -3
configs/inference_genphoto/adv3_256_384_genphoto_relora_focal_length.yaml +65 -3
configs/inference_genphoto/adv3_256_384_genphoto_relora_shutter_speed.yaml +66 -3
environment.yaml +27 -3
genphoto/data/dataset.py +950 -3
genphoto/models/attention.py +136 -3
genphoto/models/attention_processor.py +412 -3
genphoto/models/camera_adaptor.py +246 -3
genphoto/models/ccl_embedding.py +64 -3
genphoto/models/motion_module.py +389 -3
genphoto/models/resnet.py +440 -3
genphoto/models/unet.py +1300 -3
genphoto/models/unet_blocks.py +818 -3
genphoto/pipelines/pipeline_animation.py +719 -3
genphoto/utils/convert_from_ckpt.py +556 -3
genphoto/utils/convert_lora_safetensor_to_diffusers.py +154 -3
genphoto/utils/util.py +148 -3
inference_bokehK.py +216 -3
inference_color_temperature.py +338 -3
inference_focal_length.py +335 -3
inference_shutter_speed.py +322 -3
requirements.txt +19 -3

.gitattributes CHANGED Viewed

@@ -1,3 +1,36 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f978134ea372378fb27d2c9aaeb7db0a8d814207997bdad9ed8f368783d0a857
-size 1593

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+* !text !filter !merge !diff

README.md CHANGED Viewed

@@ -1,3 +1,14 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9c83176a74a800ceebc4069a48b824b4c1a7b2f06d02ff5959e63eebc2a8d222
-size 331

+---
+title: Generative Photography
+emoji: 📈
+colorFrom: blue
+colorTo: blue
+sdk: gradio
+sdk_version: 5.20.0
+app_file: app.py
+pinned: false
+license: cc-by-nc-nd-4.0
+short_description: Demo for Generative Photography
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,3 +1,152 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:14e58bab9ed2b6eac8619e2b9c3c3ff03bf4689406c28de8eb49237f6f25c23b
-size 8306

+import gradio as gr
+import json
+import torch
+from inference_bokehK import load_models as load_bokeh_models, run_inference as run_bokeh_inference, OmegaConf
+from inference_focal_length import load_models as load_focal_models, run_inference as run_focal_inference
+from inference_shutter_speed import load_models as load_shutter_models, run_inference as run_shutter_inference
+from inference_color_temperature import load_models as load_color_models, run_inference as run_color_inference
+torch.manual_seed(42)
+bokeh_cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml")
+bokeh_pipeline, bokeh_device = load_bokeh_models(bokeh_cfg)
+focal_cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_focal_length.yaml")
+focal_pipeline, focal_device = load_focal_models(focal_cfg)
+shutter_cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_shutter_speed.yaml")
+shutter_pipeline, shutter_device = load_shutter_models(shutter_cfg)
+color_cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_color_temperature.yaml")
+color_pipeline, color_device = load_color_models(color_cfg)
+def generate_bokeh_video(base_scene, bokehK_list):
+    try:
+        torch.manual_seed(42)
+        if len(json.loads(bokehK_list)) != 5:
+            raise ValueError("Exactly 5 Bokeh K values required")
+        return run_bokeh_inference(
+            pipeline=bokeh_pipeline, tokenizer=bokeh_pipeline.tokenizer,
+            text_encoder=bokeh_pipeline.text_encoder, base_scene=base_scene,
+            bokehK_list=bokehK_list, device=bokeh_device
+        )
+    except Exception as e:
+        return f"Error: {str(e)}"
+def generate_focal_video(base_scene, focal_length_list):
+    try:
+        torch.manual_seed(42)
+        if len(json.loads(focal_length_list)) != 5:
+            raise ValueError("Exactly 5 focal length values required")
+        return run_focal_inference(
+            pipeline=focal_pipeline, tokenizer=focal_pipeline.tokenizer,
+            text_encoder=focal_pipeline.text_encoder, base_scene=base_scene,
+            focal_length_list=focal_length_list, device=focal_device
+        )
+    except Exception as e:
+        return f"Error: {str(e)}"
+def generate_shutter_video(base_scene, shutter_speed_list):
+    try:
+        torch.manual_seed(42)
+        if len(json.loads(shutter_speed_list)) != 5:
+            raise ValueError("Exactly 5 shutter speed values required")
+        return run_shutter_inference(
+            pipeline=shutter_pipeline, tokenizer=shutter_pipeline.tokenizer,
+            text_encoder=shutter_pipeline.text_encoder, base_scene=base_scene,
+            shutter_speed_list=shutter_speed_list, device=shutter_device
+        )
+    except Exception as e:
+        return f"Error: {str(e)}"
+def generate_color_video(base_scene, color_temperature_list):
+    try:
+        torch.manual_seed(42)
+        if len(json.loads(color_temperature_list)) != 5:
+            raise ValueError("Exactly 5 color temperature values required")
+        return run_color_inference(
+            pipeline=color_pipeline, tokenizer=color_pipeline.tokenizer,
+            text_encoder=color_pipeline.text_encoder, base_scene=base_scene,
+            color_temperature_list=color_temperature_list, device=color_device
+        )
+    except Exception as e:
+        return f"Error: {str(e)}"
+bokeh_examples = [
+    ["A variety of potted plants are displayed on a window sill, with some of them placed in yellow and white cups. The plants are arranged in different sizes and shapes, creating a visually appealing display.", "[18.0, 14.0, 10.0, 6.0, 2.0]"],
+    ["A colorful backpack with a floral pattern is sitting on a table next to a computer monitor.", "[2.3, 5.8, 10.2, 14.8, 24.9]"]
+]
+focal_examples = [
+    ["A small office cubicle with a desk.", "[25.1, 36.1, 47.1, 58.1, 69.1]"],
+    ["A large white couch in a living room.", "[55.0, 46.0, 37.0, 28.0, 25.0]"]
+]
+shutter_examples = [
+    ["A brown and orange leather handbag.", "[0.11, 0.22, 0.33, 0.44, 0.55]"],
+    ["A variety of potted plants.", "[0.2, 0.49, 0.69, 0.75, 0.89]"]
+]
+color_examples = [
+    ["A blue sky with mountains.", "[5455.0, 5155.0, 5555.0, 6555.0, 7555.0]"],
+    ["A red couch in front of a window.", "[3500.0, 5500.0, 6500.0, 7500.0, 8500.0]"]
+]
+with gr.Blocks(title="Generative Photography") as demo:
+    gr.Markdown("# **Generative Photography: Scene-Consistent Camera Control for Realistic Text-to-Image Synthesis** ")
+    with gr.Tabs():
+        with gr.Tab("BokehK Effect"):
+            gr.Markdown("### Generate Frames with Bokeh Blur Effect")
+            with gr.Row():
+                with gr.Column():
+                    scene_input_bokeh = gr.Textbox(label="Scene Description", placeholder="Describe the scene you want to generate...")
+                    bokeh_input = gr.Textbox(label="Bokeh Blur Values", placeholder="Enter 5 comma-separated values from 1-30, e.g., [2.44, 8.3, 10.1, 17.2, 24.0]")
+                    submit_bokeh = gr.Button("Generate Video")
+                with gr.Column():
+                    video_output_bokeh = gr.Video(label="Generated Video")
+            gr.Examples(bokeh_examples, [scene_input_bokeh, bokeh_input], [video_output_bokeh], generate_bokeh_video)
+            submit_bokeh.click(generate_bokeh_video, [scene_input_bokeh, bokeh_input], [video_output_bokeh])
+        with gr.Tab("Focal Length Effect"):
+            gr.Markdown("### Generate Frames with Focal Length Effect")
+            with gr.Row():
+                with gr.Column():
+                    scene_input_focal = gr.Textbox(label="Scene Description", placeholder="Describe the scene you want to generate...")
+                    focal_input = gr.Textbox(label="Focal Length Values", placeholder="Enter 5 comma-separated values from 24-70, e.g., [25.1, 30.2, 33.3, 40.8, 54.0]")
+                    submit_focal = gr.Button("Generate Video")
+                with gr.Column():
+                    video_output_focal = gr.Video(label="Generated Video")
+            gr.Examples(focal_examples, [scene_input_focal, focal_input], [video_output_focal], generate_focal_video)
+            submit_focal.click(generate_focal_video, [scene_input_focal, focal_input], [video_output_focal])
+        with gr.Tab("Shutter Speed Effect"):
+            gr.Markdown("### Generate Frames with Shutter Speed Effect")
+            with gr.Row():
+                with gr.Column():
+                    scene_input_shutter = gr.Textbox(label="Scene Description", placeholder="Describe the scene you want to generate...")
+                    shutter_input = gr.Textbox(label="Shutter Speed Values", placeholder="Enter 5 comma-separated values from 0.1-1.0, e.g., [0.15, 0.32, 0.53, 0.62, 0.82]")
+                    submit_shutter = gr.Button("Generate Video")
+                with gr.Column():
+                    video_output_shutter = gr.Video(label="Generated Video")
+            gr.Examples(shutter_examples, [scene_input_shutter, shutter_input], [video_output_shutter], generate_shutter_video)
+            submit_shutter.click(generate_shutter_video, [scene_input_shutter, shutter_input], [video_output_shutter])
+        with gr.Tab("Color Temperature Effect"):
+            gr.Markdown("### Generate Frames with Color Temperature Effect")
+            with gr.Row():
+                with gr.Column():
+                    scene_input_color = gr.Textbox(label="Scene Description", placeholder="Describe the scene you want to generate...")
+                    color_input = gr.Textbox(label="Color Temperature Values", placeholder="Enter 5 comma-separated values from 2000-10000, e.g., [3001.3, 4000.2, 4400.34, 5488.23, 8888.82]")
+                    submit_color = gr.Button("Generate Video")
+                with gr.Column():
+                    video_output_color = gr.Video(label="Generated Video")
+            gr.Examples(color_examples, [scene_input_color, color_input], [video_output_color], generate_color_video)
+            submit_color.click(generate_color_video, [scene_input_color, color_input], [video_output_color])
+if __name__ == "__main__":
+    demo.launch(share=True)

configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml CHANGED Viewed

@@ -1,3 +1,66 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a74bacc98940eb895b1ac635f5e8b4fabb811d98c8a067ece44c0ac4ff460842
-size 1823

+output_dir: "inference_output/genphoto_bokehK"
+pretrained_model_repo: "pandaphd/generative_photography"
+pretrained_model_path: "stable-diffusion-v1-5"
+unet_subfolder: "unet_merged"
+camera_adaptor_ckpt: "weights/checkpoint-bokehK.ckpt"
+lora_ckpt: "weights/RealEstate10K_LoRA.ckpt"
+motion_module_ckpt: "weights/v3_sd15_mm.ckpt"
+lora_rank: 2
+lora_scale: 1.0
+motion_lora_rank: 0
+motion_lora_scale: 1.0
+unet_additional_kwargs:
+  use_motion_module              : true
+  motion_module_resolutions      : [ 1,2,4,8 ]
+  unet_use_cross_frame_attention : false
+  unet_use_temporal_attention    : false
+  motion_module_mid_block: false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads                : 8
+    num_transformer_block              : 1
+    attention_block_types              : [ "Temporal_Self", "Temporal_Self" ]
+    temporal_position_encoding         : true
+    temporal_position_encoding_max_len : 32
+    temporal_attention_dim_div         : 1
+    zero_initialize                    : false
+camera_encoder_kwargs:
+  downscale_factor: 8
+  channels: [320, 640, 1280, 1280]
+  nums_rb: 2
+  cin: 384
+  ksize: 1
+  sk: true
+  use_conv: false
+  compression_factor: 1
+  temporal_attention_nhead: 8
+  attention_block_types: ["Temporal_Self", ]
+  temporal_position_encoding: true
+  temporal_position_encoding_max_len: 16
+attention_processor_kwargs:
+  add_spatial: false
+  spatial_attn_names: 'attn1'
+  add_temporal: true
+  temporal_attn_names: '0'
+  camera_feature_dimensions: [320, 640, 1280, 1280]
+  query_condition: true
+  key_value_condition: true
+  scale: 1.0
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+  beta_start:          0.00085
+  beta_end:            0.012
+  beta_schedule:       "linear"
+  steps_offset:        1
+  clip_sample:         false
+num_workers: 8
+global_seed: 42

configs/inference_genphoto/adv3_256_384_genphoto_relora_color_temperature.yaml CHANGED Viewed

@@ -1,3 +1,66 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d6f6e2911a8e440f4796db8ae67b919659067b859bacd7575953da6c2b8bfb2d
-size 1845

+output_dir: "inference_output/genphoto_color_temperature"
+pretrained_model_repo: "pandaphd/generative_photography"
+pretrained_model_path: "stable-diffusion-v1-5"
+unet_subfolder: "unet_merged"
+camera_adaptor_ckpt: "weights/checkpoint-color_temperature.ckpt"
+lora_ckpt: "weights/RealEstate10K_LoRA.ckpt"
+motion_module_ckpt: "weights/v3_sd15_mm.ckpt"
+lora_rank: 2
+lora_scale: 1.0
+motion_lora_rank: 0
+motion_lora_scale: 1.0
+unet_additional_kwargs:
+  use_motion_module              : true
+  motion_module_resolutions      : [ 1,2,4,8 ]
+  unet_use_cross_frame_attention : false
+  unet_use_temporal_attention    : false
+  motion_module_mid_block: false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads                : 8
+    num_transformer_block              : 1
+    attention_block_types              : [ "Temporal_Self", "Temporal_Self" ]
+    temporal_position_encoding         : true
+    temporal_position_encoding_max_len : 32
+    temporal_attention_dim_div         : 1
+    zero_initialize                    : false
+camera_encoder_kwargs:
+  downscale_factor: 8
+  channels: [320, 640, 1280, 1280]
+  nums_rb: 2
+  cin: 384
+  ksize: 1
+  sk: true
+  use_conv: false
+  compression_factor: 1
+  temporal_attention_nhead: 8
+  attention_block_types: ["Temporal_Self", ]
+  temporal_position_encoding: true
+  temporal_position_encoding_max_len: 16
+attention_processor_kwargs:
+  add_spatial: false
+  spatial_attn_names: 'attn1'
+  add_temporal: true
+  temporal_attn_names: '0'
+  camera_feature_dimensions: [320, 640, 1280, 1280]
+  query_condition: true
+  key_value_condition: true
+  scale: 1.0
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+  beta_start:          0.00085
+  beta_end:            0.012
+  beta_schedule:       "linear"
+  steps_offset:        1
+  clip_sample:         false
+num_workers: 8
+global_seed: 42

configs/inference_genphoto/adv3_256_384_genphoto_relora_focal_length.yaml CHANGED Viewed

@@ -1,3 +1,65 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1c8c9059792e1ca206c44edd1cb29765c5ddb1f54551a1b1fc7010bf292420a8
-size 1834

+output_dir: "inference_output/genphoto_focal_length"
+pretrained_model_repo: "pandaphd/generative_photography"
+pretrained_model_path: "stable-diffusion-v1-5"
+unet_subfolder: "unet_merged"
+camera_adaptor_ckpt: "weights/checkpoint-focal_length.ckpt"
+lora_ckpt: "weights/RealEstate10K_LoRA.ckpt"
+motion_module_ckpt: "weights/v3_sd15_mm.ckpt"
+lora_rank: 2
+lora_scale: 1.0
+motion_lora_rank: 0
+motion_lora_scale: 1.0
+unet_additional_kwargs:
+  use_motion_module              : true
+  motion_module_resolutions      : [ 1,2,4,8 ]
+  unet_use_cross_frame_attention : false
+  unet_use_temporal_attention    : false
+  motion_module_mid_block: false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads                : 8
+    num_transformer_block              : 1
+    attention_block_types              : [ "Temporal_Self", "Temporal_Self" ]
+    temporal_position_encoding         : true
+    temporal_position_encoding_max_len : 32
+    temporal_attention_dim_div         : 1
+    zero_initialize                    : false
+camera_encoder_kwargs:
+  downscale_factor: 8
+  channels: [320, 640, 1280, 1280]
+  nums_rb: 2
+  cin: 384
+  ksize: 1
+  sk: true
+  use_conv: false
+  compression_factor: 1
+  temporal_attention_nhead: 8
+  attention_block_types: ["Temporal_Self", ]
+  temporal_position_encoding: true
+  temporal_position_encoding_max_len: 16
+attention_processor_kwargs:
+  add_spatial: false
+  spatial_attn_names: 'attn1'
+  add_temporal: true
+  temporal_attn_names: '0'
+  camera_feature_dimensions: [320, 640, 1280, 1280]
+  query_condition: true
+  key_value_condition: true
+  scale: 1.0
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+  beta_start:          0.00085
+  beta_end:            0.012
+  beta_schedule:       "linear"
+  steps_offset:        1
+  clip_sample:         false
+num_workers: 8
+global_seed: 42

configs/inference_genphoto/adv3_256_384_genphoto_relora_shutter_speed.yaml CHANGED Viewed

@@ -1,3 +1,66 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:36c7d618e29249ce9086f5424f9a718b0faac002edd19ee0fd0335b85fdc8b7f
-size 1837

+output_dir: "inference_output/genphoto_shutter_speed"
+pretrained_model_repo: "pandaphd/generative_photography"
+pretrained_model_path: "stable-diffusion-v1-5"
+unet_subfolder: "unet_merged"
+camera_adaptor_ckpt: "weights/checkpoint-shutter_speed.ckpt"
+lora_ckpt: "weights/RealEstate10K_LoRA.ckpt"
+motion_module_ckpt: "weights/v3_sd15_mm.ckpt"
+lora_rank: 2
+lora_scale: 1.0
+motion_lora_rank: 0
+motion_lora_scale: 1.0
+unet_additional_kwargs:
+  use_motion_module              : true
+  motion_module_resolutions      : [ 1,2,4,8 ]
+  unet_use_cross_frame_attention : false
+  unet_use_temporal_attention    : false
+  motion_module_mid_block: false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads                : 8
+    num_transformer_block              : 1
+    attention_block_types              : [ "Temporal_Self", "Temporal_Self" ]
+    temporal_position_encoding         : true
+    temporal_position_encoding_max_len : 32
+    temporal_attention_dim_div         : 1
+    zero_initialize                    : false
+camera_encoder_kwargs:
+  downscale_factor: 8
+  channels: [320, 640, 1280, 1280]
+  nums_rb: 2
+  cin: 384
+  ksize: 1
+  sk: true
+  use_conv: false
+  compression_factor: 1
+  temporal_attention_nhead: 8
+  attention_block_types: ["Temporal_Self", ]
+  temporal_position_encoding: true
+  temporal_position_encoding_max_len: 16
+attention_processor_kwargs:
+  add_spatial: false
+  spatial_attn_names: 'attn1'
+  add_temporal: true
+  temporal_attn_names: '0'
+  camera_feature_dimensions: [320, 640, 1280, 1280]
+  query_condition: true
+  key_value_condition: true
+  scale: 1.0
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+  beta_start:          0.00085
+  beta_end:            0.012
+  beta_schedule:       "linear"
+  steps_offset:        1
+  clip_sample:         false
+num_workers: 8
+global_seed: 42

environment.yaml CHANGED Viewed

@@ -1,3 +1,27 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a55fe5d623a3450e046bd7d0d095676d9d2ca62d36d19cfda8e9307007634970
-size 435

+name: genphoto
+channels:
+  - pytorch
+  - nvidia
+dependencies:
+  - python=3.10
+  - pytorch=2.1.1
+  - torchvision=0.16.1
+  - torchaudio=2.1.1
+  - pytorch-cuda=12.1
+  - pip
+  - pip:
+    - diffusers==0.24.0
+    - xformers==0.0.23
+    - imageio==2.36.0
+    - imageio[ffmpeg]
+    - opencv-python
+    - transformers
+    - gdown
+    - einops
+    - decord
+    - omegaconf
+    - safetensors
+    - gradio
+    - wandb
+    - triton
+    - termcolor

genphoto/data/dataset.py CHANGED Viewed

@@ -1,3 +1,950 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4c11d5ea01a3dd35a0987915a62ffb2c4c967ff4c81d2c9f0fe876f2daa93aad
-size 38885

+import os
+import random
+import json
+import torch
+import math
+import torch.nn as nn
+import torchvision.transforms as transforms
+import torch.nn.functional as F
+import numpy as np
+from torch.utils.data.dataset import Dataset
+from packaging import version as pver
+import cv2
+from PIL import Image
+from einops import rearrange
+from transformers import pipeline, CLIPTextModel, CLIPTokenizer
+import sys
+sys.path.append('/home/yuan418/data/project/Generative_Photography/genphoto/data/BokehMe/')
+from classical_renderer.scatter import ModuleRenderScatter
+#### for shutter speed ####
+def create_shutter_speed_embedding(shutter_speed_values, target_height, target_width, base_exposure=0.5):
+    """
+    Create an shutter_speed embedding tensor using a constant fwc value.
+    Args:
+    - shutter_speed_values: Tensor of shape [f, 1] containing shutter_speed values for each frame.
+    - H: Height of the image.
+    - W: Width of the image.
+    - base_exposure: A base exposure value to normalize brightness (defaults to 0.18 as a common base exposure level).
+    Returns:
+    - shutter_speed_embedding: Tensor of shape [f, 1, H, W] where each pixel is scaled based on the shutter_speed values.
+    """
+    f = shutter_speed_values.shape[0]
+    # Set a constant full well capacity (fwc)
+    fwc = 32000  # Constant value for full well capacity
+    # Calculate scale based on EV and sensor full well capacity (fwc)
+    scales = (shutter_speed_values / base_exposure) * (fwc / (fwc + 0.0001))
+    # Reshape and expand to match image dimensions
+    scales = scales.unsqueeze(2).unsqueeze(3).expand(f, 3, target_height, target_width)
+    # Use scales to create the final shutter_speed embedding
+    shutter_speed_embedding = scales      # Shape [f, 3, H, W]
+    return shutter_speed_embedding
+def sensor_image_simulation_numpy(avg_PPP, photon_flux, fwc, Nbits, gain=1):
+    min_val = 0
+    max_val = 2 ** Nbits - 1
+    theta = photon_flux * (avg_PPP / (np.mean(photon_flux) + 0.0001))
+    theta = np.clip(theta, 0, fwc)
+    theta = np.round(theta * gain * max_val / fwc)
+    theta = np.clip(theta, min_val, max_val)
+    theta = theta.astype(np.float32)
+    return theta
+class CameraShutterSpeed(Dataset):
+    def __init__(
+            self,
+            root_path,
+            annotation_json,
+            sample_n_frames=5,
+            sample_size=[256, 384],
+            is_Train=True,
+    ):
+        self.root_path = root_path
+        self.sample_n_frames = sample_n_frames
+        self.dataset = json.load(open(os.path.join(root_path, annotation_json), 'r'))
+        self.length = len(self.dataset)
+        self.is_Train = is_Train
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.sample_size = sample_size
+        pixel_transforms = [transforms.Resize(sample_size),
+                                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)]
+        self.pixel_transforms = pixel_transforms
+        self.tokenizer = CLIPTokenizer.from_pretrained("/home/yuan418/data/project/stable-diffusion-v1-5/", subfolder="tokenizer")
+        self.text_encoder = CLIPTextModel.from_pretrained("/home/yuan418/data/project/stable-diffusion-v1-5/", subfolder="text_encoder")
+    def load_image_reader(self, idx):
+        image_dict = self.dataset[idx]
+        image_path = os.path.join(self.root_path, image_dict['base_image_path'])
+        image_reader = cv2.imread(image_path)
+        image_reader = cv2.cvtColor(image_reader, cv2.COLOR_BGR2RGB)
+        image_caption = image_dict['caption']
+        if self.is_Train:
+            mean = 0.48
+            std_dev = 0.25
+            shutter_speed_values = [random.gauss(mean, std_dev) for _ in range(self.sample_n_frames)]
+            shutter_speed_values = [max(0.1, min(1.0, ev)) for ev in shutter_speed_values]
+            print('train shutter_speed values', shutter_speed_values)
+        else:
+            shutter_speed_list_str = image_dict['shutter_speed_list']
+            shutter_speed_values = json.loads(shutter_speed_list_str)
+            print('validation shutter_speed_values', shutter_speed_values)
+        shutter_speed_values = torch.tensor(shutter_speed_values).unsqueeze(1)
+        return image_path, image_reader, image_caption, shutter_speed_values
+    def get_batch(self, idx):
+        image_path, image_reader, image_caption, shutter_speed_values = self.load_image_reader(idx)
+        total_frames = len(shutter_speed_values)
+        if total_frames < 3:
+            raise ValueError("less than 3 frames")
+        # Generate prompts for each shutter speed value and append shutter speed information to caption
+        prompts = []
+        for ss in shutter_speed_values:
+            prompt = f"<exposure: {ss.item()}>"
+            prompts.append(prompt)
+        # Tokenize prompts and encode to get embeddings
+        with torch.no_grad():
+            prompt_ids = self.tokenizer(
+                prompts, max_length=self.tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+            ).input_ids
+            # print('tokenizer model_max_length', self.tokenizer.model_max_length)
+            encoder_hidden_states = self.text_encoder(input_ids=prompt_ids).last_hidden_state  # Shape: (f, sequence_length, hidden_size)
+        # print('encoder_hidden_states shape', encoder_hidden_states.shape)
+        # Calculate differences between consecutive embeddings (ignoring sequence_length)
+        differences = []
+        for i in range(1, encoder_hidden_states.size(0)):
+            diff = encoder_hidden_states[i] - encoder_hidden_states[i - 1]
+            diff = diff.unsqueeze(0)
+            differences.append(diff)
+        # Add the difference between the last and the first embedding
+        final_diff = encoder_hidden_states[-1] - encoder_hidden_states[0]
+        final_diff = final_diff.unsqueeze(0)
+        differences.append(final_diff)
+        # Concatenate differences along the batch dimension (f-1)
+        concatenated_differences = torch.cat(differences, dim=0)
+        # print('concatenated_differences shape', concatenated_differences.shape) # f 77 768
+        frame = concatenated_differences.size(0)
+        concatenated_differences = torch.cat(differences, dim=0)
+        # Current shape: (f, 77, 768) Pad the second dimension (77) to 128
+        pad_length = 128 - concatenated_differences.size(1)
+        if pad_length > 0:
+        # Pad along the second dimension (77 -> 128), pad only on the right side
+            concatenated_differences_padded = F.pad(concatenated_differences, (0, 0, 0, pad_length))
+        ## ccl = constrative camera learning
+        ccl_embedding = concatenated_differences_padded.reshape(frame, self.sample_size[0], self.sample_size[1])
+        ccl_embedding = ccl_embedding.unsqueeze(1)
+        ccl_embedding = ccl_embedding.expand(-1, 3, -1, -1)
+        # Now handle the sensor image simulation
+        fwc = random.uniform(19000, 64000)
+        pixel_values = []
+        for ee in shutter_speed_values:
+            avg_PPP = (0.6 * ee.item() + 0.1) * fwc
+            img_sim = sensor_image_simulation_numpy(avg_PPP, image_reader, fwc, Nbits=8, gain=1)
+            pixel_values.append(img_sim)
+        pixel_values = np.stack(pixel_values, axis=0)
+        pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous() / 255.
+        # Create shutter_speed embedding and concatenate it with CCL embedding
+        shutter_speed_embedding = create_shutter_speed_embedding(shutter_speed_values, self.sample_size[0], self.sample_size[1])
+        camera_embedding = torch.cat((shutter_speed_embedding, ccl_embedding), dim=1)
+        # print('camera_embedding shape', camera_embedding.shape)
+        return pixel_values, image_caption, camera_embedding, shutter_speed_values
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                video, video_caption, camera_embedding, shutter_speed_values = self.get_batch(idx)
+                break
+            except Exception as e:
+                idx = random.randint(0, self.length - 1)
+        for transform in self.pixel_transforms:
+            video = transform(video)
+        sample = dict(pixel_values=video, text=video_caption, camera_embedding=camera_embedding, shutter_speed_values=shutter_speed_values)
+        return sample
+#### for focal length ####
+def crop_focal_length(image_path, base_focal_length, target_focal_length, target_height, target_width, sensor_height=24.0, sensor_width=36.0):
+    img = Image.open(image_path)
+    width, height = img.size
+    # Calculate base and target FOV
+    base_x_fov = 2.0 * math.atan(sensor_width * 0.5 / base_focal_length)
+    base_y_fov = 2.0 * math.atan(sensor_height * 0.5 / base_focal_length)
+    target_x_fov = 2.0 * math.atan(sensor_width * 0.5 / target_focal_length)
+    target_y_fov = 2.0 * math.atan(sensor_height * 0.5 / target_focal_length)
+    # Calculate crop ratio, use the smaller ratio to maintain aspect ratio
+    crop_ratio = min(target_x_fov / base_x_fov, target_y_fov / base_y_fov)
+    crop_width = int(round(crop_ratio * width))
+    crop_height = int(round(crop_ratio * height))
+    # Ensure crop dimensions are within valid bounds
+    crop_width = max(1, min(width, crop_width))
+    crop_height = max(1, min(height, crop_height))
+    # Crop coordinates
+    left = int((width - crop_width) / 2)
+    top = int((height - crop_height) / 2)
+    right = int((width + crop_width) / 2)
+    bottom = int((height + crop_height) / 2)
+    # Crop the image
+    zoomed_img = img.crop((left, top, right, bottom))
+    # Resize the cropped image to target resolution
+    resized_img = zoomed_img.resize((target_width, target_height), Image.Resampling.LANCZOS)
+    # Convert the PIL image to a numpy array
+    resized_img_np = np.array(resized_img).astype(np.float32)
+    return resized_img_np
+def create_focal_length_embedding(focal_length_values, base_focal_length, target_height, target_width, sensor_height=24.0, sensor_width=36.0):
+    device = 'cpu'
+    focal_length_values = focal_length_values.to(device)
+    f = focal_length_values.shape[0]  # Number of frames
+    # Convert constants to tensors to perform operations with focal_length_values
+    sensor_width = torch.tensor(sensor_width, device=device)
+    sensor_height = torch.tensor(sensor_height, device=device)
+    base_focal_length = torch.tensor(base_focal_length, device=device)
+    # Calculate the FOV for the base focal length (min_focal_length)
+    base_fov_x = 2.0 * torch.atan(sensor_width * 0.5 / base_focal_length)
+    base_fov_y = 2.0 * torch.atan(sensor_height * 0.5 / base_focal_length)
+    # Calculate the FOV for each focal length in focal_length_values
+    target_fov_x = 2.0 * torch.atan(sensor_width * 0.5 / focal_length_values)
+    target_fov_y = 2.0 * torch.atan(sensor_height * 0.5 / focal_length_values)
+    # Calculate crop ratio: how much of the image is cropped at the current focal length
+    crop_ratio_xs = target_fov_x / base_fov_x  # Crop ratio for horizontal axis
+    crop_ratio_ys = target_fov_y / base_fov_y  # Crop ratio for vertical axis
+    # Get the center of the image
+    center_h, center_w = target_height // 2, target_width // 2
+    # Initialize a mask tensor with zeros on CPU
+    focal_length_embedding = torch.zeros((f, 3, target_height, target_width), dtype=torch.float32)  # Shape [f, 3, H, W]
+    # Fill the center region with 1 based on the calculated crop dimensions
+    for i in range(f):
+        # Crop dimensions calculated using rounded float values
+        crop_h = torch.round(crop_ratio_ys[i] * target_height).int().item()  # Rounded cropped height for the current frame
+        crop_w = torch.round(crop_ratio_xs[i] * target_width).int().item()  # Rounded cropped width for the current frame
+        # Ensure the cropped dimensions are within valid bounds
+        crop_h = max(1, min(target_height, crop_h))
+        crop_w = max(1, min(target_width, crop_w))
+        # Set the center region of the focal_length embedding to 1 for the current frame
+        focal_length_embedding[i, :,
+        center_h - crop_h // 2: center_h + crop_h // 2,
+        center_w - crop_w // 2: center_w + crop_w // 2] = 1.0
+    return focal_length_embedding
+class CameraFocalLength(Dataset):
+    def __init__(
+            self,
+            root_path,
+            annotation_json,
+            sample_n_frames=5,
+            sample_size=[256, 384],
+            is_Train=True,
+    ):
+        self.root_path = root_path
+        self.sample_n_frames = sample_n_frames
+        self.dataset = json.load(open(os.path.join(root_path, annotation_json), 'r'))
+        self.length = len(self.dataset)
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.sample_size = sample_size
+        pixel_transforms = [transforms.Resize(sample_size),
+                            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)]
+        self.pixel_transforms = pixel_transforms
+        self.is_Train = is_Train
+        self.tokenizer = CLIPTokenizer.from_pretrained("/home/yuan418/data/project/stable-diffusion-v1-5/", subfolder="tokenizer")
+        self.text_encoder = CLIPTextModel.from_pretrained("/home/yuan418/data/project/stable-diffusion-v1-5/", subfolder="text_encoder")
+    def load_image_reader(self, idx):
+        image_dict = self.dataset[idx]
+        image_path = os.path.join(self.root_path, image_dict['base_image_path'])
+        image_reader = cv2.imread(image_path)
+        image_caption = image_dict['caption']
+        if self.is_Train:
+            focal_length_values = [random.uniform(24.0, 70.0) for _ in range(self.sample_n_frames)]
+            print('train focal_length_values', focal_length_values)
+        else:
+            focal_length_list_str = image_dict['focal_length_list']
+            focal_length_values = json.loads(focal_length_list_str)
+            print('validation focal_length_values', focal_length_values)
+        focal_length_values = torch.tensor(focal_length_values).unsqueeze(1)
+        return image_path, image_reader, image_caption, focal_length_values
+    def get_batch(self, idx):
+        image_path, image_reader, image_caption, focal_length_values = self.load_image_reader(idx)
+        total_frames = len(focal_length_values)
+        if total_frames < 3:
+            raise ValueError("less than 3 frames")
+        # Generate prompts for each fl value and append fl information to caption
+        prompts = []
+        for fl in focal_length_values:
+            prompt = f"<focal length: {fl.item()}>"
+            prompts.append(prompt)
+        # Tokenize prompts and encode to get embeddings
+        with torch.no_grad():
+            prompt_ids = self.tokenizer(
+                prompts, max_length=self.tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+            ).input_ids
+            encoder_hidden_states = self.text_encoder(input_ids=prompt_ids).last_hidden_state  # Shape: (f, sequence_length, hidden_size)
+        # print('encoder_hidden_states shape', encoder_hidden_states.shape)
+        # Calculate differences between consecutive embeddings (ignoring sequence_length)
+        differences = []
+        for i in range(1, encoder_hidden_states.size(0)):
+            diff = encoder_hidden_states[i] - encoder_hidden_states[i - 1]
+            diff = diff.unsqueeze(0)
+            differences.append(diff)
+        # Add the difference between the last and the first embedding
+        final_diff = encoder_hidden_states[-1] - encoder_hidden_states[0]
+        final_diff = final_diff.unsqueeze(0)
+        differences.append(final_diff)
+        # Concatenate differences along the batch dimension (f-1)
+        concatenated_differences = torch.cat(differences, dim=0)
+        # print('concatenated_differences shape', concatenated_differences.shape) # f 77 768
+        frame = concatenated_differences.size(0)
+        # Concatenate differences along the batch dimension (f)
+        concatenated_differences = torch.cat(differences, dim=0)
+        # Current shape: (f, 77, 768), Pad the second dimension (77) to 128
+        pad_length = 128 - concatenated_differences.size(1)
+        if pad_length > 0:
+        # Pad along the second dimension (77 -> 128), pad only on the right side
+            concatenated_differences_padded = F.pad(concatenated_differences, (0, 0, 0, pad_length))
+        ## CCL = constrative camera learning
+        ccl_embedding = concatenated_differences_padded.reshape(frame, self.sample_size[0], self.sample_size[1])
+        ccl_embedding = ccl_embedding.unsqueeze(1)
+        ccl_embedding = ccl_embedding.expand(-1, 3, -1, -1)
+        # print('ccl_embedding shape', ccl_embedding.shape)
+        pixel_values = []
+        for ff in focal_length_values:
+            img_sim = crop_focal_length(image_path=image_path, base_focal_length=24.0, target_focal_length=ff, target_height=self.sample_size[0], target_width=self.sample_size[1], sensor_height=24.0, sensor_width=36.0)
+            pixel_values.append(img_sim)
+            # save_path = os.path.join(self.root_path, f"simulated_img_focal_length_{fl.item():.2f}.png")
+            # cv2.imwrite(save_path, img_sim)
+            # print(f"Saved image: {save_path}")
+        pixel_values = np.stack(pixel_values, axis=0)
+        pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous() / 255.
+        focal_length_embedding = create_focal_length_embedding(focal_length_values, base_focal_length=24.0, target_height=self.sample_size[0], target_width=self.sample_size[1])
+        # print('focal_length_embedding shape', focal_length_embedding.shape)
+        camera_embedding = torch.cat((focal_length_embedding, ccl_embedding), dim=1)
+        # print('camera_embedding shape', camera_embedding.shape)
+        return pixel_values, image_caption, camera_embedding, focal_length_values
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                video, video_caption, camera_embedding, focal_length_values = self.get_batch(idx)
+                break
+            except Exception as e:
+                idx = random.randint(0, self.length - 1)
+        for transform in self.pixel_transforms:
+            video = transform(video)
+        sample = dict(pixel_values=video, text=video_caption, camera_embedding=camera_embedding, focal_length_values=focal_length_values)
+        return sample
+#### for color temperature  ####
+def kelvin_to_rgb(kelvin):
+    temp = kelvin / 100.0
+    if temp <= 66:
+        red = 255
+        green = 99.4708025861 * np.log(temp) - 161.1195681661 if temp > 0 else 0
+        if temp <= 19:
+            blue = 0
+        else:
+            blue = 138.5177312231 * np.log(temp - 10) - 305.0447927307
+    elif 66<temp<=88:
+        red = 0.5 * (255 + 329.698727446 * ((temp - 60) ** -0.19332047592))
+        green = 0.5 * (288.1221695283 * ((temp - 60) ** -0.1155148492) + (99.4708025861 * np.log(temp) - 161.1195681661 if temp > 0 else 0))
+        blue = 0.5 * (138.5177312231 * np.log(temp - 10) - 305.0447927307 + 255)
+    else:
+        red = 329.698727446 * ((temp - 60) ** -0.19332047592)
+        green = 288.1221695283 * ((temp - 60) ** -0.1155148492)
+        blue = 255
+    return np.array([red, green, blue], dtype=np.float32) / 255.0
+def create_color_temperature_embedding(color_temperature_values, target_height, target_width, min_color_temperature=2000, max_color_temperature=10000):
+    """
+    Create an color_temperature embedding tensor based on color temperature.
+    Args:
+    - color_temperature_values: Tensor of shape [f, 1] containing color_temperature values for each frame.
+    - target_height: Height of the image.
+    - target_width: Width of the image.
+    - min_color_temperature: Minimum color_temperature value for normalization.
+    - max_color_temperature: Maximum color_temperature value for normalization.
+    Returns:
+    - color_temperature_embedding: Tensor of shape [f, 3, target_height, target_width] for RGB channel scaling.
+    """
+    f = color_temperature_values.shape[0]
+    rgb_factors = []
+    # Compute RGB factors based on kelvin_to_rgb function
+    for ct in color_temperature_values.squeeze():
+        kelvin = min_color_temperature + (ct * (max_color_temperature - min_color_temperature))  # Map normalized color_temperature to actual Kelvin
+        rgb = kelvin_to_rgb(kelvin)
+        rgb_factors.append(rgb)
+    # Convert to tensor and expand to target dimensions
+    rgb_factors = torch.tensor(rgb_factors).float()  # [f, 3]
+    rgb_factors = rgb_factors.unsqueeze(2).unsqueeze(3)  # [f, 3, 1, 1]
+    color_temperature_embedding = rgb_factors.expand(f, 3, target_height, target_width)  # [f, 3, target_height, target_width]
+    return color_temperature_embedding
+def kelvin_to_rgb_smooth(kelvin):
+    temp = kelvin / 100.0
+    if temp <= 66:
+        red = 255
+        green = 99.4708025861 * np.log(temp) - 161.1195681661 if temp > 0 else 0
+        if temp <= 19:
+            blue = 0
+        else:
+            blue = 138.5177312231 * np.log(temp - 10) - 305.0447927307
+    elif 66<temp<=88:
+        red = 0.5 * (255 + 329.698727446 * ((temp - 60) ** -0.19332047592))
+        green = 0.5 * (288.1221695283 * ((temp - 60) ** -0.1155148492) + (99.4708025861 * np.log(temp) - 161.1195681661 if temp > 0 else 0))
+        blue = 0.5 * (138.5177312231 * np.log(temp - 10) - 305.0447927307 + 255)
+    else:
+        red = 329.698727446 * ((temp - 60) ** -0.19332047592)
+        green = 288.1221695283 * ((temp - 60) ** -0.1155148492)
+        blue = 255
+    red = np.clip(red, 0, 255)
+    green = np.clip(green, 0, 255)
+    blue = np.clip(blue, 0, 255)
+    balance_rgb = np.array([red, green, blue], dtype=np.float32)
+    return balance_rgb
+def interpolate_white_balance(image, kelvin):
+    balance_rgb = kelvin_to_rgb_smooth(kelvin.item())
+    image = image.astype(np.float32)
+    r, g, b = cv2.split(image)
+    r = r * (balance_rgb[0] / 255.0)
+    g = g * (balance_rgb[1] / 255.0)
+    b = b * (balance_rgb[2] / 255.0)
+    balanced_image = cv2.merge([r,g,b])
+    balanced_image = np.clip(balanced_image, 0, 255).astype(np.uint8)
+    return balanced_image
+class CameraColorTemperature(Dataset):
+    def __init__(
+            self,
+            root_path,
+            annotation_json,
+            sample_n_frames=5,
+            sample_size=[256, 384],
+            is_Train=True,
+    ):
+        self.root_path = root_path
+        self.sample_n_frames = sample_n_frames
+        self.dataset = json.load(open(os.path.join(root_path, annotation_json), 'r'))
+        self.length = len(self.dataset)
+        self.is_Train = is_Train
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.sample_size = sample_size
+        pixel_transforms = [transforms.Resize(sample_size),
+                                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)]
+        self.pixel_transforms = pixel_transforms
+        self.tokenizer = CLIPTokenizer.from_pretrained("/home/yuan418/data/project/stable-diffusion-v1-5/", subfolder="tokenizer")
+        self.text_encoder = CLIPTextModel.from_pretrained("/home/yuan418/data/project/stable-diffusion-v1-5/", subfolder="text_encoder")
+    def load_image_reader(self, idx):
+        image_dict = self.dataset[idx]
+        image_path = os.path.join(self.root_path, image_dict['base_image_path'])
+        image_reader = cv2.imread(image_path)
+        image_reader = cv2.cvtColor(image_reader, cv2.COLOR_BGR2RGB)
+        image_caption = image_dict['caption']
+        if self.is_Train:
+            color_temperature_values = [random.uniform(2000.0, 10000.0) for _ in range(self.sample_n_frames)]
+            print('train color_temperature values', color_temperature_values)
+        else:
+            color_temperature_list_str = image_dict['color_temperature_list']
+            color_temperature_values = json.loads(color_temperature_list_str)
+            print('validation color_temperature_values', color_temperature_values)
+        color_temperature_values = torch.tensor(color_temperature_values).unsqueeze(1)
+        return image_path, image_reader, image_caption, color_temperature_values
+    def get_batch(self, idx):
+        image_path, image_reader, image_caption, color_temperature_values = self.load_image_reader(idx)
+        total_frames = len(color_temperature_values)
+        if total_frames < 3:
+            raise ValueError("less than 3 frames")
+        # Generate prompts for each color_temperature value and append color_temperature information to caption
+        prompts = []
+        for cc in color_temperature_values:
+            prompt = f"<color temperature: {cc.item()}>"
+            prompts.append(prompt)
+        # Tokenize prompts and encode to get embeddings
+        with torch.no_grad():
+            prompt_ids = self.tokenizer(
+                prompts, max_length=self.tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+            ).input_ids
+            # print('tokenizer model_max_length', self.tokenizer.model_max_length)
+            encoder_hidden_states = self.text_encoder(input_ids=prompt_ids).last_hidden_state  # Shape: (f, sequence_length, hidden_size)
+        # print('encoder_hidden_states shape', encoder_hidden_states.shape)
+        # Calculate differences between consecutive embeddings (ignoring sequence_length)
+        differences = []
+        for i in range(1, encoder_hidden_states.size(0)):
+            diff = encoder_hidden_states[i] - encoder_hidden_states[i - 1]
+            diff = diff.unsqueeze(0)
+            differences.append(diff)
+        # Add the difference between the last and the first embedding
+        final_diff = encoder_hidden_states[-1] - encoder_hidden_states[0]
+        final_diff = final_diff.unsqueeze(0)
+        differences.append(final_diff)
+        # Concatenate differences along the batch dimension (f-1)
+        concatenated_differences = torch.cat(differences, dim=0)
+        # print('concatenated_differences shape', concatenated_differences.shape) # f 77 768
+        frame = concatenated_differences.size(0)
+        concatenated_differences = torch.cat(differences, dim=0)
+        # Current shape: (f, 77, 768), Pad the second dimension (77) to 128
+        pad_length = 128 - concatenated_differences.size(1)
+        if pad_length > 0:
+        # Pad along the second dimension (77 -> 128), pad only on the right side
+            concatenated_differences_padded = F.pad(concatenated_differences, (0, 0, 0, pad_length))
+        ccl_embedding = concatenated_differences_padded.reshape(frame, self.sample_size[0], self.sample_size[1])
+        ccl_embedding = ccl_embedding.unsqueeze(1)
+        ccl_embedding = ccl_embedding.expand(-1, 3, -1, -1)
+        # print('ccl_embedding shape', ccl_embedding.shape)
+        # Now handle the sensor image simulation
+        pixel_values = []
+        for aw in color_temperature_values:
+            img_sim = interpolate_white_balance(image_reader, aw)
+            pixel_values.append(img_sim)
+        pixel_values = np.stack(pixel_values, axis=0)
+        pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous() / 255.
+        # Create color_temperature embedding and concatenate it with CCL embedding
+        color_temperature_embedding = create_color_temperature_embedding(color_temperature_values, self.sample_size[0], self.sample_size[1])
+        # print('color_temperature_embedding shape', color_temperature_embedding.shape)
+        camera_embedding = torch.cat((color_temperature_embedding, ccl_embedding), dim=1)
+        # print('camera_embedding shape', camera_embedding.shape)
+        return pixel_values, image_caption, camera_embedding, color_temperature_values
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                video, video_caption, camera_embedding, color_temperature_values = self.get_batch(idx)
+                break
+            except Exception as e:
+                idx = random.randint(0, self.length - 1)
+        for transform in self.pixel_transforms:
+            video = transform(video)
+        sample = dict(pixel_values=video, text=video_caption, camera_embedding=camera_embedding, color_temperature_values=color_temperature_values)
+        return sample
+#### for bokeh (K is the blur parameter) ####
+def create_bokehK_embedding(bokehK_values, target_height, target_width):
+    """
+    Creates a Bokeh embedding based on the given K values. The larger the K value,
+    the more the image is blurred.
+    Args:
+        bokehK_values (torch.Tensor): Tensor of K values for bokeh effect.
+        target_height (int): Desired height of the output embedding.
+        target_width (int): Desired width of the output embedding.
+        base_K (float): Base K value to control the minimum blur level.
+    Returns:
+        torch.Tensor: Bokeh embedding tensor. [f 3 h w]
+    """
+    f = bokehK_values.shape[0]
+    bokehK_embedding = torch.zeros((f, 3, target_height, target_width), dtype=bokehK_values.dtype)
+    for i in range(f):
+        K_value = bokehK_values[i].item()
+        kernel_size = max(K_value, 1)
+        sigma = K_value / 3.0
+        ax = np.linspace(-(kernel_size / 2), kernel_size / 2, int(np.ceil(kernel_size)))
+        xx, yy = np.meshgrid(ax, ax)
+        kernel = np.exp(-(xx ** 2 + yy ** 2) / (2 * sigma ** 2))
+        kernel /= np.sum(kernel)
+        scale = kernel[int(np.ceil(kernel_size) / 2), int(np.ceil(kernel_size) / 2)]
+        bokehK_embedding[i] = scale
+    return bokehK_embedding
+def bokehK_simulation(image_path, depth_map_path, K, disp_focus, gamma=2.2):
+    ## depth map image can be inferenced online using following code ##
+    #  model_dir = "/home/modules/"
+    #  pipe = pipeline(
+    #            task="depth-estimation",
+    #           model="depth-anything/Depth-Anything-V2-Small-hf",
+    #          cache_dir=model_dir,
+    #            device=0
+    #         )
+    # image_raw = Image.open(image_path)
+    # disp = pipe(image_raw)["depth"]
+    # base_name = os.path.basename(image_path)
+    # file_name, ext = os.path.splitext(base_name)
+    # disp_file_name = f"{file_name}_disp.png"
+    # disp.save(disp_file_name)
+    # disp = np.array(disp)
+    # disp = disp.astype(np.float32)
+    # disp /= 255.0
+    disp = np.float32(cv2.imread(depth_map_path, cv2.IMREAD_GRAYSCALE))
+    disp /= 255.0
+    disp = (disp - disp.min()) / (disp.max() - disp.min())
+    min_disp = np.min(disp)
+    max_disp = np.max(disp)
+    device = torch.device('cuda')
+    # Initialize renderer
+    classical_renderer = ModuleRenderScatter().to(device)
+    # Load image and disparity
+    image = cv2.imread(image_path).astype(np.float32) / 255.0
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    # Calculate defocus
+    defocus = K * (disp - disp_focus) / 10.0
+    # Convert to tensors and move to GPU if available
+    image = torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).to(device)
+    defocus = defocus.unsqueeze(0).unsqueeze(0).to(device)
+    bokeh_classical, defocus_dilate = classical_renderer(image**gamma, defocus*10.0)
+    bokeh_pred = bokeh_classical ** (1/gamma)
+    bokeh_pred = bokeh_pred.squeeze(0)
+    bokeh_pred = bokeh_pred.permute(1, 2, 0)  # remove batch dim and change channle order
+    bokeh_pred = (bokeh_pred * 255).cpu().numpy()
+    bokeh_pred = np.round(bokeh_pred)
+    bokeh_pred = bokeh_pred.astype(np.float32)
+    return bokeh_pred
+class CameraBokehK(Dataset):
+    def __init__(
+            self,
+            root_path,
+            annotation_json,
+            sample_n_frames=5,
+            sample_size=[256, 384],
+            is_Train=True,
+    ):
+        self.root_path = root_path
+        self.sample_n_frames = sample_n_frames
+        self.dataset = json.load(open(os.path.join(root_path, annotation_json), 'r'))
+        self.length = len(self.dataset)
+        self.is_Train = is_Train
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.sample_size = sample_size
+        pixel_transforms = [transforms.Resize(sample_size),
+                                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)]
+        self.pixel_transforms = pixel_transforms
+        self.tokenizer = CLIPTokenizer.from_pretrained("/home/yuan418/data/project/stable-diffusion-v1-5/", subfolder="tokenizer")
+        self.text_encoder = CLIPTextModel.from_pretrained("/home/yuan418/data/project/stable-diffusion-v1-5/", subfolder="text_encoder")
+    def load_image_reader(self, idx):
+        image_dict = self.dataset[idx]
+        image_path = os.path.join(self.root_path, image_dict['base_image_path'])
+        depth_map_path = os.path.join(self.root_path, image_dict['depth_map_path'])
+        image_caption = image_dict['caption']
+        if self.is_Train:
+            bokehK_values = [random.uniform(1.0, 30.0) for _ in range(self.sample_n_frames)]
+            print('train bokehK values', bokehK_values)
+        else:
+            bokehK_list_str = image_dict['bokehK_list']
+            bokehK_values = json.loads(bokehK_list_str)
+            print('validation bokehK_values', bokehK_values)
+        bokehK_values = torch.tensor(bokehK_values).unsqueeze(1)
+        return image_path, depth_map_path, image_caption, bokehK_values
+    def get_batch(self, idx):
+        image_path, depth_map_path, image_caption, bokehK_values = self.load_image_reader(idx)
+        total_frames = len(bokehK_values)
+        if total_frames < 3:
+            raise ValueError("less than 3 frames")
+        # Generate prompts for each bokehK value and append bokehK information to caption
+        prompts = []
+        for bb in bokehK_values:
+            prompt = f"<bokeh kernel size: {bb.item()}>"
+            prompts.append(prompt)
+        # Tokenize prompts and encode to get embeddings
+        with torch.no_grad():
+            prompt_ids = self.tokenizer(
+                prompts, max_length=self.tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+            ).input_ids
+        # print('tokenizer model_max_length', self.tokenizer.model_max_length)
+            encoder_hidden_states = self.text_encoder(input_ids=prompt_ids).last_hidden_state  # Shape: (f, sequence_length, hidden_size)
+        # print('encoder_hidden_states shape', encoder_hidden_states.shape)
+        # Calculate differences between consecutive embeddings (ignoring sequence_length)
+        differences = []
+        for i in range(1, encoder_hidden_states.size(0)):
+            diff = encoder_hidden_states[i] - encoder_hidden_states[i - 1]
+            diff = diff.unsqueeze(0)
+            differences.append(diff)
+        # Add the difference between the last and the first embedding
+        final_diff = encoder_hidden_states[-1] - encoder_hidden_states[0]
+        final_diff = final_diff.unsqueeze(0)
+        differences.append(final_diff)
+        # Concatenate differences along the batch dimension (f-1)
+        concatenated_differences = torch.cat(differences, dim=0)
+        # print('concatenated_differences shape', concatenated_differences.shape) # f 77 768
+        frame = concatenated_differences.size(0)
+        # Concatenate differences along the batch dimension (f)
+        concatenated_differences = torch.cat(differences, dim=0)
+        # Current shape: (f, 77, 768), Pad the second dimension (77) to 128
+        pad_length = 128 - concatenated_differences.size(1)
+        if pad_length > 0:
+        # Pad along the second dimension (77 -> 128), pad only on the right side
+            concatenated_differences_padded = F.pad(concatenated_differences, (0, 0, 0, pad_length))
+        ## ccl = contrastive camera learning ##
+        ccl_embedding = concatenated_differences_padded.reshape(frame, self.sample_size[0], self.sample_size[1])
+        ccl_embedding = ccl_embedding.unsqueeze(1)
+        ccl_embedding = ccl_embedding.expand(-1, 3, -1, -1)
+        # print('ccl_embedding shape', ccl_embedding.shape)
+        pixel_values = []
+        for bk in bokehK_values:
+            img_sim = bokehK_simulation(image_path, depth_map_path, bk, disp_focus=0.96, gamma=2.2)
+            # save_path = os.path.join(self.root_path, f"simulated_img_bokeh_{bk.item():.2f}.png")
+            # cv2.imwrite(save_path, img_sim)
+            # print(f"Saved image: {save_path}")
+            pixel_values.append(img_sim)
+        pixel_values = np.stack(pixel_values, axis=0)
+        pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous() / 255.
+        # Create bokehK embedding and concatenate it with CCL embedding
+        bokehK_embedding = create_bokehK_embedding(bokehK_values, self.sample_size[0], self.sample_size[1])
+        camera_embedding = torch.cat((bokehK_embedding, ccl_embedding), dim=1)
+        # print('camera_embedding shape', camera_embedding.shape)
+        return pixel_values, image_caption, camera_embedding, bokehK_values
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx):
+        while True:
+            try:
+                video, video_caption, camera_embedding, bokehK_values = self.get_batch(idx)
+                break
+            except Exception as e:
+                idx = random.randint(0, self.length - 1)
+        for transform in self.pixel_transforms:
+            video = transform(video)
+        sample = dict(pixel_values=video, text=video_caption, camera_embedding=camera_embedding, bokehK_values=bokehK_values)
+        return sample
+def test_camera_bokehK_dataset():
+    root_path = '/home/yuan418/data/project/camera_dataset/camera_bokehK/'
+    annotation_json = 'annotations/inference.json'
+    print('------------------')
+    dataset = CameraBokehK(
+       root_path=root_path,
+       annotation_json=annotation_json,
+       sample_n_frames=4,
+       sample_size=[256, 384],
+       is_Train=False,
+     )
+    # choose one sample for testing
+    idx = 1
+    sample = dataset[idx]
+    pixel_values = sample['pixel_values']
+    text = sample['text']
+    camera_embedding = sample['camera_embedding']
+    print(f"Pixel values shape: {pixel_values.shape}")
+    print(f"Text: {text}")
+    print(f"camera embedding shape: {camera_embedding.shape}")
+if __name__ == "__main__":
+    test_camera_bokehK_dataset()

genphoto/models/attention.py CHANGED Viewed

@@ -1,3 +1,136 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:82630247828d56f38b979a4a7b9bc12290ada3a1ce5be1d6153d07dbe4baaaa0
-size 5313

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from diffusers.models.attention import BasicTransformerBlock
+from einops import rearrange, repeat
+@dataclass
+class Transformer3DModelOutput(BaseOutput):
+    sample: torch.FloatTensor
+class Transformer3DModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+            self,
+            num_attention_heads: int = 16,
+            attention_head_dim: int = 88,
+            in_channels: Optional[int] = None,
+            num_layers: int = 1,
+            dropout: float = 0.0,
+            norm_num_groups: int = 32,
+            cross_attention_dim: Optional[int] = None,
+            attention_bias: bool = False,
+            activation_fn: str = "geglu",
+            num_embeds_ada_norm: Optional[int] = None,
+            use_linear_projection: bool = False,
+            only_cross_attention: bool = False,
+            upcast_attention: bool = False,
+            norm_type: str = "layer_norm",
+            norm_elementwise_affine: bool = True,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # Define input layers
+        self.in_channels = in_channels
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        if use_linear_projection:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        # Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        if use_linear_projection:
+            self.proj_out = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
+        # Input
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        batch_size, _, video_length = hidden_states.shape[:3]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        if encoder_hidden_states.shape[0] == batch_size:
+            encoder_hidden_states = repeat(encoder_hidden_states, 'b n c -> (b f) n c', f=video_length)
+        elif encoder_hidden_states.shape[0] == batch_size * video_length:
+            pass
+        else:
+            raise ValueError
+        batch, channel, height, weight = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if not self.use_linear_projection:
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+        else:
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+            hidden_states = self.proj_in(hidden_states)
+        # Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+            )
+        # Output
+        if not self.use_linear_projection:
+            hidden_states = (
+                hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+            )
+            hidden_states = self.proj_out(hidden_states)
+        else:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = (
+                hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+            )
+        output = hidden_states + residual
+        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+        if not return_dict:
+            return (output,)
+        return Transformer3DModelOutput(sample=output)

genphoto/models/attention_processor.py CHANGED Viewed

@@ -1,3 +1,412 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1fc36c35808aed64eb238e3dba643b51961992388dd76d945dec36760ab87557
-size 16681

+import torch
+import torch.nn as nn
+import torch.nn.init as init
+import logging
+from diffusers.models.lora import LoRALinearLayer
+from diffusers.models.attention import Attention
+from diffusers.utils import USE_PEFT_BACKEND
+from typing import Optional
+from einops import rearrange
+logger = logging.getLogger(__name__)
+class AttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+            self,
+            attn: Attention,
+            hidden_states: torch.FloatTensor,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            temb: Optional[torch.FloatTensor] = None,
+            scale: float = 1.0,
+            camera_feature=None
+    ) -> torch.Tensor:
+        residual = hidden_states
+        args = () if USE_PEFT_BACKEND else (scale,)
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states, *args)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class LoRAAttnProcessor(nn.Module):
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __init__(
+            self,
+            hidden_size=None,
+            cross_attention_dim=None,
+            rank=4,
+            network_alpha=None,
+            lora_scale=1.0,
+    ):
+        super().__init__()
+        self.rank = rank
+        self.lora_scale = lora_scale
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+    def __call__(
+            self,
+            attn,
+            hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=None,
+            temb=None,
+            camera_feature=None,
+            scale=None
+    ):
+        lora_scale = self.lora_scale if scale is None else scale
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + lora_scale * self.to_q_lora(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + lora_scale * self.to_v_lora(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class CameraAdaptorAttnProcessor(nn.Module):
+    def __init__(self,
+                 hidden_size,  # dimension of hidden state
+                 camera_feature_dim=None,  # dimension of the camera feature
+                 cross_attention_dim=None,  # dimension of the text embedding
+                 query_condition=False,
+                 key_value_condition=False,
+                 scale=1.0):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.camera_feature_dim = camera_feature_dim
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.query_condition = query_condition
+        self.key_value_condition = key_value_condition
+        assert hidden_size == camera_feature_dim
+        if self.query_condition and self.key_value_condition:
+            self.qkv_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.qkv_merge.weight)
+            init.zeros_(self.qkv_merge.bias)
+        elif self.query_condition:
+            self.q_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.q_merge.weight)
+            init.zeros_(self.q_merge.bias)
+        else:
+            self.kv_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.kv_merge.weight)
+            init.zeros_(self.kv_merge.bias)
+    def forward(self,
+                attn,
+                hidden_states,
+                camera_feature,
+                encoder_hidden_states=None,
+                attention_mask=None,
+                temb=None,
+                scale=None,):
+        assert camera_feature is not None
+        camera_embedding_scale = (scale or self.scale)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        if hidden_states.dim == 5:
+            hidden_states = rearrange(hidden_states, 'b c f h w -> (b f) (h w) c')
+        elif hidden_states.ndim == 4:
+            hidden_states = rearrange(hidden_states, 'b c h w -> b (h w) c')
+        else:
+            assert hidden_states.ndim == 3
+        if self.query_condition and self.key_value_condition:
+            assert encoder_hidden_states is None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        if encoder_hidden_states.ndim == 5:
+            encoder_hidden_states = rearrange(encoder_hidden_states, 'b c f h w -> (b f) (h w) c')
+        elif encoder_hidden_states.ndim == 4:
+            encoder_hidden_states = rearrange(encoder_hidden_states, 'b c h w -> b (h w) c')
+        else:
+            assert encoder_hidden_states.ndim == 3
+        if camera_feature.ndim == 5:
+            camera_feature = rearrange(camera_feature, "b c f h w -> (b f) (h w) c")
+        elif camera_feature.ndim == 4:
+            camera_feature = rearrange(camera_feature, "b c h w -> b (h w) c")
+        else:
+            assert camera_feature.ndim == 3
+        batch_size, ehs_sequence_length, _ = encoder_hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, ehs_sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        if attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        if self.query_condition and self.key_value_condition:  # only self attention
+            query_hidden_state = self.qkv_merge(hidden_states + camera_feature) * camera_embedding_scale + hidden_states
+            key_value_hidden_state = query_hidden_state
+        elif self.query_condition:
+            query_hidden_state = self.q_merge(hidden_states + camera_feature) * camera_embedding_scale + hidden_states
+            key_value_hidden_state = encoder_hidden_states
+        else:
+            key_value_hidden_state = self.kv_merge(encoder_hidden_states + camera_feature) * camera_embedding_scale + encoder_hidden_states
+            query_hidden_state = hidden_states
+        # original attention
+        query = attn.to_q(query_hidden_state)
+        key = attn.to_k(key_value_hidden_state)
+        value = attn.to_v(key_value_hidden_state)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class LORACameraAdaptorAttnProcessor(nn.Module):
+    def __init__(self,
+                 hidden_size,  # dimension of hidden state
+                 camera_feature_dim=None,  # dimension of the camera feature
+                 cross_attention_dim=None,  # dimension of the text embedding
+                 query_condition=False,
+                 key_value_condition=False,
+                 scale=1.0,
+                 # lora keywords
+                 rank=4,
+                 network_alpha=None,
+                 lora_scale=1.0):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.camera_feature_dim = camera_feature_dim
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.query_condition = query_condition
+        self.key_value_condition = key_value_condition
+        assert hidden_size == camera_feature_dim
+        if self.query_condition and self.key_value_condition:
+            self.qkv_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.qkv_merge.weight)
+            init.zeros_(self.qkv_merge.bias)
+        elif self.query_condition:
+            self.q_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.q_merge.weight)
+            init.zeros_(self.q_merge.bias)
+        else:
+            self.kv_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.kv_merge.weight)
+            init.zeros_(self.kv_merge.bias)
+        # lora
+        self.rank = rank
+        self.lora_scale = lora_scale
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+    def __call__(self,
+                 attn,
+                 hidden_states,
+                 encoder_hidden_states=None,
+                 attention_mask=None,
+                 temb=None,
+                 scale=1.0,
+                 camera_feature=None,
+                 ):
+        assert camera_feature is not None
+        lora_scale = self.lora_scale if scale is None else scale
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        if hidden_states.dim == 5:
+            hidden_states = rearrange(hidden_states, 'b c f h w -> (b f) (h w) c')
+        elif hidden_states.ndim == 4:
+            hidden_states = rearrange(hidden_states, 'b c h w -> b (h w) c')
+        else:
+            assert hidden_states.ndim == 3
+        if self.query_condition and self.key_value_condition:
+            assert encoder_hidden_states is None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        if encoder_hidden_states.ndim == 5:
+            encoder_hidden_states = rearrange(encoder_hidden_states, 'b c f h w -> (b f) (h w) c')
+        elif encoder_hidden_states.ndim == 4:
+            encoder_hidden_states = rearrange(encoder_hidden_states, 'b c h w -> b (h w) c')
+        else:
+            assert encoder_hidden_states.ndim == 3
+        if camera_feature.ndim == 5:
+            camera_feature = rearrange(camera_feature, "b c f h w -> (b f) (h w) c")
+        elif camera_feature.ndim == 4:
+            camera_feature = rearrange(camera_feature, "b c h w -> b (h w) c")
+        else:
+            assert camera_feature.ndim == 3
+        batch_size, ehs_sequence_length, _ = encoder_hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, ehs_sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        if attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        if self.query_condition and self.key_value_condition:  # only self attention
+            query_hidden_state = self.qkv_merge(hidden_states + camera_feature) * self.scale + hidden_states
+            key_value_hidden_state = query_hidden_state
+        elif self.query_condition:
+            query_hidden_state = self.q_merge(hidden_states + camera_feature) * self.scale + hidden_states
+            key_value_hidden_state = encoder_hidden_states
+        else:
+            key_value_hidden_state = self.kv_merge(encoder_hidden_states + camera_feature) * self.scale + encoder_hidden_states
+            query_hidden_state = hidden_states
+        # original attention
+        query = attn.to_q(query_hidden_state) + lora_scale * self.to_q_lora(query_hidden_state)
+        key = attn.to_k(key_value_hidden_state) + lora_scale * self.to_k_lora(key_value_hidden_state)
+        value = attn.to_v(key_value_hidden_state) + lora_scale * self.to_v_lora(key_value_hidden_state)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

genphoto/models/camera_adaptor.py CHANGED Viewed

@@ -1,3 +1,246 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b98af7dc452f718e7b74536412d017231a15d69933a224cd1cb9557fe5853ba5
-size 9775

+import math
+import torch
+import torch.nn as nn
+from einops import rearrange
+from genphoto.models.motion_module import TemporalTransformerBlock
+def get_parameter_dtype(parameter: torch.nn.Module):
+    try:
+        params = tuple(parameter.parameters())
+        if len(params) > 0:
+            return params[0].dtype
+        buffers = tuple(parameter.buffers())
+        if len(buffers) > 0:
+            return buffers[0].dtype
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].dtype
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+class CameraAdaptor(nn.Module):
+    def __init__(self, unet, camera_encoder):
+        super().__init__()
+        self.unet = unet
+        self.camera_encoder = camera_encoder
+    def forward(self, noisy_latents, timesteps, encoder_hidden_states, camera_embedding):
+        assert camera_embedding.ndim == 5
+        bs = camera_embedding.shape[0]            # b c f h w
+        camera_embedding_features = self.camera_encoder(camera_embedding)      # bf c h w
+        camera_embedding_features = [rearrange(x, '(b f) c h w -> b c f h w', b=bs)
+                                   for x in camera_embedding_features]
+        noise_pred = self.unet(noisy_latents,
+                               timesteps,
+                               encoder_hidden_states,
+                               camera_embedding_features=camera_embedding_features).sample
+        return noise_pred
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(dims, self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class ResnetBlock(nn.Module):
+    def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True):
+        super().__init__()
+        ps = ksize // 2
+        if in_c != out_c or sk == False:
+            self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps)
+        else:
+            self.in_conv = None
+        self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1)
+        self.act = nn.ReLU()
+        self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps)
+        if sk == False:
+            self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps)
+        else:
+            self.skep = None
+        self.down = down
+        if self.down == True:
+            self.down_opt = Downsample(in_c, use_conv=use_conv)
+    def forward(self, x):
+        if self.down == True:
+            x = self.down_opt(x)
+        if self.in_conv is not None:  # edit
+            x = self.in_conv(x)
+        h = self.block1(x)
+        h = self.act(h)
+        h = self.block2(h)
+        if self.skep is not None:
+            return h + self.skep(x)
+        else:
+            return h + x
+class PositionalEncoding(nn.Module):
+    def __init__(
+            self,
+            d_model,
+            dropout=0.,
+            max_len=32,
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(1, max_len, d_model)
+        pe[0, :, 0::2, ...] = torch.sin(position * div_term)
+        pe[0, :, 1::2, ...] = torch.cos(position * div_term)
+        pe.unsqueeze_(-1).unsqueeze_(-1)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1), ...]
+        return self.dropout(x)
+class CameraCameraEncoder(nn.Module):
+    def __init__(self,
+                 downscale_factor,
+                 channels=[320, 640, 1280, 1280],
+                 nums_rb=3,
+                 cin=64,
+                 ksize=3,
+                 sk=False,
+                 use_conv=True,
+                 compression_factor=1,
+                 temporal_attention_nhead=8,
+                 attention_block_types=("Temporal_Self", ),
+                 temporal_position_encoding=False,
+                 temporal_position_encoding_max_len=8,
+                 rescale_output_factor=1.0):
+        super(CameraCameraEncoder, self).__init__()
+        self.unshuffle = nn.PixelUnshuffle(downscale_factor)
+        self.channels = channels
+        self.nums_rb = nums_rb
+        self.encoder_down_conv_blocks = nn.ModuleList()
+        self.encoder_down_attention_blocks = nn.ModuleList()
+        for i in range(len(channels)):
+            conv_layers = nn.ModuleList()
+            temporal_attention_layers = nn.ModuleList()
+            for j in range(nums_rb):
+                if j == 0 and i != 0:
+                    in_dim = channels[i - 1]
+                    out_dim = int(channels[i] / compression_factor)
+                    conv_layer = ResnetBlock(in_dim, out_dim, down=True, ksize=ksize, sk=sk, use_conv=use_conv)
+                elif j == 0:
+                    in_dim = channels[0]
+                    out_dim = int(channels[i] / compression_factor)
+                    conv_layer = ResnetBlock(in_dim, out_dim, down=False, ksize=ksize, sk=sk, use_conv=use_conv)
+                elif j == nums_rb - 1:
+                    in_dim = channels[i] / compression_factor
+                    out_dim = channels[i]
+                    conv_layer = ResnetBlock(in_dim, out_dim, down=False, ksize=ksize, sk=sk, use_conv=use_conv)
+                else:
+                    in_dim = int(channels[i] / compression_factor)
+                    out_dim = int(channels[i] / compression_factor)
+                    conv_layer = ResnetBlock(in_dim, out_dim, down=False, ksize=ksize, sk=sk, use_conv=use_conv)
+                temporal_attention_layer = TemporalTransformerBlock(dim=out_dim,
+                                                                    num_attention_heads=temporal_attention_nhead,
+                                                                    attention_head_dim=int(out_dim / temporal_attention_nhead),
+                                                                    attention_block_types=attention_block_types,
+                                                                    dropout=0.0,
+                                                                    cross_attention_dim=None,
+                                                                    temporal_position_encoding=temporal_position_encoding,
+                                                                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+                                                                    rescale_output_factor=rescale_output_factor)
+                conv_layers.append(conv_layer)
+                temporal_attention_layers.append(temporal_attention_layer)
+            self.encoder_down_conv_blocks.append(conv_layers)
+            self.encoder_down_attention_blocks.append(temporal_attention_layers)
+        self.encoder_conv_in = nn.Conv2d(cin, channels[0], 3, 1, 1)
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+    def forward(self, x):
+        # unshuffle
+        bs = x.shape[0]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = self.unshuffle(x)
+        # extract features
+        features = []
+        x = self.encoder_conv_in(x)
+    #    print('xxxx 1111 shape', x.shape)
+        for res_block, attention_block in zip(self.encoder_down_conv_blocks, self.encoder_down_attention_blocks):
+            for res_layer, attention_layer in zip(res_block, attention_block):
+                x = res_layer(x)
+              #  print('xxxx 2222 shape', x.shape)
+                h, w = x.shape[-2:]
+                x = rearrange(x, '(b f) c h w -> (b h w) f c', b=bs)
+                x = attention_layer(x)
+              #  print('xxxx 3333 shape', x.shape)
+                x = rearrange(x, '(b h w) f c -> (b f) c h w', h=h, w=w)
+            features.append(x)
+        return features

genphoto/models/ccl_embedding.py CHANGED Viewed

@@ -1,3 +1,64 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:599f8f75460a5b776dc213e624d4e7fc6080c8311d14ffe572501e46512141bf
-size 2564

+import torch
+from transformers import DistilBertTokenizer, DistilBertModel
+from torch.nn.functional import cosine_similarity
+class FastLightweightTextEncoder:
+    def __init__(self, model_name='distilbert-base-uncased', cache_dir='/path/to/your/cache'):
+        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+        self.text_encoder = DistilBertModel.from_pretrained(model_name).eval().cuda()
+    def encode_texts(self, prompts):
+        # Batch processing the prompts to get their embeddings
+        inputs = self.tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
+        input_ids = inputs['input_ids'].cuda()
+        attention_mask = inputs['attention_mask'].cuda()
+        with torch.no_grad():
+            embeddings = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
+        # Normalize embeddings to get consistent vector representations
+        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
+        # Print shape of embeddings
+     #   print(f"Embeddings shape: {embeddings.shape}")
+        return embeddings
+    def calculate_differences(self, embeddings):
+        # Calculate differences between consecutive embeddings
+        differences = []
+        for i in range(1, embeddings.size(0)):
+            diff = embeddings[i] - embeddings[i - 1]
+            print('diff shape', diff.shape)
+            differences.append(diff.unsqueeze(0))  # Add batch dimension
+            print('differences shape', differences.shape)
+        # Concatenate differences along the batch dimension (f-1)
+        concatenated_differences = torch.cat(differences, dim=0)  # Shape: (f-1, sequence_length, hidden_size)
+        return concatenated_differences
+# Example usage
+if __name__ == '__main__':
+    prompts = [
+        "A smiling dog. Focal length: 24mm.",
+        "A smiling dog. Focal length: 25mm.",
+        "A smiling dog. Focal length: 26mm.",
+        "A smiling dog. Focal length: 30mm.",
+        "A smiling dog. Focal length: 36mm.",
+    ]
+    # Initialize the FastLightweightTextEncoder
+    text_encoder = FastLightweightTextEncoder(cache_dir='/home/yuan418/lab/users/Yu/modules/')
+    # Encode the prompts
+    embeddings = text_encoder.encode_texts(prompts)
+    print('a')
+    print('embeddings', embeddings)
+    print('embeddings shape', embeddings.shape)
+    # Calculate and concatenate differences
+    concatenated_diffs = text_encoder.calculate_differences(embeddings)
+    print("Concatenated differences shape:", concatenated_diffs.shape)

genphoto/models/motion_module.py CHANGED Viewed

@@ -1,3 +1,389 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5a87e7341d4c8f441adbba3acf43b289589ed0825af8197262425ec35c708d32
-size 15717

+from dataclasses import dataclass
+from typing import Callable, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.utils import BaseOutput
+from diffusers.models.attention_processor import Attention
+from diffusers.models.attention import FeedForward
+from typing import Dict, Any
+from genphoto.models.resnet import InflatedGroupNorm
+from genphoto.models.attention_processor import CameraAdaptorAttnProcessor
+from einops import rearrange
+import math
+def zero_module(module):
+    # Zero out the parameters of a module and return it.
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+@dataclass
+class TemporalTransformer3DModelOutput(BaseOutput):
+    sample: torch.FloatTensor
+def get_motion_module(
+        in_channels,
+        motion_module_type: str,
+        motion_module_kwargs: dict
+):
+    if motion_module_type == "Vanilla":
+        return VanillaTemporalModule(in_channels=in_channels, **motion_module_kwargs)
+    else:
+        raise ValueError
+class VanillaTemporalModule(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            num_attention_heads=8,
+            num_transformer_block=2,
+            attention_block_types=("Temporal_Self",),
+            temporal_position_encoding=True,
+            temporal_position_encoding_max_len=32,
+            temporal_attention_dim_div=1,
+            cross_attention_dim=320,
+            zero_initialize=True,
+            encoder_hidden_states_query=(False, False),
+            attention_activation_scale=1.0,
+            attention_processor_kwargs: Dict = {},
+            causal_temporal_attention=False,
+            causal_temporal_attention_mask_type="",
+            rescale_output_factor=1.0
+    ):
+        super().__init__()
+        self.temporal_transformer = TemporalTransformer3DModel(
+            in_channels=in_channels,
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,
+            num_layers=num_transformer_block,
+            attention_block_types=attention_block_types,
+            cross_attention_dim=cross_attention_dim,
+            temporal_position_encoding=temporal_position_encoding,
+            temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+            encoder_hidden_states_query=encoder_hidden_states_query,
+            attention_activation_scale=attention_activation_scale,
+            attention_processor_kwargs=attention_processor_kwargs,
+            causal_temporal_attention=causal_temporal_attention,
+            causal_temporal_attention_mask_type=causal_temporal_attention_mask_type,
+            rescale_output_factor=rescale_output_factor
+        )
+        if zero_initialize:
+            self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None,
+                cross_attention_kwargs: Dict[str, Any] = {}):
+        hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask, cross_attention_kwargs=cross_attention_kwargs)
+        output = hidden_states
+        return output
+class TemporalTransformer3DModel(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            num_attention_heads,
+            attention_head_dim,
+            num_layers,
+            attention_block_types=("Temporal_Self", "Temporal_Self",),
+            dropout=0.0,
+            norm_num_groups=32,
+            cross_attention_dim=320,
+            activation_fn="geglu",
+            attention_bias=False,
+            upcast_attention=False,
+            temporal_position_encoding=False,
+            temporal_position_encoding_max_len=32,
+            encoder_hidden_states_query=(False, False),
+            attention_activation_scale=1.0,
+            attention_processor_kwargs: Dict = {},
+            causal_temporal_attention=None,
+            causal_temporal_attention_mask_type="",
+            rescale_output_factor=1.0
+    ):
+        super().__init__()
+        assert causal_temporal_attention is not None
+        self.causal_temporal_attention = causal_temporal_attention
+        assert (not causal_temporal_attention) or (causal_temporal_attention_mask_type != "")
+        self.causal_temporal_attention_mask_type = causal_temporal_attention_mask_type
+        self.causal_temporal_attention_mask = None
+        inner_dim = num_attention_heads * attention_head_dim
+        self.norm = InflatedGroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TemporalTransformerBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    attention_block_types=attention_block_types,
+                    dropout=dropout,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    temporal_position_encoding=temporal_position_encoding,
+                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+                    encoder_hidden_states_query=encoder_hidden_states_query,
+                    attention_activation_scale=attention_activation_scale,
+                    attention_processor_kwargs=attention_processor_kwargs,
+                    rescale_output_factor=rescale_output_factor,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+    def get_causal_temporal_attention_mask(self, hidden_states):
+        batch_size, sequence_length, dim = hidden_states.shape
+        if self.causal_temporal_attention_mask is None or self.causal_temporal_attention_mask.shape != (
+        batch_size, sequence_length, sequence_length):
+            if self.causal_temporal_attention_mask_type == "causal":
+                # 1. vanilla causal mask
+                mask = torch.tril(torch.ones(sequence_length, sequence_length))
+            elif self.causal_temporal_attention_mask_type == "2-seq":
+                # 2. 2-seq
+                mask = torch.zeros(sequence_length, sequence_length)
+                mask[:sequence_length // 2, :sequence_length // 2] = 1
+                mask[-sequence_length // 2:, -sequence_length // 2:] = 1
+            elif self.causal_temporal_attention_mask_type == "0-prev":
+                # attn to the previous frame
+                indices = torch.arange(sequence_length)
+                indices_prev = indices - 1
+                indices_prev[0] = 0
+                mask = torch.zeros(sequence_length, sequence_length)
+                mask[:, 0] = 1.
+                mask[indices, indices_prev] = 1.
+            elif self.causal_temporal_attention_mask_type == "0":
+                # only attn to first frame
+                mask = torch.zeros(sequence_length, sequence_length)
+                mask[:, 0] = 1
+            elif self.causal_temporal_attention_mask_type == "wo-self":
+                indices = torch.arange(sequence_length)
+                mask = torch.ones(sequence_length, sequence_length)
+                mask[indices, indices] = 0
+            elif self.causal_temporal_attention_mask_type == "circle":
+                indices = torch.arange(sequence_length)
+                indices_prev = indices - 1
+                indices_prev[0] = 0
+                mask = torch.eye(sequence_length)
+                mask[indices, indices_prev] = 1
+                mask[0, -1] = 1
+            else:
+                raise ValueError
+            # generate attention mask fron binary values
+            mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+            mask = mask.unsqueeze(0)
+            mask = mask.repeat(batch_size, 1, 1)
+            self.causal_temporal_attention_mask = mask.to(hidden_states.device)
+        return self.causal_temporal_attention_mask
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None,
+                cross_attention_kwargs: Dict[str, Any] = {},):
+        residual = hidden_states
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        height, width = hidden_states.shape[-2:]
+        hidden_states = self.norm(hidden_states)
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b h w) f c")
+        hidden_states = self.proj_in(hidden_states)
+        attention_mask = self.get_causal_temporal_attention_mask(
+            hidden_states) if self.causal_temporal_attention else attention_mask
+        # Transformer Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states,
+                                  attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs)
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = rearrange(hidden_states, "(b h w) f c -> b c f h w", h=height, w=width)
+        output = hidden_states + residual
+        return output
+class TemporalTransformerBlock(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_attention_heads,
+            attention_head_dim,
+            attention_block_types=("Temporal_Self", "Temporal_Self",),
+            dropout=0.0,
+            norm_num_groups=32,
+            cross_attention_dim=768,
+            activation_fn="geglu",
+            attention_bias=False,
+            upcast_attention=False,
+            temporal_position_encoding=False,
+            temporal_position_encoding_max_len=32,
+            encoder_hidden_states_query=(False, False),
+            attention_activation_scale=1.0,
+            attention_processor_kwargs: Dict = {},
+            rescale_output_factor=1.0
+    ):
+        super().__init__()
+        attention_blocks = []
+        norms = []
+        self.attention_block_types = attention_block_types
+        for block_idx, block_name in enumerate(attention_block_types):
+            attention_blocks.append(
+                TemporalSelfAttention(
+                    attention_mode=block_name,
+                    cross_attention_dim=cross_attention_dim if block_name in ['Temporal_Cross', 'Temporal_Camera_Adaptor'] else None,
+                    query_dim=dim,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    temporal_position_encoding=temporal_position_encoding,
+                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+                    rescale_output_factor=rescale_output_factor,
+                )
+            )
+            norms.append(nn.LayerNorm(dim))
+        self.attention_blocks = nn.ModuleList(attention_blocks)
+        self.norms = nn.ModuleList(norms)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+        self.ff_norm = nn.LayerNorm(dim)
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs: Dict[str, Any] = {}):
+        for attention_block, norm, attention_block_type in zip(self.attention_blocks, self.norms, self.attention_block_types):
+            norm_hidden_states = norm(hidden_states)
+            hidden_states = attention_block(
+                norm_hidden_states,
+                encoder_hidden_states=norm_hidden_states if attention_block_type == 'Temporal_Self' else encoder_hidden_states,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs
+            ) + hidden_states
+        hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
+        output = hidden_states
+        return output
+class PositionalEncoding(nn.Module):
+    def __init__(
+            self,
+            d_model,
+            dropout=0.,
+            max_len=32,
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(1, max_len, d_model)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)]
+        return self.dropout(x)
+class TemporalSelfAttention(Attention):
+    def __init__(
+            self,
+            attention_mode=None,
+            temporal_position_encoding=False,
+            temporal_position_encoding_max_len=32,
+            rescale_output_factor=1.0,
+            *args, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        assert attention_mode == "Temporal_Self"
+        self.pos_encoder = PositionalEncoding(
+            kwargs["query_dim"],
+            max_len=temporal_position_encoding_max_len
+        ) if temporal_position_encoding else None
+        self.rescale_output_factor = rescale_output_factor
+    def set_use_memory_efficient_attention_xformers(
+            self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
+    ):
+        # disable motion module efficient xformers to avoid bad results, don't know why
+        # TODO: fix this bug
+        pass
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+        # add position encoding
+        if self.pos_encoder is not None:
+            hidden_states = self.pos_encoder(hidden_states)
+        if "camera_feature" in cross_attention_kwargs:
+            camera_feature = cross_attention_kwargs["camera_feature"]
+            if camera_feature.ndim == 5:
+                camera_feature = rearrange(camera_feature, "b c f h w -> (b h w) f c")
+            else:
+                assert camera_feature.ndim == 3
+            cross_attention_kwargs["camera_feature"] = camera_feature
+        if isinstance(self.processor, CameraAdaptorAttnProcessor):
+            return self.processor(
+                self,
+                hidden_states,
+                cross_attention_kwargs.pop('camera_feature'),
+                encoder_hidden_states=None,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )
+        elif hasattr(self.processor, "__call__"):
+            return self.processor.__call__(
+                    self,
+                    hidden_states,
+                    encoder_hidden_states=None,
+                    attention_mask=attention_mask,
+                    **cross_attention_kwargs,
+                )
+        else:
+            return self.processor(
+                self,
+                hidden_states,
+                encoder_hidden_states=None,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )

genphoto/models/resnet.py CHANGED Viewed

@@ -1,3 +1,440 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:17d68816bfa42b445e7b3c9f6da088e08024a99b838bb1ca74a327e6a9116d50
-size 17833

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py
+from einops import rearrange, repeat
+from functools import partial
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.activations import get_activation
+from diffusers.models.normalization import AdaGroupNorm
+from diffusers.models.attention_processor import SpatialNorm
+class InflatedConv3d(nn.Conv2d):
+    def forward(self, x):
+        video_length = x.shape[2]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = super().forward(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+        return x
+class InflatedGroupNorm(nn.GroupNorm):
+    def forward(self, x):
+        # return super().forward(x)
+        video_length = x.shape[2]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = super().forward(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+        return x
+def zero_module(module):
+    # Zero out the parameters of a module and return it.
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+class FusionBlock2D(nn.Module):
+    r"""
+    A Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        groups_out (`int`, *optional*, default to None):
+            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
+        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
+            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" or
+            "ada_group" for a stronger conditioning with scale and shift.
+        kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see
+            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
+        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
+        use_in_shortcut (`bool`, *optional*, default to `True`):
+            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
+        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
+        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
+        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
+            `conv_shortcut` output.
+        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
+            If None, same as `out_channels`.
+    """
+    def __init__(
+            self,
+            *,
+            in_channels,
+            out_channels=None,
+            conv_shortcut=False,
+            dropout=0.0,
+            temb_channels=512,
+            groups=32,
+            groups_out=None,
+            pre_norm=True,
+            eps=1e-6,
+            non_linearity="swish",
+            skip_time_act=False,
+            time_embedding_norm="default",  # default, scale_shift, ada_group, spatial
+            kernel=None,
+            output_scale_factor=1.0,
+            use_in_shortcut=None,
+            up=False,
+            down=False,
+            conv_shortcut_bias: bool = True,
+            conv_2d_out_channels: Optional[int] = None,
+            zero_init=True,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        in_channels = in_channels * 2
+        self.in_channels = in_channels
+        out_channels = in_channels * 3 if out_channels is None else out_channels * 3
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+        self.time_embedding_norm = time_embedding_norm
+        self.skip_time_act = skip_time_act
+        if groups_out is None:
+            groups_out = groups
+        if self.time_embedding_norm == "ada_group":
+            self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm1 = SpatialNorm(in_channels, temb_channels)
+        else:
+            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                self.time_emb_proj = torch.nn.Linear(temb_channels, out_channels)
+            elif self.time_embedding_norm == "scale_shift":
+                self.time_emb_proj = torch.nn.Linear(temb_channels, 2 * out_channels)
+            elif self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+                self.time_emb_proj = None
+            else:
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+        else:
+            self.time_emb_proj = None
+        if self.time_embedding_norm == "ada_group":
+            self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm2 = SpatialNorm(out_channels, temb_channels)
+        else:
+            self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        conv_2d_out_channels = conv_2d_out_channels or out_channels
+        self.conv2 = torch.nn.Conv2d(out_channels, conv_2d_out_channels, kernel_size=1, stride=1, padding=0)
+        self.nonlinearity = get_activation(non_linearity)
+        self.upsample = self.downsample = None
+        if self.up:
+            if kernel == "fir":
+                fir_kernel = (1, 3, 3, 1)
+                self.upsample = lambda x: upsample_2d(x, kernel=fir_kernel)
+            elif kernel == "sde_vp":
+                self.upsample = partial(F.interpolate, scale_factor=2.0, mode="nearest")
+            else:
+                self.upsample = Upsample2D(in_channels, use_conv=False)
+        elif self.down:
+            if kernel == "fir":
+                fir_kernel = (1, 3, 3, 1)
+                self.downsample = lambda x: downsample_2d(x, kernel=fir_kernel)
+            elif kernel == "sde_vp":
+                self.downsample = partial(F.avg_pool2d, kernel_size=2, stride=2)
+            else:
+                self.downsample = Downsample2D(in_channels, use_conv=False, padding=1, name="op")
+        self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = torch.nn.Conv2d(
+                in_channels, conv_2d_out_channels, kernel_size=1, stride=1, padding=0, bias=conv_shortcut_bias
+            )
+        conv_out = torch.nn.Conv2d(
+            conv_2d_out_channels, conv_2d_out_channels, kernel_size=1, stride=1, padding=0,
+        )
+        self.conv_out = zero_module(conv_out) if zero_init else conv_out
+    def forward(self, init_hidden_state, post_hidden_states, temb):
+        # init_hidden_state:  b c   1   h w
+        # post_hidden_states: b c (f-1) h w
+        video_length = post_hidden_states.shape[2]
+        repeated_init_hidden_state = repeat(init_hidden_state, "b c f h w -> b c (n f) h w", n=video_length)
+        hidden_states = torch.cat([repeated_init_hidden_state, post_hidden_states], dim=1)
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        input_tensor = hidden_states
+        if temb.shape[0] != input_tensor.shape[0]:
+            temb = repeat(temb, "b c -> (b n) c", n=input_tensor.shape[0] // temb.shape[0])
+        assert temb.shape[0] == input_tensor.shape[0], f"{temb.shape}, {input_tensor.shape}"
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm1(hidden_states, temb)
+        else:
+            hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        if self.upsample is not None:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if hidden_states.shape[0] >= 64:
+                input_tensor = input_tensor.contiguous()
+                hidden_states = hidden_states.contiguous()
+            input_tensor = self.upsample(input_tensor)
+            hidden_states = self.upsample(hidden_states)
+        elif self.downsample is not None:
+            input_tensor = self.downsample(input_tensor)
+            hidden_states = self.downsample(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if self.time_emb_proj is not None:
+            if not self.skip_time_act:
+                temb = self.nonlinearity(temb)
+            temb = self.time_emb_proj(temb)[:, :, None, None]
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
+            hidden_states = self.norm2(hidden_states, temb)
+        else:
+            hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        output_tensor = self.conv_out(output_tensor)
+        output_tensor = rearrange(output_tensor, "(b f) c h w -> b c f h w", f=video_length)
+        scale_1, scale_2, shift = output_tensor.chunk(3, dim=1)
+        # output_tensor = (1 + scale_1) * repeated_init_hidden_state + scale_2 * post_hidden_states + shift
+        output_tensor = scale_1 * repeated_init_hidden_state + (1 + scale_2) * post_hidden_states + shift
+        return output_tensor
+class Upsample3D(nn.Module):
+    def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        conv = None
+        if use_conv_transpose:
+            raise NotImplementedError
+        elif use_conv:
+            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)
+    def forward(self, hidden_states, output_size=None):
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv_transpose:
+            raise NotImplementedError
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+        # if `output_size` is passed we force the interpolation output
+        # size and do not make use of `scale_factor=2`
+        if output_size is None:
+            hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest")
+        else:
+            hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+        # if self.use_conv:
+        #     if self.name == "conv":
+        #         hidden_states = self.conv(hidden_states)
+        #     else:
+        #         hidden_states = self.Conv2d_0(hidden_states)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class Downsample3D(nn.Module):
+    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+        if use_conv:
+            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            raise NotImplementedError
+    def forward(self, hidden_states):
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv and self.padding == 0:
+            raise NotImplementedError
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class ResnetBlock3D(nn.Module):
+    def __init__(
+            self,
+            *,
+            in_channels,
+            out_channels=None,
+            conv_shortcut=False,
+            dropout=0.0,
+            temb_channels=512,
+            groups=32,
+            groups_out=None,
+            pre_norm=True,
+            eps=1e-6,
+            non_linearity="swish",
+            time_embedding_norm="default",
+            output_scale_factor=1.0,
+            use_in_shortcut=None,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.output_scale_factor = output_scale_factor
+        if groups_out is None:
+            groups_out = groups
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                time_emb_proj_out_channels = out_channels
+            elif self.time_embedding_norm == "scale_shift":
+                time_emb_proj_out_channels = out_channels * 2
+            else:
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+            self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
+        else:
+            self.time_emb_proj = None
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if non_linearity == "swish":
+            self.nonlinearity = lambda x: F.silu(x)
+        elif non_linearity == "mish":
+            self.nonlinearity = Mish()
+        elif non_linearity == "silu":
+            self.nonlinearity = nn.SiLU()
+        self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, input_tensor, temb):
+        # input: b c f h w
+        hidden_states = input_tensor
+        video_length = hidden_states.shape[2]
+        emb = repeat(emb, "b c -> (b f) c", f=video_length)
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None:
+            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        return output_tensor
+class Mish(torch.nn.Module):
+    def forward(self, hidden_states):
+        return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))

genphoto/models/unet.py CHANGED Viewed

@@ -1,3 +1,1300 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:66cd2f1e572a9d63f9ff6e1dc5bbacadd02916fc60cef9505761b6470c51f08e
-size 61839

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py
+import os
+import json
+import safetensors
+import logging
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from einops import repeat, rearrange
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union, Dict, Any
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.attention_processor import AttentionProcessor
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.attention_processor import LoRAAttnProcessor
+from diffusers.loaders import AttnProcsLayers, UNet2DConditionLoadersMixin
+from genphoto.models.unet_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+)
+from genphoto.models.attention_processor import (
+    LORACameraAdaptorAttnProcessor,
+    CameraAdaptorAttnProcessor
+)
+from genphoto.models.attention_processor import LoRAAttnProcessor as CustomizedLoRAAttnProcessor
+from genphoto.models.attention_processor import AttnProcessor as CustomizedAttnProcessor
+from genphoto.models.resnet import (
+    InflatedConv3d,
+    FusionBlock2D
+)
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+    sample: torch.FloatTensor
+class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+            self,
+            sample_size: Optional[int] = None,
+            in_channels: int = 4,
+            out_channels: int = 4,
+            center_input_sample: bool = False,
+            flip_sin_to_cos: bool = True,
+            freq_shift: int = 0,
+            down_block_types: Tuple[str] = (
+                    "CrossAttnDownBlock3D",
+                    "CrossAttnDownBlock3D",
+                    "CrossAttnDownBlock3D",
+                    "DownBlock3D",
+            ),
+            mid_block_type: str = "UNetMidBlock3DCrossAttn",
+            up_block_types: Tuple[str] = (
+                    "UpBlock3D",
+                    "CrossAttnUpBlock3D",
+                    "CrossAttnUpBlock3D",
+                    "CrossAttnUpBlock3D",
+            ),
+            only_cross_attention: Union[bool, Tuple[bool]] = False,
+            block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+            layers_per_block: int = 2,
+            downsample_padding: int = 1,
+            mid_block_scale_factor: float = 1,
+            act_fn: str = "silu",
+            norm_num_groups: int = 32,
+            norm_eps: float = 1e-5,
+            cross_attention_dim: int = 1280,
+            attention_head_dim: Union[int, Tuple[int]] = 8,
+            dual_cross_attention: bool = False,
+            use_linear_projection: bool = False,
+            class_embed_type: Optional[str] = None,
+            addition_embed_type: Optional[str] = None,
+            num_class_embeds: Optional[int] = None,
+            upcast_attention: bool = False,
+            resnet_time_scale_shift: str = "default",
+            # Additional
+            use_motion_module=False,
+            motion_module_resolutions=(1, 2, 4, 8),
+            motion_module_mid_block=False,
+            motion_module_type=None,
+            motion_module_kwargs={},
+            # whether fuse first frame's feature
+            fuse_first_frame: bool = False,
+    ):
+        super().__init__()
+        self.logger = logging.get_logger(__name__)
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+        # input
+        self.conv_in = InflatedConv3d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        self.down_fusers = nn.ModuleList([])
+        self.mid_fuser = None
+        self.down_fusers.append(
+            FusionBlock2D(
+                in_channels=block_out_channels[0],
+                out_channels=block_out_channels[0],
+                temb_channels=time_embed_dim,
+                eps=norm_eps,
+                groups=norm_num_groups,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=act_fn,
+            ) if fuse_first_frame else None
+        )
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            res = 2 ** i
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                use_motion_module=use_motion_module and (res in motion_module_resolutions),
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+            )
+            down_fuser = nn.ModuleList(
+                [
+                    FusionBlock2D(
+                        in_channels=output_channel,
+                        out_channels=output_channel,
+                        temb_channels=time_embed_dim,
+                        eps=norm_eps,
+                        groups=norm_num_groups,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=act_fn,
+                    ) if fuse_first_frame else None for _ in
+                    range(layers_per_block if is_final_block else layers_per_block + 1)
+                ]
+            )
+            self.down_blocks.append(down_block)
+            self.down_fusers.append(down_fuser)
+        # mid
+        if mid_block_type == "UNetMidBlock3DCrossAttn":
+            self.mid_block = UNetMidBlock3DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                use_motion_module=use_motion_module and motion_module_mid_block,
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+            )
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        self.mid_fuser = FusionBlock2D(
+            in_channels=block_out_channels[-1],
+            out_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            eps=norm_eps,
+            groups=norm_num_groups,
+            time_embedding_norm=resnet_time_scale_shift,
+            non_linearity=act_fn,
+        ) if fuse_first_frame else None
+        # count how many layers upsample the videos
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            res = 2 ** (3 - i)
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                use_motion_module=use_motion_module and (res in motion_module_resolutions),
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
+        self.conv_act = nn.SiLU()
+        self.conv_out = InflatedConv3d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+    def set_image_layer_lora(self, image_layer_lora_rank: int = 128):
+        lora_attn_procs = {}
+        for name in self.attn_processors.keys():
+            self.logger.info(f"(add lora) {name}")
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.config.block_out_channels[block_id]
+            lora_attn_procs[name] = LoRAAttnProcessor(
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+                rank=image_layer_lora_rank if image_layer_lora_rank > 16 else hidden_size // image_layer_lora_rank,
+            )
+        self.set_attn_processor(lora_attn_procs)
+        lora_layers = AttnProcsLayers(self.attn_processors)
+        self.logger.info(f"(lora parameters): {sum(p.numel() for p in lora_layers.parameters()) / 1e6:.3f} M")
+        del lora_layers
+    def set_image_layer_lora_scale(self, lora_scale: float = 1.0):
+        for block in self.down_blocks: setattr(block, "lora_scale", lora_scale)
+        for block in self.up_blocks:   setattr(block, "lora_scale", lora_scale)
+        setattr(self.mid_block, "lora_scale", lora_scale)
+    def set_motion_module_lora_scale(self, lora_scale: float = 1.0):
+        for block in self.down_blocks: setattr(block, "motion_lora_scale", lora_scale)
+        for block in self.up_blocks: setattr(block, "motion_lora_scale", lora_scale)
+        setattr(self.mid_block, "motion_lora_scale", lora_scale)
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            # filter out processors in motion module
+            if hasattr(module, "set_processor"):
+                if not "motion_modules." in name:
+                    processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not "motion_modules." in name:
+                    if not isinstance(processor, dict):
+                        module.set_processor(processor)
+                    else:
+                        module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_motion_module_lora_layers(self, motion_module_lora_rank: int = 32):
+        lora_attn_procs = {}
+        for name in self.mm_attn_processors.keys():
+            self.logger.info(f"(add lora) {name}")
+            cross_attention_dim = None
+            if name.startswith("mid_block"):
+                hidden_size = self.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.config.block_out_channels[block_id]
+            lora_attn_procs[name] = LoRAAttnProcessor(
+                hidden_size=hidden_size,
+                cross_attention_dim=cross_attention_dim,
+                rank=motion_module_lora_rank if motion_module_lora_rank > 16 else hidden_size // motion_module_lora_rank,
+            )
+        self.set_mm_attn_processor(lora_attn_procs)
+        lora_layers = AttnProcsLayers(self.mm_attn_processors)
+        return lora_layers
+    @property
+    def mm_attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module,
+                                        processors: Dict[str, AttentionProcessor]):
+            # filter out processors in motion module
+            if hasattr(module, "set_processor"):
+                if "motion_modules." in name:
+                    processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_mm_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.mm_attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if "motion_modules." in name:
+                    if not isinstance(processor, dict):
+                        module.set_processor(processor)
+                    else:
+                        module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_slicable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_slicable_dims(module)
+        num_slicable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_slicable_layers * [1]
+        slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+            module.gradient_checkpointing = value
+    def forward(
+            self,
+            sample: torch.FloatTensor,
+            timestep: Union[torch.Tensor, float, int],
+            encoder_hidden_states: Union[torch.Tensor, List[torch.Tensor]],
+            class_labels: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            return_dict: bool = True,
+            # support controlnet
+            down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+            mid_block_additional_residual: Optional[torch.Tensor] = None,
+            # other features
+            motion_module_alphas: Union[tuple, float] = 1.0,
+            debug: bool = False,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
+        activations = {}
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2 ** self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            self.logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # center input if necessary1
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        # extend encoder_hidden_states
+        video_length = sample.shape[2]
+        encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b f) n c", f=video_length)
+        # emb_single = emb
+        # emb = repeat(emb, "b c -> (b f) c", f=video_length)
+        # pre-process
+        sample = self.conv_in(sample)
+        activations["conv_in_out"] = sample
+        # to be fused
+        if self.down_fusers[0] != None:
+            # scale, shift   = self.down_fusers[0](sample[:,:,0].contiguous(), emb_single).unsqueeze(2).chunk(2, dim=1)
+            # sample[:,:,1:] = (1 + scale) * sample[:,:,1:].contiguous() + shift
+            fused_sample = self.down_fusers[0](
+                init_hidden_state=sample[:, :, :1].contiguous(),
+                post_hidden_states=sample[:, :, 1:].contiguous(),
+                temb=emb_single,
+            )
+            sample = torch.cat([sample[:, :, :1], fused_sample], dim=2)
+        activations["conv_in_fuse_out"] = sample
+        # down
+        down_block_res_samples = (sample,)
+        # motion module alpha
+        if isinstance(motion_module_alphas, float):
+            motion_module_alphas = (motion_module_alphas,) * 5
+        for downsample_block, down_fuser, motion_module_alpha in zip(self.down_blocks, self.down_fusers[1:],
+                                                                     motion_module_alphas[:-1]):
+            sample, res_samples = downsample_block(
+                hidden_states=sample,
+                temb=emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                motion_module_alpha=motion_module_alpha,
+                cross_attention_kwargs=cross_attention_kwargs
+            )
+            # to be fused
+            for sample_idx, fuser in enumerate(down_fuser):
+                if fuser != None:
+                    fused_sample = fuser(
+                        init_hidden_state=res_samples[sample_idx][:, :, :1].contiguous(),
+                        post_hidden_states=res_samples[sample_idx][:, :, 1:].contiguous(),
+                        temb=emb_single,
+                    )
+                    res_samples = list(res_samples)
+                    res_samples[sample_idx] = torch.cat([res_samples[sample_idx][:, :, :1], fused_sample], dim=2)
+                    res_samples = tuple(res_samples)
+            down_block_res_samples += res_samples
+        # support controlnet
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                    down_block_res_samples, down_block_additional_residuals
+            ):
+                if len(down_block_additional_residual.shape) == 4:
+                    # b c h w
+                    # if input single condition, apply it to all frames
+                    down_block_additional_residual = down_block_additional_residual.unsqueeze(2)
+                    # boardcast will solve the problem
+                    # down_block_additional_residual = repeat(down_block_additional_residual, "b c f h w -> b c (f n) h w", n=video_length)
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # mid
+        sample = self.mid_block(
+            sample, emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask,
+            motion_module_alpha=motion_module_alphas[-1], cross_attention_kwargs=cross_attention_kwargs
+        )
+        # mid block fuser
+        if self.mid_fuser != None:
+            fused_sample = self.mid_fuser(
+                init_hidden_state=sample[:, :, :1],
+                post_hidden_states=sample[:, :, 1:],
+                temb=emb_single,
+            )
+            sample = torch.cat([sample[:, :, :1], fused_sample], dim=2)
+        # support controlnet
+        if mid_block_additional_residual is not None:
+            if len(mid_block_additional_residual.shape) == 4:
+                mid_block_additional_residual = mid_block_additional_residual.unsqueeze(2)
+                # boardcast will solve this problemq
+                # mid_block_additional_residual = repeat(mid_block_additional_residual, "b c f h w -> b c (f n) h w", n=video_length)
+            sample = sample + mid_block_additional_residual
+        # up
+        for i, (upsample_block, motion_module_alpha) in enumerate(zip(self.up_blocks, motion_module_alphas[:-1][::-1])):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    motion_module_alpha=motion_module_alpha,
+                    cross_attention_kwargs=cross_attention_kwargs
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size,
+                    encoder_hidden_states=encoder_hidden_states, motion_module_alpha=motion_module_alpha,
+                    cross_attention_kwargs=cross_attention_kwargs
+                )
+        activations["upblocks_out"] = sample
+        # post-process
+        # frame-wise normalization
+        sample = rearrange(sample, "b c f h w -> (b f) c h w")
+        sample = self.conv_norm_out(sample)
+        sample = rearrange(sample, "(b f) c h w -> b c f h w", f=video_length)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if (not return_dict):
+            return (sample,)
+        elif debug:
+            return UNet3DConditionOutput(sample=sample), activations
+        else:
+            return UNet3DConditionOutput(sample=sample)
+    @classmethod
+    def from_pretrained_2d(cls, pretrained_model_path, subfolder=None, unet_additional_kwargs=None, logger=None):
+        if logger is not None:
+            logger.info(f"Loading unet's pretrained weights from {pretrained_model_path} ...")
+        if subfolder is not None:
+            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
+        config_file = os.path.join(pretrained_model_path, 'config.json')
+        if not os.path.isfile(config_file):
+            raise RuntimeError(f"{config_file} does not exist")
+        with open(config_file, "r") as f:
+            config = json.load(f)
+        config["_class_name"] = cls.__name__
+        config["down_block_types"] = [
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D"
+        ]
+        config["up_block_types"] = [
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D"
+        ]
+        from diffusers.utils import SAFETENSORS_WEIGHTS_NAME
+        model, unused_kwargs = cls.from_config(config, return_unused_kwargs=True, **unet_additional_kwargs)
+        if logger is not None:
+            logger.info(f"please check unused kwargs in 'unet_additional_kwargs' config:")
+        for k, v in unused_kwargs.items():
+            if logger is not None:
+                logger.info(f"{k:50s}: {repr(v)}")
+        model_file = os.path.join(pretrained_model_path, SAFETENSORS_WEIGHTS_NAME)
+        if not os.path.isfile(model_file):
+            raise RuntimeError(f"{model_file} does not exist")
+        state_dict = safetensors.torch.load_file(model_file, device="cpu")
+        missing, unexpected = model.load_state_dict(state_dict, strict=False)
+        if logger is not None:
+            logger.info(f"Missing keys: {len(missing)}; Unexpected keys: {len(unexpected)};")
+        assert len(unexpected) == 0
+        params = [p.numel() if "motion_modules." in n else 0 for n, p in model.named_parameters()]
+        if logger is not None:
+            logger.info(f"Motion module parameters: {sum(params) / 1e6} M")
+        return model
+class UNet3DConditionModelCameraCond(UNet3DConditionModel):
+    _supports_gradient_checkpointing = True
+    @classmethod
+    def extract_init_dict(cls, config_dict, **kwargs):
+        # Skip keys that were not present in the original config, so default __init__ values were used
+        used_defaults = config_dict.get("_use_default_values", [])
+        config_dict = {k: v for k, v in config_dict.items() if k not in used_defaults and k != "_use_default_values"}
+        # 0. Copy origin config dict
+        original_dict = dict(config_dict.items())
+        # 1. Retrieve expected config attributes from __init__ signature
+        expected_keys = cls._get_init_keys(cls)
+        expected_keys.remove("self")
+        super_expected_keys = cls._get_init_keys(UNet3DConditionModel)
+        super_expected_keys.remove("self")
+        # remove general kwargs if present in dict
+        if "kwargs" in expected_keys:
+            expected_keys.remove("kwargs")
+        if "kwargs" in super_expected_keys:
+            super_expected_keys.remove("kwargs")
+        # remove flax internal keys
+        if hasattr(cls, "_flax_internal_args"):
+            for arg in cls._flax_internal_args:
+                expected_keys.remove(arg)
+        expected_keys = expected_keys.union(super_expected_keys)
+        # remove private attributes
+        config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
+        # 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments
+        init_dict = {}
+        for key in expected_keys:
+            # if config param is passed to kwarg and is present in config dict
+            # it should overwrite existing config dict key
+            if key in kwargs and key in config_dict:
+                config_dict[key] = kwargs.pop(key)
+            if key in kwargs:
+                # overwrite key
+                init_dict[key] = kwargs.pop(key)
+            elif key in config_dict:
+                # use value from config dict
+                init_dict[key] = config_dict.pop(key)
+        # 4. Give nice warning if unexpected values have been passed
+        if len(config_dict) > 0:
+            print(
+                f"The config attributes {config_dict} were passed to {cls.__name__}, "
+                "but are not expected and will be ignored. Please verify your "
+                f"{cls.config_name} configuration file."
+            )
+        # 6. Define unused keyword arguments
+        unused_kwargs = {**config_dict, **kwargs}
+        # 7. Define "hidden" config parameters that were saved for compatible classes
+        hidden_config_dict = {k: v for k, v in original_dict.items() if k not in init_dict}
+        return init_dict, unused_kwargs, hidden_config_dict
+    def __init__(self,
+                 decoder_add_cameracond=True,
+                 **kwargs):
+        super(UNet3DConditionModelCameraCond, self).__init__(**kwargs)
+        self.decoder_add_cameracond = decoder_add_cameracond
+    def set_all_attn_processor(self,
+                               add_spatial=False,
+                               spatial_attn_names='attn1',
+                               add_temporal=False,
+                               add_spatial_lora=True,
+                               add_motion_lora=False,
+                               temporal_attn_names='0',
+                               camera_feature_dimensions=[320, 640, 1280, 1280],
+                               lora_kwargs={},
+                               motion_lora_kwargs={},
+                               **attention_processor_kwargs):
+        lora_rank = lora_kwargs.pop('lora_rank')
+        motion_lora_rank = motion_lora_kwargs.pop('lora_rank')
+        spatial_attn_procs = {}
+        if add_spatial:
+            set_processor_names = spatial_attn_names.split(',')
+            for name in self.attn_processors.keys():
+                attention_name = name.split('.')[-2]
+                cross_attention_dim = None if attention_name == 'attn1' else self.config.cross_attention_dim
+                if name.startswith("mid_block"):
+                    hidden_size = self.config.block_out_channels[-1]
+                    block_id = -1
+                    add_camera_adaptor = attention_name in set_processor_names
+                    camera_feature_dim = camera_feature_dimensions[block_id] if add_camera_adaptor else None
+                elif name.startswith("up_blocks"):
+                    block_id = int(name[len("up_blocks.")])
+                    hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+                    add_camera_adaptor = attention_name in set_processor_names
+                    camera_feature_dim = list(reversed(camera_feature_dimensions))[block_id] if add_camera_adaptor else None
+                else:
+                    assert name.startswith("down_blocks")
+                    block_id = int(name[len("down_blocks.")])
+                    hidden_size = self.config.block_out_channels[block_id]
+                    add_camera_adaptor = attention_name in set_processor_names
+                    camera_feature_dim = camera_feature_dimensions[block_id] if add_camera_adaptor else None
+                if add_camera_adaptor and add_spatial_lora:
+                    spatial_attn_procs[name] = LORACameraAdaptorAttnProcessor(hidden_size=hidden_size,
+                                                                            camera_feature_dim=camera_feature_dim,
+                                                                            cross_attention_dim=cross_attention_dim,
+                                                                            rank=lora_rank if lora_rank > 16 else hidden_size // lora_rank,
+                                                                            **attention_processor_kwargs,
+                                                                            **lora_kwargs)
+                elif add_camera_adaptor:
+                    spatial_attn_procs[name] = CameraAdaptorAttnProcessor(hidden_size=hidden_size,
+                                                                        camera_feature_dim=camera_feature_dim,
+                                                                        cross_attention_dim=cross_attention_dim,
+                                                                        **attention_processor_kwargs)
+                elif add_spatial_lora:
+                    spatial_attn_procs[name] = CustomizedLoRAAttnProcessor(hidden_size=hidden_size,
+                                                                           cross_attention_dim=cross_attention_dim,
+                                                                           rank=lora_rank if lora_rank > 16 else hidden_size // lora_rank)
+                else:
+                    spatial_attn_procs[name] = CustomizedAttnProcessor()
+        elif (not add_spatial) and add_spatial_lora:
+            for name in self.attn_processors.keys():
+                cross_attention_dim = None if name.endswith("attn1.processor") else self.config.cross_attention_dim
+                if name.startswith("mid_block"):
+                    hidden_size = self.config.block_out_channels[-1]
+                elif name.startswith("up_blocks"):
+                    block_id = int(name[len("up_blocks.")])
+                    hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+                elif name.startswith("down_blocks"):
+                    block_id = int(name[len("down_blocks.")])
+                    hidden_size = self.config.block_out_channels[block_id]
+                spatial_attn_procs[name] = CustomizedLoRAAttnProcessor(
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                    rank=lora_rank if lora_rank > 16 else hidden_size // lora_rank,
+                )
+        else:
+            for name in self.attn_processors.keys():
+                spatial_attn_procs[name] = CustomizedAttnProcessor()
+        self.set_attn_processor(spatial_attn_procs)
+        mm_attn_procs = {}
+        if add_temporal:
+            set_processor_names = temporal_attn_names.split(',')
+            cross_attention_dim = None
+            for name in self.mm_attn_processors.keys():
+                attention_name = name.split('.')[-2]
+                if name.startswith("mid_block"):
+                    hidden_size = self.config.block_out_channels[-1]
+                    block_id = -1
+                    add_camera_adaptor = attention_name in set_processor_names
+                    camera_feature_dim = camera_feature_dimensions[block_id] if add_camera_adaptor else None
+                elif name.startswith("up_blocks"):
+                    block_id = int(name[len("up_blocks.")])
+                    hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+                    add_camera_adaptor = (attention_name in set_processor_names) and self.decoder_add_cameracond
+                    camera_feature_dim = list(reversed(camera_feature_dimensions))[block_id] if add_camera_adaptor else None
+                elif name.startswith("down_blocks"):
+                    block_id = int(name[len("down_blocks.")])
+                    hidden_size = self.config.block_out_channels[block_id]
+                    add_camera_adaptor = attention_name in set_processor_names
+                    camera_feature_dim = camera_feature_dimensions[block_id] if add_camera_adaptor else None
+                if add_camera_adaptor and add_motion_lora:
+                    mm_attn_procs[name] = LORACameraAdaptorAttnProcessor(hidden_size=hidden_size,
+                                                                       camera_feature_dim=camera_feature_dim,
+                                                                       cross_attention_dim=cross_attention_dim,
+                                                                       rank=motion_lora_rank if motion_lora_rank > 16 else hidden_size // motion_lora_rank,
+                                                                       **attention_processor_kwargs,
+                                                                       **motion_lora_kwargs)
+                elif add_camera_adaptor:
+                    mm_attn_procs[name] = CameraAdaptorAttnProcessor(hidden_size=hidden_size,
+                                                                   camera_feature_dim=camera_feature_dim,
+                                                                   cross_attention_dim=cross_attention_dim,
+                                                                   **attention_processor_kwargs)
+                elif add_motion_lora:
+                    mm_attn_procs[name] = CustomizedLoRAAttnProcessor(hidden_size=hidden_size,
+                                                                      cross_attention_dim=cross_attention_dim,
+                                                                      rank=motion_lora_rank if motion_lora_rank > 16 else hidden_size // motion_lora_rank)
+                else:
+                    mm_attn_procs[name] = CustomizedAttnProcessor()
+        elif (not add_temporal) and add_motion_lora:
+            for name in self.mm_attn_processors.keys():
+                cross_attention_dim = None
+                if name.startswith("mid_block"):
+                    hidden_size = self.config.block_out_channels[-1]
+                elif name.startswith("up_blocks"):
+                    block_id = int(name[len("up_blocks.")])
+                    hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+                elif name.startswith("down_blocks"):
+                    block_id = int(name[len("down_blocks.")])
+                    hidden_size = self.config.block_out_channels[block_id]
+                mm_attn_procs[name] = CustomizedLoRAAttnProcessor(
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                    rank=motion_lora_rank if motion_lora_rank > 16 else hidden_size // motion_lora_rank,
+                )
+        else:
+            for name in self.mm_attn_processors.keys():
+                mm_attn_procs[name] = CustomizedAttnProcessor()
+        self.set_mm_attn_processor(mm_attn_procs)
+    def forward(
+            self,
+            sample: torch.FloatTensor,
+            timestep: Union[torch.Tensor, float, int],
+            encoder_hidden_states: Union[torch.Tensor, List[torch.Tensor]],
+            class_labels: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            camera_embedding_features: List[torch.Tensor] = None,
+            return_dict: bool = True,
+            # support controlnet
+            down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+            mid_block_additional_residual: Optional[torch.Tensor] = None,
+            # other features
+            motion_module_alphas: Union[tuple, float] = 1.0,
+            debug: bool = False,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
+        activations = {}
+        default_overall_up_factor = 2 ** self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            self.logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # center input if necessary1
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        # extend encoder_hidden_states
+        video_length = sample.shape[2]
+        encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b f) n c", f=video_length)
+        # pre-process
+        sample = self.conv_in(sample)           # b c f h w
+        activations["conv_in_out"] = sample
+        # to be fused
+        if self.down_fusers[0] != None:
+            # scale, shift   = self.down_fusers[0](sample[:,:,0].contiguous(), emb_single).unsqueeze(2).chunk(2, dim=1)
+            # sample[:,:,1:] = (1 + scale) * sample[:,:,1:].contiguous() + shift
+            fused_sample = self.down_fusers[0](
+                init_hidden_state=sample[:, :, :1].contiguous(),
+                post_hidden_states=sample[:, :, 1:].contiguous(),
+                temb=emb_single,
+            )
+            sample = torch.cat([sample[:, :, :1], fused_sample], dim=2)
+        activations["conv_in_fuse_out"] = sample
+        # down
+        down_block_res_samples = (sample,)
+        # motion module alpha
+        if isinstance(motion_module_alphas, float):
+            motion_module_alphas = (motion_module_alphas,) * 5
+        for downsample_block, camera_embedding_feature, down_fuser, motion_module_alpha in zip(self.down_blocks,
+                                                                                             camera_embedding_features,
+                                                                                             self.down_fusers[1:],
+                                                                                             motion_module_alphas[:-1]):
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    motion_module_alpha=motion_module_alpha,
+                    cross_attention_kwargs=cross_attention_kwargs.update({"camera_feature": camera_embedding_feature})
+                    if cross_attention_kwargs is not None else {"camera_feature": camera_embedding_feature},
+                    motion_cross_attention_kwargs={"camera_feature": camera_embedding_feature}
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    motion_module_alpha=motion_module_alpha,
+                    cross_attention_kwargs=cross_attention_kwargs.update({"camera_feature": camera_embedding_feature})
+                    if cross_attention_kwargs is not None else {"camera_feature": camera_embedding_feature},
+                    motion_cross_attention_kwargs={"camera_feature": camera_embedding_feature}
+                )
+            # to be fused
+            for sample_idx, fuser in enumerate(down_fuser):
+                if fuser != None:
+                    fused_sample = fuser(
+                        init_hidden_state=res_samples[sample_idx][:, :, :1].contiguous(),
+                        post_hidden_states=res_samples[sample_idx][:, :, 1:].contiguous(),
+                        temb=emb_single,
+                    )
+                    res_samples = list(res_samples)
+                    res_samples[sample_idx] = torch.cat([res_samples[sample_idx][:, :, :1], fused_sample], dim=2)
+                    res_samples = tuple(res_samples)
+            down_block_res_samples += res_samples
+        # support controlnet
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                    down_block_res_samples, down_block_additional_residuals
+            ):
+                if len(down_block_additional_residual.shape) == 4:
+                    # b c h w
+                    # if input single condition, apply it to all frames
+                    down_block_additional_residual = down_block_additional_residual.unsqueeze(2)
+                    # boardcast will solve the problem
+                    # down_block_additional_residual = repeat(down_block_additional_residual, "b c f h w -> b c (f n) h w", n=video_length)
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # mid
+        sample = self.mid_block(
+            sample,
+            emb,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            motion_module_alpha=motion_module_alphas[-1],
+            cross_attention_kwargs=cross_attention_kwargs.update({"camera_feature": camera_embedding_features[-1]})
+            if cross_attention_kwargs is not None else {"camera_feature": camera_embedding_features[-1]},
+            motion_cross_attention_kwargs={"camera_feature": camera_embedding_features[-1]}
+        )
+        # mid block fuser
+        if self.mid_fuser != None:
+            fused_sample = self.mid_fuser(
+                init_hidden_state=sample[:, :, :1],
+                post_hidden_states=sample[:, :, 1:],
+                temb=emb_single,
+            )
+            sample = torch.cat([sample[:, :, :1], fused_sample], dim=2)
+        # support controlnet
+        if mid_block_additional_residual is not None:
+            if len(mid_block_additional_residual.shape) == 4:
+                mid_block_additional_residual = mid_block_additional_residual.unsqueeze(2)
+                # boardcast will solve this problemq
+                # mid_block_additional_residual = repeat(mid_block_additional_residual, "b c f h w -> b c (f n) h w", n=video_length)
+            sample = sample + mid_block_additional_residual
+        # up
+        for i, (upsample_block, motion_module_alpha) in enumerate(zip(self.up_blocks, motion_module_alphas[:-1][::-1])):
+            is_final_block = i == len(self.up_blocks) - 1
+            camera_embedding_feature = camera_embedding_features[-(i+1)] if self.decoder_add_cameracond else None
+            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if self.decoder_add_cameracond:
+                if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        res_hidden_states_tuple=res_samples,
+                        encoder_hidden_states=encoder_hidden_states,
+                        upsample_size=upsample_size,
+                        attention_mask=attention_mask,
+                        motion_module_alpha=motion_module_alpha,
+                        cross_attention_kwargs=cross_attention_kwargs.update({"camera_feature":camera_embedding_feature})
+                        if cross_attention_kwargs is not None else {"camera_feature": camera_embedding_feature},
+                        motion_cross_attention_kwargs={"camera_feature": camera_embedding_feature}
+                    )
+                else:
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        res_hidden_states_tuple=res_samples,
+                        upsample_size=upsample_size,
+                        motion_module_alpha=motion_module_alpha,
+                        cross_attention_kwargs=cross_attention_kwargs.update({"camera_feature": camera_embedding_feature})
+                        if cross_attention_kwargs is not None else {"camera_feature": camera_embedding_feature},
+                        motion_cross_attention_kwargs={"camera_feature": camera_embedding_feature}
+                    )
+            else:
+                if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        res_hidden_states_tuple=res_samples,
+                        encoder_hidden_states=encoder_hidden_states,
+                        upsample_size=upsample_size,
+                        attention_mask=attention_mask,
+                        motion_module_alpha=motion_module_alpha,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )
+                else:
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        res_hidden_states_tuple=res_samples,
+                        upsample_size=upsample_size,
+                        motion_module_alpha=motion_module_alpha,
+                        cross_attention_kwargs=cross_attention_kwargs
+                    )
+        activations["upblocks_out"] = sample
+        # post-process
+        # frame-wise normalization
+        sample = rearrange(sample, "b c f h w -> (b f) c h w")
+        sample = self.conv_norm_out(sample)
+        sample = rearrange(sample, "(b f) c h w -> b c f h w", f=video_length)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if (not return_dict):
+            return (sample,)
+        elif debug:
+            return UNet3DConditionOutput(sample=sample), activations
+        else:
+            return UNet3DConditionOutput(sample=sample)

genphoto/models/unet_blocks.py CHANGED Viewed

@@ -1,3 +1,818 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:767e2392b19861d964d37159b591b9d489abc9a30332fb1a337694d7f3a94f28
-size 34808

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py
+import torch
+from torch import nn
+from einops import rearrange, repeat
+from diffusers.models.resnet import Downsample2D, Upsample2D, ResnetBlock2D
+from diffusers.models.transformer_2d import Transformer2DModel
+from genphoto.models.motion_module import get_motion_module
+def get_down_block(
+        down_block_type,
+        num_layers,
+        in_channels,
+        out_channels,
+        temb_channels,
+        add_downsample,
+        resnet_eps,
+        resnet_act_fn,
+        attn_num_head_channels,
+        resnet_groups=None,
+        cross_attention_dim=None,
+        downsample_padding=None,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        resnet_time_scale_shift="default",
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock3D":
+        return DownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+        )
+    elif down_block_type == "CrossAttnDownBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
+        return CrossAttnDownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+        up_block_type,
+        num_layers,
+        in_channels,
+        out_channels,
+        prev_output_channel,
+        temb_channels,
+        add_upsample,
+        resnet_eps,
+        resnet_act_fn,
+        attn_num_head_channels,
+        resnet_groups=None,
+        cross_attention_dim=None,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        resnet_time_scale_shift="default",
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock3D":
+        return UpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+        )
+    elif up_block_type == "CrossAttnUpBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
+        return CrossAttnUpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock3DCrossAttn(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            attn_num_head_channels=1,
+            output_scale_factor=1.0,
+            cross_attention_dim=1280,
+            dual_cross_attention=False,
+            use_linear_projection=False,
+            upcast_attention=False,
+            use_motion_module=None,
+            motion_module_type=None,
+            motion_module_kwargs=None,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        motion_modules = []
+        for _ in range(num_layers):
+            if dual_cross_attention: raise NotImplementedError
+            attentions.append(
+                Transformer2DModel(
+                    attn_num_head_channels,
+                    in_channels // attn_num_head_channels,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=in_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                ) if use_motion_module else None
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules) if use_motion_module else motion_modules
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None,
+                motion_module_alpha=1., cross_attention_kwargs=None, motion_cross_attention_kwargs=None):
+        video_length = hidden_states.shape[2]
+        temb_repeated = repeat(temb, "b c -> (b f) c", f=video_length)
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        hidden_states = self.resnets[0](hidden_states, temb_repeated)
+        hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+        lora_scale = getattr(self, "lora_scale", None)
+        if lora_scale != None:
+            cross_attention_kwargs = {"scale": lora_scale}
+        motion_lora_scale = getattr(self, "motion_lora_scale", None)
+        if motion_lora_scale != None:
+            if motion_cross_attention_kwargs is None:
+                motion_cross_attention_kwargs = {"scale": motion_lora_scale}
+            else:
+                motion_cross_attention_kwargs.update({"scale": motion_lora_scale})
+        for attn, resnet, motion_module in zip(self.attentions, self.resnets[1:], self.motion_modules):
+            hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+            hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states,
+                                 cross_attention_kwargs=cross_attention_kwargs).sample
+            hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+            # motion module
+            if motion_module is not None:
+                # hidden_states = motion_module_alpha * motion_module(hidden_states, temb=temb, encoder_hidden_states=encoder_hidden_states) + hidden_states
+                hidden_states = motion_module(hidden_states, temb=temb, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=motion_cross_attention_kwargs)
+            hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+            hidden_states = resnet(hidden_states, temb_repeated)
+            hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+        return hidden_states
+class CrossAttnDownBlock3D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            attn_num_head_channels=1,
+            cross_attention_dim=1280,
+            output_scale_factor=1.0,
+            downsample_padding=1,
+            add_downsample=True,
+            dual_cross_attention=False,
+            use_linear_projection=False,
+            only_cross_attention=False,
+            upcast_attention=False,
+            use_motion_module=None,
+            motion_module_type=None,
+            motion_module_kwargs=None,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        motion_modules = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer2DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                ) if use_motion_module else None
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules) if use_motion_module else motion_modules
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None,
+                motion_module_alpha=1., cross_attention_kwargs={}, motion_cross_attention_kwargs={}):
+        video_length = hidden_states.shape[2]
+        temb_repeated = repeat(temb, "b c -> (b f) c", f=video_length)
+        output_states = ()
+        lora_scale = getattr(self, "lora_scale", None)
+        if lora_scale != None:
+            cross_attention_kwargs["scale"] = lora_scale
+        motion_lora_scale = getattr(self, "motion_lora_scale", None)
+        if motion_lora_scale != None:
+            if motion_cross_attention_kwargs is None:
+                motion_cross_attention_kwargs = {"scale": motion_lora_scale}
+            else:
+                motion_cross_attention_kwargs.update({"scale": motion_lora_scale})
+        for resnet, attn, motion_module in zip(self.resnets, self.attentions, self.motion_modules):
+            if self.training and self.gradient_checkpointing:
+                raise NotImplementedError
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                )[0]
+                if motion_module is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(motion_module),
+                                                                      hidden_states.requires_grad_(), temb,
+                                                                      encoder_hidden_states)
+            else:
+                hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+                hidden_states = resnet(hidden_states, temb_repeated)
+                hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+                hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states,
+                                     cross_attention_kwargs=cross_attention_kwargs).sample
+                hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+                # motion module
+                if motion_module is not None:
+                    # hidden_states = motion_module_alpha * motion_module(hidden_states, temb=temb, encoder_hidden_states=encoder_hidden_states) + hidden_states
+                    hidden_states = motion_module(hidden_states, temb=temb, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=motion_cross_attention_kwargs)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+                hidden_states = downsampler(hidden_states)
+                hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class DownBlock3D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            output_scale_factor=1.0,
+            add_downsample=True,
+            downsample_padding=1,
+            use_motion_module=None,
+            motion_module_type=None,
+            motion_module_kwargs=None,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                ) if use_motion_module else None
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules) if use_motion_module else motion_modules
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, motion_module_alpha=1.,
+                motion_cross_attention_kwargs={}, **kwargs):
+        video_length = hidden_states.shape[2]
+        temb_repeated = repeat(temb, "b c -> (b f) c", f=video_length)
+        output_states = ()
+        motion_lora_scale = getattr(self, "motion_lora_scale", None)
+        if motion_lora_scale != None:
+            if motion_cross_attention_kwargs is None:
+                motion_cross_attention_kwargs = {"scale": motion_lora_scale}
+            else:
+                motion_cross_attention_kwargs.update({"scale": motion_lora_scale})
+        for resnet, motion_module in zip(self.resnets, self.motion_modules):
+            if self.training and self.gradient_checkpointing:
+                raise NotImplementedError
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                if motion_module is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(motion_module),
+                                                                      hidden_states.requires_grad_(), temb,
+                                                                      encoder_hidden_states)
+            else:
+                hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+                hidden_states = resnet(hidden_states, temb_repeated)
+                hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+                # motion module
+                if motion_module is not None:
+                    hidden_states = motion_module(hidden_states, temb=temb, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=motion_cross_attention_kwargs)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+                hidden_states = downsampler(hidden_states)
+                hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class CrossAttnUpBlock3D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            prev_output_channel: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            attn_num_head_channels=1,
+            cross_attention_dim=1280,
+            output_scale_factor=1.0,
+            add_upsample=True,
+            dual_cross_attention=False,
+            use_linear_projection=False,
+            only_cross_attention=False,
+            upcast_attention=False,
+            use_motion_module=None,
+            motion_module_type=None,
+            motion_module_kwargs=None,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        motion_modules = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer2DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                ) if use_motion_module else None
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules) if use_motion_module else motion_modules
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+            self,
+            hidden_states,
+            res_hidden_states_tuple,
+            temb=None,
+            encoder_hidden_states=None,
+            upsample_size=None,
+            attention_mask=None,
+            motion_module_alpha=1.,
+            cross_attention_kwargs=None,
+            motion_cross_attention_kwargs={}
+    ):
+        video_length = hidden_states.shape[2]
+        temb_repeated = repeat(temb, "b c -> (b f) c", f=video_length)
+        lora_scale = getattr(self, "lora_scale", None)
+        if lora_scale != None:
+            cross_attention_kwargs = {"scale": lora_scale}
+        motion_lora_scale = getattr(self, "motion_lora_scale", None)
+        if motion_lora_scale != None:
+            if motion_cross_attention_kwargs is None:
+                motion_cross_attention_kwargs = {"scale": motion_lora_scale}
+            else:
+                motion_cross_attention_kwargs.update({"scale": motion_lora_scale})
+        for resnet, attn, motion_module in zip(self.resnets, self.attentions, self.motion_modules):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                raise NotImplementedError
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                )[0]
+                if motion_module is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(motion_module),
+                                                                      hidden_states.requires_grad_(), temb,
+                                                                      encoder_hidden_states)
+            else:
+                hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+                hidden_states = resnet(hidden_states, temb_repeated)
+                hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+                hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states,
+                                     cross_attention_kwargs=cross_attention_kwargs).sample
+                hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+                # motion module
+                if motion_module is not None:
+                    # hidden_states = motion_module_alpha * motion_module(hidden_states, temb=temb, encoder_hidden_states=encoder_hidden_states) + hidden_states
+                    hidden_states = motion_module(hidden_states, temb=temb, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=motion_cross_attention_kwargs)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+                hidden_states = upsampler(hidden_states, upsample_size)
+                hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+        return hidden_states
+class UpBlock3D(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            prev_output_channel: int,
+            out_channels: int,
+            temb_channels: int,
+            dropout: float = 0.0,
+            num_layers: int = 1,
+            resnet_eps: float = 1e-6,
+            resnet_time_scale_shift: str = "default",
+            resnet_act_fn: str = "swish",
+            resnet_groups: int = 32,
+            resnet_pre_norm: bool = True,
+            output_scale_factor=1.0,
+            add_upsample=True,
+            use_motion_module=None,
+            motion_module_type=None,
+            motion_module_kwargs=None,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                ) if use_motion_module else None
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules) if use_motion_module else motion_modules
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, encoder_hidden_states=None,
+                motion_module_alpha=1., motion_cross_attention_kwargs={}, **kwargs):
+        video_length = hidden_states.shape[2]
+        temb_repeated = repeat(temb, "b c -> (b f) c", f=video_length)
+        motion_lora_scale = getattr(self, "motion_lora_scale", None)
+        if motion_lora_scale != None:
+            if motion_cross_attention_kwargs is None:
+                motion_cross_attention_kwargs = {"scale": motion_lora_scale}
+            else:
+                motion_cross_attention_kwargs.update({"scale": motion_lora_scale})
+        for resnet, motion_module in zip(self.resnets, self.motion_modules):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                raise NotImplementedError
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                if motion_module is not None:
+                    hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(motion_module),
+                                                                      hidden_states.requires_grad_(), temb,
+                                                                      encoder_hidden_states)
+            else:
+                hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+                hidden_states = resnet(hidden_states, temb_repeated)
+                hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+                # motion module
+                if motion_module is not None:
+                    hidden_states = motion_module(hidden_states, temb=temb, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=motion_cross_attention_kwargs)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+                hidden_states = upsampler(hidden_states, upsample_size)
+                hidden_states = rearrange(hidden_states, "(b f) c h w -> b c f h w", f=video_length)
+        return hidden_states

genphoto/pipelines/pipeline_animation.py CHANGED Viewed

@@ -1,3 +1,719 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:453fc7220c98fbe0fa70b19aade5b4403e470c09efed70147f2fcf35dd782d5b
-size 34090

+# Adapted from https://github.com/showlab/Tune-A-Video/blob/main/tuneavideo/pipelines/pipeline_tuneavideo.py
+import inspect
+import torch
+import numpy as np
+from typing import Callable, List, Optional, Union
+from dataclasses import dataclass
+from diffusers.utils import is_accelerate_available
+from packaging import version
+from einops import rearrange
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.loaders import LoraLoaderMixin
+from diffusers.utils import deprecate, logging, BaseOutput
+from genphoto.models.camera_adaptor import CameraCameraEncoder
+from genphoto.models.unet import UNet3DConditionModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class AnimationPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+class AnimationPipeline(DiffusionPipeline, LoraLoaderMixin):
+    _optional_components = []
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+    def enable_vae_slicing(self):
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        self.vae.disable_slicing()
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+        device = torch.device(f"cuda:{gpu_id}")
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+    @property
+    def _execution_device(self):
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _encode_prompt(self, prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_videos_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_videos_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_videos_per_prompt, seq_len, -1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def decode_latents(self, latents):
+        video_length = latents.shape[2]
+        latents = 1 / 0.18215 * latents
+        latents = rearrange(latents, "b c f h w -> (b f) c h w")
+        # video = self.vae.decode(latents).sample
+        video = []
+        for frame_idx in range(latents.shape[0]):
+            video.append(self.vae.decode(latents[frame_idx:frame_idx+1]).sample)
+        video = torch.cat(video)
+        video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length)
+        video = (video / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        video = video.cpu().float().numpy()
+        return video
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    def prepare_latents(self, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, video_length, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            rand_device = "cpu" if device.type == "mps" else device
+            if isinstance(generator, list):
+                shape = shape
+                # shape = (1,) + shape[1:]
+                latents = [
+                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
+                    for i in range(batch_size)
+                ]
+                latents = torch.cat(latents, dim=0).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        video_length: Optional[int],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        multidiff_total_steps: int = 1,
+        multidiff_overlaps: int = 12,
+        **kwargs,
+    ):
+        # Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+        # Define call parameters
+        # batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        batch_size = 1
+        if latents is not None:
+            batch_size = latents.shape[0]
+        if isinstance(prompt, list):
+            batch_size = len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # Encode input prompt
+        prompt = prompt if isinstance(prompt, list) else [prompt] * batch_size
+        if negative_prompt is not None:
+            negative_prompt = negative_prompt if isinstance(negative_prompt, list) else [negative_prompt] * batch_size
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+        # Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # Prepare latent variables
+        single_model_length = video_length
+        video_length = multidiff_total_steps * (video_length - multidiff_overlaps) + multidiff_overlaps
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            video_length,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents_dtype = latents.dtype
+        # Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                noise_pred_full = torch.zeros_like(latents).to(latents.device)
+                mask_full = torch.zeros_like(latents).to(latents.device)
+                noise_preds = []
+                for multidiff_step in range(multidiff_total_steps):
+                    start_idx = multidiff_step * (single_model_length - multidiff_overlaps)
+                    latent_partial = latents[:, :, start_idx: start_idx + single_model_length].contiguous()
+                    mask_full[:, :, start_idx: start_idx + single_model_length] += 1
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latent_partial] * 2) if do_classifier_free_guidance else latent_partial
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    # predict the noise residual
+                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample.to(dtype=latents_dtype)
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_preds.append(noise_pred)
+                for pred_idx, noise_pred in enumerate(noise_preds):
+                    start_idx = pred_idx * (single_model_length - multidiff_overlaps)
+                    noise_pred_full[:, :, start_idx: start_idx + single_model_length] += noise_pred / mask_full[:, :, start_idx: start_idx + single_model_length]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred_full, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # Post-processing
+        video = self.decode_latents(latents)
+        # Convert to tensor
+        if output_type == "tensor":
+            video = torch.from_numpy(video)
+        if not return_dict:
+            return video
+        return AnimationPipelineOutput(videos=video)
+class GenPhotoPipeline(AnimationPipeline):
+    _optional_components = []
+    def __init__(self,
+                 vae: AutoencoderKL,
+                 text_encoder: CLIPTextModel,
+                 tokenizer: CLIPTokenizer,
+                 unet: UNet3DConditionModel,
+                 scheduler: Union[
+                     DDIMScheduler,
+                     PNDMScheduler,
+                     LMSDiscreteScheduler,
+                     EulerDiscreteScheduler,
+                     EulerAncestralDiscreteScheduler,
+                     DPMSolverMultistepScheduler],
+                 camera_encoder: CameraCameraEncoder):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler)
+        self.register_modules(
+            camera_encoder=camera_encoder
+        )
+    def decode_latents(self, latents):
+        video_length = latents.shape[2]
+        latents = 1 / 0.18215 * latents
+        latents = rearrange(latents, "b c f h w -> (b f) c h w")
+        # video = self.vae.decode(latents).sample
+        video = []
+        for frame_idx in range(latents.shape[0]):
+            video.append(self.vae.decode(latents[frame_idx:frame_idx+1]).sample)
+        video = torch.cat(video)
+        video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length)
+        video = (video / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        video = video.cpu().float().numpy()
+        return video
+    def _encode_prompt(self, prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_videos_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_videos_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_videos_per_prompt, seq_len, -1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        camera_embedding: torch.FloatTensor,
+        video_length: Optional[int],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        multidiff_total_steps: int = 1,
+        multidiff_overlaps: int = 12,
+        **kwargs,
+    ):
+        # Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+        # Define call parameters
+        # batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        batch_size = 1
+        if latents is not None:
+            batch_size = latents.shape[0]
+        if isinstance(prompt, list):
+            batch_size = len(prompt)
+        device = camera_embedding[0].device if isinstance(camera_embedding, list) else camera_embedding.device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # Encode input prompt
+        prompt = prompt if isinstance(prompt, list) else [prompt] * batch_size
+        if negative_prompt is not None:
+            negative_prompt = negative_prompt if isinstance(negative_prompt, list) else [negative_prompt] * batch_size
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
+        )           # [2bf, l, c]
+        # Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # Prepare latent variables
+        single_model_length = video_length
+        video_length = multidiff_total_steps * (video_length - multidiff_overlaps) + multidiff_overlaps
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            video_length,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )                   # b c f h w
+        latents_dtype = latents.dtype
+        # Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        if isinstance(camera_embedding, list):
+            assert all([x.ndim == 5 for x in camera_embedding])
+            bs = camera_embedding[0].shape[0]
+            camera_embedding_features = []
+            for pe in camera_embedding:
+                camera_embedding_feature = self.camera_encoder(pe)
+                camera_embedding_feature = [rearrange(x, '(b f) c h w -> b c f h w', b=bs) for x in camera_embedding_feature]
+                camera_embedding_features.append(camera_embedding_feature)
+        else:
+            bs = camera_embedding.shape[0]
+            assert camera_embedding.ndim == 5
+            camera_embedding_features = self.camera_encoder(camera_embedding)       # bf, c, h, w
+            camera_embedding_features = [rearrange(x, '(b f) c h w -> b c f h w', b=bs)
+                                       for x in camera_embedding_features]
+        # Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        if isinstance(camera_embedding_features[0], list):
+            camera_embedding_features = [[torch.cat([x, x], dim=0) for x in camera_embedding_feature]
+                                       for camera_embedding_feature in camera_embedding_features] \
+                if do_classifier_free_guidance else camera_embedding_features
+        else:
+            camera_embedding_features = [torch.cat([x, x], dim=0) for x in camera_embedding_features] \
+                if do_classifier_free_guidance else camera_embedding_features  # [2b c f h w]
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                noise_pred_full = torch.zeros_like(latents).to(latents.device)
+                mask_full = torch.zeros_like(latents).to(latents.device)
+                noise_preds = []
+                for multidiff_step in range(multidiff_total_steps):
+                    start_idx = multidiff_step * (single_model_length - multidiff_overlaps)
+                    latent_partial = latents[:, :, start_idx: start_idx + single_model_length].contiguous()
+                    mask_full[:, :, start_idx: start_idx + single_model_length] += 1
+                    if isinstance(camera_embedding, list):
+                        camera_embedding_features_input = camera_embedding_features[multidiff_step]
+                    else:
+                        camera_embedding_features_input = [x[:, :, start_idx: start_idx + single_model_length]
+                                                         for x in camera_embedding_features]
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latent_partial] * 2) if do_classifier_free_guidance else latent_partial   # [2b c f h w]
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    # predict the noise residual
+                    noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings,
+                                           camera_embedding_features=camera_embedding_features_input).sample.to(dtype=latents_dtype)
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_preds.append(noise_pred)
+                for pred_idx, noise_pred in enumerate(noise_preds):
+                    start_idx = pred_idx * (single_model_length - multidiff_overlaps)
+                    noise_pred_full[:, :, start_idx: start_idx + single_model_length] += noise_pred / mask_full[:, :, start_idx: start_idx + single_model_length]
+                # compute the previous noisy sample x_t -> x_t-1  b c f h w
+                latents = self.scheduler.step(noise_pred_full, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # Post-processing
+        video = self.decode_latents(latents)
+        # Convert to tensor
+        if output_type == "tensor":
+            video = torch.from_numpy(video)
+        if not return_dict:
+            return video
+        return AnimationPipelineOutput(videos=video)

genphoto/utils/convert_from_ckpt.py CHANGED Viewed

@@ -1,3 +1,556 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3ca60e78e034ed48ea1b7d48c09d2707940b1e25b749ee68bb6b601a96270435
-size 25125

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the Stable Diffusion checkpoints."""
+import re
+from transformers import CLIPTextModel
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path["new"]
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, controlnet=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+    if controlnet:
+        unet_key = "control_model."
+    else:
+        unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        print(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+    new_checkpoint = {}
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    if not controlnet:
+        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    if controlnet:
+        # conditioning embedding
+        orig_index = 0
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+        orig_index += 2
+        diffusers_index = 0
+        while diffusers_index < 6:
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
+            diffusers_index += 1
+            orig_index += 2
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+        # down blocks
+        for i in range(num_input_blocks):
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
+        # mid block
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
+    return new_checkpoint
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    keys = list(checkpoint.keys())
+    vae_key = "first_stage_model." if any(k.startswith("first_stage_model.") for k in keys) else ""
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+    new_checkpoint = {}
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+def convert_ldm_clip_checkpoint(checkpoint):
+    text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+    text_model.load_state_dict(text_model_dict)
+    return text_model
+textenc_conversion_lst = [
+    ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))

genphoto/utils/convert_lora_safetensor_to_diffusers.py CHANGED Viewed

@@ -1,3 +1,154 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0c9162744237b045715cfe587c2be0117a49f538a99c1a853a2bf4c2d3695b69
-size 5981

+# coding=utf-8
+# Copyright 2023, Haofan Wang, Qixun Wang, All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the LoRA's safetensors checkpoints. """
+import argparse
+import torch
+from safetensors.torch import load_file
+from diffusers import StableDiffusionPipeline
+import pdb
+def convert_motion_lora_ckpt_to_diffusers(pipeline, state_dict, alpha=1.0):
+    # directly update weight in diffusers model
+    for key in state_dict:
+        # only process lora down key
+        if "up." in key: continue
+        up_key    = key.replace(".down.", ".up.")
+        model_key = key.replace("processor.", "").replace("_lora", "").replace("down.", "").replace("up.", "")
+        model_key = model_key.replace("to_out.", "to_out.0.")
+        layer_infos = model_key.split(".")[:-1]
+        curr_layer = pipeline.unet
+        while len(layer_infos) > 0:
+            temp_name = layer_infos.pop(0)
+            curr_layer = curr_layer.__getattr__(temp_name)
+        weight_down = state_dict[key]
+        weight_up   = state_dict[up_key]
+        curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)
+    return pipeline
+def convert_lora(pipeline, state_dict, LORA_PREFIX_UNET="lora_unet", LORA_PREFIX_TEXT_ENCODER="lora_te", alpha=0.6):
+    # load base model
+    # pipeline = StableDiffusionPipeline.from_pretrained(base_model_path, torch_dtype=torch.float32)
+    # load LoRA weight from .safetensors
+    # state_dict = load_file(checkpoint_path)
+    visited = []
+    # directly update weight in diffusers model
+    for key in state_dict:
+        # it is suggested to print out the key, it usually will be something like below
+        # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
+        # as we have set the alpha beforehand, so just skip
+        if ".alpha" in key or key in visited:
+            continue
+        if "text" in key:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+            curr_layer = pipeline.text_encoder
+        else:
+            layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_")
+            curr_layer = pipeline.unet
+        # find the target layer
+        temp_name = layer_infos.pop(0)
+        while len(layer_infos) > -1:
+            try:
+                curr_layer = curr_layer.__getattr__(temp_name)
+                if len(layer_infos) > 0:
+                    temp_name = layer_infos.pop(0)
+                elif len(layer_infos) == 0:
+                    break
+            except Exception:
+                if len(temp_name) > 0:
+                    temp_name += "_" + layer_infos.pop(0)
+                else:
+                    temp_name = layer_infos.pop(0)
+        pair_keys = []
+        if "lora_down" in key:
+            pair_keys.append(key.replace("lora_down", "lora_up"))
+            pair_keys.append(key)
+        else:
+            pair_keys.append(key)
+            pair_keys.append(key.replace("lora_up", "lora_down"))
+        # update weight
+        if len(state_dict[pair_keys[0]].shape) == 4:
+            weight_up = state_dict[pair_keys[0]].squeeze(3).squeeze(2).to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].squeeze(3).squeeze(2).to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3).to(curr_layer.weight.data.device)
+        else:
+            weight_up = state_dict[pair_keys[0]].to(torch.float32)
+            weight_down = state_dict[pair_keys[1]].to(torch.float32)
+            curr_layer.weight.data += alpha * torch.mm(weight_up, weight_down).to(curr_layer.weight.data.device)
+        # update visited list
+        for item in pair_keys:
+            visited.append(item)
+    return pipeline
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_model_path", default=None, type=str, required=True, help="Path to the base model in diffusers format."
+    )
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--lora_prefix_unet", default="lora_unet", type=str, help="The prefix of UNet weight in safetensors"
+    )
+    parser.add_argument(
+        "--lora_prefix_text_encoder",
+        default="lora_te",
+        type=str,
+        help="The prefix of text encoder weight in safetensors",
+    )
+    parser.add_argument("--alpha", default=0.75, type=float, help="The merging ratio in W = W0 + alpha * deltaW")
+    parser.add_argument(
+        "--to_safetensors", action="store_true", help="Whether to store pipeline in safetensors format or not."
+    )
+    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
+    args = parser.parse_args()
+    base_model_path = args.base_model_path
+    checkpoint_path = args.checkpoint_path
+    dump_path = args.dump_path
+    lora_prefix_unet = args.lora_prefix_unet
+    lora_prefix_text_encoder = args.lora_prefix_text_encoder
+    alpha = args.alpha
+    pipe = convert(base_model_path, checkpoint_path, lora_prefix_unet, lora_prefix_text_encoder, alpha)
+    pipe = pipe.to(args.device)
+    pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)

genphoto/utils/util.py CHANGED Viewed

@@ -1,3 +1,148 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cb53dbb7da4c905c1a68d9f74d5ac1e01ea13e82a2506117bc3d3436109bb1b4
-size 4875

+import os
+import functools
+import logging
+import sys
+import imageio
+import atexit
+import importlib
+import torch
+import torchvision
+import numpy as np
+from termcolor import colored
+from einops import rearrange
+def instantiate_from_config(config, **additional_kwargs):
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    additional_kwargs.update(config.get("kwargs", dict()))
+    return get_obj_from_str(config["target"])(**additional_kwargs)
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    imageio.mimsave(path, outputs, fps=fps)
+# Logger utils are copied from detectron2
+class _ColorfulFormatter(logging.Formatter):
+    def __init__(self, *args, **kwargs):
+        self._root_name = kwargs.pop("root_name") + "."
+        self._abbrev_name = kwargs.pop("abbrev_name", "")
+        if len(self._abbrev_name):
+            self._abbrev_name = self._abbrev_name + "."
+        super(_ColorfulFormatter, self).__init__(*args, **kwargs)
+    def formatMessage(self, record):
+        record.name = record.name.replace(self._root_name, self._abbrev_name)
+        log = super(_ColorfulFormatter, self).formatMessage(record)
+        if record.levelno == logging.WARNING:
+            prefix = colored("WARNING", "red", attrs=["blink"])
+        elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
+            prefix = colored("ERROR", "red", attrs=["blink", "underline"])
+        else:
+            return log
+        return prefix + " " + log
+# cache the opened file object, so that different calls to `setup_logger`
+# with the same file name can safely write to the same file.
+@functools.lru_cache(maxsize=None)
+def _cached_log_stream(filename):
+    # use 1K buffer if writing to cloud storage
+    io = open(filename, "a", buffering=1024 if "://" in filename else -1)
+    atexit.register(io.close)
+    return io
+@functools.lru_cache()
+def setup_logger(output, distributed_rank, color=True, name='AnimateDiff', abbrev_name=None):
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+    if abbrev_name is None:
+        abbrev_name = 'AD'
+    plain_formatter = logging.Formatter(
+        "[%(asctime)s] %(name)s:%(lineno)d %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S"
+    )
+    # stdout logging: master only
+    if distributed_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        if color:
+            formatter = _ColorfulFormatter(
+                colored("[%(asctime)s %(name)s:%(lineno)d]: ", "green") + "%(message)s",
+                datefmt="%m/%d %H:%M:%S",
+                root_name=name,
+                abbrev_name=str(abbrev_name),
+            )
+        else:
+            formatter = plain_formatter
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+    # file logging: all workers
+    if output is not None:
+        if output.endswith(".txt") or output.endswith(".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, "log.txt")
+        if distributed_rank > 0:
+            filename = filename + ".rank{}".format(distributed_rank)
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+        fh = logging.StreamHandler(_cached_log_stream(filename))
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(plain_formatter)
+        logger.addHandler(fh)
+    return logger
+def format_time(elapsed_time):
+    # Time thresholds
+    minute = 60
+    hour = 60 * minute
+    day = 24 * hour
+    days, remainder = divmod(elapsed_time, day)
+    hours, remainder = divmod(remainder, hour)
+    minutes, seconds = divmod(remainder, minute)
+    formatted_time = ""
+    if days > 0:
+        formatted_time += f"{int(days)} days "
+    if hours > 0:
+        formatted_time += f"{int(hours)} hours "
+    if minutes > 0:
+        formatted_time += f"{int(minutes)} minutes "
+    if seconds > 0:
+        formatted_time += f"{seconds:.2f} seconds"
+    return formatted_time.strip()

inference_bokehK.py CHANGED Viewed

@@ -1,3 +1,216 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d71226af9e998b6f458bf837712b9ebcffab037ac09c29ea48742ba4d832b257
-size 8968

+import tempfile
+import imageio
+import os
+import torch
+import logging
+import argparse
+import json
+import numpy as np
+import torch.nn.functional as F
+from pathlib import Path
+from omegaconf import OmegaConf
+from torch.utils.data import Dataset
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, DDIMScheduler
+from einops import rearrange
+from genphoto.pipelines.pipeline_animation import GenPhotoPipeline
+from genphoto.models.unet import UNet3DConditionModelCameraCond
+from genphoto.models.camera_adaptor import CameraCameraEncoder, CameraAdaptor
+from genphoto.utils.util import save_videos_grid
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+from huggingface_hub import hf_hub_download
+def create_bokehK_embedding(bokehK_values, target_height, target_width):
+    f = bokehK_values.shape[0]
+    bokehK_embedding = torch.zeros((f, 3, target_height, target_width), dtype=bokehK_values.dtype)
+    for i in range(f):
+        K_value = bokehK_values[i].item()
+        kernel_size = max(K_value, 1)
+        sigma = K_value / 3.0
+        ax = np.linspace(-(kernel_size / 2), kernel_size / 2, int(np.ceil(kernel_size)))
+        xx, yy = np.meshgrid(ax, ax)
+        kernel = np.exp(-(xx ** 2 + yy ** 2) / (2 * sigma ** 2))
+        kernel /= np.sum(kernel)
+        scale = kernel[int(np.ceil(kernel_size) / 2), int(np.ceil(kernel_size) / 2)]
+        bokehK_embedding[i] = scale
+    return bokehK_embedding
+class Camera_Embedding(Dataset):
+    def __init__(self, bokehK_values, tokenizer, text_encoder, device, sample_size=[256, 384]):
+        self.bokehK_values = bokehK_values.to(device)
+        self.tokenizer = tokenizer
+        self.text_encoder = text_encoder
+        self.device = device
+        self.sample_size = sample_size
+    def load(self):
+        if len(self.bokehK_values) != 5:
+            raise ValueError("Expected 5 bokehK values")
+        prompts = []
+        for bb in self.bokehK_values:
+            prompt = f"<bokeh kernel size: {bb.item()}>"
+            prompts.append(prompt)
+        with torch.no_grad():
+            prompt_ids = self.tokenizer(
+                prompts, max_length=self.tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+            ).input_ids.to(self.device)
+            encoder_hidden_states = self.text_encoder(input_ids=prompt_ids).last_hidden_state
+        differences = []
+        for i in range(1, encoder_hidden_states.size(0)):
+            diff = encoder_hidden_states[i] - encoder_hidden_states[i - 1]
+            diff = diff.unsqueeze(0)
+            differences.append(diff)
+        final_diff = encoder_hidden_states[-1] - encoder_hidden_states[0]
+        final_diff = final_diff.unsqueeze(0)
+        differences.append(final_diff)
+        concatenated_differences = torch.cat(differences, dim=0)
+        pad_length = 128 - concatenated_differences.size(1)
+        if pad_length > 0:
+            concatenated_differences_padded = F.pad(concatenated_differences, (0, 0, 0, pad_length))
+        ccl_embedding = concatenated_differences_padded.reshape(
+            concatenated_differences_padded.size(0), self.sample_size[0], self.sample_size[1]
+        ).unsqueeze(1).expand(-1, 3, -1, -1).to(self.device)
+        bokehK_embedding = create_bokehK_embedding(self.bokehK_values, self.sample_size[0], self.sample_size[1]).to(self.device)
+        camera_embedding = torch.cat((bokehK_embedding, ccl_embedding), dim=1)
+        return camera_embedding
+def load_models(cfg):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    pretrained_model_path = hf_hub_download("pandaphd/generative_photography", "stable-diffusion-v1-5/")
+    lora_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/RealEstate10K_LoRA.ckpt")
+    motion_module_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/v3_sd15_mm.ckpt")
+    camera_adaptor_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/checkpoint-bokehK.ckpt")
+    noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
+    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae").to(device)
+    vae.requires_grad_(False)
+    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder").to(device)
+    text_encoder.requires_grad_(False)
+    unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
+        pretrained_model_path,
+        subfolder=cfg.unet_subfolder,
+        unet_additional_kwargs=cfg.unet_additional_kwargs
+    ).to(device)
+    unet.requires_grad_(False)
+    camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
+    camera_encoder.requires_grad_(False)
+    camera_adaptor = CameraAdaptor(unet, camera_encoder)
+    camera_adaptor.requires_grad_(False)
+    camera_adaptor.to(device)
+    unet.set_all_attn_processor(
+        add_spatial_lora=cfg.lora_ckpt is not None,
+        add_motion_lora=cfg.motion_lora_rank > 0,
+        lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
+        motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
+        **cfg.attention_processor_kwargs
+    )
+    if cfg.lora_ckpt is not None:
+        lora_checkpoints = torch.load(lora_ckpt_path, map_location=unet.device)
+        if 'lora_state_dict' in lora_checkpoints.keys():
+            lora_checkpoints = lora_checkpoints['lora_state_dict']
+        _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
+        assert len(lora_u) == 0
+    if cfg.motion_module_ckpt is not None:
+        mm_checkpoints = torch.load(motion_module_ckpt_path, map_location=unet.device)
+        _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
+        assert len(mm_u) == 0
+    if cfg.camera_adaptor_ckpt is not None:
+        camera_adaptor_checkpoint = torch.load(camera_adaptor_ckpt_path, map_location=device)
+        camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
+        attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
+        camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
+        assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
+        _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
+        assert len(attention_processor_u) == 0
+    pipeline = GenPhotoPipeline(
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=noise_scheduler,
+        camera_encoder=camera_encoder
+    ).to(device)
+    pipeline.enable_vae_slicing()
+    return pipeline, device
+def run_inference(pipeline, tokenizer, text_encoder, base_scene, bokehK_list, device, video_length=5, height=256, width=384):
+    bokehK_values = json.loads(bokehK_list)
+    bokehK_values = torch.tensor(bokehK_values).unsqueeze(1)
+    camera_embedding = Camera_Embedding(bokehK_values, tokenizer, text_encoder, device).load()
+    camera_embedding = rearrange(camera_embedding.unsqueeze(0), "b f c h w -> b c f h w")
+    with torch.no_grad():
+        sample = pipeline(
+            prompt=base_scene,
+            camera_embedding=camera_embedding,
+            video_length=video_length,
+            height=height,
+            width=width,
+            num_inference_steps=25,
+            guidance_scale=8.0
+        ).videos[0].cpu()
+    temporal_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+    save_videos_grid(sample[None], temporal_video_path, rescale=False)
+    return temporal_video_path
+def main(config_path, base_scene, bokehK_list):
+    torch.manual_seed(42)
+    cfg = OmegaConf.load(config_path)
+    logger.info("Loading models...")
+    pipeline, device = load_models(cfg)
+    logger.info("Starting inference...")
+    video_path = run_inference(pipeline, pipeline.tokenizer, pipeline.text_encoder, base_scene, bokehK_list, device)
+    logger.info(f"Video saved to {video_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
+    parser.add_argument("--base_scene", type=str, required=True, help="Scene description")
+    parser.add_argument("--bokehK_list", type=str, required=True, help="Comma-separated Bokeh K values")
+    args = parser.parse_args()
+    main(args.config, args.base_scene, args.bokehK_list)
+## example
+## python inference_bokehK.py --config configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml --base_scene "A young boy wearing an orange jacket is standing on a crosswalk, waiting to cross the street." --bokehK_list "[2.44, 8.3, 10.1, 17.2, 24.0]"

inference_color_temperature.py CHANGED Viewed

@@ -1,3 +1,338 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6ed5fe8385e56e837fdb7c8ca21973136a42f4c3b09c6223c800dcc60955d61d
-size 14631

+import tempfile
+import imageio
+import os
+import torch
+import logging
+import argparse
+import json
+import numpy as np
+import torch.nn.functional as F
+from pathlib import Path
+from omegaconf import OmegaConf
+from torch.utils.data import Dataset
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, DDIMScheduler
+from einops import rearrange
+from genphoto.pipelines.pipeline_animation import GenPhotoPipeline
+from genphoto.models.unet import UNet3DConditionModelCameraCond
+from genphoto.models.camera_adaptor import CameraCameraEncoder, CameraAdaptor
+from genphoto.utils.util import save_videos_grid
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+from huggingface_hub import hf_hub_download
+def kelvin_to_rgb(kelvin):
+    if torch.is_tensor(kelvin):
+        kelvin = kelvin.cpu().item()
+    temp = kelvin / 100.0
+    if temp <= 66:
+        red = 255
+        green = 99.4708025861 * np.log(temp) - 161.1195681661 if temp > 0 else 0
+        if temp <= 19:
+            blue = 0
+        else:
+            blue = 138.5177312231 * np.log(temp - 10) - 305.0447927307
+    elif 66 < temp <= 88:
+        red = 0.5 * (255 + 329.698727446 * ((temp - 60) ** -0.19332047592))
+        green = 0.5 * (288.1221695283 * ((temp - 60) ** -0.1155148492) +
+                       (99.4708025861 * np.log(temp) - 161.1195681661 if temp > 0 else 0))
+        blue = 0.5 * (138.5177312231 * np.log(temp - 10) - 305.0447927307 + 255)
+    else:
+        red = 329.698727446 * ((temp - 60) ** -0.19332047592)
+        green = 288.1221695283 * ((temp - 60) ** -0.1155148492)
+        blue = 255
+    return np.array([red, green, blue], dtype=np.float32) / 255.0
+def create_color_temperature_embedding(color_temperature_values, target_height, target_width, min_color_temperature=2000, max_color_temperature=10000):
+    f = color_temperature_values.shape[0]
+    rgb_factors = []
+    # Compute RGB factors based on kelvin_to_rgb function
+    for color_temperature in color_temperature_values.squeeze():
+        kelvin = min_color_temperature + (color_temperature * (max_color_temperature - min_color_temperature))  # Map normalized color_temperature to actual Kelvin
+        rgb = kelvin_to_rgb(kelvin)
+        rgb_factors.append(rgb)
+    # Convert to tensor and expand to target dimensions
+    rgb_factors = torch.tensor(rgb_factors).float()  # [f, 3]
+    rgb_factors = rgb_factors.unsqueeze(2).unsqueeze(3)  # [f, 3, 1, 1]
+    color_temperature_embedding = rgb_factors.expand(f, 3, target_height, target_width)  # [f, 3, target_height, target_width]
+    return color_temperature_embedding
+class Camera_Embedding(Dataset):
+    def __init__(self, color_temperature_values, tokenizer, text_encoder, device, sample_size=[256, 384]):
+        self.color_temperature_values = color_temperature_values.to(device)
+        self.tokenizer = tokenizer
+        self.text_encoder = text_encoder
+        self.device = device
+        self.sample_size = sample_size
+    def load(self):
+        if len(self.color_temperature_values) != 5:
+            raise ValueError("Expected 5 color_temperature values")
+        # Generate prompts for each color_temperature value and append color_temperature information to caption
+        prompts = []
+        for ct in self.color_temperature_values:
+            prompt = f"<color temperature: {ct.item()}>"
+            prompts.append(prompt)
+        # Tokenize prompts and encode to get embeddings
+        with torch.no_grad():
+            prompt_ids = self.tokenizer(
+                prompts, max_length=self.tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+            ).input_ids.to(self.device)
+            encoder_hidden_states = self.text_encoder(input_ids=prompt_ids).last_hidden_state  # Shape: (f, sequence_length, hidden_size)
+        # Calculate differences between consecutive embeddings (ignoring sequence_length)
+        differences = []
+        for i in range(1, encoder_hidden_states.size(0)):
+            diff = encoder_hidden_states[i] - encoder_hidden_states[i - 1]
+            diff = diff.unsqueeze(0)
+            differences.append(diff)
+        # Add the difference between the last and the first embedding
+        final_diff = encoder_hidden_states[-1] - encoder_hidden_states[0]
+        final_diff = final_diff.unsqueeze(0)
+        differences.append(final_diff)
+        # Concatenate differences along the batch dimension (f-1)
+        concatenated_differences = torch.cat(differences, dim=0)
+        frame = concatenated_differences.size(0)
+        concatenated_differences = torch.cat(differences, dim=0)
+        pad_length = 128 - concatenated_differences.size(1)
+        if pad_length > 0:
+            concatenated_differences_padded = F.pad(concatenated_differences, (0, 0, 0, pad_length))
+        ccl_embedding = concatenated_differences_padded.reshape(frame, self.sample_size[0], self.sample_size[1])
+        ccl_embedding = ccl_embedding.unsqueeze(1)
+        ccl_embedding = ccl_embedding.expand(-1, 3, -1, -1)
+        ccl_embedding = ccl_embedding.to(self.device)
+        color_temperature_embedding = create_color_temperature_embedding(self.color_temperature_values, self.sample_size[0], self.sample_size[1]).to(self.device)
+        camera_embedding = torch.cat((color_temperature_embedding, ccl_embedding), dim=1)
+        return camera_embedding
+#
+# def load_models(cfg):
+#
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#
+#     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
+#     vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
+#     vae.requires_grad_(False)
+#     tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
+#     text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
+#     text_encoder.requires_grad_(False)
+#     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
+#         cfg.pretrained_model_path,
+#         subfolder=cfg.unet_subfolder,
+#         unet_additional_kwargs=cfg.unet_additional_kwargs
+#     ).to(device)
+#     unet.requires_grad_(False)
+#
+#     camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
+#     camera_encoder.requires_grad_(False)
+#     camera_adaptor = CameraAdaptor(unet, camera_encoder)
+#     camera_adaptor.requires_grad_(False)
+#     camera_adaptor.to(device)
+#
+#     logger.info("Setting the attention processors")
+#     unet.set_all_attn_processor(
+#         add_spatial_lora=cfg.lora_ckpt is not None,
+#         add_motion_lora=cfg.motion_lora_rank > 0,
+#         lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
+#         motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
+#         **cfg.attention_processor_kwargs
+#     )
+#
+#     if cfg.lora_ckpt is not None:
+#         print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
+#         lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
+#         if 'lora_state_dict' in lora_checkpoints.keys():
+#             lora_checkpoints = lora_checkpoints['lora_state_dict']
+#         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
+#         assert len(lora_u) == 0
+#         print(f'Loading done')
+#
+#     if cfg.motion_module_ckpt is not None:
+#         print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
+#         mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
+#         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
+#         assert len(mm_u) == 0
+#         print("Loading done")
+#
+#
+#     if cfg.camera_adaptor_ckpt is not None:
+#         logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
+#         camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
+#         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
+#         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
+#         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
+#
+#         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
+#         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
+#         assert len(attention_processor_u) == 0
+#
+#         logger.info("Camera Adaptor loading done")
+#     else:
+#         logger.info("No Camera Adaptor checkpoint used")
+#
+#     pipeline = GenPhotoPipeline(
+#         vae=vae,
+#         text_encoder=text_encoder,
+#         tokenizer=tokenizer,
+#         unet=unet,
+#         scheduler=noise_scheduler,
+#         camera_encoder=camera_encoder
+#     ).to(device)
+#
+#     pipeline.enable_vae_slicing()
+#
+#     return pipeline, device
+def load_models(cfg):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    pretrained_model_path = hf_hub_download("pandaphd/generative_photography", "stable-diffusion-v1-5/")
+    lora_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/RealEstate10K_LoRA.ckpt")
+    motion_module_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/v3_sd15_mm.ckpt")
+    camera_adaptor_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/checkpoint-color_temperature.ckpt")
+    noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
+    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae").to(device)
+    vae.requires_grad_(False)
+    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder").to(device)
+    text_encoder.requires_grad_(False)
+    unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
+        pretrained_model_path,
+        subfolder=cfg.unet_subfolder,
+        unet_additional_kwargs=cfg.unet_additional_kwargs
+    ).to(device)
+    unet.requires_grad_(False)
+    camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
+    camera_encoder.requires_grad_(False)
+    camera_adaptor = CameraAdaptor(unet, camera_encoder)
+    camera_adaptor.requires_grad_(False)
+    camera_adaptor.to(device)
+    unet.set_all_attn_processor(
+        add_spatial_lora=cfg.lora_ckpt is not None,
+        add_motion_lora=cfg.motion_lora_rank > 0,
+        lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
+        motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
+        **cfg.attention_processor_kwargs
+    )
+    if cfg.lora_ckpt is not None:
+        lora_checkpoints = torch.load(lora_ckpt_path, map_location=unet.device)
+        if 'lora_state_dict' in lora_checkpoints.keys():
+            lora_checkpoints = lora_checkpoints['lora_state_dict']
+        _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
+        assert len(lora_u) == 0
+    if cfg.motion_module_ckpt is not None:
+        mm_checkpoints = torch.load(motion_module_ckpt_path, map_location=unet.device)
+        _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
+        assert len(mm_u) == 0
+    if cfg.camera_adaptor_ckpt is not None:
+        camera_adaptor_checkpoint = torch.load(camera_adaptor_ckpt_path, map_location=device)
+        camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
+        attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
+        camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
+        assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
+        _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
+        assert len(attention_processor_u) == 0
+    pipeline = GenPhotoPipeline(
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=noise_scheduler,
+        camera_encoder=camera_encoder
+    ).to(device)
+    pipeline.enable_vae_slicing()
+    return pipeline, device
+def run_inference(pipeline, tokenizer, text_encoder, base_scene, color_temperature_list, device, video_length=5, height=256, width=384):
+    color_temperature_values = json.loads(color_temperature_list)
+    color_temperature_values = torch.tensor(color_temperature_values).unsqueeze(1)
+    # Ensure camera_embedding is on the correct device
+    camera_embedding = Camera_Embedding(color_temperature_values, tokenizer, text_encoder, device).load()
+    camera_embedding = rearrange(camera_embedding.unsqueeze(0), "b f c h w -> b c f h w")
+    with torch.no_grad():
+        sample = pipeline(
+            prompt=base_scene,
+            camera_embedding=camera_embedding,
+            video_length=video_length,
+            height=height,
+            width=width,
+            num_inference_steps=25,
+            guidance_scale=8.0
+        ).videos[0].cpu()
+    temporal_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+    save_videos_grid(sample[None], temporal_video_path, rescale=False)
+    return temporal_video_path
+def main(config_path, base_scene, color_temperature_list):
+    torch.manual_seed(42)
+    cfg = OmegaConf.load(config_path)
+    logger.info("Loading models...")
+    pipeline, device = load_models(cfg)
+    logger.info("Starting inference...")
+    video_path =  run_inference(pipeline, pipeline.tokenizer, pipeline.text_encoder, base_scene, color_temperature_list, device)
+    logger.info(f"Video saved to {video_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
+    parser.add_argument("--base_scene", type=str, required=True, help="invariant scene caption as JSON string")
+    parser.add_argument("--color_temperature_list", type=str, required=True, help="color_temperature values as JSON string")
+    args = parser.parse_args()
+    main(args.config, args.base_scene, args.color_temperature_list)
+    # usage example
+    # python inference_color_temperature.py --config configs/inference_genphoto/adv3_256_384_genphoto_relora_color_temperature.yaml --base_scene "A beautiful blue sky with a mountain range in the background." --color_temperature_list "[2455.0, 4155.0, 5555.0, 6555.0, 5855.0]"

inference_focal_length.py CHANGED Viewed

@@ -1,3 +1,335 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c41bc79a24be2dce1457e285e6fcd5cb3396b677bae30ae010e3f23ae993817c
-size 15177

+import tempfile
+import imageio
+import os
+import torch
+import logging
+import argparse
+import json
+import numpy as np
+import torch.nn.functional as F
+from pathlib import Path
+from omegaconf import OmegaConf
+from torch.utils.data import Dataset
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, DDIMScheduler
+from einops import rearrange
+from genphoto.pipelines.pipeline_animation import GenPhotoPipeline
+from genphoto.models.unet import UNet3DConditionModelCameraCond
+from genphoto.models.camera_adaptor import CameraCameraEncoder, CameraAdaptor
+from genphoto.utils.util import save_videos_grid
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+from huggingface_hub import hf_hub_download
+def create_focal_length_embedding(focal_length_values, target_height, target_width, base_focal_length=24.0, sensor_height=24.0, sensor_width=36.0):
+    device = 'cpu'
+    focal_length_values = focal_length_values.to(device)
+    f = focal_length_values.shape[0]  # Number of frames
+    # Convert constants to tensors to perform operations with focal_length_values
+    sensor_width = torch.tensor(sensor_width, device=device)
+    sensor_height = torch.tensor(sensor_height, device=device)
+    base_focal_length = torch.tensor(base_focal_length, device=device)
+    # Calculate the FOV for the base focal length (min_focal_length)
+    base_fov_x = 2.0 * torch.atan(sensor_width * 0.5 / base_focal_length)
+    base_fov_y = 2.0 * torch.atan(sensor_height * 0.5 / base_focal_length)
+    # Calculate the FOV for each focal length in focal_length_values
+    target_fov_x = 2.0 * torch.atan(sensor_width * 0.5 / focal_length_values)
+    target_fov_y = 2.0 * torch.atan(sensor_height * 0.5 / focal_length_values)
+    # Calculate crop ratio: how much of the image is cropped at the current focal length
+    crop_ratio_xs = target_fov_x / base_fov_x  # Crop ratio for horizontal axis
+    crop_ratio_ys = target_fov_y / base_fov_y  # Crop ratio for vertical axis
+    # Get the center of the image
+    center_h, center_w = target_height // 2, target_width // 2
+    # Initialize a mask tensor with zeros on CPU
+    focal_length_embedding = torch.zeros((f, 3, target_height, target_width), dtype=torch.float32)  # Shape [f, 3, H, W]
+    # Fill the center region with 1 based on the calculated crop dimensions
+    for i in range(f):
+        # Crop dimensions calculated using rounded float values
+        crop_h = torch.round(crop_ratio_ys[i] * target_height).int().item()  # Rounded cropped height for the current frame
+       # print('crop_h', crop_h)
+        crop_w = torch.round(crop_ratio_xs[i] * target_width).int().item()  # Rounded cropped width for the current frame
+        # Ensure the cropped dimensions are within valid bounds
+        crop_h = max(1, min(target_height, crop_h))
+        crop_w = max(1, min(target_width, crop_w))
+        # Set the center region of the focal_length embedding to 1 for the current frame
+        focal_length_embedding[i, :,
+        center_h - crop_h // 2: center_h + crop_h // 2,
+        center_w - crop_w // 2: center_w + crop_w // 2] = 1.0
+    return focal_length_embedding
+class Camera_Embedding(Dataset):
+    def __init__(self, focal_length_values, tokenizer, text_encoder, device, sample_size=[256, 384]):
+        self.focal_length_values = focal_length_values.to(device)
+        self.tokenizer = tokenizer
+        self.text_encoder = text_encoder
+        self.device = device
+        self.sample_size = sample_size
+    def load(self):
+        if len(self.focal_length_values) != 5:
+            raise ValueError("Expected 5 focal_length values")
+        # Generate prompts for each focal length value and append focal_length information to caption
+        prompts = []
+        for fl in self.focal_length_values:
+            prompt = f"<focal length: {fl.item()}>"
+            prompts.append(prompt)
+        # Tokenize prompts and encode to get embeddings
+        with torch.no_grad():
+            prompt_ids = self.tokenizer(
+                prompts, max_length=self.tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+            ).input_ids.to(self.device)
+            encoder_hidden_states = self.text_encoder(input_ids=prompt_ids).last_hidden_state  # Shape: (f, sequence_length, hidden_size)
+        # Calculate differences between consecutive embeddings (ignoring sequence_length)
+        differences = []
+        for i in range(1, encoder_hidden_states.size(0)):
+            diff = encoder_hidden_states[i] - encoder_hidden_states[i - 1]
+            diff = diff.unsqueeze(0)
+            differences.append(diff)
+        # Add the difference between the last and the first embedding
+        final_diff = encoder_hidden_states[-1] - encoder_hidden_states[0]
+        final_diff = final_diff.unsqueeze(0)
+        differences.append(final_diff)
+        # Concatenate differences along the batch dimension (f-1)
+        concatenated_differences = torch.cat(differences, dim=0)
+        frame = concatenated_differences.size(0)
+        concatenated_differences = torch.cat(differences, dim=0)
+        pad_length = 128 - concatenated_differences.size(1)
+        if pad_length > 0:
+        # Pad along the second dimension (77 -> 128), pad only on the right side
+            concatenated_differences_padded = F.pad(concatenated_differences, (0, 0, 0, pad_length))
+        ccl_embedding = concatenated_differences_padded.reshape(frame, self.sample_size[0], self.sample_size[1])
+        ccl_embedding = ccl_embedding.unsqueeze(1)
+        ccl_embedding = ccl_embedding.expand(-1, 3, -1, -1)
+        ccl_embedding = ccl_embedding.to(self.device)
+        focal_length_embedding = create_focal_length_embedding(self.focal_length_values, self.sample_size[0], self.sample_size[1]).to(self.device)
+        camera_embedding = torch.cat((focal_length_embedding, ccl_embedding), dim=1)
+        return camera_embedding
+#
+# def load_models(cfg):
+#
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#
+#     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
+#     vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
+#     vae.requires_grad_(False)
+#     tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
+#     text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
+#     text_encoder.requires_grad_(False)
+#     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
+#         cfg.pretrained_model_path,
+#         subfolder=cfg.unet_subfolder,
+#         unet_additional_kwargs=cfg.unet_additional_kwargs
+#     ).to(device)
+#     unet.requires_grad_(False)
+#
+#     camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
+#     camera_encoder.requires_grad_(False)
+#     camera_adaptor = CameraAdaptor(unet, camera_encoder)
+#     camera_adaptor.requires_grad_(False)
+#     camera_adaptor.to(device)
+#
+#     logger.info("Setting the attention processors")
+#     unet.set_all_attn_processor(
+#         add_spatial_lora=cfg.lora_ckpt is not None,
+#         add_motion_lora=cfg.motion_lora_rank > 0,
+#         lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
+#         motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
+#         **cfg.attention_processor_kwargs
+#     )
+#
+#     if cfg.lora_ckpt is not None:
+#         print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
+#         lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
+#         if 'lora_state_dict' in lora_checkpoints.keys():
+#             lora_checkpoints = lora_checkpoints['lora_state_dict']
+#         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
+#         assert len(lora_u) == 0
+#         print(f'Loading done')
+#
+#     if cfg.motion_module_ckpt is not None:
+#         print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
+#         mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
+#         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
+#         assert len(mm_u) == 0
+#         print("Loading done")
+#
+#     if cfg.camera_adaptor_ckpt is not None:
+#         logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
+#         camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
+#         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
+#         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
+#         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
+#
+#         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
+#         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
+#         assert len(attention_processor_u) == 0
+#
+#         logger.info("Camera Adaptor loading done")
+#     else:
+#         logger.info("No Camera Adaptor checkpoint used")
+#
+#     pipeline = GenPhotoPipeline(
+#         vae=vae,
+#         text_encoder=text_encoder,
+#         tokenizer=tokenizer,
+#         unet=unet,
+#         scheduler=noise_scheduler,
+#         camera_encoder=camera_encoder
+#     ).to(device)
+#     pipeline.enable_vae_slicing()
+#
+#     return pipeline, device
+def load_models(cfg):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    pretrained_model_path = hf_hub_download("pandaphd/generative_photography", "stable-diffusion-v1-5/")
+    lora_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/RealEstate10K_LoRA.ckpt")
+    motion_module_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/v3_sd15_mm.ckpt")
+    camera_adaptor_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/checkpoint-focal_length.ckpt")
+    noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
+    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae").to(device)
+    vae.requires_grad_(False)
+    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder").to(device)
+    text_encoder.requires_grad_(False)
+    unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
+        pretrained_model_path,
+        subfolder=cfg.unet_subfolder,
+        unet_additional_kwargs=cfg.unet_additional_kwargs
+    ).to(device)
+    unet.requires_grad_(False)
+    camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
+    camera_encoder.requires_grad_(False)
+    camera_adaptor = CameraAdaptor(unet, camera_encoder)
+    camera_adaptor.requires_grad_(False)
+    camera_adaptor.to(device)
+    unet.set_all_attn_processor(
+        add_spatial_lora=cfg.lora_ckpt is not None,
+        add_motion_lora=cfg.motion_lora_rank > 0,
+        lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
+        motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
+        **cfg.attention_processor_kwargs
+    )
+    if cfg.lora_ckpt is not None:
+        lora_checkpoints = torch.load(lora_ckpt_path, map_location=unet.device)
+        if 'lora_state_dict' in lora_checkpoints.keys():
+            lora_checkpoints = lora_checkpoints['lora_state_dict']
+        _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
+        assert len(lora_u) == 0
+    if cfg.motion_module_ckpt is not None:
+        mm_checkpoints = torch.load(motion_module_ckpt_path, map_location=unet.device)
+        _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
+        assert len(mm_u) == 0
+    if cfg.camera_adaptor_ckpt is not None:
+        camera_adaptor_checkpoint = torch.load(camera_adaptor_ckpt_path, map_location=device)
+        camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
+        attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
+        camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
+        assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
+        _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
+        assert len(attention_processor_u) == 0
+    pipeline = GenPhotoPipeline(
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=noise_scheduler,
+        camera_encoder=camera_encoder
+    ).to(device)
+    pipeline.enable_vae_slicing()
+    return pipeline, device
+def run_inference(pipeline, tokenizer, text_encoder, base_scene, focal_length_list, device, video_length=5, height=256, width=384):
+    focal_length_values = json.loads(focal_length_list)
+    focal_length_values = torch.tensor(focal_length_values).unsqueeze(1)
+    # Ensure camera_embedding is on the correct device
+    camera_embedding = Camera_Embedding(focal_length_values, tokenizer, text_encoder, device).load()
+    camera_embedding = rearrange(camera_embedding.unsqueeze(0), "b f c h w -> b c f h w")
+    with torch.no_grad():
+        sample = pipeline(
+            prompt=base_scene,
+            camera_embedding=camera_embedding,
+            video_length=video_length,
+            height=height,
+            width=width,
+            num_inference_steps=25,
+            guidance_scale=8.0
+        ).videos[0].cpu()
+    temporal_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+    save_videos_grid(sample[None], temporal_video_path, rescale=False)
+    return temporal_video_path
+def main(config_path, base_scene, focal_length_list):
+    torch.manual_seed(42)
+    cfg = OmegaConf.load(config_path)
+    logger.info("Loading models...")
+    pipeline, device = load_models(cfg)
+    logger.info("Starting inference...")
+    video_path = run_inference(pipeline, pipeline.tokenizer, pipeline.text_encoder, base_scene, focal_length_list, device)
+    logger.info(f"Video saved to {video_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
+    parser.add_argument("--base_scene", type=str, required=True, help="invariant scene caption as JSON string")
+    parser.add_argument("--focal_length_list", type=str, required=True, help="focal_length values as JSON string")
+    args = parser.parse_args()
+    main(args.config, args.base_scene, args.focal_length_list)
+    # usage example
+    # python inference_focal_length.py --config configs/inference_genphoto/adv3_256_384_genphoto_relora_focal_length.yaml --base_scene "A cozy living room with a large, comfy sofa and a coffee table." --focal_length_list "[25.0, 35.0, 45.0, 55.0, 65.0]"

inference_shutter_speed.py CHANGED Viewed

@@ -1,3 +1,322 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:12eb2507454a07a5e565233b738991782d191e932470176783be93773fb0f209
-size 13888

+import tempfile
+import imageio
+import os
+import torch
+import logging
+import argparse
+import json
+import numpy as np
+import torch.nn.functional as F
+from pathlib import Path
+from omegaconf import OmegaConf
+from torch.utils.data import Dataset
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, DDIMScheduler
+from einops import rearrange
+from genphoto.pipelines.pipeline_animation import GenPhotoPipeline
+from genphoto.models.unet import UNet3DConditionModelCameraCond
+from genphoto.models.camera_adaptor import CameraCameraEncoder, CameraAdaptor
+from genphoto.utils.util import save_videos_grid
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+from huggingface_hub import hf_hub_download
+def create_shutter_speed_embedding(shutter_speed_values, target_height, target_width, base_exposure=0.5):
+    """
+    Create a shutter_speed (Exposure Value or shutter speed) embedding tensor using a constant fwc value.
+    Args:
+    - shutter_speed_values: Tensor of shape [f, 1] containing shutter_speed values for each frame.
+    - H: Height of the image.
+    - W: Width of the image.
+    - base_exposure: A base exposure value to normalize brightness (defaults to 0.18 as a common base exposure level).
+    Returns:
+    - shutter_speed_embedding: Tensor of shape [f, 1, H, W] where each pixel is scaled based on the shutter_speed values.
+    """
+    f = shutter_speed_values.shape[0]
+    # Set a constant full well capacity (fwc)
+    fwc = 32000  # Constant value for full well capacity
+    # Calculate scale based on EV and sensor full well capacity (fwc)
+    scales = (shutter_speed_values / base_exposure) * (fwc / (fwc + 0.0001))
+    # Reshape and expand to match image dimensions
+    scales = scales.unsqueeze(2).unsqueeze(3).expand(f, 3, target_height, target_width)
+    # Use scales to create the final shutter_speed embedding
+    shutter_speed_embedding = scales      # Shape [f, 3, H, W]
+    return shutter_speed_embedding
+class Camera_Embedding(Dataset):
+    def __init__(self, shutter_speed_values, tokenizer, text_encoder, device, sample_size=[256, 384]):
+        self.shutter_speed_values = shutter_speed_values.to(device)
+        self.tokenizer = tokenizer
+        self.text_encoder = text_encoder
+        self.device = device
+        self.sample_size = sample_size
+    def load(self):
+        if len(self.shutter_speed_values) != 5:
+            raise ValueError("Expected 5 shutter_speed values")
+        # Generate prompts for each shutter_speed value and append shutter_speed information to caption
+        prompts = []
+        for ss in self.shutter_speed_values:
+            prompt = f"<exposure: {ss.item()}>"
+            prompts.append(prompt)
+        # Tokenize prompts and encode to get embeddings
+        with torch.no_grad():
+            prompt_ids = self.tokenizer(
+                prompts, max_length=self.tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+            ).input_ids.to(self.device)
+            encoder_hidden_states = self.text_encoder(input_ids=prompt_ids).last_hidden_state  # Shape: (f, sequence_length, hidden_size)
+        # Calculate differences between consecutive embeddings (ignoring sequence_length)
+        differences = []
+        for i in range(1, encoder_hidden_states.size(0)):
+            diff = encoder_hidden_states[i] - encoder_hidden_states[i - 1]
+            diff = diff.unsqueeze(0)
+            differences.append(diff)
+        # Add the difference between the last and the first embedding
+        final_diff = encoder_hidden_states[-1] - encoder_hidden_states[0]
+        final_diff = final_diff.unsqueeze(0)
+        differences.append(final_diff)
+        # Concatenate differences along the batch dimension (f-1)
+        concatenated_differences = torch.cat(differences, dim=0)
+        frame = concatenated_differences.size(0)
+        concatenated_differences = torch.cat(differences, dim=0)
+        pad_length = 128 - concatenated_differences.size(1)
+        print('pad_length', pad_length)
+        if pad_length > 0:
+            concatenated_differences_padded = F.pad(concatenated_differences, (0, 0, 0, pad_length))
+        ccl_embedding = concatenated_differences_padded.reshape(frame, self.sample_size[0], self.sample_size[1])
+        ccl_embedding = ccl_embedding.unsqueeze(1)
+        ccl_embedding = ccl_embedding.expand(-1, 3, -1, -1)
+        ccl_embedding = ccl_embedding.to(self.device)
+        shutter_speed_embedding = create_shutter_speed_embedding(self.shutter_speed_values, self.sample_size[0], self.sample_size[1]).to(self.device)
+        camera_embedding = torch.cat((shutter_speed_embedding, ccl_embedding), dim=1)
+        return camera_embedding
+# def load_models(cfg):
+#
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#
+#     noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
+#     vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
+#     vae.requires_grad_(False)
+#     tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
+#     text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
+#     text_encoder.requires_grad_(False)
+#
+#     unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
+#         cfg.pretrained_model_path,
+#         subfolder=cfg.unet_subfolder,
+#         unet_additional_kwargs=cfg.unet_additional_kwargs
+#     ).to(device)
+#     unet.requires_grad_(False)
+#
+#
+#     camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
+#     camera_encoder.requires_grad_(False)
+#     camera_adaptor = CameraAdaptor(unet, camera_encoder)
+#     camera_adaptor.requires_grad_(False)
+#     camera_adaptor.to(device)
+#
+#     logger.info("Setting the attention processors")
+#     unet.set_all_attn_processor(
+#         add_spatial_lora=cfg.lora_ckpt is not None,
+#         add_motion_lora=cfg.motion_lora_rank > 0,
+#         lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
+#         motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
+#         **cfg.attention_processor_kwargs
+#     )
+#
+#     if cfg.lora_ckpt is not None:
+#         print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
+#         lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
+#         if 'lora_state_dict' in lora_checkpoints.keys():
+#             lora_checkpoints = lora_checkpoints['lora_state_dict']
+#         _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
+#         assert len(lora_u) == 0
+#         print(f'Loading done')
+#
+#     if cfg.motion_module_ckpt is not None:
+#         print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
+#         mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
+#         _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
+#         assert len(mm_u) == 0
+#         print("Loading done")
+#
+#
+#     if cfg.camera_adaptor_ckpt is not None:
+#         logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
+#         camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
+#
+#         camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
+#         attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
+#
+#         camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
+#
+#         assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
+#         _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
+#         assert len(attention_processor_u) == 0
+#
+#         logger.info("Camera Adaptor loading done")
+#     else:
+#         logger.info("No Camera Adaptor checkpoint used")
+#
+#     pipeline = GenPhotoPipeline(
+#         vae=vae,
+#         text_encoder=text_encoder,
+#         tokenizer=tokenizer,
+#         unet=unet,
+#         scheduler=noise_scheduler,
+#         camera_encoder=camera_encoder
+#     ).to(device)
+#     pipeline.enable_vae_slicing()
+#
+#     return pipeline, device
+def load_models(cfg):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    pretrained_model_path = hf_hub_download("pandaphd/generative_photography", "stable-diffusion-v1-5/")
+    lora_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/RealEstate10K_LoRA.ckpt")
+    motion_module_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/v3_sd15_mm.ckpt")
+    camera_adaptor_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/checkpoint-shutter_speed.ckpt")
+    noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
+    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae").to(device)
+    vae.requires_grad_(False)
+    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder").to(device)
+    text_encoder.requires_grad_(False)
+    unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
+        pretrained_model_path,
+        subfolder=cfg.unet_subfolder,
+        unet_additional_kwargs=cfg.unet_additional_kwargs
+    ).to(device)
+    unet.requires_grad_(False)
+    camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
+    camera_encoder.requires_grad_(False)
+    camera_adaptor = CameraAdaptor(unet, camera_encoder)
+    camera_adaptor.requires_grad_(False)
+    camera_adaptor.to(device)
+    unet.set_all_attn_processor(
+        add_spatial_lora=cfg.lora_ckpt is not None,
+        add_motion_lora=cfg.motion_lora_rank > 0,
+        lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
+        motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
+        **cfg.attention_processor_kwargs
+    )
+    if cfg.lora_ckpt is not None:
+        lora_checkpoints = torch.load(lora_ckpt_path, map_location=unet.device)
+        if 'lora_state_dict' in lora_checkpoints.keys():
+            lora_checkpoints = lora_checkpoints['lora_state_dict']
+        _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
+        assert len(lora_u) == 0
+    if cfg.motion_module_ckpt is not None:
+        mm_checkpoints = torch.load(motion_module_ckpt_path, map_location=unet.device)
+        _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
+        assert len(mm_u) == 0
+    if cfg.camera_adaptor_ckpt is not None:
+        camera_adaptor_checkpoint = torch.load(camera_adaptor_ckpt_path, map_location=device)
+        camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
+        attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
+        camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
+        assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
+        _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
+        assert len(attention_processor_u) == 0
+    pipeline = GenPhotoPipeline(
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=noise_scheduler,
+        camera_encoder=camera_encoder
+    ).to(device)
+    pipeline.enable_vae_slicing()
+    return pipeline, device
+def run_inference(pipeline, tokenizer, text_encoder, base_scene, shutter_speed_list, device, video_length=5, height=256, width=384):
+    shutter_speed_values = json.loads(shutter_speed_list)
+    shutter_speed_values = torch.tensor(shutter_speed_values).unsqueeze(1)
+    # Ensure camera_embedding is on the correct device
+    camera_embedding = Camera_Embedding(shutter_speed_values, tokenizer, text_encoder, device).load()
+    camera_embedding = rearrange(camera_embedding.unsqueeze(0), "b f c h w -> b c f h w")
+    with torch.no_grad():
+        sample = pipeline(
+            prompt=base_scene,
+            camera_embedding=camera_embedding,
+            video_length=video_length,
+            height=height,
+            width=width,
+            num_inference_steps=25,
+            guidance_scale=8.0
+        ).videos[0].cpu()
+    temporal_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+    save_videos_grid(sample[None], temporal_video_path, rescale=False)
+    return temporal_video_path
+def main(config_path, base_scene, shutter_speed_list):
+    torch.manual_seed(42)
+    cfg = OmegaConf.load(config_path)
+    logger.info("Loading models...")
+    pipeline, device = load_models(cfg)
+    logger.info("Starting inference...")
+    video_path =  run_inference(pipeline, pipeline.tokenizer, pipeline.text_encoder, base_scene, shutter_speed_list, device)
+    logger.info(f"Video saved to {video_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
+    parser.add_argument("--base_scene", type=str, required=True, help="invariant scene caption as JSON string")
+    parser.add_argument("--shutter_speed_list", type=str, required=True, help="shutter_speed values as JSON string")
+    args = parser.parse_args()
+    main(args.config, args.base_scene, args.shutter_speed_list)
+    # usage example
+    # python inference_shutter_speed.py --config configs/inference_genphoto/adv3_256_384_genphoto_relora_shutter_speed.yaml --base_scene "A modern bathroom with a mirror and soft lighting." --shutter_speed_list "[0.1, 0.3, 0.52, 0.7, 0.8]"

requirements.txt CHANGED Viewed

@@ -1,3 +1,19 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1766bd0739223e95b2fde76b862d853da41c15b0d97273e7e90f4cd4a4d77a60
-size 290

+--extra-index-url https://download.pytorch.org/whl/cu121
+torch==2.1.1
+torchvision==0.16.1
+torchaudio==2.1.1
+diffusers==0.24.0
+imageio==2.36.0
+imageio-ffmpeg
+transformers
+accelerate
+opencv-python
+gdown
+einops
+decord
+omegaconf
+safetensors
+gradio
+wandb
+triton
+huggingface_hub