aleafy commited on
Commit
0a63786
·
0 Parent(s):

Start fresh

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. .gitignore +5 -0
  3. README.md +141 -0
  4. __pycache__/db_examples.cpython-310.pyc +0 -0
  5. __pycache__/demo_utils1.cpython-310.pyc +0 -0
  6. app.py +365 -0
  7. configs/instruct_v2v.yaml +149 -0
  8. configs/instruct_v2v_ic.yaml +130 -0
  9. configs/instruct_v2v_ic_gradio.yaml +81 -0
  10. configs/instruct_v2v_ic_inference.yaml +79 -0
  11. configs/instruct_v2v_ic_inference_hdr.yaml +80 -0
  12. configs/instruct_v2v_ic_inference_text.yaml +79 -0
  13. configs/instruct_v2v_ic_pexels.yaml +133 -0
  14. configs/instruct_v2v_ic_pexels_hdr.yaml +147 -0
  15. configs/instruct_v2v_ic_pexels_text.yaml +137 -0
  16. configs/instruct_v2v_ic_pexels_text_hdr.yaml +137 -0
  17. configs/instruct_v2v_ic_test.yaml +132 -0
  18. configs/instruct_v2v_inference.yaml +98 -0
  19. configs/instruct_v2v_ori.yaml +147 -0
  20. configs/test_textmodel.yaml +7 -0
  21. configs/test_vae.yaml +21 -0
  22. configs/test_vae_ori.yaml +28 -0
  23. configs/tmp_ic.yaml +130 -0
  24. db_examples.py +133 -0
  25. demo/clean_bg_extracted/10/cropped_video.mp4 +0 -0
  26. demo/clean_bg_extracted/10/frames/0000.png +0 -0
  27. demo/clean_bg_extracted/14/cropped_video.mp4 +0 -0
  28. demo/clean_bg_extracted/14/frames/0000.png +0 -0
  29. demo/clean_bg_extracted/22/cropped_video.mp4 +0 -0
  30. demo/clean_bg_extracted/22/frames/0000.png +0 -0
  31. demo/clean_bg_extracted/23/cropped_video.mp4 +0 -0
  32. demo/clean_bg_extracted/23/frames/0000.png +0 -0
  33. demo/clean_bg_extracted/27/cropped_video.mp4 +0 -0
  34. demo/clean_bg_extracted/27/frames/0000.png +0 -0
  35. demo/clean_bg_extracted/33/cropped_video.mp4 +0 -0
  36. demo/clean_bg_extracted/33/frames/0000.png +0 -0
  37. demo/clean_bg_extracted/39/cropped_video.mp4 +0 -0
  38. demo/clean_bg_extracted/39/frames/0000.png +0 -0
  39. demo/clean_bg_extracted/47/frames/0000.png +0 -0
  40. demo/clean_bg_extracted/55/cropped_video.mp4 +0 -0
  41. demo/clean_bg_extracted/55/frames/0000.png +0 -0
  42. demo/clean_bg_extracted/57/frames/0000.png +0 -0
  43. demo/clean_bg_extracted/58/frames/0000.png +0 -0
  44. demo/clean_bg_extracted/59/cropped_video.mp4 +0 -0
  45. demo/clean_bg_extracted/59/frames/0000.png +0 -0
  46. demo/clean_bg_extracted/62/frames/0000.png +0 -0
  47. demo/clean_bg_extracted/8/cropped_video.mp4 +0 -0
  48. demo/clean_bg_extracted/8/frames/0000.png +0 -0
  49. demo/clean_bg_extracted/9/cropped_video.mp4 +0 -0
  50. demo/clean_bg_extracted/9/frames/0000.png +0 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ app1.py
2
+ app2.py
3
+ demo_utils1.py
4
+ tmp
5
+ models
README.md ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "RelightVid"
3
+ emoji: "💡"
4
+ colorFrom: "blue"
5
+ colorTo: "green"
6
+ sdk: "gradio" # 你的项目使用的 SDK (gradio / streamlit / docker)
7
+ app_file: "app.py" # 你的主程序文件
8
+ ---
9
+
10
+
11
+ <!-- # <img src="assets/icon.png" style="vertical-align: -14px;" :height="50px" width="50px"> RelightVid -->
12
+ # RelightVid
13
+
14
+ **[RelightVid: Temporal-Consistent Diffusion Model for Video Relighting](https://arxiv.org/abs/2501.16330)**
15
+ </br>
16
+ [Ye Fang](https://github.com/Aleafy)\*,
17
+ [Zeyi Sun](https://github.com/SunzeY)\*,
18
+ [Shangzhan Zhang](https://zhanghe3z.github.io/),
19
+ [Tong Wu](https://wutong16.github.io/),
20
+ [Yinghao Xu](https://justimyhxu.github.io/),
21
+ [Pan Zhang](https://panzhang0212.github.io/),
22
+ [Jiaqi Wang](https://myownskyw7.github.io/),
23
+ [Gordon Wetzstein](https://web.stanford.edu/~gordonwz/),
24
+ [Dahua Lin](http://dahua.site/)
25
+
26
+ <p style="font-size: 0.6em; margin-top: -1em">*Equal Contribution</p>
27
+ <p align="center">
28
+ <a href="https://arxiv.org/abs/2501.16330"><img src="https://img.shields.io/badge/arXiv-Paper-<color>"></a>
29
+ <a href="https://sunzey.github.io/Make-it-Real"><img src="https://img.shields.io/badge/Project-Website-red"></a>
30
+ <a href="https://www.youtube.com/watch?v=_j-t8592GCM"><img src="https://img.shields.io/static/v1?label=Demo&message=Video&color=orange"></a>
31
+ <a href="" target='_blank'>
32
+ <img src="https://visitor-badge.laobi.icu/badge?page_id=Aleafy.RelightVid&left_color=gray&right_color=blue">
33
+ </a>
34
+ </p>
35
+
36
+
37
+ ![Demo](./assets/demo.gif)
38
+
39
+
40
+ ## 📜 News
41
+ 🚀 [2024/6/8] We release our [inference pipeline of Make-it-Real](#⚡-quick-start), including material matching and generation of albedo-only 3D objects.
42
+
43
+ 🚀 [2024/6/8] [Material library annotations](#📦-data-preparation) generated by GPT-4V and [data engine](#⚡-quick-start) are released!
44
+
45
+ 🚀 [2024/4/26] The [paper](https://arxiv.org/abs/2404.16829) and [project page](https://sunzey.github.io/Make-it-Real) are released!
46
+
47
+ ## 💡 Highlights
48
+ - 🔥 We first demonstrate that **GPT-4V** can effectively **recognize and describe materials**, allowing our model to precisely identifies and aligns materials with the corresponding components of 3D objects.
49
+ - 🔥 We construct a **Material Library** containing thousands of materials with highly
50
+ detailed descriptions readily for MLLMs to look up and assign.
51
+ - 🔥 **An effective pipeline** for texture segmentation, material identification and matching, enabling the high-quality application of materials to
52
+ 3D assets.
53
+
54
+ ## 👨‍💻 Todo
55
+ - [ ] Evaluation for Existed and Model-Generated Assets (both code & test assets)
56
+ - [ ] More Interactive Demos (huggingface, jupyter)
57
+ - [x] Make-it-Real Pipeline Inference Code
58
+ - [x] Highly detailed Material Library annotations (generated by GPT-4V)
59
+ - [x] Paper and Web Demos
60
+
61
+ ## 💾 Installation
62
+
63
+ ```bash
64
+ git clone https://github.com/Aleafy/RelightVid.git
65
+ cd RelightVid
66
+
67
+ conda create -n relitv python=3.10
68
+ conda activate relitv
69
+
70
+ pip install torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu118
71
+ pip install -r requirements.txt
72
+ ```
73
+
74
+
75
+
76
+
77
+
78
+ ## 📦 Data Preparation
79
+ 1. **Annotations**: in `data/material_lib/annotations` [folder](data/material_lib/annotations), include:
80
+ - Highly-detailed descriptions by GPT-4V: offering thorough descriptions of the material’s visual characteristics and rich semantic information.
81
+ - Category-tree: Divided into a hierarchical structure with coarse and fine granularity, it includes over 80 subcategories.
82
+ 2. **PBR Maps**: You can download the complete PBR data collection at [Huggingface](https://huggingface.co/datasets/gvecchio/MatSynth/tree/main), or download the data used in our project at [OpenXLab](https://openxlab.org.cn/datasets/YeFang/MatSynth/tree/main) (Recommended). (If you have any questions, please refer to [issue#5](https://github.com/Aleafy/Make_it_Real/issues/5))
83
+ 3. **Material Images(optinal)**: You can download the material images file [here](https://drive.google.com/file/d/1ob7CV6JiaqFyjuCzlmSnBuNRkzt2qMSG/view?usp=sharing), to check and visualize the material appearance.
84
+
85
+ <pre>
86
+ Make_it_Real
87
+ └── data
88
+ └── material_lib
89
+ ├── annotations
90
+ ├── mat_images
91
+ └── pbr_maps
92
+ └── train
93
+ ├── Ceremic
94
+ ├── Concrete
95
+ ├── ...
96
+ └── Wood
97
+ </pre>
98
+
99
+
100
+
101
+ ## ⚡ Quick Start
102
+ #### Inference
103
+ ```bash
104
+ python main.py --obj_dir <object_dir> --exp_name <unique_exp_name> --api_key <your_own_gpt4_api_key>
105
+ ```
106
+ - To ensure proper network connectivity for GPT-4V, add proxy environment settings in [main.py](https://github.com/Aleafy/Make_it_Real/blob/feb3563d57fbe18abbff8d4abfb48f71cc8f967b/main.py#L18) (optional). Also, please verify the reachability of your [API host](https://github.com/Aleafy/Make_it_Real/blob/feb3563d57fbe18abbff8d4abfb48f71cc8f967b/utils/gpt4_query.py#L68).
107
+ - Result visualization (blender engine) is located in the `output/refine_output` dir. You can compare the result with that in `output/ori_output`.
108
+
109
+ #### Annotation Engine
110
+
111
+ ```bash
112
+ cd scripts/gpt_anno
113
+ python gpt4_query_mat.py
114
+ ```
115
+ `Note`: Besides functinoning as annotation engine, you can also use this code ([gpt4_query_mat.py](https://github.com/Aleafy/Make_it_Real/blob/main/scripts/gpt_anno/gpt4_query_mat.py)) to test the GPT-4V connection simply.
116
+
117
+ <!-- [annotation code](scripts/gpt_anno) -->
118
+ <!-- #### Evalutation -->
119
+
120
+
121
+
122
+ ## ❤️ Acknowledgments
123
+ - [MatSynth](https://huggingface.co/datasets/gvecchio/MatSynth/tree/main): a Physically Based Rendering (PBR) materials dataset, which offers extensive high-resolusion tilable pbr maps to look up.
124
+ - [TEXTure](https://github.com/TEXTurePaper/TEXTurePaper): Wonderful text-guided texture generation model, and the codebase we built upon.
125
+ - [SoM](https://som-gpt4v.github.io/): Draw visual cues on images to facilate GPT-4V query better.
126
+ - [Material Palette](https://github.com/astra-vision/MaterialPalette): Excellent exploration of material extraction and generation, offers good insights and comparable setting.
127
+
128
+ ## ✒️ Citation
129
+ If you find our work helpful for your research, please consider giving a star ⭐ and citation 📝
130
+ ```bibtex
131
+ @misc{fang2024makeitreal,
132
+ title={Make-it-Real: Unleashing Large Multimodal Model for Painting 3D Objects with Realistic Materials},
133
+ author={Ye Fang and Zeyi Sun and Tong Wu and Jiaqi Wang and Ziwei Liu and Gordon Wetzstein and Dahua Lin},
134
+ year={2024},
135
+ eprint={2404.16829},
136
+ archivePrefix={arXiv},
137
+ primaryClass={cs.CV}
138
+ }
139
+ ```
140
+
141
+
__pycache__/db_examples.cpython-310.pyc ADDED
Binary file (2.22 kB). View file
 
__pycache__/demo_utils1.cpython-310.pyc ADDED
Binary file (278 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import numpy as np
4
+ from enum import Enum
5
+ import db_examples
6
+ import cv2
7
+
8
+ # from demo_utils1 import *
9
+
10
+ from misc_utils.train_utils import unit_test_create_model
11
+ from misc_utils.image_utils import save_tensor_to_gif, save_tensor_to_images
12
+ import os
13
+ from PIL import Image
14
+ import torch
15
+ import torchvision
16
+ from torchvision import transforms
17
+ from einops import rearrange
18
+ import imageio
19
+ import time
20
+
21
+ from torchvision.transforms import functional as F
22
+
23
+ import os
24
+
25
+ # 推理设置
26
+ from pl_trainer.inference.inference import InferenceIP2PVideo
27
+ from tqdm import tqdm
28
+
29
+ # 下载文件
30
+ os.makedirs('models', exist_ok=True)
31
+ filename = "models/iclight_sd15_fbc.safetensors"
32
+
33
+ # if not os.path.exists(filename):
34
+ # original_path = os.getcwd()
35
+ # base_path = './models'
36
+ # os.makedirs(base_path, exist_ok=True)
37
+
38
+ # # 直接在代码中写入 Token(注意安全风险)
39
+ # GIT_TOKEN = "955b8ea91095840b76fe38b90a088c200d4c813c"
40
+ # repo_url = f"https://YeFang:{GIT_TOKEN}@code.openxlab.org.cn/YeFang/RIV_models.git"
41
+
42
+ # try:
43
+ # if os.system(f'git clone {repo_url} {base_path}') != 0:
44
+ # raise RuntimeError("Git 克隆失败")
45
+ # os.chdir(base_path)
46
+ # if os.system('git lfs pull') != 0:
47
+ # raise RuntimeError("Git LFS 拉取失败")
48
+ # finally:
49
+ # os.chdir(original_path)
50
+
51
+ def tensor_to_pil_image(x):
52
+ """
53
+ 将 4D PyTorch 张量转换为 PIL 图像。
54
+ """
55
+ x = x.float() # 确保张量类型为 float
56
+ grid_img = torchvision.utils.make_grid(x, nrow=4).permute(1, 2, 0).detach().cpu().numpy()
57
+ grid_img = (grid_img * 255).clip(0, 255).astype("uint8") # 将 [0, 1] 范围转换为 [0, 255]
58
+ return Image.fromarray(grid_img)
59
+
60
+ def frame_to_batch(x):
61
+ """
62
+ 将帧维度转换为批次维度。
63
+ """
64
+ return rearrange(x, 'b f c h w -> (b f) c h w')
65
+
66
+ def clip_image(x, min=0., max=1.):
67
+ """
68
+ 将图像张量裁剪到指定的最小和最大值。
69
+ """
70
+ return torch.clamp(x, min=min, max=max)
71
+
72
+ def unnormalize(x):
73
+ """
74
+ 将张量范围从 [-1, 1] 转换到 [0, 1]。
75
+ """
76
+ return (x + 1) / 2
77
+
78
+
79
+ # 读取图像文件
80
+ def read_images_from_directory(directory, num_frames=16):
81
+ images = []
82
+ for i in range(num_frames):
83
+ img_path = os.path.join(directory, f'{i:04d}.png')
84
+ img = imageio.imread(img_path)
85
+ images.append(torch.tensor(img).permute(2, 0, 1)) # Convert to Tensor (C, H, W)
86
+ return images
87
+
88
+ def load_and_process_images(folder_path):
89
+ """
90
+ 读取文件夹中的所有图片,将它们转换为 [-1, 1] 范围的张量并返回一个 4D 张量。
91
+ """
92
+ processed_images = []
93
+ transform = transforms.Compose([
94
+ transforms.ToTensor(),
95
+ transforms.Lambda(lambda x: x * 2 - 1) # 将 [0, 1] 转换为 [-1, 1]
96
+ ])
97
+ for filename in sorted(os.listdir(folder_path)):
98
+ if filename.endswith(".png"):
99
+ img_path = os.path.join(folder_path, filename)
100
+ image = Image.open(img_path).convert("RGB")
101
+ processed_image = transform(image)
102
+ processed_images.append(processed_image)
103
+ return torch.stack(processed_images) # 返回 4D 张量
104
+
105
+ def load_and_process_video(video_path, num_frames=16, crop_size=512):
106
+ """
107
+ 读取视频文件中的前 num_frames 帧,将每一帧转换为 [-1, 1] 范围的张量,
108
+ 并进行中心裁剪至 crop_size x crop_size,返回一个 4D 张量。
109
+ """
110
+ processed_frames = []
111
+ transform = transforms.Compose([
112
+ transforms.CenterCrop(crop_size), # 中心裁剪
113
+ transforms.ToTensor(),
114
+ transforms.Lambda(lambda x: x * 2 - 1) # 将 [0, 1] 转换为 [-1, 1]
115
+ ])
116
+
117
+ # 使用 OpenCV 读取视频
118
+ cap = cv2.VideoCapture(video_path)
119
+
120
+ if not cap.isOpened():
121
+ raise ValueError(f"无法打开视频文件: {video_path}")
122
+
123
+ frame_count = 0
124
+
125
+ while frame_count < num_frames:
126
+ ret, frame = cap.read()
127
+ if not ret:
128
+ break # 视频帧读取完毕或视频帧不足
129
+
130
+ # 转换为 RGB 格式
131
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
132
+ image = Image.fromarray(frame)
133
+
134
+ # 应用转换
135
+ processed_frame = transform(image)
136
+ processed_frames.append(processed_frame)
137
+
138
+ frame_count += 1
139
+
140
+ cap.release() # 释放视频资源
141
+
142
+ if len(processed_frames) < num_frames:
143
+ raise ValueError(f"视频帧不足 {num_frames} 帧,仅找到 {len(processed_frames)} 帧。")
144
+
145
+ return torch.stack(processed_frames) # 返回 4D 张量 (帧数, 通道数, 高度, 宽度)
146
+
147
+
148
+ def clear_cache(output_path):
149
+ if os.path.exists(output_path):
150
+ os.remove(output_path)
151
+ return None
152
+
153
+
154
+ #! 加载模型
155
+ # 配置路径和加载模型
156
+ config_path = 'configs/instruct_v2v_ic_gradio.yaml'
157
+ diffusion_model = unit_test_create_model(config_path).cuda()
158
+
159
+ # 加载模型检查点
160
+ ckpt_path = 'models/pytorch_model.bin' #! change
161
+ ckpt = torch.load(ckpt_path, map_location='cpu')
162
+ diffusion_model.load_state_dict(ckpt, strict=False)
163
+
164
+ # import pdb; pdb.set_trace()
165
+
166
+ # # 更改全局临时目录
167
+ # new_tmp_dir = "./demo/gradio_bg"
168
+ # os.makedirs(new_tmp_dir, exist_ok=True)
169
+
170
+ # import pdb; pdb.set_trace()
171
+
172
+ def save_video_from_frames(image_pred, save_pth, fps=8):
173
+ """
174
+ 将 image_pred 中的帧保存为视频文件。
175
+
176
+ 参数:
177
+ - image_pred: Tensor,形状为 (1, 16, 3, 512, 512)
178
+ - save_pth: 保存视频的路径,例如 "output_video.mp4"
179
+ - fps: 视频的帧率
180
+ """
181
+ # 视频参数
182
+ num_frames = image_pred.shape[1]
183
+ frame_height, frame_width = 512, 512 # 目标尺寸
184
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v') # 使用 mp4 编码格式
185
+
186
+ # 创建 VideoWriter 对象
187
+ out = cv2.VideoWriter(save_pth, fourcc, fps, (frame_width, frame_height))
188
+
189
+ for i in range(num_frames):
190
+ # 反归一化 + 转换为 0-255 范围
191
+ pred_frame = clip_image(unnormalize(image_pred[0][i].unsqueeze(0))) * 255
192
+ pred_frame_resized = pred_frame.squeeze(0).detach().cpu() # (3, 512, 512)
193
+ pred_frame_resized = pred_frame_resized.permute(1, 2, 0).numpy().astype("uint8") # (512, 512, 3)
194
+
195
+ # Resize 到 256x256
196
+ pred_frame_resized = cv2.resize(pred_frame_resized, (frame_width, frame_height))
197
+
198
+ # 将 RGB 转为 BGR(因为 OpenCV 使用 BGR 格式)
199
+ pred_frame_bgr = cv2.cvtColor(pred_frame_resized, cv2.COLOR_RGB2BGR)
200
+
201
+ # 写入帧到视频
202
+ out.write(pred_frame_bgr)
203
+
204
+ # 释放 VideoWriter 资源
205
+ out.release()
206
+ print(f"视频已保存至 {save_pth}")
207
+
208
+
209
+ # 伪函数占位(生成空白视频)
210
+ def dummy_process(input_fg, input_bg):
211
+ # import pdb; pdb.set_trace()
212
+ fg_tensor = load_and_process_video(input_fg).cuda().unsqueeze(0)
213
+ bg_tensor = load_and_process_video(input_bg).cuda().unsqueeze(0) # (1, 16, 4, 64, 64)
214
+
215
+ cond_fg_tensor = diffusion_model.encode_image_to_latent(fg_tensor) # (1, 16, 4, 64, 64)
216
+ cond_bg_tensor = diffusion_model.encode_image_to_latent(bg_tensor)
217
+ cond_tensor = torch.cat((cond_fg_tensor, cond_bg_tensor), dim=2)
218
+
219
+ # 初始化潜变量
220
+ init_latent = torch.randn_like(cond_fg_tensor)
221
+
222
+ inf_pipe = InferenceIP2PVideo(
223
+ diffusion_model.unet,
224
+ scheduler='ddpm',
225
+ num_ddim_steps=20
226
+ )
227
+
228
+ EDIT_PROMPT = 'change the background'
229
+ VIDEO_CFG = 1.2
230
+ TEXT_CFG = 7.5
231
+ text_cond = diffusion_model.encode_text([EDIT_PROMPT]) # (1, 77, 768)
232
+ text_uncond = diffusion_model.encode_text([''])
233
+ latent_pred = inf_pipe(
234
+ latent=init_latent,
235
+ text_cond=text_cond,
236
+ text_uncond=text_uncond,
237
+ img_cond=cond_tensor,
238
+ text_cfg=TEXT_CFG,
239
+ img_cfg=VIDEO_CFG,
240
+ )['latent']
241
+
242
+ image_pred = diffusion_model.decode_latent_to_image(latent_pred) # (1,16,3,512,512)
243
+ output_path = os.path.join(new_tmp_dir, f"output_{int(time.time())}.mp4")
244
+ # clear_cache(output_path)
245
+
246
+ save_video_from_frames(image_pred, output_path)
247
+ # import pdb; pdb.set_trace()
248
+ # fps = 8
249
+ # frames = []
250
+ # for i in range(16):
251
+ # pred_frame = clip_image(unnormalize(image_pred[0][i].unsqueeze(0))) * 255
252
+ # pred_frame_resized = pred_frame.squeeze(0).detach().cpu() #(3,512,512)
253
+ # pred_frame_resized = pred_frame_resized.permute(1, 2, 0).detach().cpu().numpy().astype("uint8") #(512,512,3) np
254
+ # Image.fromarray(pred_frame_resized).save(save_pth)
255
+
256
+ # # 生成一个简单的黑色视频作为示例
257
+ # output_path = os.path.join(new_tmp_dir, "output.mp4")
258
+ # fourcc = cv2.VideoWriter_fourcc(*'mp4v')
259
+ # out = cv2.VideoWriter(output_path, fourcc, 20.0, (512, 512))
260
+
261
+ # for _ in range(60): # 生成 3 秒的视频(20fps)
262
+ # frame = np.zeros((512, 512, 3), dtype=np.uint8)
263
+ # out.write(frame)
264
+ # out.release()
265
+
266
+ return output_path
267
+
268
+ # 枚举类用于背景选择
269
+ class BGSource(Enum):
270
+ UPLOAD = "Use Background Video"
271
+ UPLOAD_FLIP = "Use Flipped Background Video"
272
+ LEFT = "Left Light"
273
+ RIGHT = "Right Light"
274
+ TOP = "Top Light"
275
+ BOTTOM = "Bottom Light"
276
+ GREY = "Ambient"
277
+
278
+ # Quick prompts 示例
279
+ quick_prompts = [
280
+ 'beautiful woman',
281
+ 'handsome man',
282
+ 'beautiful woman, cinematic lighting',
283
+ 'handsome man, cinematic lighting',
284
+ 'beautiful woman, natural lighting',
285
+ 'handsome man, natural lighting',
286
+ 'beautiful woman, neo punk lighting, cyberpunk',
287
+ 'handsome man, neo punk lighting, cyberpunk',
288
+ ]
289
+ quick_prompts = [[x] for x in quick_prompts]
290
+
291
+ # Gradio UI 结构
292
+ block = gr.Blocks().queue()
293
+ with block:
294
+ with gr.Row():
295
+ gr.Markdown("## IC-Light (Relighting with Foreground and Background Video Condition)")
296
+
297
+ with gr.Row():
298
+ with gr.Column():
299
+ with gr.Row():
300
+ input_fg = gr.Video(label="Foreground Video", height=370, width=370, visible=True)
301
+ input_bg = gr.Video(label="Background Video", height=370, width=370, visible=True)
302
+
303
+ prompt = gr.Textbox(label="Prompt")
304
+ bg_source = gr.Radio(choices=[e.value for e in BGSource],
305
+ value=BGSource.UPLOAD.value,
306
+ label="Background Source", type='value')
307
+
308
+ example_prompts = gr.Dataset(samples=quick_prompts, label='Prompt Quick List', components=[prompt])
309
+ bg_gallery = gr.Gallery(height=450, object_fit='contain', label='Background Quick List', value=db_examples.bg_samples, columns=5, allow_preview=False)
310
+ relight_button = gr.Button(value="Relight")
311
+
312
+ with gr.Group():
313
+ with gr.Row():
314
+ num_samples = gr.Slider(label="Videos", minimum=1, maximum=12, value=1, step=1)
315
+ seed = gr.Number(label="Seed", value=12345, precision=0)
316
+ with gr.Row():
317
+ video_width = gr.Slider(label="Video Width", minimum=256, maximum=1024, value=512, step=64)
318
+ video_height = gr.Slider(label="Video Height", minimum=256, maximum=1024, value=640, step=64)
319
+
320
+ with gr.Accordion("Advanced options", open=False):
321
+ steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
322
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=7.0, step=0.01)
323
+ highres_scale = gr.Slider(label="Highres Scale", minimum=1.0, maximum=3.0, value=1.5, step=0.01)
324
+ highres_denoise = gr.Slider(label="Highres Denoise", minimum=0.1, maximum=0.9, value=0.5, step=0.01)
325
+ a_prompt = gr.Textbox(label="Added Prompt", value='best quality')
326
+ n_prompt = gr.Textbox(label="Negative Prompt", value='lowres, bad anatomy, bad hands, cropped, worst quality')
327
+ normal_button = gr.Button(value="Compute Normal (4x Slower)")
328
+
329
+ with gr.Column():
330
+ result_video = gr.Video(label='Output Video', height=600, width=600, visible=True)
331
+
332
+ # 输入列表
333
+ # ips = [input_fg, input_bg, prompt, video_width, video_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source]
334
+ ips = [input_fg, input_bg]
335
+
336
+ # 按钮绑定处理函数
337
+ # relight_button.click(fn=lambda: None, inputs=[], outputs=[result_video])
338
+
339
+ relight_button.click(fn=dummy_process, inputs=ips, outputs=[result_video])
340
+
341
+ normal_button.click(fn=dummy_process, inputs=ips, outputs=[result_video])
342
+
343
+ # 背景库选择
344
+ def bg_gallery_selected(gal, evt: gr.SelectData):
345
+ # import pdb; pdb.set_trace()
346
+ # img_path = gal[evt.index][0]
347
+ img_path = db_examples.bg_samples[evt.index]
348
+ video_path = img_path.replace('frames/0000.png', 'cropped_video.mp4')
349
+ return video_path
350
+
351
+ bg_gallery.select(bg_gallery_selected, inputs=bg_gallery, outputs=input_bg)
352
+
353
+ # 示例
354
+ # dummy_video_for_outputs = gr.Video(visible=False, label='Result')
355
+ gr.Examples(
356
+ fn=lambda *args: args[-1],
357
+ examples=db_examples.background_conditioned_examples,
358
+ inputs=[input_fg, input_bg, prompt, bg_source, video_width, video_height, seed, result_video],
359
+ outputs=[result_video],
360
+ run_on_click=True, examples_per_page=1024
361
+ )
362
+
363
+ # 启动 Gradio 应用
364
+ # block.launch(server_name='0.0.0.0', server_port=10002, share=True)
365
+ block.launch(share=True)
configs/instruct_v2v.yaml ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ expt_dir: experiments
2
+ expt_name: instruct_v2v_ic
3
+ trainer_args:
4
+ max_epochs: 10
5
+ accelerator: "gpu"
6
+ devices: [0]
7
+ limit_train_batches: 2048
8
+ limit_val_batches: 5 #! 这边限制了每个epoch只跑多少个batch的validation
9
+ # strategy: "ddp"
10
+ strategy: "deepspeed_stage_2"
11
+ accumulate_grad_batches: 128 #! 注意一下这个值
12
+ check_val_every_n_epoch: 1 #! check一下这个值是不是和记录有关。。。
13
+ diffusion:
14
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal
15
+ params:
16
+ beta_schedule_args:
17
+ beta_schedule: scaled_linear
18
+ num_train_timesteps: 1000
19
+ beta_start: 0.00085
20
+ beta_end: 0.012
21
+ clip_sample: false
22
+ thresholding: false
23
+ prediction_type: epsilon
24
+ loss_fn: l2
25
+ optim_args:
26
+ lr: 1e-5
27
+ unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
28
+ - pretrained_models/instruct_pix2pix/diffusion_pytorch_model.bin # 这边sd加载的是ip2p的
29
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
30
+ vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
31
+ text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt
32
+ scale_factor: 0.18215
33
+ guidance_scale: 5 # not used
34
+ ddim_sampling_steps: 20
35
+ text_cfg: 7.5
36
+ img_cfg: 1.2
37
+ cond_image_dropout: 0.1
38
+ prompt_type: edit_prompt
39
+ unet:
40
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
41
+ params:
42
+ in_channels: 4 #! 8-> 4
43
+ out_channels: 4
44
+ act_fn: silu
45
+ attention_head_dim: 8
46
+ block_out_channels:
47
+ - 320
48
+ - 640
49
+ - 1280
50
+ - 1280
51
+ cross_attention_dim: 768
52
+ down_block_types:
53
+ - CrossAttnDownBlock3D
54
+ - CrossAttnDownBlock3D
55
+ - CrossAttnDownBlock3D
56
+ - DownBlock3D
57
+ up_block_types:
58
+ - UpBlock3D
59
+ - CrossAttnUpBlock3D
60
+ - CrossAttnUpBlock3D
61
+ - CrossAttnUpBlock3D
62
+ downsample_padding: 1
63
+ layers_per_block: 2
64
+ mid_block_scale_factor: 1
65
+ norm_eps: 1e-05
66
+ norm_num_groups: 32
67
+ sample_size: 64
68
+ use_motion_module: true
69
+ motion_module_resolutions:
70
+ - 1
71
+ - 2
72
+ - 4
73
+ - 8
74
+ motion_module_mid_block: false
75
+ motion_module_decoder_only: false
76
+ motion_module_type: Vanilla
77
+ motion_module_kwargs:
78
+ num_attention_heads: 8
79
+ num_transformer_block: 1
80
+ attention_block_types:
81
+ - Temporal_Self
82
+ - Temporal_Self
83
+ temporal_position_encoding: true
84
+ temporal_position_encoding_max_len: 32
85
+ temporal_attention_dim_div: 1
86
+ vae:
87
+ target: modules.kl_autoencoder.autoencoder.AutoencoderKL
88
+ params:
89
+ embed_dim: 4
90
+ ddconfig:
91
+ double_z: true
92
+ z_channels: 4
93
+ resolution: 256
94
+ in_channels: 3
95
+ out_ch: 3
96
+ ch: 128
97
+ ch_mult:
98
+ - 1
99
+ - 2
100
+ - 4
101
+ - 4
102
+ num_res_blocks: 2
103
+ attn_resolutions: []
104
+ dropout: 0.0
105
+ lossconfig:
106
+ target: torch.nn.Identity
107
+ text_model:
108
+ target: modules.openclip.modules.FrozenCLIPEmbedder
109
+ params:
110
+ freeze: true
111
+ data:
112
+ batch_size: 1
113
+ val_batch_size: 1
114
+ train:
115
+ target: dataset.videoP2P.VideoPromptToPromptMotionAug
116
+ params: #注意修改一下training的路径,和相关加载的代码, 比如说没有meta.yaml这些参数怎么搞
117
+ root_dirs:
118
+ - /home/fy/Code/instruct-video-to-video/data_train/Girl
119
+ num_frames: 16
120
+ zoom_ratio: 0.2
121
+ max_zoom: 1.25
122
+ translation_ratio: 0.7
123
+ translation_range: [0, 0.2]
124
+ is_train: True
125
+ val:
126
+ target: dataset.videoP2P.VideoPromptToPromptMotionAug
127
+ params:
128
+ root_dirs:
129
+ - /home/fy/Code/instruct-video-to-video/data_train/Girl
130
+ num_frames: 16
131
+ zoom_ratio: 0.2
132
+ max_zoom: 1.25
133
+ translation_ratio: 0.7
134
+ translation_range: [0, 0.2]
135
+ callbacks:
136
+ - target: pytorch_lightning.callbacks.ModelCheckpoint
137
+ params:
138
+ dirpath: "${expt_dir}/${expt_name}"
139
+ filename: "{epoch:04d}"
140
+ monitor: epoch
141
+ mode: max
142
+ save_top_k: 5
143
+ save_last: true
144
+ - target: callbacks.instruct_p2p_video.InstructP2PLogger
145
+ params:
146
+ max_num_images: 1
147
+ # accumulate_grad_batches: 128
148
+ require_wandb: true
149
+ - target: pytorch_lightning.callbacks.DeviceStatsMonitor
configs/instruct_v2v_ic.yaml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ expt_dir: experiments
2
+ expt_name: instruct_v2v_ic
3
+ trainer_args:
4
+ max_epochs: 10
5
+ accelerator: "gpu"
6
+ devices: [0]
7
+ limit_train_batches: 2048
8
+ limit_val_batches: 5 #! 这边限制了每个epoch只跑多少个batch的validation
9
+ # strategy: "ddp"
10
+ strategy: "deepspeed_stage_2"
11
+ accumulate_grad_batches: 128 #! 注意一下这个值
12
+ check_val_every_n_epoch: 1 #! check一下这个值是不是和记录有关。。。
13
+ diffusion:
14
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal
15
+ params:
16
+ beta_schedule_args:
17
+ beta_schedule: scaled_linear
18
+ num_train_timesteps: 1000
19
+ beta_start: 0.00085
20
+ beta_end: 0.012
21
+ clip_sample: false
22
+ thresholding: false
23
+ prediction_type: epsilon
24
+ loss_fn: l2
25
+ optim_args:
26
+ lr: 1e-5
27
+ unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
28
+ - unet/diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
29
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
30
+ - pretrained_models/iclight/iclight_sd15_fbc.safetensors # iclight lora weights
31
+ base_path: /home/fy/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
32
+ # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
33
+ # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
34
+ scale_factor: 0.18215
35
+ guidance_scale: 5 # not used
36
+ ddim_sampling_steps: 20
37
+ text_cfg: 7.5
38
+ img_cfg: 1.2
39
+ cond_image_dropout: 0.1
40
+ prompt_type: edit_prompt
41
+ unet:
42
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
43
+ params:
44
+ in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
45
+ out_channels: 4
46
+ act_fn: silu
47
+ attention_head_dim: 8
48
+ block_out_channels:
49
+ - 320
50
+ - 640
51
+ - 1280
52
+ - 1280
53
+ cross_attention_dim: 768
54
+ down_block_types:
55
+ - CrossAttnDownBlock3D
56
+ - CrossAttnDownBlock3D
57
+ - CrossAttnDownBlock3D
58
+ - DownBlock3D
59
+ up_block_types:
60
+ - UpBlock3D
61
+ - CrossAttnUpBlock3D
62
+ - CrossAttnUpBlock3D
63
+ - CrossAttnUpBlock3D
64
+ downsample_padding: 1
65
+ layers_per_block: 2
66
+ mid_block_scale_factor: 1
67
+ norm_eps: 1e-05
68
+ norm_num_groups: 32
69
+ sample_size: 64
70
+ use_motion_module: false #!!! 这边test iclight的时候可以不用motion module 即False
71
+ motion_module_resolutions:
72
+ - 1
73
+ - 2
74
+ - 4
75
+ - 8
76
+ motion_module_mid_block: false
77
+ motion_module_decoder_only: false
78
+ motion_module_type: Vanilla
79
+ motion_module_kwargs:
80
+ num_attention_heads: 8
81
+ num_transformer_block: 1
82
+ attention_block_types:
83
+ - Temporal_Self
84
+ - Temporal_Self
85
+ temporal_position_encoding: true
86
+ temporal_position_encoding_max_len: 32
87
+ temporal_attention_dim_div: 1
88
+ text_model:
89
+ target: modules.openclip.modules.FrozenCLIPEmbedder
90
+ params:
91
+ freeze: true
92
+ data:
93
+ batch_size: 1
94
+ val_batch_size: 1
95
+ train:
96
+ target: dataset.videoP2P.VideoPromptToPromptMotionAug
97
+ params: #注意修改一下training的路径,和相关加载的代码, 比如说没有meta.yaml这些参数怎么搞
98
+ root_dirs:
99
+ - /home/fy/Code/instruct-video-to-video/data_train/Girl
100
+ num_frames: 16
101
+ zoom_ratio: 0.2
102
+ max_zoom: 1.25
103
+ translation_ratio: 0.7
104
+ translation_range: [0, 0.2]
105
+ is_train: True
106
+ val:
107
+ target: dataset.videoP2P.VideoPromptToPromptMotionAug
108
+ params:
109
+ root_dirs:
110
+ - /home/fy/Code/instruct-video-to-video/data_train/Girl
111
+ num_frames: 16
112
+ zoom_ratio: 0.2
113
+ max_zoom: 1.25
114
+ translation_ratio: 0.7
115
+ translation_range: [0, 0.2]
116
+ callbacks:
117
+ - target: pytorch_lightning.callbacks.ModelCheckpoint
118
+ params:
119
+ dirpath: "${expt_dir}/${expt_name}"
120
+ filename: "{epoch:04d}"
121
+ monitor: epoch
122
+ mode: max
123
+ save_top_k: 5
124
+ save_last: true
125
+ - target: callbacks.instruct_p2p_video.InstructP2PLogger
126
+ params:
127
+ max_num_images: 1
128
+ # accumulate_grad_batches: 128
129
+ require_wandb: true
130
+ - target: pytorch_lightning.callbacks.DeviceStatsMonitor
configs/instruct_v2v_ic_gradio.yaml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ diffusion:
2
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal
3
+ params:
4
+ beta_schedule_args:
5
+ beta_schedule: scaled_linear
6
+ num_train_timesteps: 1000
7
+ beta_start: 0.00085
8
+ beta_end: 0.012
9
+ clip_sample: false
10
+ thresholding: false
11
+ prediction_type: epsilon
12
+ loss_fn: l2
13
+ optim_args:
14
+ lr: 1e-5
15
+ # base_path: models/realistic_v51
16
+ base_path: stablediffusionapi/realistic-vision-v51
17
+ # unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
18
+ # - diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
19
+ # - relvid_mm_sd15_fbc.pth
20
+ # - iclight_sd15_fbc.safetensors # iclight lora weights
21
+ # base_path: stablediffusionapi/realistic-vision-v51
22
+ # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
23
+ # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
24
+ scale_factor: 0.18215
25
+ guidance_scale: 5 # not used
26
+ ddim_sampling_steps: 20
27
+ text_cfg: 7.5
28
+ img_cfg: 1.2
29
+ cond_image_dropout: 0.1
30
+ prompt_type: edit_prompt
31
+ unet:
32
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
33
+ params:
34
+ in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
35
+ out_channels: 4
36
+ act_fn: silu
37
+ attention_head_dim: 8
38
+ block_out_channels:
39
+ - 320
40
+ - 640
41
+ - 1280
42
+ - 1280
43
+ cross_attention_dim: 768
44
+ down_block_types:
45
+ - CrossAttnDownBlock3D
46
+ - CrossAttnDownBlock3D
47
+ - CrossAttnDownBlock3D
48
+ - DownBlock3D
49
+ up_block_types:
50
+ - UpBlock3D
51
+ - CrossAttnUpBlock3D
52
+ - CrossAttnUpBlock3D
53
+ - CrossAttnUpBlock3D
54
+ downsample_padding: 1
55
+ layers_per_block: 2
56
+ mid_block_scale_factor: 1
57
+ norm_eps: 1e-05
58
+ norm_num_groups: 32
59
+ sample_size: 64
60
+ use_motion_module: true #!!! 这边test iclight的时候可以不用motion module 即False
61
+ motion_module_resolutions:
62
+ - 1
63
+ - 2
64
+ - 4
65
+ - 8
66
+ motion_module_mid_block: false
67
+ motion_module_decoder_only: false
68
+ motion_module_type: Vanilla
69
+ motion_module_kwargs:
70
+ num_attention_heads: 8
71
+ num_transformer_block: 1
72
+ attention_block_types:
73
+ - Temporal_Self
74
+ - Temporal_Self
75
+ temporal_position_encoding: true
76
+ temporal_position_encoding_max_len: 32
77
+ temporal_attention_dim_div: 1
78
+ text_model:
79
+ target: modules.openclip.modules.FrozenCLIPEmbedder
80
+ params:
81
+ freeze: true
configs/instruct_v2v_ic_inference.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ diffusion:
2
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal
3
+ params:
4
+ beta_schedule_args:
5
+ beta_schedule: scaled_linear
6
+ num_train_timesteps: 1000
7
+ beta_start: 0.00085
8
+ beta_end: 0.012
9
+ clip_sample: false
10
+ thresholding: false
11
+ prediction_type: epsilon
12
+ loss_fn: l2
13
+ optim_args:
14
+ lr: 1e-5
15
+ unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
16
+ - unet/diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
17
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
18
+ - pretrained_models/iclight/iclight_sd15_fbc.safetensors # iclight lora weights
19
+ base_path: /mnt/petrelfs/fangye/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
20
+ # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
21
+ # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
22
+ scale_factor: 0.18215
23
+ guidance_scale: 5 # not used
24
+ ddim_sampling_steps: 20
25
+ text_cfg: 7.5
26
+ img_cfg: 1.2
27
+ cond_image_dropout: 0.1
28
+ prompt_type: edit_prompt
29
+ unet:
30
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
31
+ params:
32
+ in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
33
+ out_channels: 4
34
+ act_fn: silu
35
+ attention_head_dim: 8
36
+ block_out_channels:
37
+ - 320
38
+ - 640
39
+ - 1280
40
+ - 1280
41
+ cross_attention_dim: 768
42
+ down_block_types:
43
+ - CrossAttnDownBlock3D
44
+ - CrossAttnDownBlock3D
45
+ - CrossAttnDownBlock3D
46
+ - DownBlock3D
47
+ up_block_types:
48
+ - UpBlock3D
49
+ - CrossAttnUpBlock3D
50
+ - CrossAttnUpBlock3D
51
+ - CrossAttnUpBlock3D
52
+ downsample_padding: 1
53
+ layers_per_block: 2
54
+ mid_block_scale_factor: 1
55
+ norm_eps: 1e-05
56
+ norm_num_groups: 32
57
+ sample_size: 64
58
+ use_motion_module: true #!!! 这边test iclight的时候可以不用motion module 即False
59
+ motion_module_resolutions:
60
+ - 1
61
+ - 2
62
+ - 4
63
+ - 8
64
+ motion_module_mid_block: false
65
+ motion_module_decoder_only: false
66
+ motion_module_type: Vanilla
67
+ motion_module_kwargs:
68
+ num_attention_heads: 8
69
+ num_transformer_block: 1
70
+ attention_block_types:
71
+ - Temporal_Self
72
+ - Temporal_Self
73
+ temporal_position_encoding: true
74
+ temporal_position_encoding_max_len: 32
75
+ temporal_attention_dim_div: 1
76
+ text_model:
77
+ target: modules.openclip.modules.FrozenCLIPEmbedder
78
+ params:
79
+ freeze: true
configs/instruct_v2v_ic_inference_hdr.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ diffusion:
2
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporalText
3
+ params:
4
+ beta_schedule_args:
5
+ beta_schedule: scaled_linear
6
+ num_train_timesteps: 1000
7
+ beta_start: 0.00085
8
+ beta_end: 0.012
9
+ clip_sample: false
10
+ thresholding: false
11
+ prediction_type: epsilon
12
+ loss_fn: l2
13
+ optim_args:
14
+ lr: 1e-5
15
+ unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
16
+ - unet/diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
17
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
18
+ - pretrained_models/iclight/iclight_sd15_fc.safetensors # iclight lora weights
19
+ base_path: /mnt/petrelfs/fangye/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
20
+ # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
21
+ # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
22
+ scale_factor: 0.18215
23
+ guidance_scale: 5 # not used
24
+ ddim_sampling_steps: 20
25
+ text_cfg: 7.5
26
+ img_cfg: 1.2
27
+ cond_image_dropout: 0.1
28
+ prompt_type: edit_prompt
29
+ hdr_train: True
30
+ unet:
31
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
32
+ params:
33
+ in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
34
+ out_channels: 4
35
+ act_fn: silu
36
+ attention_head_dim: 8
37
+ block_out_channels:
38
+ - 320
39
+ - 640
40
+ - 1280
41
+ - 1280
42
+ cross_attention_dim: 768
43
+ down_block_types:
44
+ - CrossAttnDownBlock3D
45
+ - CrossAttnDownBlock3D
46
+ - CrossAttnDownBlock3D
47
+ - DownBlock3D
48
+ up_block_types:
49
+ - UpBlock3D
50
+ - CrossAttnUpBlock3D
51
+ - CrossAttnUpBlock3D
52
+ - CrossAttnUpBlock3D
53
+ downsample_padding: 1
54
+ layers_per_block: 2
55
+ mid_block_scale_factor: 1
56
+ norm_eps: 1e-05
57
+ norm_num_groups: 32
58
+ sample_size: 64
59
+ use_motion_module: true #!!! 这边test iclight的时候可以不用motion module 即False
60
+ motion_module_resolutions:
61
+ - 1
62
+ - 2
63
+ - 4
64
+ - 8
65
+ motion_module_mid_block: false
66
+ motion_module_decoder_only: false
67
+ motion_module_type: Vanilla
68
+ motion_module_kwargs:
69
+ num_attention_heads: 8
70
+ num_transformer_block: 1
71
+ attention_block_types:
72
+ - Temporal_Self
73
+ - Temporal_Self
74
+ temporal_position_encoding: true
75
+ temporal_position_encoding_max_len: 32
76
+ temporal_attention_dim_div: 1
77
+ text_model:
78
+ target: modules.openclip.modules.FrozenCLIPEmbedder
79
+ params:
80
+ freeze: true
configs/instruct_v2v_ic_inference_text.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ diffusion:
2
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporalText
3
+ params:
4
+ beta_schedule_args:
5
+ beta_schedule: scaled_linear
6
+ num_train_timesteps: 1000
7
+ beta_start: 0.00085
8
+ beta_end: 0.012
9
+ clip_sample: false
10
+ thresholding: false
11
+ prediction_type: epsilon
12
+ loss_fn: l2
13
+ optim_args:
14
+ lr: 1e-5
15
+ unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
16
+ - unet/diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
17
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
18
+ - pretrained_models/iclight/iclight_sd15_fc.safetensors # iclight lora weights
19
+ base_path: /mnt/petrelfs/fangye/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
20
+ # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
21
+ # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
22
+ scale_factor: 0.18215
23
+ guidance_scale: 5 # not used
24
+ ddim_sampling_steps: 20
25
+ text_cfg: 7.5
26
+ img_cfg: 1.2
27
+ cond_image_dropout: 0.1
28
+ prompt_type: edit_prompt
29
+ unet:
30
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
31
+ params:
32
+ in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
33
+ out_channels: 4
34
+ act_fn: silu
35
+ attention_head_dim: 8
36
+ block_out_channels:
37
+ - 320
38
+ - 640
39
+ - 1280
40
+ - 1280
41
+ cross_attention_dim: 768
42
+ down_block_types:
43
+ - CrossAttnDownBlock3D
44
+ - CrossAttnDownBlock3D
45
+ - CrossAttnDownBlock3D
46
+ - DownBlock3D
47
+ up_block_types:
48
+ - UpBlock3D
49
+ - CrossAttnUpBlock3D
50
+ - CrossAttnUpBlock3D
51
+ - CrossAttnUpBlock3D
52
+ downsample_padding: 1
53
+ layers_per_block: 2
54
+ mid_block_scale_factor: 1
55
+ norm_eps: 1e-05
56
+ norm_num_groups: 32
57
+ sample_size: 64
58
+ use_motion_module: true #!!! 这边test iclight的时候可以不用motion module 即False
59
+ motion_module_resolutions:
60
+ - 1
61
+ - 2
62
+ - 4
63
+ - 8
64
+ motion_module_mid_block: false
65
+ motion_module_decoder_only: false
66
+ motion_module_type: Vanilla
67
+ motion_module_kwargs:
68
+ num_attention_heads: 8
69
+ num_transformer_block: 1
70
+ attention_block_types:
71
+ - Temporal_Self
72
+ - Temporal_Self
73
+ temporal_position_encoding: true
74
+ temporal_position_encoding_max_len: 32
75
+ temporal_attention_dim_div: 1
76
+ text_model:
77
+ target: modules.openclip.modules.FrozenCLIPEmbedder
78
+ params:
79
+ freeze: true
configs/instruct_v2v_ic_pexels.yaml ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ expt_dir: experiments
2
+ expt_name: instruct_v2v_ic_pexels_text_bgdrop_0.3_trystepckpt
3
+ trainer_args:
4
+ max_epochs: 1
5
+ accelerator: "gpu"
6
+ devices: [0] #! change to get more cards
7
+ limit_train_batches: 2048
8
+ limit_val_batches: 1 #! 这边限制了每个epoch只跑多少个batch的validation
9
+ # strategy: "ddp"
10
+ strategy: "deepspeed_stage_2"
11
+ # autotune_only_on_rank_zero: true # 确保只有一个进程执行调优表操作
12
+ accumulate_grad_batches: 1 #! 注意一下这个值
13
+ check_val_every_n_epoch: 1 #! check一下这个值是不是和记录有关。。。
14
+ # precision: 16 # 启用半精度 (FP16)
15
+ diffusion:
16
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal
17
+ params:
18
+ beta_schedule_args:
19
+ beta_schedule: scaled_linear
20
+ num_train_timesteps: 1000
21
+ beta_start: 0.00085
22
+ beta_end: 0.012
23
+ clip_sample: false
24
+ thresholding: false
25
+ prediction_type: epsilon
26
+ loss_fn: l2
27
+ optim_args:
28
+ lr: 1e-5
29
+ unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
30
+ - unet/diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
31
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
32
+ - pretrained_models/iclight/iclight_sd15_fbc.safetensors # iclight lora weights
33
+ base_path: /mnt/petrelfs/fangye/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
34
+ # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
35
+ # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
36
+ scale_factor: 0.18215
37
+ guidance_scale: 5 # not used
38
+ ddim_sampling_steps: 20
39
+ text_cfg: 7.5
40
+ img_cfg: 1.2
41
+ cond_image_dropout: 0.1
42
+ cond_text_dropout: 0.1
43
+ prompt_type: edit_prompt
44
+ unet:
45
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
46
+ params:
47
+ in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
48
+ out_channels: 4
49
+ act_fn: silu
50
+ attention_head_dim: 8
51
+ block_out_channels:
52
+ - 320
53
+ - 640
54
+ - 1280
55
+ - 1280
56
+ cross_attention_dim: 768
57
+ down_block_types:
58
+ - CrossAttnDownBlock3D
59
+ - CrossAttnDownBlock3D
60
+ - CrossAttnDownBlock3D
61
+ - DownBlock3D
62
+ up_block_types:
63
+ - UpBlock3D
64
+ - CrossAttnUpBlock3D
65
+ - CrossAttnUpBlock3D
66
+ - CrossAttnUpBlock3D
67
+ downsample_padding: 1
68
+ layers_per_block: 2
69
+ mid_block_scale_factor: 1
70
+ norm_eps: 1e-05
71
+ norm_num_groups: 32
72
+ sample_size: 64
73
+ use_motion_module: true #!!! 这边test iclight的时候可以不用motion module 即False
74
+ motion_module_resolutions:
75
+ - 1
76
+ - 2
77
+ - 4
78
+ - 8
79
+ motion_module_mid_block: false
80
+ motion_module_decoder_only: false
81
+ motion_module_type: Vanilla
82
+ motion_module_kwargs:
83
+ num_attention_heads: 8
84
+ num_transformer_block: 1
85
+ attention_block_types:
86
+ - Temporal_Self
87
+ - Temporal_Self
88
+ temporal_position_encoding: true
89
+ temporal_position_encoding_max_len: 32
90
+ temporal_attention_dim_div: 1
91
+ text_model:
92
+ target: modules.openclip.modules.FrozenCLIPEmbedder
93
+ params:
94
+ freeze: true
95
+ data:
96
+ batch_size: 1
97
+ val_batch_size: 1
98
+ train:
99
+ target: dataset.videoP2P.VideoPromptToPromptMotionAugPexels
100
+ params: # 注意修改一下training的路径,和相关加载的代码, 比如说没有meta.yaml这些参数怎么搞
101
+ root_dirs:
102
+ - /mnt/petrelfs/fangye/test/instruct-video-to-video_1019/data_train_pexels/rmbg_data
103
+ num_frames: 16
104
+ zoom_ratio: 0.2
105
+ max_zoom: 1.25
106
+ translation_ratio: 0.7
107
+ translation_range: [0, 0.2]
108
+ is_train: True
109
+ val:
110
+ target: dataset.videoP2P.VideoPromptToPromptMotionAugPexels
111
+ params:
112
+ root_dirs:
113
+ - /mnt/petrelfs/fangye/test/instruct-video-to-video_1019/data_train_pexels/rmbg_data
114
+ num_frames: 16
115
+ zoom_ratio: 0.2
116
+ max_zoom: 1.25
117
+ translation_ratio: 0.7
118
+ translation_range: [0, 0.2]
119
+ callbacks:
120
+ - target: pytorch_lightning.callbacks.ModelCheckpoint
121
+ params:
122
+ dirpath: "${expt_dir}/${expt_name}"
123
+ filename: "{step:06d}"
124
+ every_n_train_steps: 10
125
+ save_last: True
126
+ # every_n_train_steps: 10
127
+ - target: callbacks.instruct_p2p_video.InstructP2PLogger
128
+ params:
129
+ max_num_images: 1
130
+ expt_name: instruct_v2v_ic_pexels_text_bgdrop_0.3_trystepckpt
131
+ # accumulate_grad_batches: 128
132
+ require_wandb: true
133
+ - target: pytorch_lightning.callbacks.DeviceStatsMonitor
configs/instruct_v2v_ic_pexels_hdr.yaml ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ expt_dir: experiments
2
+ expt_name: instruct_v2v_ic_pexels_text_hdr_test_lr0.5_aug_lossc_fix_bs1 #! 注意传入log里面, 不要每次修改
3
+ trainer_args:
4
+ max_epochs: 10
5
+ accelerator: "gpu"
6
+ devices: [0,1,2,3,4,5,6,7] #! change to get more cards
7
+ limit_train_batches: 2048
8
+ limit_val_batches: 3 #! 这边限制了每个epoch只跑多少个batch的validation
9
+ # strategy: "ddp"
10
+ strategy: "deepspeed_stage_2"
11
+ # autotune_only_on_rank_zero: true # 确保只有一个进程执行调优表操作
12
+ accumulate_grad_batches: 128 #! 注意一下这个值 256->128
13
+ check_val_every_n_epoch: 1 #! check一下这个值是不是和记录有关。。。
14
+ # precision: 16 # 启用半精度 (FP16)
15
+ diffusion:
16
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporalText
17
+ params:
18
+ beta_schedule_args:
19
+ beta_schedule: scaled_linear
20
+ num_train_timesteps: 1000
21
+ beta_start: 0.00085
22
+ beta_end: 0.012
23
+ clip_sample: false
24
+ thresholding: false
25
+ prediction_type: epsilon
26
+ loss_fn: l2
27
+ optim_args:
28
+ lr: 1e-5 #! 原来是1e-5
29
+ unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
30
+ - unet/diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
31
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
32
+ - pretrained_models/iclight/iclight_sd15_fc.safetensors # iclight lora weights
33
+ base_path: /mnt/petrelfs/fangye/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
34
+ # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
35
+ # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
36
+ scale_factor: 0.18215
37
+ guidance_scale: 5 # not used
38
+ ddim_sampling_steps: 20
39
+ text_cfg: 7.5
40
+ img_cfg: 1.2
41
+ hdr_cfg: 7.5
42
+ cond_image_dropout: 0.1
43
+ cond_text_dropout: 0.1
44
+ cond_hdr_dropout: 0.1
45
+ ic_condition: fg
46
+ hdr_train: True
47
+ prompt_type: edit_prompt
48
+ unet:
49
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
50
+ params:
51
+ in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
52
+ out_channels: 4
53
+ act_fn: silu
54
+ attention_head_dim: 8
55
+ block_out_channels:
56
+ - 320
57
+ - 640
58
+ - 1280
59
+ - 1280
60
+ cross_attention_dim: 768
61
+ down_block_types:
62
+ - CrossAttnDownBlock3D
63
+ - CrossAttnDownBlock3D
64
+ - CrossAttnDownBlock3D
65
+ - DownBlock3D
66
+ up_block_types:
67
+ - UpBlock3D
68
+ - CrossAttnUpBlock3D
69
+ - CrossAttnUpBlock3D
70
+ - CrossAttnUpBlock3D
71
+ downsample_padding: 1
72
+ layers_per_block: 2
73
+ mid_block_scale_factor: 1
74
+ norm_eps: 1e-05
75
+ norm_num_groups: 32
76
+ sample_size: 64
77
+ use_motion_module: true #!!! 这边test iclight的时候可以不用motion module 即False
78
+ motion_module_resolutions:
79
+ - 1
80
+ - 2
81
+ - 4
82
+ - 8
83
+ motion_module_mid_block: false
84
+ motion_module_decoder_only: false
85
+ motion_module_type: Vanilla
86
+ motion_module_kwargs:
87
+ num_attention_heads: 8
88
+ num_transformer_block: 1
89
+ attention_block_types:
90
+ - Temporal_Self
91
+ - Temporal_Self
92
+ temporal_position_encoding: true
93
+ temporal_position_encoding_max_len: 32
94
+ temporal_attention_dim_div: 1
95
+ text_model:
96
+ target: modules.openclip.modules.FrozenCLIPEmbedder
97
+ params:
98
+ freeze: true
99
+ data:
100
+ batch_size: 1
101
+ val_batch_size: 1
102
+ train:
103
+ target: dataset.videoP2P.VideoPromptToPromptMotionAugPexelsHDR
104
+ params: # 注意修改一下training的路径,和相关加载的代码, 比如说没有meta.yaml这些参数怎么搞
105
+ root_dirs: #! 注意root_dirs已经更改
106
+ # - /mnt/petrelfs/fangye/test/instruct-video-to-video_1019/data_train_pexels/rmbg_data
107
+ - /mnt/hwfile/mllm/sunzeyi/iclight_video/rendered_data_rgb_fixlast
108
+ hdr_dir: /mnt/hwfile/mllm/sunzeyi/iclight_video/haven_hdr_rgb
109
+ num_frames: 16
110
+ zoom_ratio: 0.2
111
+ max_zoom: 1.25
112
+ translation_ratio: 0.7
113
+ translation_range: [0, 0.2]
114
+ is_train: True
115
+ ic_condition: fg
116
+ val:
117
+ target: dataset.videoP2P.VideoPromptToPromptMotionAugPexelsHDR
118
+ params:
119
+ root_dirs:
120
+ # - /mnt/petrelfs/fangye/test/instruct-video-to-video_1019/data_train_pexels/rmbg_data
121
+ - /mnt/hwfile/mllm/sunzeyi/iclight_video/rendered_data_rgb_fixlast
122
+ hdr_dir: /mnt/hwfile/mllm/sunzeyi/iclight_video/haven_hdr_rgb
123
+ num_frames: 16
124
+ zoom_ratio: 0.2
125
+ max_zoom: 1.25
126
+ translation_ratio: 0.7
127
+ translation_range: [0, 0.2]
128
+ ic_condition: fg
129
+ callbacks:
130
+ - target: pytorch_lightning.callbacks.ModelCheckpoint
131
+ params:
132
+ dirpath: "${expt_dir}/${expt_name}"
133
+ # filename: "{epoch:04d}"
134
+ filename: "{step:06d}"
135
+ every_n_train_steps: 1
136
+ save_last: false
137
+ # monitor: epoch
138
+ # mode: max
139
+ # save_top_k: 3
140
+ # save_last: false
141
+ - target: callbacks.instruct_p2p_video.InstructP2PLogger
142
+ params:
143
+ max_num_images: 1
144
+ expt_name: instruct_v2v_ic_pexels_text_hdr_test_lr0.5_aug_lossc_fix_bs1
145
+ # accumulate_grad_batches: 128
146
+ require_wandb: true
147
+ - target: pytorch_lightning.callbacks.DeviceStatsMonitor
configs/instruct_v2v_ic_pexels_text.yaml ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ expt_dir: experiments
2
+ expt_name: instruct_v2v_ic_pexels_text_fg #! 注意传入log里面, 不要每次修改
3
+ trainer_args:
4
+ max_epochs: 5
5
+ accelerator: "gpu"
6
+ devices: [0] #! change to get more cards
7
+ limit_train_batches: 2048
8
+ limit_val_batches: 1 #! 这边限制了每个epoch只跑多少个batch的validation
9
+ # strategy: "ddp"
10
+ strategy: "deepspeed_stage_2"
11
+ # autotune_only_on_rank_zero: true # 确保只有一个进程执行调优表操作
12
+ accumulate_grad_batches: 256 #! 注意一下这个值
13
+ check_val_every_n_epoch: 1 #! check一下这个值是不是和记录有关。。。
14
+ # precision: 16 # 启用半精度 (FP16)
15
+ diffusion:
16
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporalText
17
+ params:
18
+ beta_schedule_args:
19
+ beta_schedule: scaled_linear
20
+ num_train_timesteps: 1000
21
+ beta_start: 0.00085
22
+ beta_end: 0.012
23
+ clip_sample: false
24
+ thresholding: false
25
+ prediction_type: epsilon
26
+ loss_fn: l2
27
+ optim_args:
28
+ lr: 1e-5
29
+ unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
30
+ - unet/diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
31
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
32
+ - pretrained_models/iclight/iclight_sd15_fc.safetensors # iclight lora weights
33
+ base_path: /mnt/petrelfs/fangye/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
34
+ # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
35
+ # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
36
+ scale_factor: 0.18215
37
+ guidance_scale: 5 # not used
38
+ ddim_sampling_steps: 20
39
+ text_cfg: 7.5
40
+ img_cfg: 1.2
41
+ cond_image_dropout: 0.1
42
+ cond_text_dropout: 0.075
43
+ ic_condition: fg
44
+ prompt_type: edit_prompt
45
+ unet:
46
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
47
+ params:
48
+ in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
49
+ out_channels: 4
50
+ act_fn: silu
51
+ attention_head_dim: 8
52
+ block_out_channels:
53
+ - 320
54
+ - 640
55
+ - 1280
56
+ - 1280
57
+ cross_attention_dim: 768
58
+ down_block_types:
59
+ - CrossAttnDownBlock3D
60
+ - CrossAttnDownBlock3D
61
+ - CrossAttnDownBlock3D
62
+ - DownBlock3D
63
+ up_block_types:
64
+ - UpBlock3D
65
+ - CrossAttnUpBlock3D
66
+ - CrossAttnUpBlock3D
67
+ - CrossAttnUpBlock3D
68
+ downsample_padding: 1
69
+ layers_per_block: 2
70
+ mid_block_scale_factor: 1
71
+ norm_eps: 1e-05
72
+ norm_num_groups: 32
73
+ sample_size: 64
74
+ use_motion_module: true #!!! 这边test iclight的时候可以不用motion module 即False
75
+ motion_module_resolutions:
76
+ - 1
77
+ - 2
78
+ - 4
79
+ - 8
80
+ motion_module_mid_block: false
81
+ motion_module_decoder_only: false
82
+ motion_module_type: Vanilla
83
+ motion_module_kwargs:
84
+ num_attention_heads: 8
85
+ num_transformer_block: 1
86
+ attention_block_types:
87
+ - Temporal_Self
88
+ - Temporal_Self
89
+ temporal_position_encoding: true
90
+ temporal_position_encoding_max_len: 32
91
+ temporal_attention_dim_div: 1
92
+ text_model:
93
+ target: modules.openclip.modules.FrozenCLIPEmbedder
94
+ params:
95
+ freeze: true
96
+ data:
97
+ batch_size: 1
98
+ val_batch_size: 1
99
+ train:
100
+ target: dataset.videoP2P.VideoPromptToPromptMotionAugPexels
101
+ params: # 注意修改一下training的路径,和相关加载的代码, 比如说没有meta.yaml这些参数怎么搞
102
+ root_dirs:
103
+ - /mnt/petrelfs/fangye/test/instruct-video-to-video_1019/data_train_pexels/rmbg_data
104
+ num_frames: 16
105
+ zoom_ratio: 0.2
106
+ max_zoom: 1.25
107
+ translation_ratio: 0.7
108
+ translation_range: [0, 0.2]
109
+ is_train: True
110
+ ic_condition: fg
111
+ val:
112
+ target: dataset.videoP2P.VideoPromptToPromptMotionAugPexels
113
+ params:
114
+ root_dirs:
115
+ - /mnt/petrelfs/fangye/test/instruct-video-to-video_1019/data_train_pexels/rmbg_data
116
+ num_frames: 16
117
+ zoom_ratio: 0.2
118
+ max_zoom: 1.25
119
+ translation_ratio: 0.7
120
+ translation_range: [0, 0.2]
121
+ ic_condition: fg
122
+ callbacks:
123
+ - target: pytorch_lightning.callbacks.ModelCheckpoint
124
+ params:
125
+ dirpath: "${expt_dir}/${expt_name}"
126
+ filename: "{epoch:04d}"
127
+ monitor: epoch
128
+ mode: max
129
+ save_top_k: 5
130
+ save_last: false
131
+ - target: callbacks.instruct_p2p_video.InstructP2PLogger
132
+ params:
133
+ max_num_images: 1
134
+ expt_name: instruct_v2v_ic_pexels_text_fg
135
+ # accumulate_grad_batches: 128
136
+ require_wandb: true
137
+ - target: pytorch_lightning.callbacks.DeviceStatsMonitor
configs/instruct_v2v_ic_pexels_text_hdr.yaml ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ expt_dir: experiments
2
+ expt_name: instruct_v2v_ic_pexels_text_fg #! 注意传入log里面, 不要每次修改
3
+ trainer_args:
4
+ max_epochs: 5
5
+ accelerator: "gpu"
6
+ devices: [0] #! change to get more cards
7
+ limit_train_batches: 2048
8
+ limit_val_batches: 1 #! 这边限制了每个epoch只跑多少个batch的validation
9
+ # strategy: "ddp"
10
+ strategy: "deepspeed_stage_2"
11
+ # autotune_only_on_rank_zero: true # 确保只有一个进程执行调优表操作
12
+ accumulate_grad_batches: 256 #! 注意一下这个值
13
+ check_val_every_n_epoch: 1 #! check一下这个值是不是和记录有关。。。
14
+ # precision: 16 # 启用半精度 (FP16)
15
+ diffusion:
16
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporalText
17
+ params:
18
+ beta_schedule_args:
19
+ beta_schedule: scaled_linear
20
+ num_train_timesteps: 1000
21
+ beta_start: 0.00085
22
+ beta_end: 0.012
23
+ clip_sample: false
24
+ thresholding: false
25
+ prediction_type: epsilon
26
+ loss_fn: l2
27
+ optim_args:
28
+ lr: 1e-5
29
+ unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
30
+ - unet/diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
31
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
32
+ - pretrained_models/iclight/iclight_sd15_fc.safetensors # iclight lora weights
33
+ base_path: /mnt/petrelfs/fangye/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
34
+ # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
35
+ # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
36
+ scale_factor: 0.18215
37
+ guidance_scale: 5 # not used
38
+ ddim_sampling_steps: 20
39
+ text_cfg: 7.5
40
+ img_cfg: 1.2
41
+ cond_image_dropout: 0.1
42
+ cond_text_dropout: 0.075
43
+ ic_condition: fg
44
+ prompt_type: edit_prompt
45
+ unet:
46
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
47
+ params:
48
+ in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
49
+ out_channels: 4
50
+ act_fn: silu
51
+ attention_head_dim: 8
52
+ block_out_channels:
53
+ - 320
54
+ - 640
55
+ - 1280
56
+ - 1280
57
+ cross_attention_dim: 768
58
+ down_block_types:
59
+ - CrossAttnDownBlock3D
60
+ - CrossAttnDownBlock3D
61
+ - CrossAttnDownBlock3D
62
+ - DownBlock3D
63
+ up_block_types:
64
+ - UpBlock3D
65
+ - CrossAttnUpBlock3D
66
+ - CrossAttnUpBlock3D
67
+ - CrossAttnUpBlock3D
68
+ downsample_padding: 1
69
+ layers_per_block: 2
70
+ mid_block_scale_factor: 1
71
+ norm_eps: 1e-05
72
+ norm_num_groups: 32
73
+ sample_size: 64
74
+ use_motion_module: true #!!! 这边test iclight的时候可以不用motion module 即False
75
+ motion_module_resolutions:
76
+ - 1
77
+ - 2
78
+ - 4
79
+ - 8
80
+ motion_module_mid_block: false
81
+ motion_module_decoder_only: false
82
+ motion_module_type: Vanilla
83
+ motion_module_kwargs:
84
+ num_attention_heads: 8
85
+ num_transformer_block: 1
86
+ attention_block_types:
87
+ - Temporal_Self
88
+ - Temporal_Self
89
+ temporal_position_encoding: true
90
+ temporal_position_encoding_max_len: 32
91
+ temporal_attention_dim_div: 1
92
+ text_model:
93
+ target: modules.openclip.modules.FrozenCLIPEmbedder
94
+ params:
95
+ freeze: true
96
+ data:
97
+ batch_size: 1
98
+ val_batch_size: 1
99
+ train:
100
+ target: dataset.videoP2P.VideoPromptToPromptMotionAugPexels
101
+ params: # 注意修改一下training的路径,和相关加载的代码, 比如说没有meta.yaml这些参数怎么搞
102
+ root_dirs:
103
+ - /mnt/petrelfs/fangye/test/instruct-video-to-video_1019/data_train_pexels/rmbg_data
104
+ num_frames: 16
105
+ zoom_ratio: 0.2
106
+ max_zoom: 1.25
107
+ translation_ratio: 0.7
108
+ translation_range: [0, 0.2]
109
+ is_train: True
110
+ ic_condition: fg
111
+ val:
112
+ target: dataset.videoP2P.VideoPromptToPromptMotionAugPexels
113
+ params:
114
+ root_dirs:
115
+ - /mnt/petrelfs/fangye/test/instruct-video-to-video_1019/data_train_pexels/rmbg_data
116
+ num_frames: 16
117
+ zoom_ratio: 0.2
118
+ max_zoom: 1.25
119
+ translation_ratio: 0.7
120
+ translation_range: [0, 0.2]
121
+ ic_condition: fg
122
+ callbacks:
123
+ - target: pytorch_lightning.callbacks.ModelCheckpoint
124
+ params:
125
+ dirpath: "${expt_dir}/${expt_name}"
126
+ filename: "{epoch:04d}"
127
+ monitor: epoch
128
+ mode: max
129
+ save_top_k: 5
130
+ save_last: false
131
+ - target: callbacks.instruct_p2p_video.InstructP2PLogger
132
+ params:
133
+ max_num_images: 1
134
+ expt_name: instruct_v2v_ic_pexels_text_fg
135
+ # accumulate_grad_batches: 128
136
+ require_wandb: true
137
+ - target: pytorch_lightning.callbacks.DeviceStatsMonitor
configs/instruct_v2v_ic_test.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ expt_dir: experiments
2
+ expt_name: instruct_v2v_ic
3
+ trainer_args:
4
+ max_epochs: 5
5
+ accelerator: "gpu"
6
+ devices: [0,1,2,3]
7
+ limit_train_batches: 2048
8
+ limit_val_batches: 1 #! 这边限制了每个epoch只跑多少个batch的validation
9
+ # strategy: "ddp"
10
+ strategy: "deepspeed_stage_2"
11
+ # autotune_only_on_rank_zero: true # 确保只有一个进程执行调优表操作
12
+ accumulate_grad_batches: 32 #! 注意一下这个值
13
+ check_val_every_n_epoch: 1 #! check一下这个值是不是和记录有关。。。
14
+ # precision: 16 # 启用半精度 (FP16)
15
+ diffusion:
16
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal
17
+ params:
18
+ beta_schedule_args:
19
+ beta_schedule: scaled_linear
20
+ num_train_timesteps: 1000
21
+ beta_start: 0.00085
22
+ beta_end: 0.012
23
+ clip_sample: false
24
+ thresholding: false
25
+ prediction_type: epsilon
26
+ loss_fn: l2
27
+ optim_args:
28
+ lr: 1e-5
29
+ unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
30
+ - unet/diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
31
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
32
+ - pretrained_models/iclight/iclight_sd15_fbc.safetensors # iclight lora weights
33
+ base_path: /mnt/petrelfs/fangye/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
34
+ # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
35
+ # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
36
+ scale_factor: 0.18215
37
+ guidance_scale: 5 # not used
38
+ ddim_sampling_steps: 20
39
+ text_cfg: 7.5
40
+ img_cfg: 1.2
41
+ cond_image_dropout: 0.1
42
+ prompt_type: edit_prompt
43
+ unet:
44
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
45
+ params:
46
+ in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
47
+ out_channels: 4
48
+ act_fn: silu
49
+ attention_head_dim: 8
50
+ block_out_channels:
51
+ - 320
52
+ - 640
53
+ - 1280
54
+ - 1280
55
+ cross_attention_dim: 768
56
+ down_block_types:
57
+ - CrossAttnDownBlock3D
58
+ - CrossAttnDownBlock3D
59
+ - CrossAttnDownBlock3D
60
+ - DownBlock3D
61
+ up_block_types:
62
+ - UpBlock3D
63
+ - CrossAttnUpBlock3D
64
+ - CrossAttnUpBlock3D
65
+ - CrossAttnUpBlock3D
66
+ downsample_padding: 1
67
+ layers_per_block: 2
68
+ mid_block_scale_factor: 1
69
+ norm_eps: 1e-05
70
+ norm_num_groups: 32
71
+ sample_size: 64
72
+ use_motion_module: true #!!! 这边test iclight的时候可以不用motion module 即False
73
+ motion_module_resolutions:
74
+ - 1
75
+ - 2
76
+ - 4
77
+ - 8
78
+ motion_module_mid_block: false
79
+ motion_module_decoder_only: false
80
+ motion_module_type: Vanilla
81
+ motion_module_kwargs:
82
+ num_attention_heads: 8
83
+ num_transformer_block: 1
84
+ attention_block_types:
85
+ - Temporal_Self
86
+ - Temporal_Self
87
+ temporal_position_encoding: true
88
+ temporal_position_encoding_max_len: 32
89
+ temporal_attention_dim_div: 1
90
+ text_model:
91
+ target: modules.openclip.modules.FrozenCLIPEmbedder
92
+ params:
93
+ freeze: true
94
+ data:
95
+ batch_size: 1
96
+ val_batch_size: 1
97
+ train:
98
+ target: dataset.videoP2P.VideoPromptToPromptMotionAug
99
+ params: # 注意修改一下training的路径,和相关加载的代码, 比如说没有meta.yaml这些参数怎么搞
100
+ root_dirs:
101
+ - /mnt/petrelfs/fangye/test/instruct-video-to-video_1019/data_train_v2
102
+ num_frames: 16
103
+ zoom_ratio: 0.2
104
+ max_zoom: 1.25
105
+ translation_ratio: 0.7
106
+ translation_range: [0, 0.2]
107
+ is_train: True
108
+ val:
109
+ target: dataset.videoP2P.VideoPromptToPromptMotionAug
110
+ params:
111
+ root_dirs:
112
+ - data_train
113
+ num_frames: 16
114
+ zoom_ratio: 0.2
115
+ max_zoom: 1.25
116
+ translation_ratio: 0.7
117
+ translation_range: [0, 0.2]
118
+ callbacks:
119
+ - target: pytorch_lightning.callbacks.ModelCheckpoint
120
+ params:
121
+ dirpath: "${expt_dir}/${expt_name}"
122
+ filename: "{epoch:04d}"
123
+ monitor: epoch
124
+ mode: max
125
+ save_top_k: 5
126
+ save_last: true
127
+ - target: callbacks.instruct_p2p_video.InstructP2PLogger
128
+ params:
129
+ max_num_images: 1
130
+ # accumulate_grad_batches: 128
131
+ require_wandb: true
132
+ - target: pytorch_lightning.callbacks.DeviceStatsMonitor
configs/instruct_v2v_inference.yaml ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ diffusion:
2
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal
3
+ params:
4
+ beta_schedule_args:
5
+ beta_schedule: scaled_linear
6
+ num_train_timesteps: 1000
7
+ beta_start: 0.00085
8
+ beta_end: 0.012
9
+ clip_sample: false
10
+ thresholding: false
11
+ prediction_type: epsilon
12
+ loss_fn: l2
13
+ unet_init_weights:
14
+ - pretrained_models/instruct_pix2pix/diffusion_pytorch_model.bin
15
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt
16
+ vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
17
+ text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt
18
+ optim_args:
19
+ lr: 1e-5
20
+ scale_factor: 0.18215
21
+ guidance_scale: 5 # not used
22
+ ddim_sampling_steps: 20
23
+ text_cfg: 7.5
24
+ img_cfg: 1.2
25
+ cond_image_dropout: 0.1
26
+ prompt_type: edit_prompt
27
+ unet:
28
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
29
+ params:
30
+ in_channels: 8
31
+ out_channels: 4
32
+ act_fn: silu
33
+ attention_head_dim: 8
34
+ block_out_channels:
35
+ - 320
36
+ - 640
37
+ - 1280
38
+ - 1280
39
+ cross_attention_dim: 768
40
+ down_block_types:
41
+ - CrossAttnDownBlock3D
42
+ - CrossAttnDownBlock3D
43
+ - CrossAttnDownBlock3D
44
+ - DownBlock3D
45
+ up_block_types:
46
+ - UpBlock3D
47
+ - CrossAttnUpBlock3D
48
+ - CrossAttnUpBlock3D
49
+ - CrossAttnUpBlock3D
50
+ downsample_padding: 1
51
+ layers_per_block: 2
52
+ mid_block_scale_factor: 1
53
+ norm_eps: 1e-05
54
+ norm_num_groups: 32
55
+ sample_size: 64
56
+ use_motion_module: true
57
+ motion_module_resolutions:
58
+ - 1
59
+ - 2
60
+ - 4
61
+ - 8
62
+ motion_module_mid_block: false
63
+ motion_module_decoder_only: false
64
+ motion_module_type: Vanilla
65
+ motion_module_kwargs:
66
+ num_attention_heads: 8
67
+ num_transformer_block: 1
68
+ attention_block_types:
69
+ - Temporal_Self
70
+ - Temporal_Self
71
+ temporal_position_encoding: true
72
+ temporal_position_encoding_max_len: 32
73
+ temporal_attention_dim_div: 1
74
+ vae:
75
+ target: modules.kl_autoencoder.autoencoder.AutoencoderKL
76
+ params:
77
+ embed_dim: 4
78
+ ddconfig:
79
+ double_z: true
80
+ z_channels: 4
81
+ resolution: 256
82
+ in_channels: 3
83
+ out_ch: 3
84
+ ch: 128
85
+ ch_mult:
86
+ - 1
87
+ - 2
88
+ - 4
89
+ - 4
90
+ num_res_blocks: 2
91
+ attn_resolutions: []
92
+ dropout: 0.0
93
+ lossconfig:
94
+ target: torch.nn.Identity
95
+ text_model:
96
+ target: modules.openclip.modules.FrozenCLIPEmbedder
97
+ params:
98
+ freeze: true
configs/instruct_v2v_ori.yaml ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ expt_dir: experiments
2
+ expt_name: instruct_v2v
3
+ trainer_args:
4
+ max_epochs: 10
5
+ accelerator: "gpu"
6
+ devices: [0]
7
+ limit_train_batches: 2048
8
+ limit_val_batches: 1
9
+ # strategy: "ddp"
10
+ strategy: "deepspeed_stage_2"
11
+ accumulate_grad_batches: 256
12
+ check_val_every_n_epoch: 5
13
+ diffusion:
14
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal
15
+ params:
16
+ beta_schedule_args:
17
+ beta_schedule: scaled_linear
18
+ num_train_timesteps: 1000
19
+ beta_start: 0.00085
20
+ beta_end: 0.012
21
+ clip_sample: false
22
+ thresholding: false
23
+ prediction_type: epsilon
24
+ loss_fn: l2
25
+ optim_args:
26
+ lr: 1e-5
27
+ unet_init_weights:
28
+ - pretrained_models/instruct_pix2pix/diffusion_pytorch_model.bin
29
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt
30
+ vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
31
+ text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt
32
+ scale_factor: 0.18215
33
+ guidance_scale: 5 # not used
34
+ ddim_sampling_steps: 20
35
+ text_cfg: 7.5
36
+ img_cfg: 1.2
37
+ cond_image_dropout: 0.1
38
+ prompt_type: edit_prompt
39
+ unet:
40
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
41
+ params:
42
+ in_channels: 8
43
+ out_channels: 4
44
+ act_fn: silu
45
+ attention_head_dim: 8
46
+ block_out_channels:
47
+ - 320
48
+ - 640
49
+ - 1280
50
+ - 1280
51
+ cross_attention_dim: 768
52
+ down_block_types:
53
+ - CrossAttnDownBlock3D
54
+ - CrossAttnDownBlock3D
55
+ - CrossAttnDownBlock3D
56
+ - DownBlock3D
57
+ up_block_types:
58
+ - UpBlock3D
59
+ - CrossAttnUpBlock3D
60
+ - CrossAttnUpBlock3D
61
+ - CrossAttnUpBlock3D
62
+ downsample_padding: 1
63
+ layers_per_block: 2
64
+ mid_block_scale_factor: 1
65
+ norm_eps: 1e-05
66
+ norm_num_groups: 32
67
+ sample_size: 64
68
+ use_motion_module: true
69
+ motion_module_resolutions:
70
+ - 1
71
+ - 2
72
+ - 4
73
+ - 8
74
+ motion_module_mid_block: false
75
+ motion_module_decoder_only: false
76
+ motion_module_type: Vanilla
77
+ motion_module_kwargs:
78
+ num_attention_heads: 8
79
+ num_transformer_block: 1
80
+ attention_block_types:
81
+ - Temporal_Self
82
+ - Temporal_Self
83
+ temporal_position_encoding: true
84
+ temporal_position_encoding_max_len: 32
85
+ temporal_attention_dim_div: 1
86
+ vae:
87
+ target: modules.kl_autoencoder.autoencoder.AutoencoderKL
88
+ params:
89
+ embed_dim: 4
90
+ ddconfig:
91
+ double_z: true
92
+ z_channels: 4
93
+ resolution: 256
94
+ in_channels: 3
95
+ out_ch: 3
96
+ ch: 128
97
+ ch_mult:
98
+ - 1
99
+ - 2
100
+ - 4
101
+ - 4
102
+ num_res_blocks: 2
103
+ attn_resolutions: []
104
+ dropout: 0.0
105
+ lossconfig:
106
+ target: torch.nn.Identity
107
+ text_model:
108
+ target: modules.openclip.modules.FrozenCLIPEmbedder
109
+ params:
110
+ freeze: true
111
+ data:
112
+ batch_size: 1
113
+ val_batch_size: 1
114
+ train:
115
+ target: dataset.videoP2P.VideoPromptToPromptMotionAug
116
+ params:
117
+ root_dirs:
118
+ - video_ptp/raw_generated
119
+ - video_ptp/raw_generated_webvid
120
+ num_frames: 16
121
+ zoom_ratio: 0.2
122
+ max_zoom: 1.25
123
+ translation_ratio: 0.7
124
+ translation_range: [0, 0.2]
125
+ val:
126
+ target: dataset.videoP2P.VideoPromptToPromptMotionAug
127
+ params:
128
+ root_dirs:
129
+ - video_ptp/raw_generated
130
+ num_frames: 16
131
+ zoom_ratio: 0.2
132
+ max_zoom: 1.25
133
+ translation_ratio: 0.7
134
+ translation_range: [0, 0.2]
135
+ callbacks:
136
+ - target: pytorch_lightning.callbacks.ModelCheckpoint
137
+ params:
138
+ dirpath: "${expt_dir}/${expt_name}"
139
+ filename: "{epoch:04d}"
140
+ monitor: epoch
141
+ mode: max
142
+ save_top_k: 5
143
+ save_last: true
144
+ - target: callbacks.instruct_p2p_video.InstructP2PLogger
145
+ params:
146
+ max_num_images: 1
147
+ require_wandb: true
configs/test_textmodel.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ diffusion:
2
+ params:
3
+ base_path: /home/fy/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
4
+ text_model:
5
+ target: modules.openclip.modules.FrozenCLIPEmbedder
6
+ params:
7
+ freeze: true
configs/test_vae.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ vae:
2
+ target: modules.kl_autoencoder.autoencoder.AutoencoderKL
3
+ params:
4
+ embed_dim: 4
5
+ ddconfig:
6
+ double_z: true
7
+ z_channels: 4
8
+ resolution: 256 #先暂时256
9
+ in_channels: 3
10
+ out_ch: 3
11
+ ch: 128
12
+ ch_mult:
13
+ - 1
14
+ - 2
15
+ - 4
16
+ - 4
17
+ num_res_blocks: 2
18
+ attn_resolutions: []
19
+ dropout: 0.0
20
+ lossconfig:
21
+ target: torch.nn.Identity
configs/test_vae_ori.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ diffusion:
2
+ params:
3
+ base_path: /home/fy/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
4
+ text_model:
5
+ target: modules.openclip.modules.FrozenCLIPEmbedder
6
+ params:
7
+ freeze: true
8
+ vae:
9
+ target: modules.kl_autoencoder.autoencoder.AutoencoderKL
10
+ params:
11
+ embed_dim: 4
12
+ ddconfig:
13
+ double_z: true
14
+ z_channels: 4
15
+ resolution: 256 #先暂时256
16
+ in_channels: 3
17
+ out_ch: 3
18
+ ch: 128
19
+ ch_mult:
20
+ - 1
21
+ - 2
22
+ - 4
23
+ - 4
24
+ num_res_blocks: 2
25
+ attn_resolutions: []
26
+ dropout: 0.0
27
+ lossconfig:
28
+ target: torch.nn.Identity
configs/tmp_ic.yaml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ expt_dir: experiments
2
+ expt_name: instruct_v2v_ic
3
+ trainer_args:
4
+ max_epochs: 10
5
+ accelerator: "gpu"
6
+ devices: [0]
7
+ limit_train_batches: 2048
8
+ limit_val_batches: 5 #! 这边限制了每个epoch只跑多少个batch的validation
9
+ # strategy: "ddp"
10
+ strategy: "deepspeed_stage_2"
11
+ accumulate_grad_batches: 128 #! 注意一下这个值
12
+ check_val_every_n_epoch: 1 #! check一下这个值是不是和记录有关。。。
13
+ diffusion:
14
+ target: pl_trainer.instruct_p2p_video.InstructP2PVideoTrainerTemporal
15
+ params:
16
+ beta_schedule_args:
17
+ beta_schedule: scaled_linear
18
+ num_train_timesteps: 1000
19
+ beta_start: 0.00085
20
+ beta_end: 0.012
21
+ clip_sample: false
22
+ thresholding: false
23
+ prediction_type: epsilon
24
+ loss_fn: l2
25
+ optim_args:
26
+ lr: 1e-5
27
+ unet_init_weights: #! 注意一下, 完全可以从iv2v的ckpt开始train
28
+ - unet/diffusion_pytorch_model.safetensors # iclight, unet, sf tensor
29
+ - pretrained_models/Motion_Module/mm_sd_v15.ckpt # motion module, 推测加载的是animatediff的
30
+ - /mnt/petrelfs/fangye/IC-Light/models/iclight_sd15_fbc.safetensors # iclight lora weights
31
+ base_path: /mnt/petrelfs/fangye/.cache/huggingface/hub/models--stablediffusionapi--realistic-vision-v51/snapshots/19e3643d7d963c156d01537188ec08f0b79a514a
32
+ # vae_init_weights: pretrained_models/instruct_pix2pix/vqvae.ckpt
33
+ # text_model_init_weights: pretrained_models/instruct_pix2pix/text.ckpt #! 这两个可以直接设为None, 从from_pretrained中加载
34
+ scale_factor: 0.18215
35
+ guidance_scale: 5 # not used
36
+ ddim_sampling_steps: 20
37
+ text_cfg: 7.5
38
+ img_cfg: 1.2
39
+ cond_image_dropout: 0.1
40
+ prompt_type: edit_prompt
41
+ unet:
42
+ target: modules.video_unet_temporal.unet.UNet3DConditionModel
43
+ params:
44
+ in_channels: 4 #! change:8->12 iclight 改为12 注意一下...
45
+ out_channels: 4
46
+ act_fn: silu
47
+ attention_head_dim: 8
48
+ block_out_channels:
49
+ - 320
50
+ - 640
51
+ - 1280
52
+ - 1280
53
+ cross_attention_dim: 768
54
+ down_block_types:
55
+ - CrossAttnDownBlock3D
56
+ - CrossAttnDownBlock3D
57
+ - CrossAttnDownBlock3D
58
+ - DownBlock3D
59
+ up_block_types:
60
+ - UpBlock3D
61
+ - CrossAttnUpBlock3D
62
+ - CrossAttnUpBlock3D
63
+ - CrossAttnUpBlock3D
64
+ downsample_padding: 1
65
+ layers_per_block: 2
66
+ mid_block_scale_factor: 1
67
+ norm_eps: 1e-05
68
+ norm_num_groups: 32
69
+ sample_size: 64
70
+ use_motion_module: true #! 这边test iclight的时候可以不用motion module 即False
71
+ motion_module_resolutions:
72
+ - 1
73
+ - 2
74
+ - 4
75
+ - 8
76
+ motion_module_mid_block: false
77
+ motion_module_decoder_only: false
78
+ motion_module_type: Vanilla
79
+ motion_module_kwargs:
80
+ num_attention_heads: 8
81
+ num_transformer_block: 1
82
+ attention_block_types:
83
+ - Temporal_Self
84
+ - Temporal_Self
85
+ temporal_position_encoding: true
86
+ temporal_position_encoding_max_len: 32
87
+ temporal_attention_dim_div: 1
88
+ text_model:
89
+ target: modules.openclip.modules.FrozenCLIPEmbedder
90
+ params:
91
+ freeze: true
92
+ data:
93
+ batch_size: 1
94
+ val_batch_size: 1
95
+ train:
96
+ target: dataset.videoP2P.VideoPromptToPromptMotionAug
97
+ params: #注意修改一下training的路径,和相关加载的代码, 比如说没有meta.yaml这些参数怎么搞
98
+ root_dirs:
99
+ - /home/fy/Code/instruct-video-to-video/data_train/Girl
100
+ num_frames: 16
101
+ zoom_ratio: 0.2
102
+ max_zoom: 1.25
103
+ translation_ratio: 0.7
104
+ translation_range: [0, 0.2]
105
+ is_train: True
106
+ val:
107
+ target: dataset.videoP2P.VideoPromptToPromptMotionAug
108
+ params:
109
+ root_dirs:
110
+ - /home/fy/Code/instruct-video-to-video/data_train/Girl
111
+ num_frames: 16
112
+ zoom_ratio: 0.2
113
+ max_zoom: 1.25
114
+ translation_ratio: 0.7
115
+ translation_range: [0, 0.2]
116
+ callbacks:
117
+ - target: pytorch_lightning.callbacks.ModelCheckpoint
118
+ params:
119
+ dirpath: "${expt_dir}/${expt_name}"
120
+ filename: "{epoch:04d}"
121
+ monitor: epoch
122
+ mode: max
123
+ save_top_k: 5
124
+ save_last: true
125
+ - target: callbacks.instruct_p2p_video.InstructP2PLogger
126
+ params:
127
+ max_num_images: 1
128
+ # accumulate_grad_batches: 128
129
+ require_wandb: true
130
+ - target: pytorch_lightning.callbacks.DeviceStatsMonitor
db_examples.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ bg_samples = [
3
+ 'demo/clean_bg_extracted/22/frames/0000.png',
4
+ 'demo/clean_bg_extracted/23/frames/0000.png',
5
+ 'demo/clean_bg_extracted/27/frames/0000.png',
6
+ 'demo/clean_bg_extracted/33/frames/0000.png',
7
+ 'demo/clean_bg_extracted/47/frames/0000.png',
8
+ 'demo/clean_bg_extracted/39/frames/0000.png',
9
+ 'demo/clean_bg_extracted/59/frames/0000.png',
10
+ 'demo/clean_bg_extracted/55/frames/0000.png',
11
+ 'demo/clean_bg_extracted/58/frames/0000.png',
12
+ 'demo/clean_bg_extracted/57/frames/0000.png', #42
13
+ 'demo/clean_bg_extracted/8/frames/0000.png',
14
+ 'demo/clean_bg_extracted/9/frames/0000.png',
15
+ 'demo/clean_bg_extracted/10/frames/0000.png',
16
+ 'demo/clean_bg_extracted/14/frames/0000.png',
17
+ 'demo/clean_bg_extracted/62/frames/0000.png'
18
+ ] # 准备大概 15 个 background视频
19
+
20
+
21
+ background_conditioned_examples = [
22
+ [
23
+ "demo/clean_fg_extracted/14/cropped_video.mp4",
24
+ "demo/clean_bg_extracted/22/cropped_video.mp4",
25
+ "beautiful woman, cinematic lighting",
26
+ "Use Background Video",
27
+ 512,
28
+ 768,
29
+ 12345,
30
+ "static_fg_sync_bg_visualization_fy/14_22_100fps.mp4",
31
+ ],
32
+ [
33
+ "demo/clean_fg_extracted/14/cropped_video.mp4",
34
+ "demo/clean_bg_extracted/55/cropped_video.mp4",
35
+ "beautiful woman, cinematic lighting",
36
+ "Use Background Video",
37
+ 512,
38
+ 768,
39
+ 12345,
40
+ "static_fg_sync_bg_visualization_fy/14_55_100fps.mp4",
41
+ ],
42
+ [
43
+ "demo/clean_fg_extracted/15/cropped_video.mp4",
44
+ "demo/clean_bg_extracted/27/cropped_video.mp4",
45
+ "beautiful woman, cinematic lighting",
46
+ "Use Background Video",
47
+ 512,
48
+ 768,
49
+ 12345,
50
+ "static_fg_sync_bg_visualization_fy/15_27_100fps.mp4",
51
+ ],
52
+ [
53
+ "demo/clean_fg_extracted/18/cropped_video.mp4",
54
+ "demo/clean_bg_extracted/23/cropped_video.mp4",
55
+ "beautiful woman, cinematic lighting",
56
+ "Use Background Video",
57
+ 512,
58
+ 768,
59
+ 12345,
60
+ "static_fg_sync_bg_visualization_fy/18_23_100fps.mp4",
61
+ ],
62
+ # [
63
+ # "demo/clean_fg_extracted/18/cropped_video.mp4",
64
+ # "demo/clean_bg_extracted/33/cropped_video.mp4",
65
+ # "beautiful woman, cinematic lighting",
66
+ # "Use Background Video",
67
+ # 512,
68
+ # 768,
69
+ # 12345,
70
+ # "static_fg_sync_bg_visualization_fy/18_33_100fps.mp4",
71
+ # ],
72
+ [
73
+ "demo/clean_fg_extracted/22/cropped_video.mp4",
74
+ "demo/clean_bg_extracted/39/cropped_video.mp4",
75
+ "beautiful woman, cinematic lighting",
76
+ "Use Background Video",
77
+ 512,
78
+ 768,
79
+ 12345,
80
+ "static_fg_sync_bg_visualization_fy/22_39_100fps.mp4",
81
+ ],
82
+ # [
83
+ # "demo/clean_fg_extracted/22/cropped_video.mp4",
84
+ # "demo/clean_bg_extracted/59/cropped_video.mp4",
85
+ # "beautiful woman, cinematic lighting",
86
+ # "Use Background Video",
87
+ # 512,
88
+ # 768,
89
+ # 12345,
90
+ # "static_fg_sync_bg_visualization_fy/22_59_100fps.mp4",
91
+ # ],
92
+ [
93
+ "demo/clean_fg_extracted/9/cropped_video.mp4",
94
+ "demo/clean_bg_extracted/8/cropped_video.mp4",
95
+ "beautiful woman, cinematic lighting",
96
+ "Use Background Video",
97
+ 512,
98
+ 768,
99
+ 12345,
100
+ "static_fg_sync_bg_visualization_fy/9_8_100fps.mp4",
101
+ ],
102
+ [
103
+ "demo/clean_fg_extracted/9/cropped_video.mp4",
104
+ "demo/clean_bg_extracted/9/cropped_video.mp4",
105
+ "beautiful woman, cinematic lighting",
106
+ "Use Background Video",
107
+ 512,
108
+ 768,
109
+ 12345,
110
+ "static_fg_sync_bg_visualization_fy/9_9_100fps.mp4",
111
+ ],
112
+ [
113
+ "demo/clean_fg_extracted/9/cropped_video.mp4",
114
+ "demo/clean_bg_extracted/10/cropped_video.mp4",
115
+ "beautiful woman, cinematic lighting",
116
+ "Use Background Video",
117
+ 512,
118
+ 768,
119
+ 12345,
120
+ "static_fg_sync_bg_visualization_fy/9_10_100fps.mp4",
121
+ ],
122
+ # [
123
+ # "demo/clean_fg_extracted/9/cropped_video.mp4",
124
+ # "demo/clean_bg_extracted/14/cropped_video.mp4",
125
+ # "beautiful woman, cinematic lighting",
126
+ # "Use Background Video",
127
+ # 512,
128
+ # 768,
129
+ # 12345,
130
+ # "static_fg_sync_bg_visualization_fy/9_14_100fps.mp4",
131
+ # ],
132
+
133
+ ]
demo/clean_bg_extracted/10/cropped_video.mp4 ADDED
Binary file (220 kB). View file
 
demo/clean_bg_extracted/10/frames/0000.png ADDED
demo/clean_bg_extracted/14/cropped_video.mp4 ADDED
Binary file (138 kB). View file
 
demo/clean_bg_extracted/14/frames/0000.png ADDED
demo/clean_bg_extracted/22/cropped_video.mp4 ADDED
Binary file (187 kB). View file
 
demo/clean_bg_extracted/22/frames/0000.png ADDED
demo/clean_bg_extracted/23/cropped_video.mp4 ADDED
Binary file (70.2 kB). View file
 
demo/clean_bg_extracted/23/frames/0000.png ADDED
demo/clean_bg_extracted/27/cropped_video.mp4 ADDED
Binary file (123 kB). View file
 
demo/clean_bg_extracted/27/frames/0000.png ADDED
demo/clean_bg_extracted/33/cropped_video.mp4 ADDED
Binary file (136 kB). View file
 
demo/clean_bg_extracted/33/frames/0000.png ADDED
demo/clean_bg_extracted/39/cropped_video.mp4 ADDED
Binary file (118 kB). View file
 
demo/clean_bg_extracted/39/frames/0000.png ADDED
demo/clean_bg_extracted/47/frames/0000.png ADDED
demo/clean_bg_extracted/55/cropped_video.mp4 ADDED
Binary file (78.4 kB). View file
 
demo/clean_bg_extracted/55/frames/0000.png ADDED
demo/clean_bg_extracted/57/frames/0000.png ADDED
demo/clean_bg_extracted/58/frames/0000.png ADDED
demo/clean_bg_extracted/59/cropped_video.mp4 ADDED
Binary file (227 kB). View file
 
demo/clean_bg_extracted/59/frames/0000.png ADDED
demo/clean_bg_extracted/62/frames/0000.png ADDED
demo/clean_bg_extracted/8/cropped_video.mp4 ADDED
Binary file (52.5 kB). View file
 
demo/clean_bg_extracted/8/frames/0000.png ADDED
demo/clean_bg_extracted/9/cropped_video.mp4 ADDED
Binary file (165 kB). View file
 
demo/clean_bg_extracted/9/frames/0000.png ADDED