paulpanwang commited on
Commit
476e0f0
·
verified ·
1 Parent(s): f00d8af

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +100 -0
  2. .gitignore +11 -0
  3. LICENSE +21 -0
  4. README.md +302 -9
  5. app.py +165 -0
  6. assets/_demo/1.gif +3 -0
  7. assets/_demo/10.gif +3 -0
  8. assets/_demo/2.gif +3 -0
  9. assets/_demo/3.gif +3 -0
  10. assets/_demo/4.gif +3 -0
  11. assets/_demo/5.gif +3 -0
  12. assets/_demo/6.gif +3 -0
  13. assets/_demo/7.gif +3 -0
  14. assets/_demo/8.gif +3 -0
  15. assets/_demo/9.gif +3 -0
  16. assets/_demo/a_frog/pas.gif +3 -0
  17. assets/_demo/a_frog/sd15.gif +3 -0
  18. assets/_demo/a_frog/sd35m.gif +3 -0
  19. assets/_demo/a_frog_elevest/pas.gif +3 -0
  20. assets/_demo/a_frog_elevest/sd15.gif +3 -0
  21. assets/_demo/a_frog_elevest/sd35m.gif +3 -0
  22. assets/_demo/a_frog_empty/pas.gif +3 -0
  23. assets/_demo/a_frog_empty/sd15.gif +3 -0
  24. assets/_demo/a_frog_empty/sd35m.gif +3 -0
  25. assets/_demo/a_toy_robot/pas.gif +3 -0
  26. assets/_demo/a_toy_robot/sd15.gif +3 -0
  27. assets/_demo/a_toy_robot/sd35m.gif +3 -0
  28. assets/_demo/controlnet/book.gif +3 -0
  29. assets/_demo/controlnet/cookie.gif +3 -0
  30. assets/_demo/controlnet/iron_robot.gif +3 -0
  31. assets/_demo/controlnet/panda.gif +3 -0
  32. assets/_demo/controlnet/plush_dog_toy.gif +3 -0
  33. assets/_demo/controlnet/teddy_bear.gif +3 -0
  34. assets/_demo/overview.png +3 -0
  35. assets/crm/3D/345/215/241/351/200/232/347/213/227.webp +0 -0
  36. assets/crm/astronaut.webp +3 -0
  37. assets/crm/bulldog.webp +0 -0
  38. assets/crm/ghost-eating-burger.webp +0 -0
  39. assets/crm/kunkun.webp +0 -0
  40. assets/crm//344/270/207/345/234/243/345/215/227/347/223/234.webp +0 -0
  41. assets/crm//344/272/272/347/211/251/351/252/221/351/251/254.webp +0 -0
  42. assets/crm//345/210/235/351/237/263/346/234/252/346/235/245/347/216/251/345/201/266.webp +0 -0
  43. assets/crm//345/215/241/351/200/232/346/201/220/351/276/231.webp +0 -0
  44. assets/crm//345/215/241/351/200/232/346/211/213/346/236/252/346/210/252/345/233/276.webp +0 -0
  45. assets/crm//345/215/241/351/200/232/347/214/253.webp +0 -0
  46. assets/crm//345/215/241/351/200/232/350/230/221/350/217/207/345/245/227/350/243/205.webp +0 -0
  47. assets/crm//345/217/257/347/210/261/347/216/204/347/255/226.webp +0 -0
  48. assets/crm/大头泡泡马特.webp +3 -0
  49. assets/crm//345/275/251/350/211/262/350/230/221/350/217/207.webp +0 -0
  50. assets/crm//345/275/251/350/211/262/350/230/221/350/217/2072.webp +0 -0
.gitattributes CHANGED
@@ -33,3 +33,103 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/_demo/1.gif filter=lfs diff=lfs merge=lfs -text
37
+ assets/_demo/10.gif filter=lfs diff=lfs merge=lfs -text
38
+ assets/_demo/2.gif filter=lfs diff=lfs merge=lfs -text
39
+ assets/_demo/3.gif filter=lfs diff=lfs merge=lfs -text
40
+ assets/_demo/4.gif filter=lfs diff=lfs merge=lfs -text
41
+ assets/_demo/5.gif filter=lfs diff=lfs merge=lfs -text
42
+ assets/_demo/6.gif filter=lfs diff=lfs merge=lfs -text
43
+ assets/_demo/7.gif filter=lfs diff=lfs merge=lfs -text
44
+ assets/_demo/8.gif filter=lfs diff=lfs merge=lfs -text
45
+ assets/_demo/9.gif filter=lfs diff=lfs merge=lfs -text
46
+ assets/_demo/a_frog/pas.gif filter=lfs diff=lfs merge=lfs -text
47
+ assets/_demo/a_frog/sd15.gif filter=lfs diff=lfs merge=lfs -text
48
+ assets/_demo/a_frog/sd35m.gif filter=lfs diff=lfs merge=lfs -text
49
+ assets/_demo/a_frog_elevest/pas.gif filter=lfs diff=lfs merge=lfs -text
50
+ assets/_demo/a_frog_elevest/sd15.gif filter=lfs diff=lfs merge=lfs -text
51
+ assets/_demo/a_frog_elevest/sd35m.gif filter=lfs diff=lfs merge=lfs -text
52
+ assets/_demo/a_frog_empty/pas.gif filter=lfs diff=lfs merge=lfs -text
53
+ assets/_demo/a_frog_empty/sd15.gif filter=lfs diff=lfs merge=lfs -text
54
+ assets/_demo/a_frog_empty/sd35m.gif filter=lfs diff=lfs merge=lfs -text
55
+ assets/_demo/a_toy_robot/pas.gif filter=lfs diff=lfs merge=lfs -text
56
+ assets/_demo/a_toy_robot/sd15.gif filter=lfs diff=lfs merge=lfs -text
57
+ assets/_demo/a_toy_robot/sd35m.gif filter=lfs diff=lfs merge=lfs -text
58
+ assets/_demo/controlnet/book.gif filter=lfs diff=lfs merge=lfs -text
59
+ assets/_demo/controlnet/cookie.gif filter=lfs diff=lfs merge=lfs -text
60
+ assets/_demo/controlnet/iron_robot.gif filter=lfs diff=lfs merge=lfs -text
61
+ assets/_demo/controlnet/panda.gif filter=lfs diff=lfs merge=lfs -text
62
+ assets/_demo/controlnet/plush_dog_toy.gif filter=lfs diff=lfs merge=lfs -text
63
+ assets/_demo/controlnet/teddy_bear.gif filter=lfs diff=lfs merge=lfs -text
64
+ assets/_demo/overview.png filter=lfs diff=lfs merge=lfs -text
65
+ assets/crm/astronaut.webp filter=lfs diff=lfs merge=lfs -text
66
+ assets/crm/大头泡泡马特.webp filter=lfs diff=lfs merge=lfs -text
67
+ assets/crm/武器-剑.webp filter=lfs diff=lfs merge=lfs -text
68
+ assets/crm/毛线衣.webp filter=lfs diff=lfs merge=lfs -text
69
+ assets/crm/翅膀道具.webp filter=lfs diff=lfs merge=lfs -text
70
+ assets/diffsplat/1_wukong_avatar.png filter=lfs diff=lfs merge=lfs -text
71
+ assets/diffsplat/2_wukong_sculpture.png filter=lfs diff=lfs merge=lfs -text
72
+ assets/diffsplat/3_wukong_toy.png filter=lfs diff=lfs merge=lfs -text
73
+ assets/diffsplat/4_mask.png filter=lfs diff=lfs merge=lfs -text
74
+ assets/diffsplat/5_bajie.png filter=lfs diff=lfs merge=lfs -text
75
+ assets/diffsplat/6_armor.png filter=lfs diff=lfs merge=lfs -text
76
+ assets/grm/17_dalle3_rockingchair1.png filter=lfs diff=lfs merge=lfs -text
77
+ assets/grm/19_dalle3_stump1.png filter=lfs diff=lfs merge=lfs -text
78
+ assets/grm/astronaut.webp filter=lfs diff=lfs merge=lfs -text
79
+ assets/grm/coat.webp filter=lfs diff=lfs merge=lfs -text
80
+ assets/grm/david.png filter=lfs diff=lfs merge=lfs -text
81
+ assets/grm/dreamcraft3d_00.png filter=lfs diff=lfs merge=lfs -text
82
+ assets/grm/dreamcraft3d_01.png filter=lfs diff=lfs merge=lfs -text
83
+ assets/grm/dreamcraft3d_02.png filter=lfs diff=lfs merge=lfs -text
84
+ assets/grm/frog.png filter=lfs diff=lfs merge=lfs -text
85
+ assets/grm/girl1_padded.png filter=lfs diff=lfs merge=lfs -text
86
+ assets/grm/girl2_copy.png filter=lfs diff=lfs merge=lfs -text
87
+ assets/grm/image.png filter=lfs diff=lfs merge=lfs -text
88
+ assets/grm/ironman_helmet.png filter=lfs diff=lfs merge=lfs -text
89
+ assets/grm/panda.png filter=lfs diff=lfs merge=lfs -text
90
+ assets/grm/sculpture_0.png filter=lfs diff=lfs merge=lfs -text
91
+ assets/grm/turtle.png filter=lfs diff=lfs merge=lfs -text
92
+ assets/grm/unicorn.png filter=lfs diff=lfs merge=lfs -text
93
+ assets/grm/zebra.png filter=lfs diff=lfs merge=lfs -text
94
+ assets/instantmesh/blue_cat.png filter=lfs diff=lfs merge=lfs -text
95
+ assets/instantmesh/bubble_mart_blue.png filter=lfs diff=lfs merge=lfs -text
96
+ assets/instantmesh/bulldog.png filter=lfs diff=lfs merge=lfs -text
97
+ assets/instantmesh/cartoon_dinosaur.png filter=lfs diff=lfs merge=lfs -text
98
+ assets/instantmesh/cartoon_panda.png filter=lfs diff=lfs merge=lfs -text
99
+ assets/instantmesh/chair_armed.png filter=lfs diff=lfs merge=lfs -text
100
+ assets/instantmesh/chair_watermelon.png filter=lfs diff=lfs merge=lfs -text
101
+ assets/instantmesh/cute_horse.jpg filter=lfs diff=lfs merge=lfs -text
102
+ assets/instantmesh/pikachu.png filter=lfs diff=lfs merge=lfs -text
103
+ assets/instantmesh/sea_turtle.png filter=lfs diff=lfs merge=lfs -text
104
+ assets/instantmesh/sword.png filter=lfs diff=lfs merge=lfs -text
105
+ extensions/RaDe-GS/SIBR_viewers/docs/img/capreal/caprealinputsonly.png filter=lfs diff=lfs merge=lfs -text
106
+ extensions/RaDe-GS/SIBR_viewers/docs/img/capreal/caprealmesh.png filter=lfs diff=lfs merge=lfs -text
107
+ extensions/RaDe-GS/SIBR_viewers/docs/img/capreal/caprealpointcloud.png filter=lfs diff=lfs merge=lfs -text
108
+ extensions/RaDe-GS/SIBR_viewers/docs/img/capreal/meshlab.png filter=lfs diff=lfs merge=lfs -text
109
+ extensions/RaDe-GS/SIBR_viewers/docs/img/diagramas/class/sibr_classes_v2.png filter=lfs diff=lfs merge=lfs -text
110
+ extensions/RaDe-GS/SIBR_viewers/docs/img/diagramas/sequence/AssetStreamer/seq_assetStreamer.pdf filter=lfs diff=lfs merge=lfs -text
111
+ extensions/RaDe-GS/SIBR_viewers/docs/img/diagramas/sequence/Renderers/seq_insideOut.pdf filter=lfs diff=lfs merge=lfs -text
112
+ extensions/RaDe-GS/SIBR_viewers/docs/img/diagramas/sequence/Renderers/seq_ulr_stream.pdf filter=lfs diff=lfs merge=lfs -text
113
+ extensions/RaDe-GS/SIBR_viewers/docs/img/diagramas/sequence/Renderers/seq_ulr_v3_landscape.pdf filter=lfs diff=lfs merge=lfs -text
114
+ extensions/RaDe-GS/SIBR_viewers/docs/img/diagramas/sequence/Unity/seq_unity_ulr_rendering.pdf filter=lfs diff=lfs merge=lfs -text
115
+ extensions/RaDe-GS/SIBR_viewers/docs/img/diagramas/sequence/Unity/seq_unity_ulr_texture_upload.pdf filter=lfs diff=lfs merge=lfs -text
116
+ extensions/RaDe-GS/SIBR_viewers/docs/img/ibr_common_cmake.png filter=lfs diff=lfs merge=lfs -text
117
+ extensions/RaDe-GS/SIBR_viewers/docs/img/jesnault_git_cheat_sheet.png filter=lfs diff=lfs merge=lfs -text
118
+ extensions/RaDe-GS/SIBR_viewers/docs/img/multimeshmanager.png filter=lfs diff=lfs merge=lfs -text
119
+ extensions/RaDe-GS/SIBR_viewers/docs/img/multiviewmanager.png filter=lfs diff=lfs merge=lfs -text
120
+ extensions/RaDe-GS/SIBR_viewers/docs/img/ulr_screenshot.png filter=lfs diff=lfs merge=lfs -text
121
+ extensions/RaDe-GS/assets/teaser.png filter=lfs diff=lfs merge=lfs -text
122
+ extensions/RaDe-GS/paper.pdf filter=lfs diff=lfs merge=lfs -text
123
+ extensions/RaDe-GS/submodules/diff-gaussian-rasterization/dist/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
124
+ extensions/RaDe-GS/submodules/diff-gaussian-rasterization/third_party/glm/doc/manual/frontpage1.png filter=lfs diff=lfs merge=lfs -text
125
+ extensions/RaDe-GS/submodules/diff-gaussian-rasterization/third_party/glm/doc/manual/frontpage2.png filter=lfs diff=lfs merge=lfs -text
126
+ extensions/RaDe-GS/submodules/diff-gaussian-rasterization/third_party/glm/doc/manual.pdf filter=lfs diff=lfs merge=lfs -text
127
+ extensions/RaDe-GS/submodules/frpc_linux_amd64_v0.2 filter=lfs diff=lfs merge=lfs -text
128
+ gradio_cached_examples/22/3D[[:space:]]Gaussians[[:space:]]ply[[:space:]]format/33a17050938079dd150f/_a[[:space:]]toy[[:space:]]robot..._013020.ply filter=lfs diff=lfs merge=lfs -text
129
+ gradio_cached_examples/22/3D[[:space:]]Gaussians[[:space:]]ply[[:space:]]format/9084f12000e4d0861282/_a[[:space:]]cute[[:space:]]panda..._013020.ply filter=lfs diff=lfs merge=lfs -text
130
+ gradio_cached_examples/22/3D[[:space:]]Gaussians[[:space:]]ply[[:space:]]format/927242333ba684b13bb2/_a[[:space:]]book..._013020.ply filter=lfs diff=lfs merge=lfs -text
131
+ gradio_cached_examples/23/3D[[:space:]]Gaussians[[:space:]]ply[[:space:]]format/1174ce83af0bcabb215a/_a[[:space:]]book..._013020.ply filter=lfs diff=lfs merge=lfs -text
132
+ gradio_cached_examples/23/3D[[:space:]]Gaussians[[:space:]]ply[[:space:]]format/743ad8490995e372c5f3/_a[[:space:]]cute[[:space:]]panda..._013020.ply filter=lfs diff=lfs merge=lfs -text
133
+ gradio_cached_examples/23/3D[[:space:]]Gaussians[[:space:]]ply[[:space:]]format/e611dc2e9fbf1acd1cfc/_a[[:space:]]toy[[:space:]]robot..._013020.ply filter=lfs diff=lfs merge=lfs -text
134
+ tmp/input_image.png filter=lfs diff=lfs merge=lfs -text
135
+ wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ build/
3
+ *.egg-info/
4
+ out/
5
+ temp/
6
+ log/
7
+ download/*.json
8
+ download/*.csv
9
+ temp*.*
10
+ *.tar*
11
+ *.vsix
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Chenguo Lin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,306 @@
1
  ---
2
- title: Diffsplat
3
- emoji: ⚡
4
- colorFrom: yellow
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 5.15.0
8
  app_file: app.py
9
- pinned: false
10
- license: mit
11
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: DiffSplat
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 4.44.1
6
  ---
7
+ # [ICLR 2025] DiffSplat
8
+
9
+ <h4 align="center">
10
+
11
+ DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian Splat Generation
12
+
13
+ [Chenguo Lin](https://chenguolin.github.io), [Panwang Pan](https://paulpanwang.github.io), [Bangbang Yang](https://ybbbbt.com), [Zeming Li](https://www.zemingli.com), [Yadong Mu](http://www.muyadong.com)
14
+
15
+ [![arXiv](https://img.shields.io/badge/arXiv-2501.16764-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2501.16764)
16
+ [![Project page](https://img.shields.io/badge/Project-Page-brightgreen)](https://chenguolin.github.io/projects/DiffSplat)
17
+ [![Model](https://img.shields.io/badge/HF-Model-yellow)](https://huggingface.co/chenguolin/DiffSplat)
18
+
19
+ <p>
20
+ <img width="144" src="./assets/_demo/1.gif">
21
+ <img width="144" src="./assets/_demo/2.gif">
22
+ <img width="144" src="./assets/_demo/3.gif">
23
+ <img width="144" src="./assets/_demo/4.gif">
24
+ <img width="144" src="./assets/_demo/5.gif">
25
+ </p>
26
+ <p>
27
+ <img width="144" src="./assets/_demo/6.gif">
28
+ <img width="144" src="./assets/_demo/7.gif">
29
+ <img width="144" src="./assets/_demo/8.gif">
30
+ <img width="144" src="./assets/_demo/9.gif">
31
+ <img width="144" src="./assets/_demo/10.gif">
32
+ </p>
33
+ <p>
34
+ <img width="730", src="./assets/_demo/overview.png">
35
+ </p>
36
+
37
+ </h4>
38
+
39
+ This repository contains the official implementation of the paper: [DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian Splat Generation](https://arxiv.org/abs/2501.16764), which is accepted to ICLR 2025.
40
+ DiffSplat is a generative framework to synthesize 3D Gaussian Splats from text prompts & single-view images in 1~2 seconds. It is fine-tuned directly from a pretrained text-to-image diffusion model.
41
+
42
+ Feel free to contact me ([email protected]) or open an issue if you have any questions or suggestions.
43
+
44
+
45
+ ## 📢 News
46
+
47
+ - **2025-02-02**: Inference instructions (text-conditioned & image-conditioned & controlnet) are provided.
48
+ - **2025-01-29**: The source code and pretrained models are released. Happy 🐍 Chinese New Year 🎆!
49
+ - **2025-01-22**: InstructScene is accepted to ICLR 2025.
50
+
51
+
52
+ ## 📋 TODO
53
+
54
+ - [x] Provide detailed instructions for inference.
55
+ - [ ] Provide detailed instructions for training.
56
+ - [ ] Implement a Gradio demo.
57
+
58
+
59
+ ## 🔧 Installation
60
+
61
+ You may need to modify the specific version of `torch` in `settings/setup.sh` according to your CUDA version.
62
+ There are not restrictions on the `torch` version, feel free to use your preferred one.
63
+ ```bash
64
+ git clone https://github.com/chenguolin/DiffSplat.git
65
+ cd DiffSplat
66
+ bash settings/setup.sh
67
+ ```
68
+
69
+
70
+ ## 📊 Dataset
71
+
72
+ - We use [G-Objaverse](https://github.com/modelscope/richdreamer/tree/main/dataset/gobjaverse) with about 265K 3D objects and 10.6M rendered images (265K x 40 views, including RGB, normal and depth maps) for `GSRecon` and `GSVAE` training. [Its subset](https://github.com/ashawkey/objaverse_filter) with about 83K 3D objects provided by [LGM](https://me.kiui.moe/lgm) is used for `DiffSplat` training. Their text descriptions are provided by the latest version of [Cap3D](https://huggingface.co/datasets/tiange/Cap3D) (i.e., refined by [DiffuRank](https://arxiv.org/abs/2404.07984)).
73
+ - We find the filtering is crucial for the generation quality of `DiffSplat`, and a larger dataset is beneficial for the performance of `GSRecon` and `GSVAE`.
74
+ - We store the dataset in an internal HDFS cluster in this project. Thus, the training code can NOT be directly run on your local machine. Please implement your own dataloading logic referring to our provided dataset & dataloader code.
75
+
76
+
77
+ ## 🚀 Usage
78
+
79
+ ### 📷 Camera Conventions
80
+
81
+ The camera and world coordinate systems in this project are both defined in the `OpenGL` convention, i.e., X: right, Y: up, Z: backward. The camera is located at `(0, 0, 1.4)` in the world coordinate system, and the camera looks at the origin `(0, 0, 0)`.
82
+ Please refer to [kiuikit camera doc](https://kit.kiui.moe/camera) for visualizations of the camera and world coordinate systems.
83
+
84
+ ### 🤗 Pretrained Models
85
+
86
+ All pretrained models are available at [HuggingFace🤗](https://huggingface.co/chenguolin/DiffSplat).
87
+
88
+ | **Model Name** | **Fine-tined From** | **#Param.** | **Link** | **Note** |
89
+ |-------------------------------|---------------------|-------------|----------|----------|
90
+ | **GSRecon** | From scratch | 42M | [gsrecon_gobj265k_cnp_even4](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsrecon_gobj265k_cnp_even4) | Feed-forward reconstruct per-pixel 3DGS from (RGB, normal, point) maps |
91
+ | **GSVAE (SD)** | [SD1.5 VAE](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) | 84M | [gsvae_gobj265k_sd](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsvae_gobj265k_sd) | |
92
+ | **GSVAE (SDXL)** | [SDXL fp16 VAE](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) | 84M | [gsvae_gobj265k_sdxl_fp16](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsvae_gobj265k_sdxl_fp16) | fp16-fixed SDXL VAE is more robust |
93
+ | **GSVAE (SD3)** | [SD3 VAE](https://huggingface.co/stabilityai/stable-diffusion-3-medium) | 84M | [gsvae_gobj265k_sd3](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsvae_gobj265k_sd3) | |
94
+ | **DiffSplat (SD1.5)** | [SD1.5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) | 0.86B | Text-cond: [gsdiff_gobj83k_sd15__render](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsdiff_gobj83k_sd15__render)<br> Image-cond: [gsdiff_gobj83k_sd15_image__render](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsdiff_gobj83k_sd15_image__render) | Best efficiency |
95
+ | **DiffSplat (PixArt-Sigma)** | [PixArt-Sigma](https://huggingface.co/PixArt-alpha/PixArt-Sigma-XL-2-512-MS) | 0.61B | Text-cond: [gsdiff_gobj83k_pas_fp16__render](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsdiff_gobj83k_pas_fp16__render)<br> Image-cond: [gsdiff_gobj83k_pas_fp16_image__render](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsdiff_gobj83k_pas_fp16_image__render) | Best Trade-off |
96
+ | **DiffSplat (SD3.5m)** | [SD3.5 median](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium) | 2.24B | Text-cond: [gsdiff_gobj83k_sd35m__render](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsdiff_gobj83k_sd35m__render)<br> Image-cond: [gsdiff_gobj83k_sd35m_image__render](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsdiff_gobj83k_sd35m_image__render) | Best performance |
97
+ | **DiffSplat ControlNet (SD1.5)** | From scratch | 361M | Depth: [gsdiff_gobj83k_sd15__render__depth](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsdiff_gobj83k_sd15__render__depth)<br> Normal: [gsdiff_gobj83k_sd15__render__normal](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsdiff_gobj83k_sd15__render__normal)<br> Canny: [gsdiff_gobj83k_sd15__render__canny](https://huggingface.co/chenguolin/DiffSplat/tree/main/gsdiff_gobj83k_sd15__render__canny) | |
98
+ | **(Optional) ElevEst** | [dinov2_vitb14_reg](https://github.com/facebookresearch/dinov2) | 86 M | [elevest_gobj265k_b_C25](https://huggingface.co/chenguolin/DiffSplat/tree/main/elevest_gobj265k_b_C25) | (Optional) Single-view image elevation estimation |
99
+
100
+
101
+ ### ⚡ Inference
102
+
103
+ #### 0. Download Pretrained Models
104
+
105
+ Note that:
106
+ - Pretrained weights will download from HuggingFace and stored in `./out`.
107
+ - Other pretrained models (such as CLIP, T5, image VAE, etc.) will be downloaded automatically and stored in your HuggingFace cache directory.
108
+ - If you face problems in visiting HuggingFace Hub, you can try to set the environment variable `export HF_ENDPOINT=https://hf-mirror.com`.
109
+
110
+ ```bash
111
+ python3 download_ckpt.py --model_type [MODEL_TYPE] [--image_cond]
112
+
113
+ # `MODEL_TYPE`: choose from "sd15", "pas", "sd35m", "depth", "normal", "canny", "elevest".
114
+ # `--image_cond`: add this flag for downloading image-conditioned models
115
+ ```
116
+
117
+ For example, to download the `text-cond SD1.5-based DiffSplat`:
118
+ ```bash
119
+ python3 download_ckpt.py --model_type sd15
120
+ ```
121
+ To download the `image-cond PixArt-Sigma-based DiffSplat`:
122
+ ```bash
123
+ python3 download_ckpt.py --model_type pas --image_cond
124
+ ```
125
+
126
+ #### 1. Text-conditioned 3D Object Generation
127
+
128
+ Note that:
129
+ - Model differences may not be significant for simple text prompts. We recommend using `DiffSplat (SD1.5)` for better efficiency, `DiffSplat (SD3.5m)` for better performance, and `DiffSplat (PixArt-Sigma)` for a better trade-off.
130
+ - By default, `export HF_HOME=~/.cache/huggingface`, `export TORCH_HOME=~/.cache/torch`. You can change theses paths in `scripts/infer.sh`. SD3-related models require HuggingFace token for downloading, which is expected to be stored in `HF_HOME`.
131
+ - Outputs will be stored in `./out/<MODEL_NAME>/inference`.
132
+ - Prompt is specified by `--prompt` (e.g., `a_toy_robot`). Please seperate words by `_` and it will be replaced by space in the code automatically.
133
+ - If `"gif"` is in `--output_video_type`, the output will be a `.gif` file. Otherwise, it will be a `.mp4` file. If `"fancy"` is in `--output_video_type`, the output video will be in a fancy style that 3DGS scales gradually increase while rotating.
134
+ - `--seed` is used for random seed setting. `--gpu_id` is used for specifying the GPU device.
135
+ - Use `--half_precision` for `BF16` half-precision inference. It will reduce the memory usage but may slightly affect the quality.
136
+
137
+ ```bash
138
+ # DiffSplat (SD1.5)
139
+ bash scripts/infer.sh src/infer_gsdiff_sd.py configs/gsdiff_sd15.yaml gsdiff_gobj83k_sd15__render \
140
+ --prompt a_toy_robot --output_video_type gif \
141
+ --gpu_id 0 --seed 0 [--half_precision]
142
+
143
+ # DiffSplat (PixArt-Sigma)
144
+ bash scripts/infer.sh src/infer_gsdiff_pas.py configs/gsdiff_pas.yaml gsdiff_gobj83k_pas_fp16__render \
145
+ --prompt a_toy_robot --output_video_type gif \
146
+ --gpu_id 0 --seed 0 [--half_precision]
147
+
148
+ # DiffSplat (SD3.5m)
149
+ bash scripts/infer.sh src/infer_gsdiff_sd3.py configs/gsdiff_sd35m_80g.yaml gsdiff_gobj83k_sd35m__render \
150
+ --prompt a_toy_robot --output_video_type gif \
151
+ --gpu_id 0 --seed 0 [--half_precision]
152
+ ```
153
+
154
+ You will get:
155
+ | DiffSplat (SD1.5) | DiffSplat (PixArt-Sigma) | DiffSplat (SD3.5m) |
156
+ |-------------------------|-------------------------------|-------------------------|
157
+ | ![sd15_text](./assets/_demo/a_toy_robot/sd15.gif) | ![pas_text](./assets/_demo/a_toy_robot/pas.gif) | ![sd35m_text](./assets/_demo/a_toy_robot/sd35m.gif) |
158
+
159
+
160
+ **More Advanced Arguments**:
161
+ - `--prompt_file`: instead of using `--prompt`, `--prompt_file` will read prompts from a `.txt` file line by line.
162
+ - Diffusion configurations:
163
+ - `--scheduler_type`: choose from `ddim`, `dpmsolver++`, `sde-dpmsolver++`, etc.
164
+ - `--num_inference_timesteps`: the number of diffusion steps.
165
+ - `--guidance_scale`: classifier-free guidance (CFG) scale; `1.0` means no CFG.
166
+ - `--eta`: specified for `DDIM` scheduler; the weight of noise for added noise in diffusion steps.
167
+ - [Instant3D](https://instant-3d.github.io) tricks:
168
+ - `--init_std`, `--init_noise_strength`, `--init_bg`: initial noise settings, cf. [Instant3D Sec. 3.1](https://arxiv.org/pdf/2311.06214); NOT used by default, as we found it's not that helpful in our case.
169
+ - Others:
170
+ - `--elevation`: elevation for viewing and rendering; not necessary for text-conditioned generation; set to `10` by default (from xz-plane (`0`) to +y axis (`90`)).
171
+ - `--negative_prompt`: empty prompt (`""`) by default; used with CFG for better visual quality (e.g., more vibrant colors), but we found it causes lower metric values (such as [ImageReward](https://github.com/THUDM/ImageReward)).
172
+ - `--save_ply`: save the generated 3DGS as a `.ply` file; used with `--opacity_threshold_ply` to filter out low-opacity splats for much smaller `.ply` file size.
173
+ - `--eval_text_cond`: evaluate text-conditioned generation automatically.
174
+ - ...
175
+
176
+ Please refer to [infer_gsdiff_sd.py](./src/infer_gsdiff_sd.py), [infer_gsdiff_pas.py](./src/infer_gsdiff_pas.py), and [infer_gsdiff_sd3.py](./src/infer_gsdiff_sd3.py) for more argument details.
177
+
178
+ #### 2. Image-conditioned 3D Object Generation
179
+
180
+ Note that:
181
+ - Most of the arguments are the same as text-conditioned generation. Our method support **text and image as conditions simultaneously**.
182
+ - Elevation is necessary for image-conditioned generation. You can specify the elevation angle by `--elevation` for viewing and rendering (from xz-plane (`0`) to +y axis (`90`)) or estimate it from the input image by `--use_elevest` (download the pretrained `ElevEst` model by `python3 download_ckpt.py --model_type elevest`) first. But we found that the **estimated elevation is not always accurate**, so it's better to set it manually.
183
+ - Text prompt is **optional** for image-conditioned generation. If you want to use text prompt, you can specify it by `--prompt` (e.g., `a_frog`), otherwise, empty prompt (`""`) will be used. Note that **DiffSplat (SD3.5m)** is sensitive to text prompts, and it may generate bad results without a proper prompt.
184
+ - Remember to set a smaller `--guidance_scale` for image-conditioned generation, as the default value is set for text-conditioned generation. `2.0` is recommended for most cases.
185
+ - `--triangle_cfg_scaling` is a trick that set larger CFG values for far-away views from the input image, while smaller CFG values for close-up views, cf. [SV3D Sec. 3](https://arxiv.org/pdf/2403.12008).
186
+ - `--rembg_and_center` will remove the background and center the object in the image. It can be used with `--rembg_model_name` (by default `u2net`) and `--border_ratio` (by default `0.2`).
187
+ - Image-conditioned generation is more sensitive to arguments, and you may need to tune them for better results.
188
+
189
+ ```bash
190
+ # DiffSplat (SD1.5)
191
+ bash scripts/infer.sh src/infer_gsdiff_sd.py configs/gsdiff_sd15.yaml gsdiff_gobj83k_sd15_image__render \
192
+ --rembg_and_center --triangle_cfg_scaling --output_video_type gif --guidance_scale 2 \
193
+ --image_path assets/grm/frog.png --elevation 20 --prompt a_frog
194
+
195
+ # DiffSplat (PixArt-Sigma)
196
+ bash scripts/infer.sh src/infer_gsdiff_pas.py configs/gsdiff_pas.yaml gsdiff_gobj83k_pas_fp16_image__render \
197
+ --rembg_and_center --triangle_cfg_scaling --output_video_type gif --guidance_scale 2 \
198
+ --image_path assets/grm/frog.png --elevation 20 --prompt a_frog
199
+
200
+ # DiffSplat (SD3.5m)
201
+ bash scripts/infer.sh src/infer_gsdiff_sd3.py configs/gsdiff_sd35m_80g.yaml gsdiff_gobj83k_sd35m_image__render \
202
+ --rembg_and_center --triangle_cfg_scaling --output_video_type gif --guidance_scale 2 \
203
+ --image_path assets/grm/frog.png --elevation 20 --prompt a_frog
204
+ ```
205
+
206
+ You will get
207
+ | Arguments | DiffSplat (SD1.5) | DiffSplat (PixArt-Sigma) | DiffSplat (SD3.5m) |
208
+ |---------|-------------------------|-------------------------------|-------------------------|
209
+ | `--elevation 20 --prompt a_frog` | ![sd15_image](./assets/_demo/a_frog/sd15.gif) | ![pas_image](./assets/_demo/a_frog/pas.gif) | ![sd35m_image](./assets/_demo/a_frog/sd35m.gif) |
210
+ | `--use_elevest --prompt a_frog` (estimated elevation: -0.78 deg) | ![sd15_image](./assets/_demo/a_frog_elevest/sd15.gif) | ![pas_image](./assets/_demo/a_frog_elevest/pas.gif) | ![sd35m_image](./assets/_demo/a_frog_elevest/sd35m.gif) |
211
+ | `--elevation 20` (prompt is `""`) | ![sd15_image](./assets/_demo/a_frog_empty/sd15.gif) | ![pas_image](./assets/_demo/a_frog_empty/pas.gif) | ![sd35m_image](./assets/_demo/a_frog_empty/sd35m.gif) |
212
+
213
+ **More Advanced Arguments**:
214
+ - `--image_dir`: instead of using `--image_path`, `--image_dir` will read images from a directory.
215
+
216
+ Please refer to [infer_gsdiff_sd.py](./src/infer_gsdiff_sd.py), [infer_gsdiff_pas.py](./src/infer_gsdiff_pas.py), and [infer_gsdiff_sd3.py](./src/infer_gsdiff_sd3.py) for more argument details.
217
+
218
+ #### 3. ControlNet for 3D Object Generation
219
+
220
+ Note that:
221
+ - After downloading pretrained **DiffSplat (SD1.5)**, you shoule download the controlnet weights by `python3 download_ckpt.py --model_type [depth | normal | canny]`.
222
+ - For **depth-controlnet**, values in depth maps are normalized to `[0, 1]` and larger values (white) mean closer to the camera (smaller depth). Please refer to [GObjaverse Dataset](./src/data/gobjaverse_parquet_dataset.py) for more details.
223
+ - For **normal-controlnet**, input camera is normalized to locate at `(0, 0, 1.4)` and look at `(0, 0, 0)`, thus the input normal maps are transformed accordingly. Please refer to [GObjaverse Dataset](./src/data/gobjaverse_parquet_dataset.py) for more details.
224
+ - For **canny-controlnet**, canny edges are extracted from the input RGB images automatically by `cv2.Canny`. Please refer to [GObjaverse Dataset](./src/data/gobjaverse_parquet_dataset.py) for more details.
225
+
226
+ ```bash
227
+ # ControlNet (depth)
228
+ bash scripts/infer.sh src/infer_gsdiff_sd.py configs/gsdiff_sd15.yaml gsdiff_gobj83k_sd15__render \
229
+ --load_pretrained_controlnet gsdiff_gobj83k_sd15__render__depth \
230
+ --output_video_type gif --image_path assets/diffsplat/controlnet/toy_depth.png \
231
+ --prompt teddy_bear --elevation 10
232
+
233
+ # ControlNet (normal)
234
+ bash scripts/infer.sh src/infer_gsdiff_sd.py configs/gsdiff_sd15.yaml gsdiff_gobj83k_sd15__render \
235
+ --load_pretrained_controlnet gsdiff_gobj83k_sd15__render__normal \
236
+ --output_video_type gif --image_path assets/diffsplat/controlnet/robot_normal.png \
237
+ --prompt iron_robot --elevation 10
238
+
239
+ # ControlNet (canny)
240
+ bash scripts/infer.sh src/infer_gsdiff_sd.py configs/gsdiff_sd15.yaml gsdiff_gobj83k_sd15__render \
241
+ --load_pretrained_controlnet gsdiff_gobj83k_sd15__render__canny \
242
+ --output_video_type gif --image_path assets/diffsplat/controlnet/cookie_canny.png \
243
+ --prompt book --elevation 10
244
+ ```
245
+
246
+ You will get:
247
+ | Original Image | Input Control | `--prompt teddy_bear` | `--prompt panda` |
248
+ |----------------|---------------|-----------------------|--------------------|
249
+ | ![depth_image](./assets/diffsplat/controlnet/toy_image.png) | ![depth](./assets/diffsplat/controlnet/toy_depth.png) | ![controlnet_1](assets/_demo/controlnet/teddy_bear.gif) | ![controlnet_2](assets/_demo/controlnet/panda.gif) |
250
+
251
+ | Original Image | Input Control | `--prompt iron_robot` | `--prompt plush_dog_toy` |
252
+ |----------------|---------------|-----------------------|--------------------|
253
+ | ![normal_image](./assets/diffsplat/controlnet/robot_image.png) | ![normal](./assets/diffsplat/controlnet/robot_normal.png) | ![controlnet_1](assets/_demo/controlnet/iron_robot.gif) | ![controlnet_2](assets/_demo/controlnet/plush_dog_toy.gif) |
254
+
255
+ | Original Image | Input Control | `--prompt book` | `--prompt cookie` |
256
+ |----------------|---------------|-----------------|---------------------|
257
+ | ![canny_image](./assets/diffsplat/controlnet/cookie_image.png) | ![canny](./assets/diffsplat/controlnet/cookie_canny.png) | ![controlnet_1](assets/_demo/controlnet/book.gif) | ![controlnet_2](assets/_demo/controlnet/cookie.gif) |
258
+
259
+ **More Advanced Arguments**:
260
+ - `--guess_mode`: ControlNet encoder tries to recognize the content of the input image even if you remove all prompts, cf. [the original ControlNet repo](https://github.com/lllyasviel/ControlNet#guess-mode--non-prompt-mode) and [HF ControlNet](https://huggingface.co/docs/diffusers/using-diffusers/controlnet#guess-mode).
261
+ - `--controlnet_scale`: determines how much weight to assign to the conditioning inputs; outputs of the ControlNet are multiplied by `controlnet_scale` before they are added to the residual in the original UNet.
262
+
263
+ Please refer to [infer_gsdiff_sd.py](./src/infer_gsdiff_sd.py) for more argument details.
264
+
265
+
266
+ ### 🦾 Training
267
+
268
+ #### 1. GSRecon
269
+
270
+ Please refer to [train_gsrecon.py](./src/train_gsrecon.py).
271
+
272
+ Instructions for `GSRecon` training will be provided soon.
273
+
274
+ #### 2. GSVAE
275
+
276
+ Please refer to [train_gsvae.py](./src/train_gsvae.py).
277
+
278
+ Instructions for `GSVAE` training will be provided soon.
279
+
280
+ #### 3. DiffSplat
281
+
282
+ Please refer to [train_gsdiff_sd.py](./src/train_gsdiff_sd.py), [train_gsdiff_pas.py](./src/train_gsdiff_pas.py), and [train_gsdiff_sd3.py](./src/train_gsdiff_sd3.py).
283
+
284
+ Instructions for `DiffSplat` training will be provided soon.
285
+
286
+ #### 4. ControlNet
287
+
288
+ Please refer to [train_gsdiff_sd_controlnet.py](./src/train_gsdiff_sd_controlnet.py).
289
+
290
+ Instructions for `ControlNet` training and inference will be provided soon.
291
+
292
+
293
+ ## 😊 Acknowledgement
294
+ We would like to thank the authors of [LGM](https://me.kiui.moe/lgm), [GRM](https://justimyhxu.github.io/projects/grm), and [Wonder3D](https://www.xxlong.site/Wonder3D) for their great work and generously providing source codes, which inspired our work and helped us a lot in the implementation.
295
+
296
 
297
+ ## 📚 Citation
298
+ If you find our work helpful, please consider citing:
299
+ ```bibtex
300
+ @inproceedings{lin2025diffsplat,
301
+ title={DiffSplat: Repurposing Image Diffusion Models for Scalable 3D Gaussian Splat Generation},
302
+ author={Lin, Chenguo and Pan, Panwang and Yang, Bangbang and Li, Zeming and Mu, Yadong},
303
+ booktitle={International Conference on Learning Representations (ICLR)},
304
+ year={2025}
305
+ }
306
+ ```
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shlex
3
+ import subprocess
4
+ import imageio
5
+ import numpy as np
6
+
7
+ import gradio as gr
8
+ import spaces
9
+ import sys
10
+ from loguru import logger
11
+ current_path = os.path.dirname(os.path.abspath(__file__))
12
+
13
+ MAX_SEED = np.iinfo(np.int32).max
14
+ TMP_DIR = os.path.join(current_path, 'out')
15
+ os.makedirs(TMP_DIR, exist_ok=True)
16
+ TAG = "gsdiff_gobj83k_sd15__render"
17
+
18
+ # download checkpoints
19
+ # subprocess.run(shlex.split("python3 download_ckpt.py --model_type pas")) # for txt condition
20
+ # subprocess.run(shlex.split("python3 download_ckpt.py --model_type pas --image_cond")) # for img condition
21
+
22
+ # img_commands = "PYTHONPATH=./ bash scripts/infer.sh src/infer_gsdiff_pas.py configs/gsdiff_pas.yaml {} \
23
+ # --rembg_and_center --triangle_cfg_scaling --save_ply --output_video_type mp4 --guidance_scale {} \
24
+ # --image_path {} --elevation {} --prompt {} --seed {}"
25
+
26
+ # txt_commands = "PYTHONPATH=./ bash scripts/infer.sh src/infer_gsdiff_pas.py configs/gsdiff_pas.yaml \
27
+ # --rembg_and_center --triangle_cfg_scaling --save_ply --output_video_type mp4 --guidance_scale {} \
28
+ # --image_path {} --elevation {} --prompt {} --seed {}"
29
+
30
+ # SD1.5
31
+ subprocess.run(shlex.split("python3 download_ckpt.py --model_type sd15")) # for txt condition
32
+ # subprocess.run(shlex.split("python3 download_ckpt.py --model_type sd15 --image_cond")) # for img condition
33
+ img_commands = "PYTHONPATH=./ bash scripts/infer.sh src/infer_gsdiff_sd.py configs/gsdiff_sd15.yaml \
34
+ --rembg_and_center --triangle_cfg_scaling --save_ply --output_video_type mp4 --guidance_scale {} \
35
+ --image_path {} --elevation {} --prompt {} --seed {}"
36
+
37
+ txt_commands = "PYTHONPATH=./ bash scripts/infer.sh src/infer_gsdiff_sd.py configs/gsdiff_sd15.yaml {} \
38
+ --rembg_and_center --save_ply --output_video_type mp4 --guidance_scale {} \
39
+ --elevation {} --prompt {} --seed {}"
40
+
41
+
42
+
43
+ # process function
44
+ @spaces.GPU
45
+ def process(input_image, prompt='a_high_quality_3D_asset', prompt_neg='ugly, blurry, pixelated obscure, unnatural colors, poor lighting, dull, unclear, cropped, lowres, low quality, artifacts, duplicate', input_elevation=20, guidance_scale=2., input_seed=0):
46
+
47
+ if input_image is not None:
48
+ image_path = os.path.join(TMP_DIR, "input_image.png")
49
+ image_name = image_path.split('/')[-1].split('.')[0]
50
+ input_image.save(image_path)
51
+ full_command = img_commands.format(TAG, guidance_scale, image_path, input_elevation, prompt, input_seed)
52
+ else:
53
+ full_command = txt_commands.format(TAG, guidance_scale, input_elevation, prompt, input_seed)
54
+ image_name = ""
55
+
56
+ os.system(full_command)
57
+
58
+ # save video and ply files
59
+ ckpt_dir = os.path.join(TMP_DIR, TAG, "checkpoints")
60
+ infer_from_iter = int(sorted(os.listdir(ckpt_dir))[-1])
61
+ MAX_NAME_LEN = 20 # TODO: make `20` configurable
62
+ prompt = prompt.replace("_", " ")
63
+ prompt_name = prompt[:MAX_NAME_LEN] + "..." if prompt[:MAX_NAME_LEN] != "" else prompt
64
+ name = f"[{image_name}]_[{prompt_name}]_{infer_from_iter:06d}"
65
+ output_video_path = os.path.join(TMP_DIR, TAG, "inference", name + ".mp4")
66
+ output_ply_path = os.path.join(TMP_DIR, TAG, "inference", name + ".ply")
67
+ output_img_path = os.path.join(TMP_DIR, TAG, "inference", name + "_gs.png")
68
+
69
+ logger.info(full_command, output_video_path, output_ply_path)
70
+
71
+ output_image = imageio.imread(output_img_path)
72
+ return output_image, output_video_path, output_ply_path
73
+
74
+
75
+ # gradio UI
76
+ _TITLE = '''DiffSplat: Repurposing Image Diffusion Models for Scalable Gaussian Splat Generation'''
77
+
78
+ _DESCRIPTION = '''
79
+ ### If you find our work helpful, please consider citing our paper 📚 or giving the repo a star 🌟
80
+ <div>
81
+ <a style="display:inline-block; margin-left: .5em" href="https://chenguolin.github.io/projects/DiffSplat"><img src='https://img.shields.io/badge/Project-Page-brightgreen'/></a>
82
+ <a style="display:inline-block; margin-left: .5em" href="https://arxiv.org/abs/2501.16764"><img src='https://img.shields.io/badge/arXiv-2501.16764-b31b1b.svg?logo=arXiv'/></a>
83
+ <a style="display:inline-block; margin-left: .5em" href="https://github.com/chenguolin/DiffSplat"><img src='https://img.shields.io/github/stars/chenguolin/DiffSplat?style=social'/></a>
84
+ <a style="display:inline-block; margin-left: .5em" href="https://huggingface.co/chenguolin/DiffSplat"><img src='https://img.shields.io/badge/HF-Model-yellow'/></a>
85
+ </div>
86
+
87
+ * Input can be only text, only image, or both image and text.
88
+ * If you find the generated 3D asset satisfactory, click "Extract GLB" to extract the GLB file and download it.
89
+ * Upload an image and click "Generate" to create a 3D asset. If the image has alpha channel, it be used as the mask. Otherwise, we use `rembg` to remove the background.
90
+ '''
91
+
92
+ block = gr.Blocks(title=_TITLE).queue()
93
+ with block:
94
+ with gr.Row():
95
+ with gr.Column(scale=1):
96
+ gr.Markdown('# ' + _TITLE)
97
+ gr.Markdown(_DESCRIPTION)
98
+
99
+ with gr.Row(variant='panel'):
100
+ with gr.Column(scale=1):
101
+ # input image
102
+ input_image = gr.Image(label="image", type='pil')
103
+
104
+ # input prompt
105
+ input_text = gr.Textbox(label="prompt",value="a_high_quality_3D_asset")
106
+
107
+ # negative prompt
108
+ input_neg_text = gr.Textbox(label="negative prompt", value="")
109
+
110
+ # guidance_scale
111
+ guidance_scale = gr.Slider(label="guidance scale", minimum=1., maximum=7.5, step=0.5, value=2.0)
112
+
113
+ # elevation
114
+ input_elevation = gr.Slider(label="elevation", minimum=-90, maximum=90, step=1, value=20)
115
+ # # inference steps
116
+ # input_num_steps = gr.Slider(label="inference steps", minimum=1, maximum=100, step=1, value=30)
117
+ # random seed
118
+ input_seed = gr.Slider(label="random seed", minimum=0, maximum=100000, step=1, value=0)
119
+ # gen button
120
+ button_gen = gr.Button("Generate")
121
+
122
+
123
+ with gr.Column(scale=1):
124
+ with gr.Tab("Video"):
125
+ # final video results
126
+ output_video = gr.Video(label="video")
127
+ # ply file
128
+ output_file = gr.File(label="3D Gaussians (ply format)")
129
+ with gr.Tab("Splatter Images"):
130
+ output_image = gr.Image(interactive=False, show_label=False)
131
+
132
+
133
+ button_gen.click(process, inputs=[input_image, input_text, input_neg_text, input_elevation, guidance_scale, input_seed], outputs=[output_image, output_video, output_file])
134
+
135
+ gr.Examples(
136
+ examples=[
137
+ f'assets/diffsplat/{image}'
138
+ for image in os.listdir("assets/diffsplat") if image.endswith('.png')
139
+ ],
140
+ inputs=[input_image],
141
+ outputs=[output_image, output_video, output_file],
142
+ fn=lambda x: process(input_image=x),
143
+ # cache_examples=True,
144
+ run_on_click=True,
145
+ label='Image-to-3D Examples'
146
+ )
147
+
148
+ gr.Examples(
149
+ examples=[
150
+ "a_toy_robot",
151
+ "a_cute_panda",
152
+ "a_book"
153
+ ],
154
+ inputs=[input_text],
155
+ outputs=[output_image, output_video, output_file],
156
+ fn=lambda x: process(input_image=None, prompt=x),
157
+ # cache_examples=True,
158
+ run_on_click=True,
159
+ label='Text-to-3D Examples'
160
+ )
161
+
162
+
163
+ # Launch the Gradio app
164
+ if __name__ == "__main__":
165
+ block.launch(share=True)
assets/_demo/1.gif ADDED

Git LFS Details

  • SHA256: e5026f63c1c69ba4d31c34cc82ccef0ac75c24b947893376a4f227ff04fc00dc
  • Pointer size: 132 Bytes
  • Size of remote file: 1.87 MB
assets/_demo/10.gif ADDED

Git LFS Details

  • SHA256: e6d6191e04b32f0b74a299bb3cf35735fa329bec309165056128405e5a1e55ef
  • Pointer size: 132 Bytes
  • Size of remote file: 1.76 MB
assets/_demo/2.gif ADDED

Git LFS Details

  • SHA256: f63f122c00335401c849c73e0bb71c12785d1754f506acb32c5fe324404cefd7
  • Pointer size: 132 Bytes
  • Size of remote file: 4.61 MB
assets/_demo/3.gif ADDED

Git LFS Details

  • SHA256: acf87ce7deebc7693fd6c72e25fc22deb3642b0cf5ff732a68bd330f3b5472af
  • Pointer size: 132 Bytes
  • Size of remote file: 4.96 MB
assets/_demo/4.gif ADDED

Git LFS Details

  • SHA256: 2f88df29b6f2e06006df95d61a4be87d4c3774da455526b255dc7aacd4ec7ef6
  • Pointer size: 132 Bytes
  • Size of remote file: 3.22 MB
assets/_demo/5.gif ADDED

Git LFS Details

  • SHA256: 006f05b05d3630b7ebfe0cf067de203d55fa778472d7d038cbf87363e40c70a9
  • Pointer size: 132 Bytes
  • Size of remote file: 4.12 MB
assets/_demo/6.gif ADDED

Git LFS Details

  • SHA256: c16ba28c12d40ea83d0cc705af89659c2e671bc93c254c49583d1d0fbca2ea56
  • Pointer size: 132 Bytes
  • Size of remote file: 1.97 MB
assets/_demo/7.gif ADDED

Git LFS Details

  • SHA256: 79a27742114d96e0e767d8e722e70ea5f83b06e8fc44b3e782c2b7ce2f64a4bb
  • Pointer size: 132 Bytes
  • Size of remote file: 1.81 MB
assets/_demo/8.gif ADDED

Git LFS Details

  • SHA256: 8a99d281c8bc17a8c4655585cf4dbc1da51e3490229e55294582d35f26fba7a1
  • Pointer size: 132 Bytes
  • Size of remote file: 2.64 MB
assets/_demo/9.gif ADDED

Git LFS Details

  • SHA256: aff03087a11381171ce700dffb0ffc194ff009ba36a486de01de8f2bb04f3069
  • Pointer size: 132 Bytes
  • Size of remote file: 3.31 MB
assets/_demo/a_frog/pas.gif ADDED

Git LFS Details

  • SHA256: 078c2374d80b8e3ecfdb0e54b024237708e06b6035ef750e16d05c39b97f4998
  • Pointer size: 132 Bytes
  • Size of remote file: 3.57 MB
assets/_demo/a_frog/sd15.gif ADDED

Git LFS Details

  • SHA256: cf8db71ca8962c9337fceabed510081ee5be63029c5194ec936e0809ef9c0814
  • Pointer size: 132 Bytes
  • Size of remote file: 3.53 MB
assets/_demo/a_frog/sd35m.gif ADDED

Git LFS Details

  • SHA256: ccbd8ced948c77606826e9baad190c82a97312cad3f73d875213c74480bba167
  • Pointer size: 132 Bytes
  • Size of remote file: 4.06 MB
assets/_demo/a_frog_elevest/pas.gif ADDED

Git LFS Details

  • SHA256: 2ba7e65d04231c4a686c2545925eb185e662633f99d8e62f46e4787f00a84cde
  • Pointer size: 132 Bytes
  • Size of remote file: 3.55 MB
assets/_demo/a_frog_elevest/sd15.gif ADDED

Git LFS Details

  • SHA256: a6054d11ee74288eee98bde90db223a40c79fa158f64b7cc6f27dac37d74dfbf
  • Pointer size: 132 Bytes
  • Size of remote file: 3.38 MB
assets/_demo/a_frog_elevest/sd35m.gif ADDED

Git LFS Details

  • SHA256: a167109dfd1c1cd46f14648b3845d23b8d7f49e0cdaf089f78b356ff3300b34f
  • Pointer size: 132 Bytes
  • Size of remote file: 3.73 MB
assets/_demo/a_frog_empty/pas.gif ADDED

Git LFS Details

  • SHA256: 18443bb88dcab653c41b2dcddfa8423d735fac1adf664cc374d45ef5ed9e5d53
  • Pointer size: 132 Bytes
  • Size of remote file: 3.49 MB
assets/_demo/a_frog_empty/sd15.gif ADDED

Git LFS Details

  • SHA256: b0b550b17d9dc85af20324ad7bb310a37599f3d6502fc9969fda1d29897dacf9
  • Pointer size: 132 Bytes
  • Size of remote file: 3.43 MB
assets/_demo/a_frog_empty/sd35m.gif ADDED

Git LFS Details

  • SHA256: c8ad473274b2cd9b2c7d4f7368a0bed22685ac8ca823e0984c7ca993c2b6898c
  • Pointer size: 132 Bytes
  • Size of remote file: 3.55 MB
assets/_demo/a_toy_robot/pas.gif ADDED

Git LFS Details

  • SHA256: 68d9b2d11a8129c5e1567664ab40d48da8123e283638bb2b5813d3461284b247
  • Pointer size: 132 Bytes
  • Size of remote file: 2.02 MB
assets/_demo/a_toy_robot/sd15.gif ADDED

Git LFS Details

  • SHA256: c213115602ac16f60cb3e877d5bfb57111ff1b7cfc2f47f3c188a096a421a658
  • Pointer size: 132 Bytes
  • Size of remote file: 2.14 MB
assets/_demo/a_toy_robot/sd35m.gif ADDED

Git LFS Details

  • SHA256: 850dab4e311c96d23db7c4c2fe9b15598c695512a29b5643e7dcad220845bd9f
  • Pointer size: 132 Bytes
  • Size of remote file: 2.11 MB
assets/_demo/controlnet/book.gif ADDED

Git LFS Details

  • SHA256: 8d138dcba04ea3079a21b083416146a292a2b0f300bf3403aeb4c43918e29272
  • Pointer size: 132 Bytes
  • Size of remote file: 2.23 MB
assets/_demo/controlnet/cookie.gif ADDED

Git LFS Details

  • SHA256: 0d842886ed5e13d2d03ded8d877cc6c25fbb09681fbcc1884e240a09e8cdbfd4
  • Pointer size: 132 Bytes
  • Size of remote file: 2.01 MB
assets/_demo/controlnet/iron_robot.gif ADDED

Git LFS Details

  • SHA256: c7d16871a73b989c0632dd722ab7134738ebf4d42e8953cec0dcc3eddc27d9c2
  • Pointer size: 132 Bytes
  • Size of remote file: 2.76 MB
assets/_demo/controlnet/panda.gif ADDED

Git LFS Details

  • SHA256: b7e53a7407d05617033fd9b7b90ff1e75c9e17983d60abd00499f48c0bc67ec4
  • Pointer size: 132 Bytes
  • Size of remote file: 2.77 MB
assets/_demo/controlnet/plush_dog_toy.gif ADDED

Git LFS Details

  • SHA256: 6c1e7731f8400beacf065db130f721789593f4c67a57473f5c5a1c935fa5c00a
  • Pointer size: 132 Bytes
  • Size of remote file: 3.18 MB
assets/_demo/controlnet/teddy_bear.gif ADDED

Git LFS Details

  • SHA256: 32941570f92e60addec09091c0cac24ab28f85ceb2d732aa8f8e0bcb4a774493
  • Pointer size: 132 Bytes
  • Size of remote file: 3.12 MB
assets/_demo/overview.png ADDED

Git LFS Details

  • SHA256: f09b1aadf070bed1c9e4ca503b09c2ee0d6790951b4718f97ff2a24d4c3281dd
  • Pointer size: 131 Bytes
  • Size of remote file: 236 kB
assets/crm/3D/345/215/241/351/200/232/347/213/227.webp ADDED
assets/crm/astronaut.webp ADDED

Git LFS Details

  • SHA256: c3e9157b31885c0fbccdba965f8ad1e27b51c41fbdb609ee8fc5728e6f4e4c3d
  • Pointer size: 131 Bytes
  • Size of remote file: 101 kB
assets/crm/bulldog.webp ADDED
assets/crm/ghost-eating-burger.webp ADDED
assets/crm/kunkun.webp ADDED
assets/crm//344/270/207/345/234/243/345/215/227/347/223/234.webp ADDED
assets/crm//344/272/272/347/211/251/351/252/221/351/251/254.webp ADDED
assets/crm//345/210/235/351/237/263/346/234/252/346/235/245/347/216/251/345/201/266.webp ADDED
assets/crm//345/215/241/351/200/232/346/201/220/351/276/231.webp ADDED
assets/crm//345/215/241/351/200/232/346/211/213/346/236/252/346/210/252/345/233/276.webp ADDED
assets/crm//345/215/241/351/200/232/347/214/253.webp ADDED
assets/crm//345/215/241/351/200/232/350/230/221/350/217/207/345/245/227/350/243/205.webp ADDED
assets/crm//345/217/257/347/210/261/347/216/204/347/255/226.webp ADDED
assets/crm/大头泡泡马特.webp ADDED

Git LFS Details

  • SHA256: 7135922c38ef118eace0abe2a7fbf78574ebebcf973aeb8cf67ed6cfdde1fcdd
  • Pointer size: 131 Bytes
  • Size of remote file: 354 kB
assets/crm//345/275/251/350/211/262/350/230/221/350/217/207.webp ADDED
assets/crm//345/275/251/350/211/262/350/230/221/350/217/2072.webp ADDED