camenduru commited on
Commit
cd712fb
·
1 Parent(s): 53656cc

thanks to cerspense ❤

Browse files
README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: video-to-video
3
+ license: cc-by-nc-4.0
4
+ ---
5
+ ![model example](https://i.imgur.com/ze1DGOJ.png)
6
+ [example outputs](https://www.youtube.com/watch?v=HO3APT_0UA4) (courtesy of [dotsimulate](https://www.instagram.com/dotsimulate/))
7
+
8
+ # zeroscope_v2 1111 models
9
+ A collection of watermark-free Modelscope-based video models capable of generating high quality video at [448x256](https://huggingface.co/cerspense/zeroscope_v2_dark_30x448x256), [576x320](https://huggingface.co/cerspense/zeroscope_v2_576w) and [1024 x 576](https://huggingface.co/cerspense/zeroscope_v2_XL). These models were trained from the [original weights](https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis) with offset noise using 9,923 clips and 29,769 tagged frames.<br />
10
+ This collection makes it easy to switch between models with the new dropdown menu in the 1111 extension.
11
+
12
+ ### Using it with the 1111 text2video extension
13
+ Simply download the contents of this repo to 'stable-diffusion-webui\models\text2video'
14
+ Or, manually download the model folders you want, along with VQGAN_autoencoder.pth.
15
+
16
+ Thanks to [dotsimulate](https://www.instagram.com/dotsimulate/) for the config files.
17
+
18
+ Thanks to [camenduru](https://github.com/camenduru), [kabachuha](https://github.com/kabachuha), [ExponentialML](https://github.com/ExponentialML), [VANYA](https://twitter.com/veryVANYA), [polyware](https://twitter.com/polyware_ai), [tin2tin](https://github.com/tin2tin)<br />
VQGAN_autoencoder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:930e9865584beae2405d29bc06a05db3bb6a5b34eedd40a7db29b9156ed7d098
3
+ size 2607657443
zs2_448w/configuration.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ { "framework": "pytorch",
2
+ "task": "text-to-video-synthesis",
3
+ "model": {
4
+ "type": "latent-text-to-video-synthesis",
5
+ "model_args": {
6
+ "ckpt_clip": "open_clip_pytorch_model.bin",
7
+ "ckpt_unet": "text2video_pytorch_model.pth",
8
+ "ckpt_autoencoder": "../VQGAN_autoencoder.pth",
9
+ "max_frames": 16,
10
+ "tiny_gpu": 1
11
+ },
12
+ "model_cfg": {
13
+ "unet_in_dim": 4,
14
+ "unet_dim": 320,
15
+ "unet_y_dim": 768,
16
+ "unet_context_dim": 1024,
17
+ "unet_out_dim": 4,
18
+ "unet_dim_mult": [1, 2, 4, 4],
19
+ "unet_num_heads": 8,
20
+ "unet_head_dim": 64,
21
+ "unet_res_blocks": 2,
22
+ "unet_attn_scales": [1, 0.5, 0.25],
23
+ "unet_dropout": 0.1,
24
+ "temporal_attention": "True",
25
+ "num_timesteps": 1000,
26
+ "mean_type": "eps",
27
+ "var_type": "fixed_small",
28
+ "loss_type": "mse"
29
+ }
30
+ },
31
+ "pipeline": {
32
+ "type": "latent-text-to-video-synthesis"
33
+ }
34
+ }
zs2_448w/open_clip_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4c4972695e1c8005e929fe9d8e857f2dbd65967900c195eabe5223567a57794
3
+ size 1972454741
zs2_448w/text2video_pytorch_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f8784148193f58209c9954f324173cf5f6b04d3cada762d0b98b140d5be2bda
3
+ size 2822976729
zs2_576w/configuration.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ { "framework": "pytorch",
2
+ "task": "text-to-video-synthesis",
3
+ "model": {
4
+ "type": "latent-text-to-video-synthesis",
5
+ "model_args": {
6
+ "ckpt_clip": "open_clip_pytorch_model.bin",
7
+ "ckpt_unet": "text2video_pytorch_model.pth",
8
+ "ckpt_autoencoder": "../VQGAN_autoencoder.pth",
9
+ "max_frames": 16,
10
+ "tiny_gpu": 1
11
+ },
12
+ "model_cfg": {
13
+ "unet_in_dim": 4,
14
+ "unet_dim": 320,
15
+ "unet_y_dim": 768,
16
+ "unet_context_dim": 1024,
17
+ "unet_out_dim": 4,
18
+ "unet_dim_mult": [1, 2, 4, 4],
19
+ "unet_num_heads": 8,
20
+ "unet_head_dim": 64,
21
+ "unet_res_blocks": 2,
22
+ "unet_attn_scales": [1, 0.5, 0.25],
23
+ "unet_dropout": 0.1,
24
+ "temporal_attention": "True",
25
+ "num_timesteps": 1000,
26
+ "mean_type": "eps",
27
+ "var_type": "fixed_small",
28
+ "loss_type": "mse"
29
+ }
30
+ },
31
+ "pipeline": {
32
+ "type": "latent-text-to-video-synthesis"
33
+ }
34
+ }
zs2_576w/open_clip_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7737c257bec4a587785ae6b9bf52cc0c16f041ef776df6bb60928615059a2878
3
+ size 1972448549
zs2_576w/text2video_pytorch_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6040b6383685912fa5d3aeb1e84d6efe1d11f4de773c23f3f2a5e97c12ab6b7
3
+ size 2822972283
zs2_XL/configuration.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ { "framework": "pytorch",
2
+ "task": "text-to-video-synthesis",
3
+ "model": {
4
+ "type": "latent-text-to-video-synthesis",
5
+ "model_args": {
6
+ "ckpt_clip": "open_clip_pytorch_model.bin",
7
+ "ckpt_unet": "text2video_pytorch_model.pth",
8
+ "ckpt_autoencoder": "../VQGAN_autoencoder.pth",
9
+ "max_frames": 16,
10
+ "tiny_gpu": 1
11
+ },
12
+ "model_cfg": {
13
+ "unet_in_dim": 4,
14
+ "unet_dim": 320,
15
+ "unet_y_dim": 768,
16
+ "unet_context_dim": 1024,
17
+ "unet_out_dim": 4,
18
+ "unet_dim_mult": [1, 2, 4, 4],
19
+ "unet_num_heads": 8,
20
+ "unet_head_dim": 64,
21
+ "unet_res_blocks": 2,
22
+ "unet_attn_scales": [1, 0.5, 0.25],
23
+ "unet_dropout": 0.1,
24
+ "temporal_attention": "True",
25
+ "num_timesteps": 1000,
26
+ "mean_type": "eps",
27
+ "var_type": "fixed_small",
28
+ "loss_type": "mse"
29
+ }
30
+ },
31
+ "pipeline": {
32
+ "type": "latent-text-to-video-synthesis"
33
+ }
34
+ }
zs2_XL/open_clip_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b25d2b2605ea43e0447eb84b8b08ba027855569f74391ecc9a3abf283f045441
3
+ size 1972448549
zs2_XL/text2video_pytorch_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18dd886130ca1d7228900ac703e88f96f358f040cd56f5392f1d8d7b174ec750
3
+ size 2822972283