Spaces:
Runtime error
Runtime error
Commit
·
afdec82
1
Parent(s):
30a86e6
Upload 5 files
Browse files- app.py +4 -4
- convert_original_stable_diffusion_to_diffusers.py +156 -0
- convert_stable_diffusion_checkpoint_to_onnx.py +265 -0
- v1-inference.yaml +70 -0
- v1-inpainting-inference.yaml +70 -0
app.py
CHANGED
@@ -146,7 +146,7 @@ def generate_click(
|
|
146 |
global current_pipe
|
147 |
global scheduler
|
148 |
global pipe
|
149 |
-
|
150 |
# reset scheduler and pipeline if model is different
|
151 |
if model_name != model_drop:
|
152 |
model_name = model_drop
|
@@ -323,7 +323,7 @@ if __name__ == "__main__":
|
|
323 |
# create gradio block
|
324 |
title = "Stable Diffusion " + str(version.parse(_df_version))
|
325 |
possibilities = ['TEXT2IMG', 'IMG2IMG', 'Inpainting']
|
326 |
-
|
327 |
with gr.Blocks(title=title) as app:
|
328 |
with gr.Row():
|
329 |
with gr.Column(scale=1, min_width=600):
|
@@ -373,5 +373,5 @@ if __name__ == "__main__":
|
|
373 |
|
374 |
image_out.style(grid=2)
|
375 |
|
376 |
-
app.queue(concurrency_count=1, api_open=
|
377 |
-
app.launch(inbrowser=True, server_name="0.0.0.0" if args.local else "127.0.0.1", show_api=
|
|
|
146 |
global current_pipe
|
147 |
global scheduler
|
148 |
global pipe
|
149 |
+
|
150 |
# reset scheduler and pipeline if model is different
|
151 |
if model_name != model_drop:
|
152 |
model_name = model_drop
|
|
|
323 |
# create gradio block
|
324 |
title = "Stable Diffusion " + str(version.parse(_df_version))
|
325 |
possibilities = ['TEXT2IMG', 'IMG2IMG', 'Inpainting']
|
326 |
+
|
327 |
with gr.Blocks(title=title) as app:
|
328 |
with gr.Row():
|
329 |
with gr.Column(scale=1, min_width=600):
|
|
|
373 |
|
374 |
image_out.style(grid=2)
|
375 |
|
376 |
+
app.queue(concurrency_count=1, api_open=True)
|
377 |
+
app.launch(inbrowser=True, server_name="0.0.0.0" if args.local else "127.0.0.1", show_api=True, quiet=True, share=args.public) # open to local network: server_name="0.0.0.0"
|
convert_original_stable_diffusion_to_diffusers.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2023 The HuggingFace Inc. team.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
""" Conversion script for the LDM checkpoints. """
|
16 |
+
|
17 |
+
import argparse
|
18 |
+
|
19 |
+
import torch
|
20 |
+
|
21 |
+
from diffusers.pipelines.stable_diffusion.convert_from_ckpt import download_from_original_stable_diffusion_ckpt
|
22 |
+
|
23 |
+
|
24 |
+
if __name__ == "__main__":
|
25 |
+
parser = argparse.ArgumentParser()
|
26 |
+
|
27 |
+
parser.add_argument(
|
28 |
+
"--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
|
29 |
+
)
|
30 |
+
# !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
|
31 |
+
parser.add_argument(
|
32 |
+
"--original_config_file",
|
33 |
+
default=None,
|
34 |
+
type=str,
|
35 |
+
help="The YAML config file corresponding to the original architecture.",
|
36 |
+
)
|
37 |
+
parser.add_argument(
|
38 |
+
"--num_in_channels",
|
39 |
+
default=None,
|
40 |
+
type=int,
|
41 |
+
help="The number of input channels. If `None` number of input channels will be automatically inferred.",
|
42 |
+
)
|
43 |
+
parser.add_argument(
|
44 |
+
"--scheduler_type",
|
45 |
+
default="pndm",
|
46 |
+
type=str,
|
47 |
+
help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancestral', 'dpm']",
|
48 |
+
)
|
49 |
+
parser.add_argument(
|
50 |
+
"--pipeline_type",
|
51 |
+
default=None,
|
52 |
+
type=str,
|
53 |
+
help=(
|
54 |
+
"The pipeline type. One of 'FrozenOpenCLIPEmbedder', 'FrozenCLIPEmbedder', 'PaintByExample'"
|
55 |
+
". If `None` pipeline will be automatically inferred."
|
56 |
+
),
|
57 |
+
)
|
58 |
+
parser.add_argument(
|
59 |
+
"--image_size",
|
60 |
+
default=None,
|
61 |
+
type=int,
|
62 |
+
help=(
|
63 |
+
"The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Siffusion v2"
|
64 |
+
" Base. Use 768 for Stable Diffusion v2."
|
65 |
+
),
|
66 |
+
)
|
67 |
+
parser.add_argument(
|
68 |
+
"--prediction_type",
|
69 |
+
default=None,
|
70 |
+
type=str,
|
71 |
+
help=(
|
72 |
+
"The prediction type that the model was trained on. Use 'epsilon' for Stable Diffusion v1.X and Stable"
|
73 |
+
" Diffusion v2 Base. Use 'v_prediction' for Stable Diffusion v2."
|
74 |
+
),
|
75 |
+
)
|
76 |
+
parser.add_argument(
|
77 |
+
"--extract_ema",
|
78 |
+
action="store_true",
|
79 |
+
help=(
|
80 |
+
"Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
|
81 |
+
" or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
|
82 |
+
" higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
|
83 |
+
),
|
84 |
+
)
|
85 |
+
parser.add_argument(
|
86 |
+
"--upcast_attention",
|
87 |
+
action="store_true",
|
88 |
+
help=(
|
89 |
+
"Whether the attention computation should always be upcasted. This is necessary when running stable"
|
90 |
+
" diffusion 2.1."
|
91 |
+
),
|
92 |
+
)
|
93 |
+
parser.add_argument(
|
94 |
+
"--from_safetensors",
|
95 |
+
action="store_true",
|
96 |
+
help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
|
97 |
+
)
|
98 |
+
parser.add_argument(
|
99 |
+
"--to_safetensors",
|
100 |
+
action="store_true",
|
101 |
+
help="Whether to store pipeline in safetensors format or not.",
|
102 |
+
)
|
103 |
+
parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
|
104 |
+
parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
|
105 |
+
parser.add_argument(
|
106 |
+
"--stable_unclip",
|
107 |
+
type=str,
|
108 |
+
default=None,
|
109 |
+
required=False,
|
110 |
+
help="Set if this is a stable unCLIP model. One of 'txt2img' or 'img2img'.",
|
111 |
+
)
|
112 |
+
parser.add_argument(
|
113 |
+
"--stable_unclip_prior",
|
114 |
+
type=str,
|
115 |
+
default=None,
|
116 |
+
required=False,
|
117 |
+
help="Set if this is a stable unCLIP txt2img model. Selects which prior to use. If `--stable_unclip` is set to `txt2img`, the karlo prior (https://huggingface.co/kakaobrain/karlo-v1-alpha/tree/main/prior) is selected by default.",
|
118 |
+
)
|
119 |
+
parser.add_argument(
|
120 |
+
"--clip_stats_path",
|
121 |
+
type=str,
|
122 |
+
help="Path to the clip stats file. Only required if the stable unclip model's config specifies `model.params.noise_aug_config.params.clip_stats_path`.",
|
123 |
+
required=False,
|
124 |
+
)
|
125 |
+
parser.add_argument(
|
126 |
+
"--controlnet", action="store_true", default=None, help="Set flag if this is a controlnet checkpoint."
|
127 |
+
)
|
128 |
+
parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
|
129 |
+
args = parser.parse_args()
|
130 |
+
|
131 |
+
pipe = download_from_original_stable_diffusion_ckpt(
|
132 |
+
checkpoint_path=args.checkpoint_path,
|
133 |
+
original_config_file=args.original_config_file,
|
134 |
+
image_size=args.image_size,
|
135 |
+
prediction_type=args.prediction_type,
|
136 |
+
model_type=args.pipeline_type,
|
137 |
+
extract_ema=args.extract_ema,
|
138 |
+
scheduler_type=args.scheduler_type,
|
139 |
+
num_in_channels=args.num_in_channels,
|
140 |
+
upcast_attention=args.upcast_attention,
|
141 |
+
from_safetensors=args.from_safetensors,
|
142 |
+
device=args.device,
|
143 |
+
stable_unclip=args.stable_unclip,
|
144 |
+
stable_unclip_prior=args.stable_unclip_prior,
|
145 |
+
clip_stats_path=args.clip_stats_path,
|
146 |
+
controlnet=args.controlnet,
|
147 |
+
)
|
148 |
+
|
149 |
+
if args.half:
|
150 |
+
pipe.to(torch_dtype=torch.float16)
|
151 |
+
|
152 |
+
if args.controlnet:
|
153 |
+
# only save the controlnet model
|
154 |
+
pipe.controlnet.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
|
155 |
+
else:
|
156 |
+
pipe.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)
|
convert_stable_diffusion_checkpoint_to_onnx.py
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
import argparse
|
16 |
+
import os
|
17 |
+
import shutil
|
18 |
+
from pathlib import Path
|
19 |
+
|
20 |
+
import onnx
|
21 |
+
import torch
|
22 |
+
from packaging import version
|
23 |
+
from torch.onnx import export
|
24 |
+
|
25 |
+
from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline, StableDiffusionPipeline
|
26 |
+
|
27 |
+
|
28 |
+
is_torch_less_than_1_11 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.11")
|
29 |
+
|
30 |
+
|
31 |
+
def onnx_export(
|
32 |
+
model,
|
33 |
+
model_args: tuple,
|
34 |
+
output_path: Path,
|
35 |
+
ordered_input_names,
|
36 |
+
output_names,
|
37 |
+
dynamic_axes,
|
38 |
+
opset,
|
39 |
+
use_external_data_format=False,
|
40 |
+
):
|
41 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
42 |
+
# PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11,
|
43 |
+
# so we check the torch version for backwards compatibility
|
44 |
+
if is_torch_less_than_1_11:
|
45 |
+
export(
|
46 |
+
model,
|
47 |
+
model_args,
|
48 |
+
f=output_path.as_posix(),
|
49 |
+
input_names=ordered_input_names,
|
50 |
+
output_names=output_names,
|
51 |
+
dynamic_axes=dynamic_axes,
|
52 |
+
do_constant_folding=True,
|
53 |
+
use_external_data_format=use_external_data_format,
|
54 |
+
enable_onnx_checker=True,
|
55 |
+
opset_version=opset,
|
56 |
+
)
|
57 |
+
else:
|
58 |
+
export(
|
59 |
+
model,
|
60 |
+
model_args,
|
61 |
+
f=output_path.as_posix(),
|
62 |
+
input_names=ordered_input_names,
|
63 |
+
output_names=output_names,
|
64 |
+
dynamic_axes=dynamic_axes,
|
65 |
+
do_constant_folding=True,
|
66 |
+
opset_version=opset,
|
67 |
+
)
|
68 |
+
|
69 |
+
|
70 |
+
@torch.no_grad()
|
71 |
+
def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = False):
|
72 |
+
dtype = torch.float16 if fp16 else torch.float32
|
73 |
+
if fp16 and torch.cuda.is_available():
|
74 |
+
device = "cuda"
|
75 |
+
elif fp16 and not torch.cuda.is_available():
|
76 |
+
raise ValueError("`float16` model export is only supported on GPUs with CUDA")
|
77 |
+
else:
|
78 |
+
device = "cpu"
|
79 |
+
pipeline = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=dtype).to(device)
|
80 |
+
output_path = Path(output_path)
|
81 |
+
|
82 |
+
# TEXT ENCODER
|
83 |
+
num_tokens = pipeline.text_encoder.config.max_position_embeddings
|
84 |
+
text_hidden_size = pipeline.text_encoder.config.hidden_size
|
85 |
+
text_input = pipeline.tokenizer(
|
86 |
+
"A sample prompt",
|
87 |
+
padding="max_length",
|
88 |
+
max_length=pipeline.tokenizer.model_max_length,
|
89 |
+
truncation=True,
|
90 |
+
return_tensors="pt",
|
91 |
+
)
|
92 |
+
onnx_export(
|
93 |
+
pipeline.text_encoder,
|
94 |
+
# casting to torch.int32 until the CLIP fix is released: https://github.com/huggingface/transformers/pull/18515/files
|
95 |
+
model_args=(text_input.input_ids.to(device=device, dtype=torch.int32)),
|
96 |
+
output_path=output_path / "text_encoder" / "model.onnx",
|
97 |
+
ordered_input_names=["input_ids"],
|
98 |
+
output_names=["last_hidden_state", "pooler_output"],
|
99 |
+
dynamic_axes={
|
100 |
+
"input_ids": {0: "batch", 1: "sequence"},
|
101 |
+
},
|
102 |
+
opset=opset,
|
103 |
+
)
|
104 |
+
del pipeline.text_encoder
|
105 |
+
|
106 |
+
# UNET
|
107 |
+
unet_in_channels = pipeline.unet.config.in_channels
|
108 |
+
unet_sample_size = pipeline.unet.config.sample_size
|
109 |
+
unet_path = output_path / "unet" / "model.onnx"
|
110 |
+
onnx_export(
|
111 |
+
pipeline.unet,
|
112 |
+
model_args=(
|
113 |
+
torch.randn(2, unet_in_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
|
114 |
+
torch.randn(2).to(device=device, dtype=dtype),
|
115 |
+
torch.randn(2, num_tokens, text_hidden_size).to(device=device, dtype=dtype),
|
116 |
+
False,
|
117 |
+
),
|
118 |
+
output_path=unet_path,
|
119 |
+
ordered_input_names=["sample", "timestep", "encoder_hidden_states", "return_dict"],
|
120 |
+
output_names=["out_sample"], # has to be different from "sample" for correct tracing
|
121 |
+
dynamic_axes={
|
122 |
+
"sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
|
123 |
+
"timestep": {0: "batch"},
|
124 |
+
"encoder_hidden_states": {0: "batch", 1: "sequence"},
|
125 |
+
},
|
126 |
+
opset=opset,
|
127 |
+
use_external_data_format=True, # UNet is > 2GB, so the weights need to be split
|
128 |
+
)
|
129 |
+
unet_model_path = str(unet_path.absolute().as_posix())
|
130 |
+
unet_dir = os.path.dirname(unet_model_path)
|
131 |
+
unet = onnx.load(unet_model_path)
|
132 |
+
# clean up existing tensor files
|
133 |
+
shutil.rmtree(unet_dir)
|
134 |
+
os.mkdir(unet_dir)
|
135 |
+
# collate external tensor files into one
|
136 |
+
onnx.save_model(
|
137 |
+
unet,
|
138 |
+
unet_model_path,
|
139 |
+
save_as_external_data=True,
|
140 |
+
all_tensors_to_one_file=True,
|
141 |
+
location="weights.pb",
|
142 |
+
convert_attribute=False,
|
143 |
+
)
|
144 |
+
del pipeline.unet
|
145 |
+
|
146 |
+
# VAE ENCODER
|
147 |
+
vae_encoder = pipeline.vae
|
148 |
+
vae_in_channels = vae_encoder.config.in_channels
|
149 |
+
vae_sample_size = vae_encoder.config.sample_size
|
150 |
+
# need to get the raw tensor output (sample) from the encoder
|
151 |
+
vae_encoder.forward = lambda sample, return_dict: vae_encoder.encode(sample, return_dict)[0].sample()
|
152 |
+
onnx_export(
|
153 |
+
vae_encoder,
|
154 |
+
model_args=(
|
155 |
+
torch.randn(1, vae_in_channels, vae_sample_size, vae_sample_size).to(device=device, dtype=dtype),
|
156 |
+
False,
|
157 |
+
),
|
158 |
+
output_path=output_path / "vae_encoder" / "model.onnx",
|
159 |
+
ordered_input_names=["sample", "return_dict"],
|
160 |
+
output_names=["latent_sample"],
|
161 |
+
dynamic_axes={
|
162 |
+
"sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
|
163 |
+
},
|
164 |
+
opset=opset,
|
165 |
+
)
|
166 |
+
|
167 |
+
# VAE DECODER
|
168 |
+
vae_decoder = pipeline.vae
|
169 |
+
vae_latent_channels = vae_decoder.config.latent_channels
|
170 |
+
vae_out_channels = vae_decoder.config.out_channels
|
171 |
+
# forward only through the decoder part
|
172 |
+
vae_decoder.forward = vae_encoder.decode
|
173 |
+
onnx_export(
|
174 |
+
vae_decoder,
|
175 |
+
model_args=(
|
176 |
+
torch.randn(1, vae_latent_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
|
177 |
+
False,
|
178 |
+
),
|
179 |
+
output_path=output_path / "vae_decoder" / "model.onnx",
|
180 |
+
ordered_input_names=["latent_sample", "return_dict"],
|
181 |
+
output_names=["sample"],
|
182 |
+
dynamic_axes={
|
183 |
+
"latent_sample": {0: "batch", 1: "channels", 2: "height", 3: "width"},
|
184 |
+
},
|
185 |
+
opset=opset,
|
186 |
+
)
|
187 |
+
del pipeline.vae
|
188 |
+
|
189 |
+
# SAFETY CHECKER
|
190 |
+
if pipeline.safety_checker is not None:
|
191 |
+
safety_checker = pipeline.safety_checker
|
192 |
+
clip_num_channels = safety_checker.config.vision_config.num_channels
|
193 |
+
clip_image_size = safety_checker.config.vision_config.image_size
|
194 |
+
safety_checker.forward = safety_checker.forward_onnx
|
195 |
+
onnx_export(
|
196 |
+
pipeline.safety_checker,
|
197 |
+
model_args=(
|
198 |
+
torch.randn(
|
199 |
+
1,
|
200 |
+
clip_num_channels,
|
201 |
+
clip_image_size,
|
202 |
+
clip_image_size,
|
203 |
+
).to(device=device, dtype=dtype),
|
204 |
+
torch.randn(1, vae_sample_size, vae_sample_size, vae_out_channels).to(device=device, dtype=dtype),
|
205 |
+
),
|
206 |
+
output_path=output_path / "safety_checker" / "model.onnx",
|
207 |
+
ordered_input_names=["clip_input", "images"],
|
208 |
+
output_names=["out_images", "has_nsfw_concepts"],
|
209 |
+
dynamic_axes={
|
210 |
+
"clip_input": {0: "batch", 1: "channels", 2: "height", 3: "width"},
|
211 |
+
"images": {0: "batch", 1: "height", 2: "width", 3: "channels"},
|
212 |
+
},
|
213 |
+
opset=opset,
|
214 |
+
)
|
215 |
+
del pipeline.safety_checker
|
216 |
+
safety_checker = OnnxRuntimeModel.from_pretrained(output_path / "safety_checker")
|
217 |
+
feature_extractor = pipeline.feature_extractor
|
218 |
+
else:
|
219 |
+
safety_checker = None
|
220 |
+
feature_extractor = None
|
221 |
+
|
222 |
+
onnx_pipeline = OnnxStableDiffusionPipeline(
|
223 |
+
vae_encoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_encoder"),
|
224 |
+
vae_decoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_decoder"),
|
225 |
+
text_encoder=OnnxRuntimeModel.from_pretrained(output_path / "text_encoder"),
|
226 |
+
tokenizer=pipeline.tokenizer,
|
227 |
+
unet=OnnxRuntimeModel.from_pretrained(output_path / "unet"),
|
228 |
+
scheduler=pipeline.scheduler,
|
229 |
+
safety_checker=safety_checker,
|
230 |
+
feature_extractor=feature_extractor,
|
231 |
+
requires_safety_checker=safety_checker is not None,
|
232 |
+
)
|
233 |
+
|
234 |
+
onnx_pipeline.save_pretrained(output_path)
|
235 |
+
print("ONNX pipeline saved to", output_path)
|
236 |
+
|
237 |
+
del pipeline
|
238 |
+
del onnx_pipeline
|
239 |
+
_ = OnnxStableDiffusionPipeline.from_pretrained(output_path, provider="CPUExecutionProvider")
|
240 |
+
print("ONNX pipeline is loadable")
|
241 |
+
|
242 |
+
|
243 |
+
if __name__ == "__main__":
|
244 |
+
parser = argparse.ArgumentParser()
|
245 |
+
|
246 |
+
parser.add_argument(
|
247 |
+
"--model_path",
|
248 |
+
type=str,
|
249 |
+
required=True,
|
250 |
+
help="Path to the `diffusers` checkpoint to convert (either a local directory or on the Hub).",
|
251 |
+
)
|
252 |
+
|
253 |
+
parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.")
|
254 |
+
|
255 |
+
parser.add_argument(
|
256 |
+
"--opset",
|
257 |
+
default=14,
|
258 |
+
type=int,
|
259 |
+
help="The version of the ONNX operator set to use.",
|
260 |
+
)
|
261 |
+
parser.add_argument("--fp16", action="store_true", default=False, help="Export the models in `float16` mode")
|
262 |
+
|
263 |
+
args = parser.parse_args()
|
264 |
+
|
265 |
+
convert_models(args.model_path, args.output_path, args.opset, args.fp16)
|
v1-inference.yaml
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 1.0e-04
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
4 |
+
params:
|
5 |
+
linear_start: 0.00085
|
6 |
+
linear_end: 0.0120
|
7 |
+
num_timesteps_cond: 1
|
8 |
+
log_every_t: 200
|
9 |
+
timesteps: 1000
|
10 |
+
first_stage_key: "jpg"
|
11 |
+
cond_stage_key: "txt"
|
12 |
+
image_size: 64
|
13 |
+
channels: 4
|
14 |
+
cond_stage_trainable: false # Note: different from the one we trained before
|
15 |
+
conditioning_key: crossattn
|
16 |
+
monitor: val/loss_simple_ema
|
17 |
+
scale_factor: 0.18215
|
18 |
+
use_ema: False
|
19 |
+
|
20 |
+
scheduler_config: # 10000 warmup steps
|
21 |
+
target: ldm.lr_scheduler.LambdaLinearScheduler
|
22 |
+
params:
|
23 |
+
warm_up_steps: [ 10000 ]
|
24 |
+
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
25 |
+
f_start: [ 1.e-6 ]
|
26 |
+
f_max: [ 1. ]
|
27 |
+
f_min: [ 1. ]
|
28 |
+
|
29 |
+
unet_config:
|
30 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
31 |
+
params:
|
32 |
+
image_size: 32 # unused
|
33 |
+
in_channels: 4
|
34 |
+
out_channels: 4
|
35 |
+
model_channels: 320
|
36 |
+
attention_resolutions: [ 4, 2, 1 ]
|
37 |
+
num_res_blocks: 2
|
38 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
39 |
+
num_heads: 8
|
40 |
+
use_spatial_transformer: True
|
41 |
+
transformer_depth: 1
|
42 |
+
context_dim: 768
|
43 |
+
use_checkpoint: True
|
44 |
+
legacy: False
|
45 |
+
|
46 |
+
first_stage_config:
|
47 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
48 |
+
params:
|
49 |
+
embed_dim: 4
|
50 |
+
monitor: val/rec_loss
|
51 |
+
ddconfig:
|
52 |
+
double_z: true
|
53 |
+
z_channels: 4
|
54 |
+
resolution: 256
|
55 |
+
in_channels: 3
|
56 |
+
out_ch: 3
|
57 |
+
ch: 128
|
58 |
+
ch_mult:
|
59 |
+
- 1
|
60 |
+
- 2
|
61 |
+
- 4
|
62 |
+
- 4
|
63 |
+
num_res_blocks: 2
|
64 |
+
attn_resolutions: []
|
65 |
+
dropout: 0.0
|
66 |
+
lossconfig:
|
67 |
+
target: torch.nn.Identity
|
68 |
+
|
69 |
+
cond_stage_config:
|
70 |
+
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
v1-inpainting-inference.yaml
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 7.5e-05
|
3 |
+
target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
|
4 |
+
params:
|
5 |
+
linear_start: 0.00085
|
6 |
+
linear_end: 0.0120
|
7 |
+
num_timesteps_cond: 1
|
8 |
+
log_every_t: 200
|
9 |
+
timesteps: 1000
|
10 |
+
first_stage_key: "jpg"
|
11 |
+
cond_stage_key: "txt"
|
12 |
+
image_size: 64
|
13 |
+
channels: 4
|
14 |
+
cond_stage_trainable: false # Note: different from the one we trained before
|
15 |
+
conditioning_key: hybrid # important
|
16 |
+
monitor: val/loss_simple_ema
|
17 |
+
scale_factor: 0.18215
|
18 |
+
finetune_keys: null
|
19 |
+
|
20 |
+
scheduler_config: # 10000 warmup steps
|
21 |
+
target: ldm.lr_scheduler.LambdaLinearScheduler
|
22 |
+
params:
|
23 |
+
warm_up_steps: [ 2500 ] # NOTE for resuming. use 10000 if starting from scratch
|
24 |
+
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
25 |
+
f_start: [ 1.e-6 ]
|
26 |
+
f_max: [ 1. ]
|
27 |
+
f_min: [ 1. ]
|
28 |
+
|
29 |
+
unet_config:
|
30 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
31 |
+
params:
|
32 |
+
image_size: 32 # unused
|
33 |
+
in_channels: 9 # 4 data + 4 downscaled image + 1 mask
|
34 |
+
out_channels: 4
|
35 |
+
model_channels: 320
|
36 |
+
attention_resolutions: [ 4, 2, 1 ]
|
37 |
+
num_res_blocks: 2
|
38 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
39 |
+
num_heads: 8
|
40 |
+
use_spatial_transformer: True
|
41 |
+
transformer_depth: 1
|
42 |
+
context_dim: 768
|
43 |
+
use_checkpoint: True
|
44 |
+
legacy: False
|
45 |
+
|
46 |
+
first_stage_config:
|
47 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
48 |
+
params:
|
49 |
+
embed_dim: 4
|
50 |
+
monitor: val/rec_loss
|
51 |
+
ddconfig:
|
52 |
+
double_z: true
|
53 |
+
z_channels: 4
|
54 |
+
resolution: 256
|
55 |
+
in_channels: 3
|
56 |
+
out_ch: 3
|
57 |
+
ch: 128
|
58 |
+
ch_mult:
|
59 |
+
- 1
|
60 |
+
- 2
|
61 |
+
- 4
|
62 |
+
- 4
|
63 |
+
num_res_blocks: 2
|
64 |
+
attn_resolutions: []
|
65 |
+
dropout: 0.0
|
66 |
+
lossconfig:
|
67 |
+
target: torch.nn.Identity
|
68 |
+
|
69 |
+
cond_stage_config:
|
70 |
+
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|