yibolu
commited on
Commit
·
8881820
1
Parent(s):
ec9c23f
update lyrasd2
Browse files- .gitattributes +5 -16
- .gitignore +4 -2
- CHANGELOG.md +0 -4
- LISENCE +0 -494
- README.md +104 -50
- control_bird_canny.png +0 -0
- controlnet_img2img_demo.py +62 -0
- controlnet_txt2img_demo.py +63 -0
- demo.py +0 -12
- img2img_demo.py +47 -0
- lyraSD/__init__.py +0 -1
- lyraSD/inference.py +0 -85
- lyraSD/muse_trt/__init__.py +0 -10
- lyraSD/muse_trt/models.py +0 -149
- lyraSD/muse_trt/sd_img2img.py +0 -368
- lyraSD/muse_trt/sd_text2img.py +0 -292
- lyraSD/muse_trt/super.py +0 -64
- lyraSD/muse_trt/utilities.py +0 -538
- lyrasd_model/__init__.py +5 -0
- lyrasd_model/lora_util.py +54 -0
- lyrasd_model/lyrasd_controlnet_img2img_pipeline.py +637 -0
- lyrasd_model/lyrasd_controlnet_txt2img_pipeline.py +547 -0
- lyrasd_model/lyrasd_img2img_pipeline.py +554 -0
- lyraSD/muse_trt/libnvinfer_plugin.so → lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm80.so +2 -2
- sd1.4-engine/superx4-512-512.plan → lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm86.so +2 -2
- sd1.4-engine/clip.plan → lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so +2 -2
- sd1.4-engine/vae-decoder.plan → lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so +2 -2
- lyrasd_model/lyrasd_lib/placeholder.txt +0 -0
- lyrasd_model/lyrasd_txt2img_pipeline.py +458 -0
- models/README.md +12 -0
- output/img2img_demo.jpg +0 -0
- output/img2img_input.jpg +0 -0
- output/text2img_demo.jpg +0 -0
- outputs/res_controlnet_img2img_0.png +0 -0
- outputs/res_controlnet_txt2img_0.png +0 -0
- outputs/res_img2img_0.png +0 -0
- outputs/res_txt2img_0.png +0 -0
- outputs/res_txt2img_lora_0.png +0 -0
- requirements.txt +2 -0
- sd1.4-engine/feature_extractor/preprocessor_config.json +0 -28
- sd1.4-engine/scheduler/scheduler_config.json +0 -14
- sd1.4-engine/text_encoder/config.json +0 -25
- sd1.4-engine/tokenizer/merges.txt +0 -0
- sd1.4-engine/tokenizer/special_tokens_map.json +0 -24
- sd1.4-engine/tokenizer/tokenizer_config.json +0 -34
- sd1.4-engine/tokenizer/vocab.json +0 -0
- sd1.4-engine/unet_fp16.plan +0 -3
- sd1.4-engine/vae/config.json +0 -31
- sd1.4-engine/vae/diffusion_pytorch_model.bin +0 -3
- txt2img_demo.py +44 -0
.gitattributes
CHANGED
@@ -25,6 +25,7 @@
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
28 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
@@ -32,19 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
sd1.5-engine/vae-decoder.plan filter=lfs diff=lfs merge=lfs -text
|
40 |
-
sd1.5-engine/vae-encoder.plan filter=lfs diff=lfs merge=lfs -text
|
41 |
-
sd1.5-engine/scheduler filter=lfs diff=lfs merge=lfs -text
|
42 |
-
sd1.5-engine/superx4-512-512.plan filter=lfs diff=lfs merge=lfs -text
|
43 |
-
sd1.5-engine/text_encoder filter=lfs diff=lfs merge=lfs -text
|
44 |
-
sd1.5-engine/tokenizer filter=lfs diff=lfs merge=lfs -text
|
45 |
-
sd1.5-engine/vae filter=lfs diff=lfs merge=lfs -text
|
46 |
-
sd1.5-engine/feature_extractor filter=lfs diff=lfs merge=lfs -text
|
47 |
-
sd1.4-engine/clip.plan filter=lfs diff=lfs merge=lfs -text
|
48 |
-
sd1.4-engine/superx4-512-512.plan filter=lfs diff=lfs merge=lfs -text
|
49 |
-
sd1.4-engine/unet_fp16.plan filter=lfs diff=lfs merge=lfs -text
|
50 |
-
sd1.4-engine/vae-decoder.plan filter=lfs diff=lfs merge=lfs -text
|
|
|
25 |
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm80.so filter=lfs diff=lfs merge=lfs -text
|
37 |
+
lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm86.so filter=lfs diff=lfs merge=lfs -text
|
38 |
+
lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so filter=lfs diff=lfs merge=lfs -text
|
39 |
+
lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
-
|
2 |
*.pyc
|
3 |
-
|
|
|
|
|
|
1 |
+
.idea
|
2 |
*.pyc
|
3 |
+
.vscode
|
4 |
+
__pycache__
|
5 |
+
models/lyrasd*
|
CHANGELOG.md
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
## V1.0
|
2 |
-
|
3 |
-
- Add accelerated Stable Diffusion pretrained model v1.4 (from: https://huggingface.co/CompVis/stable-diffusion-v1-4)
|
4 |
-
- Add accelerated Real-ESRGAN(4x) (from https://github.com/xinntao/Real-ESRGAN)
|
|
|
|
|
|
|
|
|
|
LISENCE
DELETED
@@ -1,494 +0,0 @@
|
|
1 |
-
CreativeML Open RAIL-M License
|
2 |
-
|
3 |
-
Copyright (c) 2023 Tencent Music Entertainment
|
4 |
-
|
5 |
-
|
6 |
-
Terms of the CreativeML Open RAIL-M:
|
7 |
-
--------------------------------------------------------------------
|
8 |
-
Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors
|
9 |
-
|
10 |
-
CreativeML Open RAIL-M
|
11 |
-
dated August 22, 2022
|
12 |
-
|
13 |
-
Section I: PREAMBLE
|
14 |
-
|
15 |
-
Multimodal generative models are being widely adopted and used, and have the potential to transform the way artists, among other individuals, conceive and benefit from AI or ML technologies as a tool for content creation.
|
16 |
-
|
17 |
-
Notwithstanding the current and potential benefits that these artifacts can bring to society at large, there are also concerns about potential misuses of them, either due to their technical limitations or ethical considerations.
|
18 |
-
|
19 |
-
In short, this license strives for both the open and responsible downstream use of the accompanying model. When it comes to the open character, we took inspiration from open source permissive licenses regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be able to enforce the license in case potential misuses of the Model may occur. At the same time, we strive to promote open and responsible research on generative models for art and content generation.
|
20 |
-
|
21 |
-
Even though downstream derivative versions of the model could be released under different licensing terms, the latter will always have to include - at minimum - the same use-based restrictions as the ones in the original license (this license). We believe in the intersection between open and responsible AI development; thus, this License aims to strike a balance between both in order to enable responsible open-science in the field of AI.
|
22 |
-
|
23 |
-
This License governs the use of the model (and its derivatives) and is informed by the model card associated with the model.
|
24 |
-
|
25 |
-
NOW THEREFORE, You and Licensor agree as follows:
|
26 |
-
|
27 |
-
1. Definitions
|
28 |
-
|
29 |
-
- "License" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
|
30 |
-
- "Data" means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
|
31 |
-
- "Output" means the results of operating a Model as embodied in informational content resulting therefrom.
|
32 |
-
- "Model" means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material.
|
33 |
-
- "Derivatives of the Model" means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model.
|
34 |
-
- "Complementary Material" means the accompanying source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, etc, if any.
|
35 |
-
- "Distribution" means any transmission, reproduction, publication or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means - e.g. API-based or web access.
|
36 |
-
- "Licensor" means the copyright owner or entity authorized by the copyright owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model.
|
37 |
-
- "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application - e.g. chatbot, translator, image generator.
|
38 |
-
- "Third Parties" means individuals or legal entities that are not under common control with Licensor or You.
|
39 |
-
- "Contribution" means any work of authorship, including the original version of the Model and any modifications or additions to that Model or Derivatives of the Model thereof, that is intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Model, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
|
40 |
-
- "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Model.
|
41 |
-
|
42 |
-
Section II: INTELLECTUAL PROPERTY RIGHTS
|
43 |
-
|
44 |
-
Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
|
45 |
-
|
46 |
-
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Complementary Material, the Model, and Derivatives of the Model.
|
47 |
-
3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution incorporated within the Model and/or Complementary Material constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or Work shall terminate as of the date such litigation is asserted or filed.
|
48 |
-
|
49 |
-
Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
|
50 |
-
|
51 |
-
4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g. software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the following conditions:
|
52 |
-
Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply to the use of Complementary Material.
|
53 |
-
You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License;
|
54 |
-
You must cause any modified files to carry prominent notices stating that You changed the files;
|
55 |
-
You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model, Derivatives of the Model.
|
56 |
-
You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions - respecting paragraph 4.a. - for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
|
57 |
-
5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph (paragraph 5).
|
58 |
-
6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You generate using the Model. You are accountable for the Output you generate and its subsequent uses. No use of the output can contravene any provision as stated in the License.
|
59 |
-
|
60 |
-
Section IV: OTHER PROVISIONS
|
61 |
-
|
62 |
-
7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model through electronic means, or modify the Output of the Model based on updates. You shall undertake reasonable efforts to use the latest version of the Model.
|
63 |
-
8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors.
|
64 |
-
9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the Complementary Material and assume any risks associated with Your exercise of permissions under this License.
|
65 |
-
10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model and the Complementary Material (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
|
66 |
-
11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
|
67 |
-
12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
|
68 |
-
|
69 |
-
END OF TERMS AND CONDITIONS
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
Attachment A
|
74 |
-
|
75 |
-
Use Restrictions
|
76 |
-
|
77 |
-
You agree not to use the Model or Derivatives of the Model:
|
78 |
-
- In any way that violates any applicable national, federal, state, local or international law or regulation;
|
79 |
-
- For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
|
80 |
-
- To generate or disseminate verifiably false information and/or content with the purpose of harming others;
|
81 |
-
- To generate or disseminate personal identifiable information that can be used to harm an individual;
|
82 |
-
- To defame, disparage or otherwise harass others;
|
83 |
-
- For fully automated decision making that adversely impacts an individual’s legal rights or otherwise creates or modifies a binding, enforceable obligation;
|
84 |
-
- For any use intended to or which has the effect of discriminating against or harming individuals or groups based on online or offline social behavior or known or predicted personal or personality characteristics;
|
85 |
-
- To exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
|
86 |
-
- For any use intended to or which has the effect of discriminating against individuals or groups based on legally protected characteristics or categories;
|
87 |
-
- To provide medical advice and medical results interpretation;
|
88 |
-
- To generate or disseminate information for the purpose to be used
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
Other dependencies and licenses:
|
94 |
-
|
95 |
-
|
96 |
-
Open Source Software Licensed under the CreativeML Open RAIL-M License:
|
97 |
-
--------------------------------------------------------------------
|
98 |
-
1. stable-diffuison
|
99 |
-
Files:https://huggingface.co/CompVis/stable-diffusion-v1-4
|
100 |
-
License:CreativeML Open RAIL-M
|
101 |
-
For details:https://huggingface.co/spaces/CompVis/stable-diffusion-license
|
102 |
-
|
103 |
-
A copy of the MIT License is included in this file.
|
104 |
-
|
105 |
-
|
106 |
-
Open Source Software Licensed under the Apache License Version 2.0:
|
107 |
-
--------------------------------------------------------------------
|
108 |
-
1. huggingface/diffusers
|
109 |
-
File:https://github.com/huggingface/diffusers
|
110 |
-
License:Apache License V2
|
111 |
-
For details:https://github.com/huggingface/diffusers/blob/main/LICENSE
|
112 |
-
|
113 |
-
2. huggingface/transformers
|
114 |
-
Copyright 2018- The Hugging Face team. All rights reserved.
|
115 |
-
|
116 |
-
3. NVIDIA/TensorRT
|
117 |
-
Copyright 2021 NVIDIA Corporation
|
118 |
-
|
119 |
-
Licensed under the Apache License, Version 2.0 (the "License");
|
120 |
-
you may not use this file except in compliance with the License.
|
121 |
-
You may obtain a copy of the License at
|
122 |
-
|
123 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
124 |
-
|
125 |
-
Unless required by applicable law or agreed to in writing, software
|
126 |
-
distributed under the License is distributed on an "AS IS" BASIS,
|
127 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
128 |
-
See the License for the specific language governing permissions and
|
129 |
-
limitations under the License.
|
130 |
-
|
131 |
-
4. TensorRT/tools/Polygraphy
|
132 |
-
Copyright 2020 NVIDIA Corporation
|
133 |
-
|
134 |
-
Licensed under the Apache License, Version 2.0 (the "License");
|
135 |
-
you may not use this file except in compliance with the License.
|
136 |
-
You may obtain a copy of the License at
|
137 |
-
|
138 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
139 |
-
|
140 |
-
Unless required by applicable law or agreed to in writing, software
|
141 |
-
distributed under the License is distributed on an "AS IS" BASIS,
|
142 |
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
143 |
-
See the License for the specific language governing permissions and
|
144 |
-
limitations under the License.
|
145 |
-
|
146 |
-
|
147 |
-
Terms of the Apache License Version 2.0:
|
148 |
-
--------------------------------------------------------------------
|
149 |
-
Apache License
|
150 |
-
|
151 |
-
Version 2.0, January 2004
|
152 |
-
|
153 |
-
http://www.apache.org/licenses/
|
154 |
-
|
155 |
-
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
156 |
-
1. Definitions.
|
157 |
-
|
158 |
-
"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
|
159 |
-
|
160 |
-
"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
|
161 |
-
|
162 |
-
"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
|
163 |
-
|
164 |
-
"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
|
165 |
-
|
166 |
-
"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
|
167 |
-
|
168 |
-
"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
|
169 |
-
|
170 |
-
"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
|
171 |
-
|
172 |
-
"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
|
173 |
-
|
174 |
-
"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
|
175 |
-
|
176 |
-
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
|
177 |
-
|
178 |
-
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
|
179 |
-
|
180 |
-
3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
|
181 |
-
|
182 |
-
4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
|
183 |
-
|
184 |
-
You must give any other recipients of the Work or Derivative Works a copy of this License; and
|
185 |
-
|
186 |
-
You must cause any modified files to carry prominent notices stating that You changed the files; and
|
187 |
-
|
188 |
-
You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
|
189 |
-
|
190 |
-
If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
|
191 |
-
|
192 |
-
You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
|
193 |
-
|
194 |
-
5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
|
195 |
-
|
196 |
-
6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
|
197 |
-
|
198 |
-
7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
|
199 |
-
|
200 |
-
8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
|
201 |
-
|
202 |
-
9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
|
203 |
-
|
204 |
-
END OF TERMS AND CONDITIONS
|
205 |
-
|
206 |
-
|
207 |
-
Open Source Software Licensed under the Modified BSD License:
|
208 |
-
--------------------------------------------------------------------
|
209 |
-
1. Numpy
|
210 |
-
Copyright (c) 2005-2023, NumPy Developers.
|
211 |
-
All rights reserved.
|
212 |
-
|
213 |
-
Terms of the Modified BSD License:
|
214 |
-
-------------------------------------------------------------------
|
215 |
-
This project is licensed under the terms of the Modified BSD License, as follows:
|
216 |
-
|
217 |
-
Copyright (c) 2005-2023, NumPy Developers.
|
218 |
-
All rights reserved.
|
219 |
-
|
220 |
-
Redistribution and use in source and binary forms, with or without
|
221 |
-
modification, are permitted provided that the following conditions are met:
|
222 |
-
|
223 |
-
Redistributions of source code must retain the above copyright notice, this
|
224 |
-
list of conditions and the following disclaimer.
|
225 |
-
|
226 |
-
Redistributions in binary form must reproduce the above copyright notice, this
|
227 |
-
list of conditions and the following disclaimer in the documentation and/or
|
228 |
-
other materials provided with the distribution.
|
229 |
-
|
230 |
-
Neither the name of the NumPy Developers nor the names of any contributors
|
231 |
-
may be used to endorse or promote products derived from this
|
232 |
-
software without specific prior written permission.
|
233 |
-
|
234 |
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
235 |
-
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
236 |
-
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
237 |
-
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
238 |
-
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
239 |
-
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
240 |
-
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
241 |
-
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
242 |
-
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
243 |
-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
244 |
-
|
245 |
-
2. pytorch
|
246 |
-
|
247 |
-
From PyTorch:
|
248 |
-
|
249 |
-
Copyright (c) 2016- Facebook, Inc (Adam Paszke)
|
250 |
-
Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
|
251 |
-
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
|
252 |
-
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
|
253 |
-
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
|
254 |
-
Copyright (c) 2011-2013 NYU (Clement Farabet)
|
255 |
-
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
|
256 |
-
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
|
257 |
-
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
|
258 |
-
|
259 |
-
From Caffe2:
|
260 |
-
|
261 |
-
Copyright (c) 2016-present, Facebook Inc. All rights reserved.
|
262 |
-
|
263 |
-
All contributions by Facebook:
|
264 |
-
Copyright (c) 2016 Facebook Inc.
|
265 |
-
|
266 |
-
All contributions by Google:
|
267 |
-
Copyright (c) 2015 Google Inc.
|
268 |
-
All rights reserved.
|
269 |
-
|
270 |
-
All contributions by Yangqing Jia:
|
271 |
-
Copyright (c) 2015 Yangqing Jia
|
272 |
-
All rights reserved.
|
273 |
-
|
274 |
-
All contributions by Kakao Brain:
|
275 |
-
Copyright 2019-2020 Kakao Brain
|
276 |
-
|
277 |
-
All contributions by Cruise LLC:
|
278 |
-
Copyright (c) 2022 Cruise LLC.
|
279 |
-
All rights reserved.
|
280 |
-
|
281 |
-
All contributions from Caffe:
|
282 |
-
Copyright(c) 2013, 2014, 2015, the respective contributors
|
283 |
-
All rights reserved.
|
284 |
-
|
285 |
-
All other contributions:
|
286 |
-
Copyright(c) 2015, 2016 the respective contributors
|
287 |
-
All rights reserved.
|
288 |
-
|
289 |
-
Caffe2 uses a copyright model similar to Caffe: each contributor holds
|
290 |
-
copyright over their contributions to Caffe2. The project versioning records
|
291 |
-
all such contribution and copyright details. If a contributor wants to further
|
292 |
-
mark their specific copyright on a particular contribution, they should
|
293 |
-
indicate their copyright solely in the commit message of the change when it is
|
294 |
-
committed.
|
295 |
-
|
296 |
-
All rights reserved.
|
297 |
-
|
298 |
-
|
299 |
-
Terms of the Modified BSD License:
|
300 |
-
-------------------------------------------------------------------
|
301 |
-
This project is licensed under the terms of the Modified BSD License, as follows:
|
302 |
-
|
303 |
-
Redistribution and use in source and binary forms, with or without
|
304 |
-
modification, are permitted provided that the following conditions are met:
|
305 |
-
|
306 |
-
1. Redistributions of source code must retain the above copyright
|
307 |
-
notice, this list of conditions and the following disclaimer.
|
308 |
-
|
309 |
-
2. Redistributions in binary form must reproduce the above copyright
|
310 |
-
notice, this list of conditions and the following disclaimer in the
|
311 |
-
documentation and/or other materials provided with the distribution.
|
312 |
-
|
313 |
-
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
|
314 |
-
and IDIAP Research Institute nor the names of its contributors may be
|
315 |
-
used to endorse or promote products derived from this software without
|
316 |
-
specific prior written permission.
|
317 |
-
|
318 |
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
319 |
-
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
320 |
-
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
321 |
-
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
322 |
-
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
323 |
-
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
324 |
-
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
325 |
-
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
326 |
-
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
327 |
-
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
328 |
-
POSSIBILITY OF SUCH DAMAGE.
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
Open Source Software Licensed under the BSD 3-Clause License:
|
333 |
-
--------------------------------------------------------------------
|
334 |
-
1. scipy
|
335 |
-
Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
|
336 |
-
|
337 |
-
Terms of the BSD 3-Clause License:
|
338 |
-
--------------------------------------------------------------------
|
339 |
-
Redistribution and use in source and binary forms, with or without
|
340 |
-
modification, are permitted provided that the following conditions are met:
|
341 |
-
|
342 |
-
* Redistributions of source code must retain the above copyright notice, this
|
343 |
-
list of conditions and the following disclaimer.
|
344 |
-
|
345 |
-
* Redistributions in binary form must reproduce the above copyright notice,
|
346 |
-
this list of conditions and the following disclaimer in the documentation
|
347 |
-
and/or other materials provided with the distribution.
|
348 |
-
|
349 |
-
* Neither the name of the copyright holder nor the names of its
|
350 |
-
contributors may be used to endorse or promote products derived from
|
351 |
-
this software without specific prior written permission.
|
352 |
-
|
353 |
-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
354 |
-
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
355 |
-
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
356 |
-
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
357 |
-
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
358 |
-
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
359 |
-
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
360 |
-
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
361 |
-
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
362 |
-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
363 |
-
|
364 |
-
|
365 |
-
Open Source Software Licensed under the Python Software Foundation License Version 2:
|
366 |
-
--------------------------------------------------------------------------
|
367 |
-
1. Python/cpython
|
368 |
-
Copyright © 2001-2023 Python Software Foundation. All rights reserved
|
369 |
-
|
370 |
-
|
371 |
-
A. HISTORY OF THE SOFTWARE
|
372 |
-
==========================
|
373 |
-
|
374 |
-
Python was created in the early 1990s by Guido van Rossum at Stichting
|
375 |
-
Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands
|
376 |
-
as a successor of a language called ABC. Guido remains Python's
|
377 |
-
principal author, although it includes many contributions from others.
|
378 |
-
|
379 |
-
In 1995, Guido continued his work on Python at the Corporation for
|
380 |
-
National Research Initiatives (CNRI, see https://www.cnri.reston.va.us)
|
381 |
-
in Reston, Virginia where he released several versions of the
|
382 |
-
software.
|
383 |
-
|
384 |
-
In May 2000, Guido and the Python core development team moved to
|
385 |
-
BeOpen.com to form the BeOpen PythonLabs team. In October of the same
|
386 |
-
year, the PythonLabs team moved to Digital Creations, which became
|
387 |
-
Zope Corporation. In 2001, the Python Software Foundation (PSF, see
|
388 |
-
https://www.python.org/psf/) was formed, a non-profit organization
|
389 |
-
created specifically to own Python-related Intellectual Property.
|
390 |
-
Zope Corporation was a sponsoring member of the PSF.
|
391 |
-
|
392 |
-
All Python releases are Open Source (see https://opensource.org for
|
393 |
-
the Open Source Definition). Historically, most, but not all, Python
|
394 |
-
releases have also been GPL-compatible; the table below summarizes
|
395 |
-
the various releases.
|
396 |
-
|
397 |
-
Release Derived Year Owner GPL-
|
398 |
-
from compatible? (1)
|
399 |
-
|
400 |
-
0.9.0 thru 1.2 1991-1995 CWI yes
|
401 |
-
1.3 thru 1.5.2 1.2 1995-1999 CNRI yes
|
402 |
-
1.6 1.5.2 2000 CNRI no
|
403 |
-
2.0 1.6 2000 BeOpen.com no
|
404 |
-
1.6.1 1.6 2001 CNRI yes (2)
|
405 |
-
2.1 2.0+1.6.1 2001 PSF no
|
406 |
-
2.0.1 2.0+1.6.1 2001 PSF yes
|
407 |
-
2.1.1 2.1+2.0.1 2001 PSF yes
|
408 |
-
2.1.2 2.1.1 2002 PSF yes
|
409 |
-
2.1.3 2.1.2 2002 PSF yes
|
410 |
-
2.2 and above 2.1.1 2001-now PSF yes
|
411 |
-
|
412 |
-
Footnotes:
|
413 |
-
|
414 |
-
(1) GPL-compatible doesn't mean that we're distributing Python under
|
415 |
-
the GPL. All Python licenses, unlike the GPL, let you distribute
|
416 |
-
a modified version without making your changes open source. The
|
417 |
-
GPL-compatible licenses make it possible to combine Python with
|
418 |
-
other software that is released under the GPL; the others don't.
|
419 |
-
|
420 |
-
(2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
|
421 |
-
because its license has a choice of law clause. According to
|
422 |
-
CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
|
423 |
-
is "not incompatible" with the GPL.
|
424 |
-
|
425 |
-
Thanks to the many outside volunteers who have worked under Guido's
|
426 |
-
direction to make these releases possible.
|
427 |
-
|
428 |
-
|
429 |
-
B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
|
430 |
-
===============================================================
|
431 |
-
|
432 |
-
Python software and documentation are licensed under the
|
433 |
-
Python Software Foundation License Version 2.
|
434 |
-
|
435 |
-
Starting with Python 3.8.6, examples, recipes, and other code in
|
436 |
-
the documentation are dual licensed under the PSF License Version 2
|
437 |
-
and the Zero-Clause BSD license.
|
438 |
-
|
439 |
-
Some software incorporated into Python is under different licenses.
|
440 |
-
The licenses are listed with code falling under that license.
|
441 |
-
|
442 |
-
|
443 |
-
PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
|
444 |
-
--------------------------------------------
|
445 |
-
|
446 |
-
1. This LICENSE AGREEMENT is between the Python Software Foundation
|
447 |
-
("PSF"), and the Individual or Organization ("Licensee") accessing and
|
448 |
-
otherwise using this software ("Python") in source or binary form and
|
449 |
-
its associated documentation.
|
450 |
-
|
451 |
-
2. Subject to the terms and conditions of this License Agreement, PSF hereby
|
452 |
-
grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
|
453 |
-
analyze, test, perform and/or display publicly, prepare derivative works,
|
454 |
-
distribute, and otherwise use Python alone or in any derivative version,
|
455 |
-
provided, however, that PSF's License Agreement and PSF's notice of copyright,
|
456 |
-
i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
|
457 |
-
2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation;
|
458 |
-
All Rights Reserved" are retained in Python alone or in any derivative version
|
459 |
-
prepared by Licensee.
|
460 |
-
|
461 |
-
3. In the event Licensee prepares a derivative work that is based on
|
462 |
-
or incorporates Python or any part thereof, and wants to make
|
463 |
-
the derivative work available to others as provided herein, then
|
464 |
-
Licensee hereby agrees to include in any such work a brief summary of
|
465 |
-
the changes made to Python.
|
466 |
-
|
467 |
-
4. PSF is making Python available to Licensee on an "AS IS"
|
468 |
-
basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
|
469 |
-
IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
|
470 |
-
DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
|
471 |
-
FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
|
472 |
-
INFRINGE ANY THIRD PARTY RIGHTS.
|
473 |
-
|
474 |
-
5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
|
475 |
-
FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
|
476 |
-
A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
|
477 |
-
OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
|
478 |
-
|
479 |
-
6. This License Agreement will automatically terminate upon a material
|
480 |
-
breach of its terms and conditions.
|
481 |
-
|
482 |
-
7. Nothing in this License Agreement shall be deemed to create any
|
483 |
-
relationship of agency, partnership, or joint venture between PSF and
|
484 |
-
Licensee. This License Agreement does not grant permission to use PSF
|
485 |
-
trademarks or trade name in a trademark sense to endorse or promote
|
486 |
-
products or services of Licensee, or any third party.
|
487 |
-
|
488 |
-
8. By copying, installing or otherwise using Python, Licensee
|
489 |
-
agrees to be bound by the terms and conditions of this License
|
490 |
-
Agreement.
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -4,19 +4,23 @@ language:
|
|
4 |
- en
|
5 |
tags:
|
6 |
- art
|
7 |
-
-
|
8 |
---
|
9 |
-
## Model Card for
|
10 |
|
11 |
-
|
12 |
|
13 |
Among its main features are:
|
14 |
|
15 |
-
-
|
16 |
-
-
|
17 |
-
-
|
18 |
-
-
|
19 |
-
-
|
|
|
|
|
|
|
|
|
20 |
|
21 |
## Speed
|
22 |
|
@@ -25,75 +29,125 @@ Among its main features are:
|
|
25 |
- device: Nvidia A100 40G
|
26 |
- img size: 512x512
|
27 |
- percision:fp16
|
28 |
-
- steps:
|
29 |
-
-
|
30 |
-
|
31 |
-
### text2img
|
32 |
-
|model|time cost(ms)|memory(MB)|
|
33 |
-
|:-:|:-:|:-:|
|
34 |
-
|Pytorch SD|~5000ms|~10240|
|
35 |
-
|lyraSD|~435ms|~4026|
|
36 |
-
|
37 |
-
### superResolution(SR)
|
38 |
-
|model|time cost(ms)|memory(MB)|
|
39 |
-
|:-:|:-:|:-:|
|
40 |
-
|Pytorch SR|~720ms|~6650|
|
41 |
-
|lyraSD|~26ms|~1600|
|
42 |
-
|
43 |
-
|
44 |
|
|
|
|
|
|
|
|
|
|
|
45 |
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
## Model Sources
|
48 |
|
49 |
-
- **
|
|
|
|
|
50 |
|
51 |
-
## Uses
|
52 |
|
53 |
```python
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
-
from PIL import Image
|
61 |
-
i2imodel = LyraSD("img2img", "./sd1.4-engine")
|
62 |
-
demo_img = Image.open("output/text2img_demo.jpg")
|
63 |
-
i2imodel.inference(prompt="A fantasy landscape, trending on artstation", image=demo_img)
|
64 |
|
65 |
```
|
66 |
## Demo output
|
67 |
|
68 |
-
###
|
69 |
-
|
|
|
70 |
|
71 |
-
|
|
|
72 |
|
73 |
-
|
74 |
|
75 |
-
|
|
|
76 |
|
|
|
|
|
77 |
|
|
|
78 |
|
79 |
-
|
|
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
```
|
86 |
|
87 |
## Citation
|
88 |
``` bibtex
|
89 |
-
@Misc{
|
90 |
-
author = {Kangjian Wu, Zhengtao Wang, Bin Wu},
|
91 |
-
title = {
|
92 |
-
howpublished = {\url{https://huggingface.co/TMElyralab/
|
93 |
year = {2023}
|
94 |
}
|
95 |
```
|
96 |
|
97 |
## Report bug
|
98 |
-
- start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/
|
99 |
- report bug with a `[bug]` mark in the title.
|
|
|
4 |
- en
|
5 |
tags:
|
6 |
- art
|
7 |
+
- Stable Diffusion
|
8 |
---
|
9 |
+
## Model Card for lyraSD2
|
10 |
|
11 |
+
lyraSD2 is currently the **fastest Stable Diffusion model** that can 100% align the outputs of **diffusers** available, boasting an inference cost of only **0.52 seconds** for a 512x512 image, accelerating the process up to **80% faster** than the original version.
|
12 |
|
13 |
Among its main features are:
|
14 |
|
15 |
+
- 4 Commonly used Pipelines
|
16 |
+
- - Text2Img
|
17 |
+
- - Img2Img
|
18 |
+
- - ControlNetText2Img
|
19 |
+
- - ControlNetImg2Img
|
20 |
+
- 100% likeness to diffusers output
|
21 |
+
- ControlNet Hot Swap: Can hot swap a ControlNet model weights within 0.4s (0s if cached)
|
22 |
+
- Lora How Swap: Can hot swap a Lora within 0.5s (0.1s if cached)
|
23 |
+
- device requirements: Nvidia Ampere architecture (A100, A10) or compatable
|
24 |
|
25 |
## Speed
|
26 |
|
|
|
29 |
- device: Nvidia A100 40G
|
30 |
- img size: 512x512
|
31 |
- percision:fp16
|
32 |
+
- steps: 20
|
33 |
+
- sampler: EulerA
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
### Text2Img
|
36 |
+
|model|time cost(ms)|
|
37 |
+
|:-:|:-:|
|
38 |
+
|torch2.0.1 + diffusers|~667ms|
|
39 |
+
|lyraSD|~528ms|
|
40 |
|
41 |
+
### ControlNet-Text2Img
|
42 |
+
|model|time cost(ms)|
|
43 |
+
|:-:|:-:|
|
44 |
+
|torch2.0.1 + diffusers|~930ms|
|
45 |
+
|lyraSD2|~745ms|
|
46 |
|
47 |
## Model Sources
|
48 |
|
49 |
+
- **Checkpoint:** https://civitai.com/models/7371/rev-animated
|
50 |
+
- **ControlNet:** https://huggingface.co/lllyasviel/sd-controlnet-canny
|
51 |
+
- **Lora:** https://civitai.com/models/18323?modelVersionId=46846
|
52 |
|
53 |
+
## Text2Img Uses
|
54 |
|
55 |
```python
|
56 |
+
import torch
|
57 |
+
import time
|
58 |
+
|
59 |
+
from lyrasd_model import LyraSdTxt2ImgPipeline
|
60 |
+
|
61 |
+
# 存放模型文件的路径,应该包含一下结构:
|
62 |
+
# 1. clip 模型
|
63 |
+
# 2. 转换好的优化后的 unet 模型,放入其中的 unet_bins 文件夹
|
64 |
+
# 3. vae 模型
|
65 |
+
# 4. scheduler 配置
|
66 |
+
|
67 |
+
# LyraSD 的 C++ 编译动态链接库,其中包含 C++ CUDA 计算的细节
|
68 |
+
lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm80.so"
|
69 |
+
model_path = "./models/lyrasd_rev_animated"
|
70 |
+
lora_path = "./models/lyrasd_xiaorenshu_lora"
|
71 |
+
|
72 |
+
# 构建 Txt2Img 的 Pipeline
|
73 |
+
model = LyraSdTxt2ImgPipeline(model_path, lib_path)
|
74 |
+
|
75 |
+
# load lora
|
76 |
+
# 参数分别为 lora 存放位置,名字,lora 强度,lora模型精度
|
77 |
+
model.load_lora(lora_path, "xiaorenshu", 0.4, "fp32")
|
78 |
+
|
79 |
+
# 准备应用的输入和超参数
|
80 |
+
prompt = "a cat, cute, cartoon, concise, traditional, chinese painting, Tang and Song Dynasties, masterpiece, 4k, 8k, UHD, best quality"
|
81 |
+
negative_prompt = "(((horrible))), (((scary))), (((naked))), (((large breasts))), high saturation, colorful, human:2, body:2, low quality, bad quality, lowres, out of frame, duplicate, watermark, signature, text, frames, cut, cropped, malformed limbs, extra limbs, (((missing arms))), (((missing legs)))"
|
82 |
+
height, width = 512, 512
|
83 |
+
steps = 30
|
84 |
+
guidance_scale = 7
|
85 |
+
generator = torch.Generator().manual_seed(123)
|
86 |
+
num_images = 1
|
87 |
+
|
88 |
+
start = time.perf_counter()
|
89 |
+
# 推理生成
|
90 |
+
images = model(prompt, height, width, steps,
|
91 |
+
guidance_scale, negative_prompt, num_images,
|
92 |
+
generator=generator)
|
93 |
+
print("image gen cost: ",time.perf_counter() - start)
|
94 |
+
# 存储生成的图片
|
95 |
+
for i, image in enumerate(images):
|
96 |
+
image.save(f"outputs/res_txt2img_lora_{i}.png")
|
97 |
+
|
98 |
+
# unload lora,参数为 lora 的名字,是否清除 lora 缓存
|
99 |
+
# model.unload_lora("xiaorenshu", True)
|
100 |
|
|
|
|
|
|
|
|
|
101 |
|
102 |
```
|
103 |
## Demo output
|
104 |
|
105 |
+
### Text2Img
|
106 |
+
#### Text2Img without Lora
|
107 |
+

|
108 |
|
109 |
+
#### Text2Img with Lora
|
110 |
+

|
111 |
|
112 |
+
### Img2Img
|
113 |
|
114 |
+
#### Img2Img input
|
115 |
+
<img src="https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/seaside_town.png?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1692601590;1865401590&q-key-time=1692601590;1865401590&q-header-list=&q-url-param-list=&q-signature=ca04ca92d990d94813029c0d9ef29537e5f4637c" alt="img2img input" width="512"/>
|
116 |
|
117 |
+
#### Img2Img output
|
118 |
+

|
119 |
|
120 |
+
### ControlNet Text2Img
|
121 |
|
122 |
+
#### Control Image
|
123 |
+

|
124 |
|
125 |
+
#### ControlNet Text2Img Output
|
126 |
+

|
127 |
+
|
128 |
+
## Docker Environment Recommendation
|
129 |
+
|
130 |
+
- For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
|
131 |
+
- For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
|
132 |
+
|
133 |
+
```bash
|
134 |
+
docker pull nvcr.io/nvidia/pytorch:23.02-py3
|
135 |
+
docker run --rm -it --gpus all -v ./:/lyraSD2 nvcr.io/nvidia/pytorch:23.02-py3
|
136 |
+
|
137 |
+
pip install -r requirements.txt
|
138 |
+
python txt2img_demo.py
|
139 |
```
|
140 |
|
141 |
## Citation
|
142 |
``` bibtex
|
143 |
+
@Misc{lyraSD2_2023,
|
144 |
+
author = {Kangjian Wu, Zhengtao Wang, Yibo Lu, Haoxiong Su, Bin Wu},
|
145 |
+
title = {lyraSD2: Accelerating Stable Diffusion with best flexibility},
|
146 |
+
howpublished = {\url{https://huggingface.co/TMElyralab/lyraSD2}},
|
147 |
year = {2023}
|
148 |
}
|
149 |
```
|
150 |
|
151 |
## Report bug
|
152 |
+
- start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraSD2/discussions
|
153 |
- report bug with a `[bug]` mark in the title.
|
control_bird_canny.png
ADDED
![]() |
controlnet_img2img_demo.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from io import BytesIO
|
2 |
+
|
3 |
+
import requests
|
4 |
+
import torch
|
5 |
+
from PIL import Image
|
6 |
+
|
7 |
+
from lyrasd_model import LyraSdControlnetImg2ImgPipeline
|
8 |
+
|
9 |
+
# 存放模型文件的路径,应该包含一下结构:
|
10 |
+
# 1. clip 模型
|
11 |
+
# 2. 转换好的优化后的 unet 模型
|
12 |
+
# 3. vae 模型
|
13 |
+
# 4. scheduler 配置
|
14 |
+
|
15 |
+
# LyraSD 的 C++ 编译动态链接库,其中包含 C++ CUDA 计算的细节
|
16 |
+
lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
|
17 |
+
model_path = "./models/lyrasd_rev_animated"
|
18 |
+
canny_controlnet_path = "./models/lyrasd_canny"
|
19 |
+
|
20 |
+
# 构建 Img2Img 的 Pipeline
|
21 |
+
model = LyraSdControlnetImg2ImgPipeline(model_path, lib_path)
|
22 |
+
|
23 |
+
# load Controlnet 模型,最多load 3个
|
24 |
+
model.load_controlnet_model("canny", canny_controlnet_path, "fp32")
|
25 |
+
|
26 |
+
control_img = Image.open("control_bird_canny.png")
|
27 |
+
|
28 |
+
# 准备应用的输入和超参数
|
29 |
+
prompt = "a bird"
|
30 |
+
negative_prompt = "NSFW"
|
31 |
+
height, width = 512, 512
|
32 |
+
steps = 20
|
33 |
+
guidance_scale = 7.5
|
34 |
+
generator = torch.Generator().manual_seed(123)
|
35 |
+
num_images = 1
|
36 |
+
|
37 |
+
# 可以一次性load 3 个 Controlnets,达到multi Controlnet的效果,这里的参数的长度需要对其
|
38 |
+
# Controlnet 所输入的img list 长度应该和 controlnet scale 与 Controlnet name 一致,而内部的list长度需要和batch size一致
|
39 |
+
# 对应的index 可以对其
|
40 |
+
controlnet_images = [[control_img]]
|
41 |
+
controlnet_scale= [0.5]
|
42 |
+
controlnet_names= ['canny']
|
43 |
+
|
44 |
+
# 从 cos 上拿个图作为初始化图片
|
45 |
+
init_image_url = "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/seaside_town.png?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1692601590;1865401590&q-key-time=1692601590;1865401590&q-header-list=&q-url-param-list=&q-signature=ca04ca92d990d94813029c0d9ef29537e5f4637c"
|
46 |
+
init_image = BytesIO(requests.get(init_image_url).content)
|
47 |
+
init_image = Image.open(init_image).convert('RGB')
|
48 |
+
init_image = init_image.resize((width, height), Image.Resampling.LANCZOS)
|
49 |
+
guess_mode = False
|
50 |
+
strength = 0.8
|
51 |
+
|
52 |
+
# 推理生成
|
53 |
+
images = model(prompt, init_image, strength, height, width, steps,
|
54 |
+
guidance_scale, negative_prompt, num_images,
|
55 |
+
generator=generator, controlnet_images=controlnet_images,
|
56 |
+
controlnet_scale=controlnet_scale, controlnet_names=controlnet_names,
|
57 |
+
guess_mode=guess_mode
|
58 |
+
)
|
59 |
+
|
60 |
+
# 存储生成的图片
|
61 |
+
for i, image in enumerate(images):
|
62 |
+
image.save(f"outputs/res_controlnet_img2img_{i}.png")
|
controlnet_txt2img_demo.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import time
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
from lyrasd_model import LyraSdControlnetTxt2ImgPipeline
|
6 |
+
|
7 |
+
# 存放模型文件的路径,应该包含一下结构:
|
8 |
+
# 1. clip 模型
|
9 |
+
# 2. 转换好的优化后的 unet 模型
|
10 |
+
# 3. 转换好的优化后的 controlnet 模型
|
11 |
+
# 4. vae 模型
|
12 |
+
# 5. scheduler 配置
|
13 |
+
|
14 |
+
# LyraSD 的 C++ 编译动态链接库,其中包含 C++ CUDA 计算的细节
|
15 |
+
lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
|
16 |
+
model_path = "./models/lyrasd_rev_animated"
|
17 |
+
canny_controlnet_path = "./models/lyrasd_canny"
|
18 |
+
# 构建 Txt2Img 的 Pipeline
|
19 |
+
pipe = LyraSdControlnetTxt2ImgPipeline(model_path, lib_path)
|
20 |
+
|
21 |
+
# load Controlnet 模型,最多load 3个
|
22 |
+
start = time.perf_counter()
|
23 |
+
pipe.load_controlnet_model("canny", canny_controlnet_path, "fp32")
|
24 |
+
print(f"controlnet load cost: {time.perf_counter() - start}")
|
25 |
+
# 可以通过 get_loaded_controlnet 方法获取目前已经load 好的Controlnet list
|
26 |
+
print(pipe.get_loaded_controlnet())
|
27 |
+
|
28 |
+
# 可以通过unload_controlnet_model 方法unload Controlnet
|
29 |
+
# pipe.unload_controlnet_model("canny")
|
30 |
+
|
31 |
+
control_img = Image.open("control_bird_canny.png")
|
32 |
+
|
33 |
+
# 准备应用的输入和超参数
|
34 |
+
prompt = "a blue bird"
|
35 |
+
negative_prompt = "NSFW"
|
36 |
+
height, width = 512, 512
|
37 |
+
steps = 20
|
38 |
+
guidance_scale = 7.5
|
39 |
+
generator = torch.Generator().manual_seed(123)
|
40 |
+
num_images = 1
|
41 |
+
guess_mode = False
|
42 |
+
|
43 |
+
# 可以一次性load 3 个 Controlnets,达到multi Controlnet的效果,这里的参数的长度需要对其
|
44 |
+
# Controlnet 所输入的img list 长度应该和 controlnet scale 与 Controlnet name 一致,而内部的list长度需要和batch size一致
|
45 |
+
# 对应的index 可以对其
|
46 |
+
controlnet_images = [[control_img]]
|
47 |
+
controlnet_scale = [0.5]
|
48 |
+
controlnet_names = ['canny']
|
49 |
+
|
50 |
+
# 推理生成,返回结果都是生成好的 PIL.Image
|
51 |
+
|
52 |
+
start = time.perf_counter()
|
53 |
+
images = pipe(prompt, height, width, steps,
|
54 |
+
guidance_scale, negative_prompt, num_images,
|
55 |
+
generator=generator, controlnet_images=controlnet_images,
|
56 |
+
controlnet_scale=controlnet_scale, controlnet_names=controlnet_names,
|
57 |
+
guess_mode=guess_mode
|
58 |
+
)
|
59 |
+
print("cur cost: ",time.perf_counter() - start)
|
60 |
+
|
61 |
+
# 存储生成的图片
|
62 |
+
for i, image in enumerate(images):
|
63 |
+
image.save(f"./outputs/res_controlnet_txt2img_{i}.png")
|
demo.py
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
from lyraSD import LyraSD
|
2 |
-
|
3 |
-
t2imodel = LyraSD("text2img", "./sd1.4-engine")
|
4 |
-
t2imodel.inference(prompt="A fantasy landscape, trending on artstation", use_super=True)
|
5 |
-
|
6 |
-
|
7 |
-
from PIL import Image
|
8 |
-
i2imodel = LyraSD("img2img", "./sd1.4-engine")
|
9 |
-
demo_img = Image.open("output/img2img_input.jpg")
|
10 |
-
i2imodel.inference(prompt="A fantasy landscape, trending on artstation",
|
11 |
-
image=demo_img, use_super=True)
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
img2img_demo.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from io import BytesIO
|
2 |
+
|
3 |
+
import requests
|
4 |
+
import torch
|
5 |
+
from PIL import Image
|
6 |
+
|
7 |
+
from lyrasd_model import LyraSDImg2ImgPipeline
|
8 |
+
|
9 |
+
# 存放模型文件的路径,应该包含一下结构:
|
10 |
+
# 1. clip 模型
|
11 |
+
# 2. 转换好的优化后的 unet 模型
|
12 |
+
# 3. vae 模型
|
13 |
+
# 4. scheduler 配置
|
14 |
+
|
15 |
+
# LyraSD 的 C++ 编译动态链接库,其中包含 C++ CUDA 计算的细节
|
16 |
+
lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
|
17 |
+
model_path = "./models/lyrasd_rev_animated"
|
18 |
+
|
19 |
+
# 构建 Img2Img 的 Pipeline
|
20 |
+
model = LyraSDImg2ImgPipeline(model_path, lib_path)
|
21 |
+
|
22 |
+
# 准备应用的输入和超参数
|
23 |
+
prompt = "a cat, cartoon style"
|
24 |
+
negative_prompt = "NSFW"
|
25 |
+
height, width = 512, 512
|
26 |
+
steps = 20
|
27 |
+
guidance_scale = 7.5
|
28 |
+
generator = torch.Generator().manual_seed(123)
|
29 |
+
num_images = 1
|
30 |
+
|
31 |
+
# 从 cos 上拿个图作为初始化图片
|
32 |
+
init_image_url = "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/seaside_town.png?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1692601590;1865401590&q-key-time=1692601590;1865401590&q-header-list=&q-url-param-list=&q-signature=ca04ca92d990d94813029c0d9ef29537e5f4637c"
|
33 |
+
init_image = BytesIO(requests.get(init_image_url).content)
|
34 |
+
init_image = Image.open(init_image).convert('RGB')
|
35 |
+
init_image = init_image.resize((width, height), Image.Resampling.LANCZOS)
|
36 |
+
|
37 |
+
strength = 0.8
|
38 |
+
|
39 |
+
# 推理生成
|
40 |
+
images = model(prompt, init_image, strength, steps,
|
41 |
+
guidance_scale, negative_prompt, num_images,
|
42 |
+
generator=generator
|
43 |
+
)
|
44 |
+
|
45 |
+
# 存储生成的图片
|
46 |
+
for i, image in enumerate(images):
|
47 |
+
image.save(f"./outputs/res_img2img_{i}.png")
|
lyraSD/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
from .inference import LyraSD
|
|
|
|
lyraSD/inference.py
DELETED
@@ -1,85 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from PIL import Image
|
3 |
-
from .muse_trt import TRTStableDiffusionText2ImgPipeline
|
4 |
-
from .muse_trt import TRTStableDiffusionImg2ImgPipeline
|
5 |
-
import numpy as np
|
6 |
-
|
7 |
-
|
8 |
-
class LyraSD(object):
|
9 |
-
def __init__(self, sd_mode, engine_dir,o_height=512, o_width=512, device="cuda:0"):
|
10 |
-
self.sd_mode = sd_mode
|
11 |
-
self.device = device
|
12 |
-
self.o_height = o_height
|
13 |
-
self.o_width = o_width
|
14 |
-
if self.sd_mode == "text2img":
|
15 |
-
self.pipeline = TRTStableDiffusionText2ImgPipeline(
|
16 |
-
engine_dir = engine_dir,
|
17 |
-
o_height = o_height,
|
18 |
-
o_width = o_width,
|
19 |
-
device=device
|
20 |
-
)
|
21 |
-
elif self.sd_mode == "img2img":
|
22 |
-
self.pipeline = TRTStableDiffusionImg2ImgPipeline(
|
23 |
-
engine_dir = engine_dir,
|
24 |
-
o_height = o_height,
|
25 |
-
o_width = o_width,
|
26 |
-
device=device
|
27 |
-
)
|
28 |
-
else:
|
29 |
-
raise ValueError("Invalid sd_mode: {}".format(self.sd_mode))
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
def inference(self, prompt,
|
34 |
-
image=None,
|
35 |
-
save_dir="./output",
|
36 |
-
save_basename="sd-",
|
37 |
-
negative_prompts='',
|
38 |
-
strength=0.8,
|
39 |
-
height=None,
|
40 |
-
width =None,
|
41 |
-
num_images_per_prompt=1,
|
42 |
-
num_inference_steps=50,
|
43 |
-
guidance_scale=7.5,
|
44 |
-
use_super=False,
|
45 |
-
):
|
46 |
-
|
47 |
-
if self.sd_mode=="text2img" and prompt is None:
|
48 |
-
raise ValueError("prompt must be set on text2img mode")
|
49 |
-
|
50 |
-
if self.sd_mode=="img2img" and image is None:
|
51 |
-
raise ValueError("image must be set on img2img mode")
|
52 |
-
|
53 |
-
save_basename += f"{self.sd_mode}"
|
54 |
-
if height is None:
|
55 |
-
height = self.o_height
|
56 |
-
if width is None:
|
57 |
-
width = self.o_width
|
58 |
-
|
59 |
-
if self.sd_mode=="text2img":
|
60 |
-
result_image = self.pipeline(prompt=prompt, negative_prompt=negative_prompts,
|
61 |
-
num_inference_steps= num_inference_steps,
|
62 |
-
num_images_per_prompt=num_images_per_prompt,
|
63 |
-
guidance_scale=guidance_scale,
|
64 |
-
height=height,
|
65 |
-
width=width,
|
66 |
-
use_super=use_super)
|
67 |
-
elif self.sd_mode=="img2img":
|
68 |
-
result_image = self.pipeline(prompt=prompt,
|
69 |
-
image=image,
|
70 |
-
negative_prompt=negative_prompts,
|
71 |
-
strength = strength,
|
72 |
-
num_inference_steps= num_inference_steps,
|
73 |
-
num_images_per_prompt=num_images_per_prompt,
|
74 |
-
guidance_scale=guidance_scale,
|
75 |
-
height=height,
|
76 |
-
width=width,
|
77 |
-
use_super=use_super)
|
78 |
-
|
79 |
-
|
80 |
-
for i in range(result_image.shape[0]):
|
81 |
-
result_image = Image.fromarray(result_image[0])
|
82 |
-
result_image = result_image.resize((512, 512))
|
83 |
-
result_image.save(os.path.join(save_dir, save_basename + "-{}.jpg".format(i)))
|
84 |
-
|
85 |
-
return result_image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lyraSD/muse_trt/__init__.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
import ctypes
|
2 |
-
import os
|
3 |
-
|
4 |
-
current_workdir = os.path.dirname(__file__)
|
5 |
-
|
6 |
-
ctypes.cdll.LoadLibrary(os.path.join(current_workdir, "libnvinfer_plugin.so"))
|
7 |
-
|
8 |
-
from .sd_img2img import TRTStableDiffusionImg2ImgPipeline
|
9 |
-
from .sd_text2img import TRTStableDiffusionText2ImgPipeline
|
10 |
-
from .super import SuperX4TRTInfer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lyraSD/muse_trt/models.py
DELETED
@@ -1,149 +0,0 @@
|
|
1 |
-
r"""models components"""
|
2 |
-
from collections import OrderedDict
|
3 |
-
from copy import deepcopy
|
4 |
-
from typing import Any, Dict, Optional, Union
|
5 |
-
|
6 |
-
import numpy as np
|
7 |
-
import torch
|
8 |
-
from cuda import cudart
|
9 |
-
from diffusers import ControlNetModel
|
10 |
-
from diffusers.models import AutoencoderKL, UNet2DConditionModel
|
11 |
-
from torch import nn
|
12 |
-
from torch.nn import functional as F
|
13 |
-
from transformers import CLIPTextModel
|
14 |
-
|
15 |
-
|
16 |
-
class BaseModel():
|
17 |
-
def __init__(
|
18 |
-
self,
|
19 |
-
local_model_path=None,
|
20 |
-
hf_token=None,
|
21 |
-
text_maxlen=77,
|
22 |
-
embedding_dim=768,
|
23 |
-
fp16=False,
|
24 |
-
device='cuda',
|
25 |
-
verbose=True,
|
26 |
-
max_batch_size=16
|
27 |
-
):
|
28 |
-
self.fp16 = fp16
|
29 |
-
self.device = device
|
30 |
-
self.verbose = verbose
|
31 |
-
self.hf_token = hf_token
|
32 |
-
self.local_model_path = local_model_path
|
33 |
-
|
34 |
-
# Defaults
|
35 |
-
self.text_maxlen = text_maxlen
|
36 |
-
self.embedding_dim = embedding_dim
|
37 |
-
self.min_batch = 1
|
38 |
-
self.max_batch = max_batch_size
|
39 |
-
self.min_latent_shape = 256 // 8 # min image resolution: 256x256
|
40 |
-
self.max_latent_shape = 1024 // 8 # max image resolution: 1024x1024
|
41 |
-
|
42 |
-
def get_model(self):
|
43 |
-
pass
|
44 |
-
|
45 |
-
def get_shape_dict(self, batch_size, image_height, image_width):
|
46 |
-
return None
|
47 |
-
|
48 |
-
def check_dims(self, batch_size, image_height, image_width):
|
49 |
-
assert batch_size >= self.min_batch and batch_size <= self.max_batch
|
50 |
-
assert image_height % 8 == 0 or image_width % 8 == 0
|
51 |
-
latent_height = image_height // 8
|
52 |
-
latent_width = image_width // 8
|
53 |
-
assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape
|
54 |
-
assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape
|
55 |
-
return (latent_height, latent_width)
|
56 |
-
|
57 |
-
|
58 |
-
class CLIP(BaseModel):
|
59 |
-
def get_model(self):
|
60 |
-
if self.hf_token is None and self.local_model_path is not None:
|
61 |
-
clip_model = CLIPTextModel.from_pretrained(
|
62 |
-
self.local_model_path, subfolder="text_encoder").to(self.device)
|
63 |
-
else:
|
64 |
-
clip_model = CLIPTextModel.from_pretrained(
|
65 |
-
"openai/clip-vit-large-patch14").to(self.device)
|
66 |
-
return clip_model
|
67 |
-
|
68 |
-
def get_shape_dict(self, batch_size, image_height, image_width):
|
69 |
-
self.check_dims(batch_size, image_height, image_width)
|
70 |
-
return {
|
71 |
-
'input_ids': (batch_size, self.text_maxlen),
|
72 |
-
'text_embeddings': (batch_size, self.text_maxlen, self.embedding_dim)
|
73 |
-
}
|
74 |
-
|
75 |
-
|
76 |
-
class UNet(BaseModel):
|
77 |
-
def get_model(self):
|
78 |
-
model_opts = {'revision': 'fp16',
|
79 |
-
'torch_dtype': torch.float16} if self.fp16 else {}
|
80 |
-
print(model_opts)
|
81 |
-
if self.hf_token is None and self.local_model_path is not None:
|
82 |
-
unet_model = UNet2DConditionModel.from_pretrained(
|
83 |
-
self.local_model_path, subfolder="unet",
|
84 |
-
**model_opts
|
85 |
-
).to(self.device)
|
86 |
-
else:
|
87 |
-
unet_model = UNet2DConditionModel.from_pretrained(
|
88 |
-
"CompVis/stable-diffusion-v1-4",
|
89 |
-
subfolder="unet",
|
90 |
-
use_auth_token=self.hf_token,
|
91 |
-
**model_opts).to(self.device)
|
92 |
-
return unet_model
|
93 |
-
|
94 |
-
def get_shape_dict(self, batch_size, image_height, image_width):
|
95 |
-
latent_height, latent_width = self.check_dims(
|
96 |
-
batch_size, image_height, image_width)
|
97 |
-
return {
|
98 |
-
'sample': (2*batch_size, 4, latent_height, latent_width),
|
99 |
-
'encoder_hidden_states': (2*batch_size, self.text_maxlen, self.embedding_dim),
|
100 |
-
'latent': (2*batch_size, 4, latent_height, latent_width)
|
101 |
-
}
|
102 |
-
|
103 |
-
class VAEEncoderModule(nn.Module):
|
104 |
-
def __init__(self, local_model_path, device) -> None:
|
105 |
-
super().__init__()
|
106 |
-
self.vae = AutoencoderKL.from_pretrained(
|
107 |
-
local_model_path, subfolder="vae"
|
108 |
-
).to(device)
|
109 |
-
|
110 |
-
def forward(self, x):
|
111 |
-
h = self.vae.encoder(x)
|
112 |
-
moments = self.vae.quant_conv(h)
|
113 |
-
return moments
|
114 |
-
|
115 |
-
|
116 |
-
class VAEEncoder(BaseModel):
|
117 |
-
def get_model(self):
|
118 |
-
vae_encoder = VAEEncoderModule(self.local_model_path, self.device)
|
119 |
-
return vae_encoder
|
120 |
-
def get_shape_dict(self, batch_size, image_height, image_width):
|
121 |
-
image_height, image_width = self.check_dims(
|
122 |
-
batch_size, image_height, image_width)
|
123 |
-
return {
|
124 |
-
'images': (batch_size, 3, image_height, image_width),
|
125 |
-
'latent': (batch_size, 8, image_height//8, image_width//8),
|
126 |
-
}
|
127 |
-
|
128 |
-
|
129 |
-
class VAEDecoder(BaseModel):
|
130 |
-
def get_model(self):
|
131 |
-
if self.hf_token is None and self.local_model_path is not None:
|
132 |
-
vae = AutoencoderKL.from_pretrained(
|
133 |
-
self.local_model_path, subfolder="vae"
|
134 |
-
).to(self.device)
|
135 |
-
else:
|
136 |
-
vae = AutoencoderKL.from_pretrained(
|
137 |
-
"CompVis/stable-diffusion-v1-4",
|
138 |
-
subfolder="vae",
|
139 |
-
use_auth_token=self.hf_token).to(self.device)
|
140 |
-
vae.forward = vae.decode
|
141 |
-
return vae
|
142 |
-
|
143 |
-
def get_shape_dict(self, batch_size, image_height, image_width):
|
144 |
-
latent_height, latent_width = self.check_dims(
|
145 |
-
batch_size, image_height, image_width)
|
146 |
-
return {
|
147 |
-
'latent': (batch_size, 4, latent_height, latent_width),
|
148 |
-
'images': (batch_size, 3, image_height, image_width)
|
149 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lyraSD/muse_trt/sd_img2img.py
DELETED
@@ -1,368 +0,0 @@
|
|
1 |
-
r"""
|
2 |
-
StableDiffusion Img2Img Pipeline by TensorRT.
|
3 |
-
It has included SuperResolutionX4 TensorRT Engine.
|
4 |
-
|
5 |
-
Inspired by: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
|
6 |
-
https://developer.nvidia.com/tensorrt
|
7 |
-
"""
|
8 |
-
|
9 |
-
import inspect
|
10 |
-
import os
|
11 |
-
from typing import List, Optional, Union
|
12 |
-
|
13 |
-
import numpy as np
|
14 |
-
import PIL.Image
|
15 |
-
import tensorrt as trt
|
16 |
-
import torch
|
17 |
-
import time
|
18 |
-
from diffusers import AutoencoderKL
|
19 |
-
from diffusers.schedulers import DPMSolverMultistepScheduler
|
20 |
-
from diffusers.models.vae import DiagonalGaussianDistribution
|
21 |
-
from diffusers.utils import PIL_INTERPOLATION, randn_tensor
|
22 |
-
from polygraphy import cuda
|
23 |
-
from transformers import CLIPTokenizer
|
24 |
-
|
25 |
-
from .models import CLIP, UNet, VAEDecoder, VAEEncoder
|
26 |
-
from .super import SuperX4TRTInfer
|
27 |
-
from .utilities import TRT_LOGGER, Engine
|
28 |
-
|
29 |
-
|
30 |
-
def preprocess(image):
|
31 |
-
if isinstance(image, torch.Tensor):
|
32 |
-
return image
|
33 |
-
elif isinstance(image, PIL.Image.Image):
|
34 |
-
image = [image]
|
35 |
-
|
36 |
-
if isinstance(image[0], PIL.Image.Image):
|
37 |
-
w, h = image[0].size
|
38 |
-
w, h = map(lambda x: x - x % 8, (w, h)) # resize to integer multiple of 8
|
39 |
-
|
40 |
-
image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
|
41 |
-
image = np.concatenate(image, axis=0)
|
42 |
-
image = np.array(image).astype(np.float32) / 255.0
|
43 |
-
image = image.transpose(0, 3, 1, 2)
|
44 |
-
image = 2.0 * image - 1.0
|
45 |
-
image = torch.from_numpy(image)
|
46 |
-
elif isinstance(image[0], torch.Tensor):
|
47 |
-
image = torch.cat(image, dim=0)
|
48 |
-
return image
|
49 |
-
|
50 |
-
|
51 |
-
class TRTStableDiffusionImg2ImgPipeline:
|
52 |
-
def __init__(self, engine_dir: str, o_height: int = 1300, o_width: int = 750, device: str = 'cuda:0'):
|
53 |
-
self.device = torch.device(device)
|
54 |
-
super().__init__()
|
55 |
-
self.vae = AutoencoderKL.from_pretrained(
|
56 |
-
os.path.join(engine_dir, 'vae'),
|
57 |
-
torch_dtype=torch.float16
|
58 |
-
).to(self.device)
|
59 |
-
|
60 |
-
self.tokenizer = CLIPTokenizer.from_pretrained(
|
61 |
-
os.path.join(engine_dir, 'tokenizer')
|
62 |
-
)
|
63 |
-
self.scheduler = DPMSolverMultistepScheduler.from_pretrained(
|
64 |
-
os.path.join(engine_dir, 'scheduler')
|
65 |
-
)
|
66 |
-
|
67 |
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
68 |
-
self.trt_torch_models_cls = {
|
69 |
-
'clip': CLIP(),
|
70 |
-
'unet_fp16': UNet(),
|
71 |
-
#'vae-encoder': VAEEncoder(),
|
72 |
-
'vae-decoder': VAEDecoder()
|
73 |
-
}
|
74 |
-
|
75 |
-
self.engine = {}
|
76 |
-
# Build engines
|
77 |
-
for model_name, _ in self.trt_torch_models_cls.items():
|
78 |
-
engine = Engine(model_name, engine_dir)
|
79 |
-
self.engine[model_name] = engine
|
80 |
-
# Separate iteration to activate engines
|
81 |
-
for model_name, _ in self.trt_torch_models_cls.items():
|
82 |
-
self.engine[model_name].activate()
|
83 |
-
self.stream = cuda.Stream()
|
84 |
-
|
85 |
-
self.super = SuperX4TRTInfer(
|
86 |
-
engine_dir,
|
87 |
-
model_name='superx4-512-512.plan',
|
88 |
-
fp16=True,
|
89 |
-
)
|
90 |
-
|
91 |
-
def runEngine(self, model_name, feed_dict):
|
92 |
-
engine = self.engine[model_name]
|
93 |
-
return engine.infer(feed_dict, self.stream)
|
94 |
-
|
95 |
-
def _torch_decode_latents(self, latents):
|
96 |
-
latents = 1 / self.vae.config.scaling_factor * latents
|
97 |
-
image = self.vae.decode(latents).sample
|
98 |
-
image = (image / 2 + 0.5).clamp(0, 1)
|
99 |
-
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
|
100 |
-
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
101 |
-
image = (image * 255).round()
|
102 |
-
return image
|
103 |
-
|
104 |
-
def _trt_decode_latents(self, latents):
|
105 |
-
latents = 1 / self.vae.config.scaling_factor * latents
|
106 |
-
sample_inp = cuda.DeviceView(
|
107 |
-
ptr=latents.data_ptr(), shape=latents.shape, dtype=np.float32)
|
108 |
-
image = self.runEngine('vae-decoder', {"latent": sample_inp})['images']
|
109 |
-
image = (image / 2 + 0.5).clamp(0, 1)
|
110 |
-
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
|
111 |
-
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
112 |
-
image = (image * 255).round()
|
113 |
-
|
114 |
-
return image
|
115 |
-
|
116 |
-
def prepare_extra_step_kwargs(self, generator, eta):
|
117 |
-
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
118 |
-
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
119 |
-
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
120 |
-
# and should be between [0, 1]
|
121 |
-
|
122 |
-
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
123 |
-
extra_step_kwargs = {}
|
124 |
-
if accepts_eta:
|
125 |
-
extra_step_kwargs["eta"] = eta
|
126 |
-
|
127 |
-
# check if the scheduler accepts generator
|
128 |
-
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
129 |
-
if accepts_generator:
|
130 |
-
extra_step_kwargs["generator"] = generator
|
131 |
-
return extra_step_kwargs
|
132 |
-
|
133 |
-
def get_timesteps(self, num_inference_steps, strength, device):
|
134 |
-
# get the original timestep using init_timestep
|
135 |
-
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
136 |
-
|
137 |
-
t_start = max(num_inference_steps - init_timestep, 0)
|
138 |
-
timesteps = self.scheduler.timesteps[t_start:]
|
139 |
-
|
140 |
-
return timesteps, num_inference_steps - t_start
|
141 |
-
|
142 |
-
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
143 |
-
if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
|
144 |
-
raise ValueError(
|
145 |
-
f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
|
146 |
-
)
|
147 |
-
|
148 |
-
image = image.to(device=device, dtype=dtype)
|
149 |
-
|
150 |
-
batch_size = batch_size * num_images_per_prompt
|
151 |
-
if isinstance(generator, list) and len(generator) != batch_size:
|
152 |
-
raise ValueError(
|
153 |
-
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
154 |
-
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
155 |
-
)
|
156 |
-
|
157 |
-
if isinstance(generator, list):
|
158 |
-
init_latents = [
|
159 |
-
self.vae.encode(image[i: i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
|
160 |
-
]
|
161 |
-
init_latents = torch.cat(init_latents, dim=0)
|
162 |
-
else:
|
163 |
-
init_latents = self.vae.encode(image).latent_dist.sample(generator)
|
164 |
-
|
165 |
-
init_latents = self.vae.config.scaling_factor * init_latents
|
166 |
-
|
167 |
-
if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
|
168 |
-
raise ValueError(
|
169 |
-
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
170 |
-
)
|
171 |
-
else:
|
172 |
-
init_latents = torch.cat([init_latents], dim=0)
|
173 |
-
|
174 |
-
shape = init_latents.shape
|
175 |
-
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
176 |
-
|
177 |
-
# get latents
|
178 |
-
init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
|
179 |
-
latents = init_latents
|
180 |
-
|
181 |
-
return latents
|
182 |
-
|
183 |
-
def _default_height_width(self, height, width, image):
|
184 |
-
if isinstance(image, list):
|
185 |
-
image = image[0]
|
186 |
-
|
187 |
-
if height is None:
|
188 |
-
if isinstance(image, PIL.Image.Image):
|
189 |
-
height = image.height
|
190 |
-
elif isinstance(image, torch.Tensor):
|
191 |
-
height = image.shape[3]
|
192 |
-
|
193 |
-
height = (height // 8) * 8 # round down to nearest multiple of 8
|
194 |
-
|
195 |
-
if width is None:
|
196 |
-
if isinstance(image, PIL.Image.Image):
|
197 |
-
width = image.width
|
198 |
-
elif isinstance(image, torch.Tensor):
|
199 |
-
width = image.shape[2]
|
200 |
-
|
201 |
-
width = (width // 8) * 8 # round down to nearest multiple of 8
|
202 |
-
|
203 |
-
return height, width
|
204 |
-
|
205 |
-
def _trt_encode_prompt(self, prompt, negative_prompt, num_images_per_prompt,):
|
206 |
-
# Tokenize input
|
207 |
-
text_input_ids = self.tokenizer(
|
208 |
-
prompt,
|
209 |
-
padding="max_length",
|
210 |
-
max_length=self.tokenizer.model_max_length,
|
211 |
-
return_tensors="pt",
|
212 |
-
).input_ids.type(torch.int32).to(self.device)
|
213 |
-
|
214 |
-
# CLIP text encoder
|
215 |
-
text_input_ids_inp = cuda.DeviceView(
|
216 |
-
ptr=text_input_ids.data_ptr(), shape=text_input_ids.shape, dtype=np.int32
|
217 |
-
)
|
218 |
-
text_embeddings = self.runEngine('clip', {"input_ids": text_input_ids_inp})['text_embeddings']
|
219 |
-
|
220 |
-
# Duplicate text embeddings for each generation per prompt
|
221 |
-
bs_embed, seq_len, _ = text_embeddings.shape
|
222 |
-
text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
|
223 |
-
text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
224 |
-
|
225 |
-
max_length = text_input_ids.shape[-1]
|
226 |
-
uncond_input_ids = self.tokenizer(
|
227 |
-
negative_prompt,
|
228 |
-
padding="max_length",
|
229 |
-
max_length=max_length,
|
230 |
-
truncation=True,
|
231 |
-
return_tensors="pt",
|
232 |
-
).input_ids.type(torch.int32).to(self.device)
|
233 |
-
uncond_input_ids_inp = cuda.DeviceView(
|
234 |
-
ptr=uncond_input_ids.data_ptr(), shape=uncond_input_ids.shape, dtype=np.int32)
|
235 |
-
uncond_embeddings = self.runEngine('clip', {"input_ids": uncond_input_ids_inp})['text_embeddings']
|
236 |
-
|
237 |
-
# Duplicate unconditional embeddings for each generation per prompt
|
238 |
-
seq_len = uncond_embeddings.shape[1]
|
239 |
-
uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
|
240 |
-
uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
241 |
-
|
242 |
-
# Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
|
243 |
-
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
|
244 |
-
text_embeddings = text_embeddings.to(dtype=torch.float16)
|
245 |
-
|
246 |
-
return text_embeddings
|
247 |
-
|
248 |
-
@torch.no_grad()
|
249 |
-
def __call__(
|
250 |
-
self,
|
251 |
-
prompt: Union[str, List[str]] = None,
|
252 |
-
image: Union[torch.Tensor, PIL.Image.Image] = None,
|
253 |
-
strength: float = 0.8,
|
254 |
-
height: Optional[int] = None,
|
255 |
-
width: Optional[int] = None,
|
256 |
-
num_inference_steps: int = 50,
|
257 |
-
guidance_scale: float = 7.5,
|
258 |
-
negative_prompt: Optional[Union[str, List[str]]] = None,
|
259 |
-
num_images_per_prompt: Optional[int] = 1,
|
260 |
-
eta: float = 0.0,
|
261 |
-
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
262 |
-
latents: Optional[torch.FloatTensor] = None,
|
263 |
-
prompt_embeds: Optional[torch.FloatTensor] = None,
|
264 |
-
use_super: bool = True,
|
265 |
-
):
|
266 |
-
# 1. Default height and width to unet
|
267 |
-
height, width = self._default_height_width(height, width, image)
|
268 |
-
|
269 |
-
# 2. Define call parameters and Allocate the cuda buffers for TRT Engine bindings.
|
270 |
-
if prompt is not None and isinstance(prompt, str):
|
271 |
-
batch_size = 1
|
272 |
-
elif prompt is not None and isinstance(prompt, list):
|
273 |
-
batch_size = len(prompt)
|
274 |
-
else:
|
275 |
-
batch_size = prompt_embeds.shape[0]
|
276 |
-
|
277 |
-
# Allocate buffers for TensorRT engine bindings
|
278 |
-
for model_name, obj in self.trt_torch_models_cls.items():
|
279 |
-
self.engine[model_name].allocate_buffers(
|
280 |
-
shape_dict=obj.get_shape_dict(batch_size, height, width),
|
281 |
-
device=self.device
|
282 |
-
)
|
283 |
-
|
284 |
-
do_classifier_free_guidance = guidance_scale > 1.0
|
285 |
-
|
286 |
-
with trt.Runtime(TRT_LOGGER) as runtime:
|
287 |
-
torch.cuda.synchronize()
|
288 |
-
|
289 |
-
# 3. Encode input prompt. TRT Clip model.
|
290 |
-
prompt_embeds = self._trt_encode_prompt(
|
291 |
-
prompt, negative_prompt, num_images_per_prompt
|
292 |
-
)
|
293 |
-
|
294 |
-
# 4. Prepare mask, image, and controlnet_conditioning_image
|
295 |
-
image = preprocess(image)
|
296 |
-
|
297 |
-
# 5. Prepare timesteps.
|
298 |
-
self.scheduler.set_timesteps(num_inference_steps, device=self.device)
|
299 |
-
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, self.device)
|
300 |
-
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
|
301 |
-
|
302 |
-
# 6. Prepare latent variables. It will use VAE-Enoder(currently the encoder is torch model, not trt)
|
303 |
-
latents = self.prepare_latents(
|
304 |
-
image,
|
305 |
-
latent_timestep,
|
306 |
-
batch_size,
|
307 |
-
num_images_per_prompt,
|
308 |
-
prompt_embeds.dtype,
|
309 |
-
self.device,
|
310 |
-
generator,
|
311 |
-
)
|
312 |
-
|
313 |
-
# 7. Prepare extra step kwargs and Set lantens/controlnet_conditioning_image/prompt_embeds to special dtype.
|
314 |
-
# The dytpe must be equal to the following to ensure that the NAN can not be issued in trt engine.
|
315 |
-
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
316 |
-
latents = latents.to(dtype=torch.float32)
|
317 |
-
prompt_embeds = prompt_embeds.to(dtype=torch.float16)
|
318 |
-
|
319 |
-
# 8. Denoising loop
|
320 |
-
for i, t in enumerate(timesteps):
|
321 |
-
# expand the latents if we are doing classifier free guidance
|
322 |
-
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
323 |
-
|
324 |
-
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
325 |
-
|
326 |
-
# predict the noise residual
|
327 |
-
|
328 |
-
dtype = np.float16
|
329 |
-
if t.dtype != torch.float32:
|
330 |
-
timestep_float = t.float()
|
331 |
-
else:
|
332 |
-
timestep_float = t
|
333 |
-
|
334 |
-
sample_inp = cuda.DeviceView(
|
335 |
-
ptr=latent_model_input.data_ptr(), shape=latent_model_input.shape, dtype=np.float32
|
336 |
-
)
|
337 |
-
timestep_inp = cuda.DeviceView(
|
338 |
-
ptr=timestep_float.data_ptr(), shape=timestep_float.shape, dtype=np.float32
|
339 |
-
)
|
340 |
-
embeddings_inp = cuda.DeviceView(
|
341 |
-
ptr=prompt_embeds.data_ptr(), shape=prompt_embeds.shape, dtype=dtype
|
342 |
-
)
|
343 |
-
|
344 |
-
noise_pred = self.engine['unet_fp16'].infer(
|
345 |
-
{"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp},
|
346 |
-
self.stream)['latent']
|
347 |
-
# perform guidance
|
348 |
-
if do_classifier_free_guidance:
|
349 |
-
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
350 |
-
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
351 |
-
|
352 |
-
# compute the previous noisy sample x_t -> x_t-1
|
353 |
-
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
354 |
-
|
355 |
-
# 9. Use VAE-Decoder to decode the latents
|
356 |
-
image = self._trt_decode_latents(latents)
|
357 |
-
|
358 |
-
# 10. SuperX4 Resolution, Optional.
|
359 |
-
if use_super:
|
360 |
-
image = np.ascontiguousarray(np.transpose(image, (0, 3, 1, 2))).astype(np.float16)
|
361 |
-
#image = self.super.infer(np.transpose(image.astype(np.float16), (0, 3, 1, 2)))
|
362 |
-
image = self.super.infer(image)
|
363 |
-
image = np.uint8(np.transpose(image, (0, 2, 3, 1)))
|
364 |
-
else:
|
365 |
-
image = np.uint8(image)
|
366 |
-
|
367 |
-
return image
|
368 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lyraSD/muse_trt/sd_text2img.py
DELETED
@@ -1,292 +0,0 @@
|
|
1 |
-
r"""
|
2 |
-
StableDiffusion Text2Img Pipeline by TensorRT.
|
3 |
-
It has included SuperResolutionX4 TensorRT Engine.
|
4 |
-
|
5 |
-
Inspired by: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
|
6 |
-
https://developer.nvidia.com/tensorrt
|
7 |
-
"""
|
8 |
-
|
9 |
-
import inspect
|
10 |
-
import os
|
11 |
-
from typing import List, Optional, Union
|
12 |
-
|
13 |
-
import numpy as np
|
14 |
-
import tensorrt as trt
|
15 |
-
import torch
|
16 |
-
from diffusers import AutoencoderKL
|
17 |
-
from diffusers.schedulers import DPMSolverMultistepScheduler
|
18 |
-
from diffusers.utils import PIL_INTERPOLATION, randn_tensor
|
19 |
-
from polygraphy import cuda
|
20 |
-
from transformers import CLIPTokenizer
|
21 |
-
|
22 |
-
from .models import CLIP, UNet, VAEDecoder
|
23 |
-
from .super import SuperX4TRTInfer
|
24 |
-
from .utilities import TRT_LOGGER, Engine
|
25 |
-
|
26 |
-
|
27 |
-
class TRTStableDiffusionText2ImgPipeline:
|
28 |
-
def __init__(self, engine_dir: str, o_height: int = 512, o_width: int = 512, device: str = 'cuda:0'):
|
29 |
-
self.device = torch.device(device)
|
30 |
-
super().__init__()
|
31 |
-
self.vae = AutoencoderKL.from_pretrained(
|
32 |
-
os.path.join(engine_dir, 'vae'),
|
33 |
-
torch_dtype=torch.float16
|
34 |
-
).to(self.device)
|
35 |
-
|
36 |
-
self.tokenizer = CLIPTokenizer.from_pretrained(
|
37 |
-
os.path.join(engine_dir, 'tokenizer')
|
38 |
-
)
|
39 |
-
self.scheduler = DPMSolverMultistepScheduler.from_pretrained(
|
40 |
-
os.path.join(engine_dir, 'scheduler')
|
41 |
-
)
|
42 |
-
|
43 |
-
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
44 |
-
self.trt_torch_models_cls = {
|
45 |
-
'clip': CLIP(),
|
46 |
-
'unet_fp16': UNet(),
|
47 |
-
'vae-decoder': VAEDecoder()
|
48 |
-
}
|
49 |
-
|
50 |
-
self.engine = {}
|
51 |
-
# Build engines
|
52 |
-
for model_name, _ in self.trt_torch_models_cls.items():
|
53 |
-
engine = Engine(model_name, engine_dir)
|
54 |
-
self.engine[model_name] = engine
|
55 |
-
# Separate iteration to activate engines
|
56 |
-
for model_name, _ in self.trt_torch_models_cls.items():
|
57 |
-
self.engine[model_name].activate()
|
58 |
-
self.stream = cuda.Stream()
|
59 |
-
|
60 |
-
self.super = SuperX4TRTInfer(
|
61 |
-
engine_dir=engine_dir,
|
62 |
-
model_name='superx4-512-512.plan',
|
63 |
-
fp16=True
|
64 |
-
)
|
65 |
-
|
66 |
-
def runEngine(self, model_name, feed_dict):
|
67 |
-
engine = self.engine[model_name]
|
68 |
-
return engine.infer(feed_dict, self.stream)
|
69 |
-
|
70 |
-
def _torch_decode_latents(self, latents):
|
71 |
-
latents = 1 / self.vae.config.scaling_factor * latents
|
72 |
-
image = self.vae.decode(latents).sample
|
73 |
-
image = (image / 2 + 0.5).clamp(0, 1)
|
74 |
-
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
|
75 |
-
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
76 |
-
image = (image * 255).round()
|
77 |
-
return image
|
78 |
-
|
79 |
-
def _trt_decode_latents(self, latents):
|
80 |
-
latents = 1 / self.vae.config.scaling_factor * latents
|
81 |
-
sample_inp = cuda.DeviceView(
|
82 |
-
ptr=latents.data_ptr(), shape=latents.shape, dtype=np.float32)
|
83 |
-
image = self.runEngine('vae-decoder', {"latent": sample_inp})['images']
|
84 |
-
image = (image / 2 + 0.5).clamp(0, 1)
|
85 |
-
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
|
86 |
-
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
87 |
-
image = (image * 255).round()
|
88 |
-
|
89 |
-
return image
|
90 |
-
|
91 |
-
def prepare_extra_step_kwargs(self, generator, eta):
|
92 |
-
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
93 |
-
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
94 |
-
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
95 |
-
# and should be between [0, 1]
|
96 |
-
|
97 |
-
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
98 |
-
extra_step_kwargs = {}
|
99 |
-
if accepts_eta:
|
100 |
-
extra_step_kwargs["eta"] = eta
|
101 |
-
|
102 |
-
# check if the scheduler accepts generator
|
103 |
-
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
104 |
-
if accepts_generator:
|
105 |
-
extra_step_kwargs["generator"] = generator
|
106 |
-
return extra_step_kwargs
|
107 |
-
|
108 |
-
def get_timesteps(self, num_inference_steps, strength, device):
|
109 |
-
# get the original timestep using init_timestep
|
110 |
-
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
111 |
-
|
112 |
-
t_start = max(num_inference_steps - init_timestep, 0)
|
113 |
-
timesteps = self.scheduler.timesteps[t_start:]
|
114 |
-
|
115 |
-
return timesteps, num_inference_steps - t_start
|
116 |
-
|
117 |
-
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
118 |
-
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
119 |
-
if isinstance(generator, list) and len(generator) != batch_size:
|
120 |
-
raise ValueError(
|
121 |
-
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
122 |
-
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
123 |
-
)
|
124 |
-
|
125 |
-
if latents is None:
|
126 |
-
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
127 |
-
else:
|
128 |
-
latents = latents.to(device)
|
129 |
-
|
130 |
-
# scale the initial noise by the standard deviation required by the scheduler
|
131 |
-
latents = latents * self.scheduler.init_noise_sigma
|
132 |
-
return latents
|
133 |
-
|
134 |
-
def _trt_encode_prompt(self, prompt, negative_prompt, num_images_per_prompt,):
|
135 |
-
# Tokenize input
|
136 |
-
text_input_ids = self.tokenizer(
|
137 |
-
prompt,
|
138 |
-
padding="max_length",
|
139 |
-
max_length=self.tokenizer.model_max_length,
|
140 |
-
return_tensors="pt",
|
141 |
-
).input_ids.type(torch.int32).to(self.device)
|
142 |
-
|
143 |
-
# CLIP text encoder
|
144 |
-
text_input_ids_inp = cuda.DeviceView(
|
145 |
-
ptr=text_input_ids.data_ptr(), shape=text_input_ids.shape, dtype=np.int32
|
146 |
-
)
|
147 |
-
text_embeddings = self.runEngine('clip', {"input_ids": text_input_ids_inp})['text_embeddings']
|
148 |
-
|
149 |
-
# Duplicate text embeddings for each generation per prompt
|
150 |
-
bs_embed, seq_len, _ = text_embeddings.shape
|
151 |
-
text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
|
152 |
-
text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
153 |
-
|
154 |
-
max_length = text_input_ids.shape[-1]
|
155 |
-
uncond_input_ids = self.tokenizer(
|
156 |
-
negative_prompt,
|
157 |
-
padding="max_length",
|
158 |
-
max_length=max_length,
|
159 |
-
truncation=True,
|
160 |
-
return_tensors="pt",
|
161 |
-
).input_ids.type(torch.int32).to(self.device)
|
162 |
-
uncond_input_ids_inp = cuda.DeviceView(
|
163 |
-
ptr=uncond_input_ids.data_ptr(), shape=uncond_input_ids.shape, dtype=np.int32)
|
164 |
-
uncond_embeddings = self.runEngine('clip', {"input_ids": uncond_input_ids_inp})['text_embeddings']
|
165 |
-
|
166 |
-
# Duplicate unconditional embeddings for each generation per prompt
|
167 |
-
seq_len = uncond_embeddings.shape[1]
|
168 |
-
uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
|
169 |
-
uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
170 |
-
|
171 |
-
# Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
|
172 |
-
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
|
173 |
-
text_embeddings = text_embeddings.to(dtype=torch.float16)
|
174 |
-
|
175 |
-
return text_embeddings
|
176 |
-
|
177 |
-
@torch.no_grad()
|
178 |
-
def __call__(
|
179 |
-
self,
|
180 |
-
prompt: Union[str, List[str]] = None,
|
181 |
-
height: Optional[int] = None,
|
182 |
-
width: Optional[int] = None,
|
183 |
-
num_inference_steps: int = 50,
|
184 |
-
guidance_scale: float = 7.5,
|
185 |
-
negative_prompt: Optional[Union[str, List[str]]] = None,
|
186 |
-
num_images_per_prompt: Optional[int] = 1,
|
187 |
-
eta: float = 0.0,
|
188 |
-
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
189 |
-
latents: Optional[torch.FloatTensor] = None,
|
190 |
-
prompt_embeds: Optional[torch.FloatTensor] = None,
|
191 |
-
use_super: bool = True,
|
192 |
-
):
|
193 |
-
# 1. Default height and width to unet
|
194 |
-
assert height is not None, "height can not be None"
|
195 |
-
assert width is not None, "width can not be None"
|
196 |
-
|
197 |
-
# 2. Define call parameters and Allocate the cuda buffers for TRT Engine bindings.
|
198 |
-
if prompt is not None and isinstance(prompt, str):
|
199 |
-
batch_size = 1
|
200 |
-
elif prompt is not None and isinstance(prompt, list):
|
201 |
-
batch_size = len(prompt)
|
202 |
-
else:
|
203 |
-
batch_size = prompt_embeds.shape[0]
|
204 |
-
|
205 |
-
# Allocate buffers for TensorRT engine bindings
|
206 |
-
for model_name, obj in self.trt_torch_models_cls.items():
|
207 |
-
self.engine[model_name].allocate_buffers(
|
208 |
-
shape_dict=obj.get_shape_dict(batch_size, height, width),
|
209 |
-
device=self.device
|
210 |
-
)
|
211 |
-
|
212 |
-
do_classifier_free_guidance = guidance_scale > 1.0
|
213 |
-
|
214 |
-
with trt.Runtime(TRT_LOGGER) as runtime:
|
215 |
-
torch.cuda.synchronize()
|
216 |
-
|
217 |
-
# 3. Encode input prompt. TRT Clip model.
|
218 |
-
prompt_embeds = self._trt_encode_prompt(
|
219 |
-
prompt, negative_prompt, num_images_per_prompt
|
220 |
-
)
|
221 |
-
|
222 |
-
# 4. Prepare timesteps.
|
223 |
-
self.scheduler.set_timesteps(num_inference_steps, device=self.device)
|
224 |
-
timesteps = self.scheduler.timesteps
|
225 |
-
|
226 |
-
# 5. Prepare latent variables. It will use VAE-Enoder(currently the encoder is torch model, not trt)
|
227 |
-
num_channels_latents = 4
|
228 |
-
latents = self.prepare_latents(
|
229 |
-
batch_size*num_images_per_prompt,
|
230 |
-
num_channels_latents,
|
231 |
-
height,
|
232 |
-
width,
|
233 |
-
prompt_embeds.dtype,
|
234 |
-
self.device,
|
235 |
-
generator,
|
236 |
-
latents
|
237 |
-
)
|
238 |
-
|
239 |
-
# 6. Prepare extra step kwargs and Set lantens/controlnet_conditioning_image/prompt_embeds to special dtype.
|
240 |
-
# The dytpe must be equal to the following to ensure that the NAN can not be issued in trt engine.
|
241 |
-
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
242 |
-
latents = latents.to(dtype=torch.float32)
|
243 |
-
prompt_embeds = prompt_embeds.to(dtype=torch.float16)
|
244 |
-
|
245 |
-
# 7. Denoising loop
|
246 |
-
for i, t in enumerate(timesteps):
|
247 |
-
# expand the latents if we are doing classifier free guidance
|
248 |
-
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
249 |
-
|
250 |
-
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
251 |
-
|
252 |
-
# predict the noise residual
|
253 |
-
|
254 |
-
dtype = np.float16
|
255 |
-
if t.dtype != torch.float32:
|
256 |
-
timestep_float = t.float()
|
257 |
-
else:
|
258 |
-
timestep_float = t
|
259 |
-
|
260 |
-
sample_inp = cuda.DeviceView(
|
261 |
-
ptr=latent_model_input.data_ptr(), shape=latent_model_input.shape, dtype=np.float32
|
262 |
-
)
|
263 |
-
timestep_inp = cuda.DeviceView(
|
264 |
-
ptr=timestep_float.data_ptr(), shape=timestep_float.shape, dtype=np.float32
|
265 |
-
)
|
266 |
-
embeddings_inp = cuda.DeviceView(
|
267 |
-
ptr=prompt_embeds.data_ptr(), shape=prompt_embeds.shape, dtype=dtype
|
268 |
-
)
|
269 |
-
|
270 |
-
noise_pred = self.engine['unet_fp16'].infer(
|
271 |
-
{"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp},
|
272 |
-
self.stream)['latent']
|
273 |
-
# perform guidance
|
274 |
-
if do_classifier_free_guidance:
|
275 |
-
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
276 |
-
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
277 |
-
|
278 |
-
# compute the previous noisy sample x_t -> x_t-1
|
279 |
-
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
280 |
-
|
281 |
-
# 8. Use VAE-Decoder to decode the latents
|
282 |
-
image = self._trt_decode_latents(latents)
|
283 |
-
|
284 |
-
# 9. SuperX4 Resolution, Optional.
|
285 |
-
if use_super:
|
286 |
-
image = np.ascontiguousarray(np.transpose(image, (0, 3, 1, 2))).astype(np.float16)
|
287 |
-
#image = self.super.infer(np.transpose(image.astype(np.float16), (0, 3, 1, 2)))
|
288 |
-
image = self.super.infer(image)
|
289 |
-
image = np.uint8(np.transpose(image, (0, 2, 3, 1)))
|
290 |
-
else:
|
291 |
-
image = np.uint8(image)
|
292 |
-
return image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lyraSD/muse_trt/super.py
DELETED
@@ -1,64 +0,0 @@
|
|
1 |
-
r"""use tensorrt engine to infer, a useful pipeline"""
|
2 |
-
|
3 |
-
import os
|
4 |
-
|
5 |
-
import numpy as np
|
6 |
-
from polygraphy import cuda
|
7 |
-
from polygraphy.backend.common import bytes_from_path
|
8 |
-
from polygraphy.backend.trt import engine_from_bytes
|
9 |
-
|
10 |
-
|
11 |
-
class SuperX4TRTInfer:
|
12 |
-
def __init__(self, engine_dir,
|
13 |
-
model_name='superx4.plan',
|
14 |
-
o_height=None,
|
15 |
-
o_width=None,
|
16 |
-
fp16=True,
|
17 |
-
) -> None:
|
18 |
-
engine_path = os.path.join(engine_dir, model_name)
|
19 |
-
self.engine = engine_from_bytes(bytes_from_path(engine_path))
|
20 |
-
self.context = self.engine.create_execution_context()
|
21 |
-
|
22 |
-
self.o_height = o_height
|
23 |
-
self.o_width = o_width
|
24 |
-
self.fp = fp16
|
25 |
-
self.dtype = np.float16 if fp16 else np.float32
|
26 |
-
|
27 |
-
self.stream = cuda.Stream()
|
28 |
-
|
29 |
-
def infer(self, x):
|
30 |
-
batch_size, channel, height, width = x.shape
|
31 |
-
if self.o_height is None or self.o_width is None:
|
32 |
-
o_height = height*4
|
33 |
-
o_width = width*4
|
34 |
-
else:
|
35 |
-
o_height = self.o_height
|
36 |
-
o_width = self.o_width
|
37 |
-
|
38 |
-
h_output = np.empty([batch_size, channel, o_height, o_width], dtype=self.dtype)
|
39 |
-
|
40 |
-
# allocate device memory
|
41 |
-
d_input = cuda.wrapper().malloc(1 * x.nbytes)
|
42 |
-
d_output = cuda.wrapper().malloc(1*h_output.nbytes)
|
43 |
-
|
44 |
-
bindings = [int(d_input), int(d_output)]
|
45 |
-
|
46 |
-
# transfer input data to device
|
47 |
-
cuda.wrapper().memcpy(d_input, x.ctypes.data, x.nbytes, cuda.MemcpyKind.HostToDevice, self.stream.ptr)
|
48 |
-
|
49 |
-
# execute model
|
50 |
-
noerror = self.context.execute_async_v2(bindings, self.stream.ptr)
|
51 |
-
if not noerror:
|
52 |
-
raise ValueError(f"ERROR: inference failed.")
|
53 |
-
|
54 |
-
# transfer predictions back
|
55 |
-
cuda.wrapper().memcpy(h_output.ctypes.data, d_output, h_output.nbytes, cuda.MemcpyKind.DeviceToHost, self.stream.ptr)
|
56 |
-
cuda.wrapper().free(d_input)
|
57 |
-
cuda.wrapper().free(d_output)
|
58 |
-
|
59 |
-
return h_output
|
60 |
-
|
61 |
-
def teardown(self):
|
62 |
-
del self.engine
|
63 |
-
self.stream.free()
|
64 |
-
del self.stream
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lyraSD/muse_trt/utilities.py
DELETED
@@ -1,538 +0,0 @@
|
|
1 |
-
r"""utils components"""
|
2 |
-
|
3 |
-
from collections import OrderedDict
|
4 |
-
from copy import copy
|
5 |
-
import numpy as np
|
6 |
-
import os
|
7 |
-
import math
|
8 |
-
from PIL import Image
|
9 |
-
from polygraphy.backend.common import bytes_from_path
|
10 |
-
from polygraphy.backend.trt import CreateConfig, Profile
|
11 |
-
from polygraphy.backend.trt import engine_from_bytes, engine_from_network, network_from_onnx_path, save_engine
|
12 |
-
from polygraphy.backend.trt import util as trt_util
|
13 |
-
from polygraphy import cuda
|
14 |
-
import random
|
15 |
-
from scipy import integrate
|
16 |
-
import tensorrt as trt
|
17 |
-
import torch
|
18 |
-
|
19 |
-
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
|
20 |
-
|
21 |
-
|
22 |
-
class Engine():
|
23 |
-
def __init__(
|
24 |
-
self,
|
25 |
-
model_name,
|
26 |
-
engine_dir,
|
27 |
-
memory_pool_size=None
|
28 |
-
):
|
29 |
-
self.engine_path = os.path.join(engine_dir, model_name+'.plan')
|
30 |
-
self.engine = None
|
31 |
-
self.context = None
|
32 |
-
self.buffers = OrderedDict()
|
33 |
-
self.tensors = OrderedDict()
|
34 |
-
self.memory_pool_size = memory_pool_size
|
35 |
-
|
36 |
-
def __del__(self):
|
37 |
-
[buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray)]
|
38 |
-
del self.engine
|
39 |
-
del self.context
|
40 |
-
del self.buffers
|
41 |
-
del self.tensors
|
42 |
-
|
43 |
-
def build(self, onnx_path, fp16, input_profile=None, enable_preview=False):
|
44 |
-
print(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
|
45 |
-
p = Profile()
|
46 |
-
if input_profile:
|
47 |
-
for name, dims in input_profile.items():
|
48 |
-
assert len(dims) == 3
|
49 |
-
p.add(name, min=dims[0], opt=dims[1], max=dims[2])
|
50 |
-
|
51 |
-
preview_features = []
|
52 |
-
if enable_preview:
|
53 |
-
trt_version = [int(i) for i in trt.__version__.split(".")]
|
54 |
-
# FASTER_DYNAMIC_SHAPES_0805 should only be used for TRT 8.5.1 or above.
|
55 |
-
if trt_version[0] > 8 or \
|
56 |
-
(trt_version[0] == 8 and (trt_version[1] > 5 or (trt_version[1] == 5 and trt_version[2] >= 1))):
|
57 |
-
preview_features = [trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]
|
58 |
-
|
59 |
-
if self.memory_pool_size is not None:
|
60 |
-
memory_pool_limits = {trt.MemoryPoolType.WORKSPACE: (self.memory_pool_size*(2 ** 30))}
|
61 |
-
print(memory_pool_limits)
|
62 |
-
else:
|
63 |
-
memory_pool_limits = None
|
64 |
-
engine = engine_from_network(
|
65 |
-
network_from_onnx_path(onnx_path),
|
66 |
-
config=CreateConfig(
|
67 |
-
fp16=fp16, profiles=[p], preview_features=preview_features, memory_pool_limits=memory_pool_limits
|
68 |
-
)
|
69 |
-
)
|
70 |
-
save_engine(engine, path=self.engine_path)
|
71 |
-
|
72 |
-
def activate(self):
|
73 |
-
print(f"Loading TensorRT engine: {self.engine_path}")
|
74 |
-
self.engine = engine_from_bytes(bytes_from_path(self.engine_path))
|
75 |
-
self.context = self.engine.create_execution_context()
|
76 |
-
|
77 |
-
def allocate_buffers(self, shape_dict=None, device='cuda'):
|
78 |
-
for idx in range(trt_util.get_bindings_per_profile(self.engine)):
|
79 |
-
binding = self.engine[idx]
|
80 |
-
if shape_dict and binding in shape_dict:
|
81 |
-
shape = shape_dict[binding]
|
82 |
-
else:
|
83 |
-
shape = self.engine.get_binding_shape(binding)
|
84 |
-
dtype = trt_util.np_dtype_from_trt(self.engine.get_binding_dtype(binding))
|
85 |
-
if self.engine.binding_is_input(binding):
|
86 |
-
self.context.set_binding_shape(idx, shape)
|
87 |
-
# Workaround to convert np dtype to torch
|
88 |
-
np_type_tensor = np.empty(shape=[], dtype=dtype)
|
89 |
-
torch_type_tensor = torch.from_numpy(np_type_tensor)
|
90 |
-
tensor = torch.empty(tuple(shape), dtype=torch_type_tensor.dtype).to(device=device)
|
91 |
-
self.tensors[binding] = tensor
|
92 |
-
self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), shape=shape, dtype=dtype)
|
93 |
-
|
94 |
-
def infer(self, feed_dict, stream):
|
95 |
-
start_binding, end_binding = trt_util.get_active_profile_bindings(self.context)
|
96 |
-
# shallow copy of ordered dict
|
97 |
-
device_buffers = copy(self.buffers)
|
98 |
-
for name, buf in feed_dict.items():
|
99 |
-
assert isinstance(buf, cuda.DeviceView)
|
100 |
-
device_buffers[name] = buf
|
101 |
-
bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()]
|
102 |
-
noerror = self.context.execute_async_v2(bindings=bindings, stream_handle=stream.ptr)
|
103 |
-
if not noerror:
|
104 |
-
raise ValueError(f"ERROR: inference failed.")
|
105 |
-
|
106 |
-
return self.tensors
|
107 |
-
|
108 |
-
|
109 |
-
class LMSDiscreteScheduler():
|
110 |
-
def __init__(
|
111 |
-
self,
|
112 |
-
device='cuda',
|
113 |
-
beta_start=0.00085,
|
114 |
-
beta_end=0.012,
|
115 |
-
num_train_timesteps=1000,
|
116 |
-
):
|
117 |
-
self.num_train_timesteps = num_train_timesteps
|
118 |
-
self.order = 4
|
119 |
-
|
120 |
-
self.beta_start = beta_start
|
121 |
-
self.beta_end = beta_end
|
122 |
-
betas = (torch.linspace(beta_start**0.5, beta_end**0.5, self.num_train_timesteps, dtype=torch.float32) ** 2)
|
123 |
-
alphas = 1.0 - betas
|
124 |
-
self.alphas_cumprod = torch.cumprod(alphas, dim=0)
|
125 |
-
|
126 |
-
sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
|
127 |
-
sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
|
128 |
-
self.sigmas = torch.from_numpy(sigmas)
|
129 |
-
|
130 |
-
# standard deviation of the initial noise distribution
|
131 |
-
self.init_noise_sigma = self.sigmas.max()
|
132 |
-
|
133 |
-
self.device = device
|
134 |
-
|
135 |
-
def set_timesteps(self, steps):
|
136 |
-
self.num_inference_steps = steps
|
137 |
-
|
138 |
-
timesteps = np.linspace(0, self.num_train_timesteps - 1, steps, dtype=float)[::-1].copy()
|
139 |
-
sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
|
140 |
-
sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
|
141 |
-
sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
|
142 |
-
self.sigmas = torch.from_numpy(sigmas).to(device=self.device)
|
143 |
-
|
144 |
-
# Move all timesteps to correct device beforehand
|
145 |
-
self.timesteps = torch.from_numpy(timesteps).to(device=self.device).float()
|
146 |
-
self.derivatives = []
|
147 |
-
|
148 |
-
def scale_model_input(self, sample: torch.FloatTensor, idx, *args, **kwargs) -> torch.FloatTensor:
|
149 |
-
return sample * self.latent_scales[idx]
|
150 |
-
|
151 |
-
def configure(self):
|
152 |
-
order = self.order
|
153 |
-
self.lms_coeffs = []
|
154 |
-
self.latent_scales = [1./((sigma**2 + 1) ** 0.5) for sigma in self.sigmas]
|
155 |
-
|
156 |
-
def get_lms_coefficient(order, t, current_order):
|
157 |
-
"""
|
158 |
-
Compute a linear multistep coefficient.
|
159 |
-
"""
|
160 |
-
def lms_derivative(tau):
|
161 |
-
prod = 1.0
|
162 |
-
for k in range(order):
|
163 |
-
if current_order == k:
|
164 |
-
continue
|
165 |
-
prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
|
166 |
-
return prod
|
167 |
-
integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
|
168 |
-
return integrated_coeff
|
169 |
-
|
170 |
-
for step_index in range(self.num_inference_steps):
|
171 |
-
order = min(step_index + 1, order)
|
172 |
-
self.lms_coeffs.append([get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)])
|
173 |
-
|
174 |
-
def step(self, output, latents, idx, timestep):
|
175 |
-
# compute the previous noisy sample x_t -> x_t-1
|
176 |
-
# 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
|
177 |
-
sigma = self.sigmas[idx]
|
178 |
-
pred_original_sample = latents - sigma * output
|
179 |
-
# 2. Convert to an ODE derivative
|
180 |
-
derivative = (latents - pred_original_sample) / sigma
|
181 |
-
self.derivatives.append(derivative)
|
182 |
-
if len(self.derivatives) > self.order:
|
183 |
-
self.derivatives.pop(0)
|
184 |
-
# 3. Compute previous sample based on the derivatives path
|
185 |
-
prev_sample = latents + sum(
|
186 |
-
coeff * derivative for coeff, derivative in zip(self.lms_coeffs[idx], reversed(self.derivatives))
|
187 |
-
)
|
188 |
-
|
189 |
-
return prev_sample
|
190 |
-
|
191 |
-
|
192 |
-
class DPMScheduler():
|
193 |
-
def __init__(
|
194 |
-
self,
|
195 |
-
beta_start=0.00085,
|
196 |
-
beta_end=0.012,
|
197 |
-
num_train_timesteps=1000,
|
198 |
-
solver_order=2,
|
199 |
-
predict_epsilon=True,
|
200 |
-
thresholding=False,
|
201 |
-
dynamic_thresholding_ratio=0.995,
|
202 |
-
sample_max_value=1.0,
|
203 |
-
algorithm_type="dpmsolver++",
|
204 |
-
solver_type="midpoint",
|
205 |
-
lower_order_final=True,
|
206 |
-
device='cuda',
|
207 |
-
):
|
208 |
-
# this schedule is very specific to the latent diffusion model.
|
209 |
-
self.betas = (
|
210 |
-
torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
|
211 |
-
)
|
212 |
-
|
213 |
-
self.device = device
|
214 |
-
self.alphas = 1.0 - self.betas
|
215 |
-
self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
|
216 |
-
# Currently we only support VP-type noise schedule
|
217 |
-
self.alpha_t = torch.sqrt(self.alphas_cumprod)
|
218 |
-
self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
|
219 |
-
self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
|
220 |
-
|
221 |
-
# standard deviation of the initial noise distribution
|
222 |
-
self.init_noise_sigma = 1.0
|
223 |
-
|
224 |
-
self.algorithm_type = algorithm_type
|
225 |
-
self.predict_epsilon = predict_epsilon
|
226 |
-
self.thresholding = thresholding
|
227 |
-
self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
|
228 |
-
self.sample_max_value = sample_max_value
|
229 |
-
self.lower_order_final = lower_order_final
|
230 |
-
|
231 |
-
# settings for DPM-Solver
|
232 |
-
if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
|
233 |
-
raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
|
234 |
-
if solver_type not in ["midpoint", "heun"]:
|
235 |
-
raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
|
236 |
-
|
237 |
-
# setable values
|
238 |
-
self.num_inference_steps = None
|
239 |
-
self.solver_order = solver_order
|
240 |
-
self.num_train_timesteps = num_train_timesteps
|
241 |
-
self.solver_type = solver_type
|
242 |
-
|
243 |
-
self.first_order_first_coef = []
|
244 |
-
self.first_order_second_coef = []
|
245 |
-
|
246 |
-
self.second_order_first_coef = []
|
247 |
-
self.second_order_second_coef = []
|
248 |
-
self.second_order_third_coef = []
|
249 |
-
|
250 |
-
self.third_order_first_coef = []
|
251 |
-
self.third_order_second_coef = []
|
252 |
-
self.third_order_third_coef = []
|
253 |
-
self.third_order_fourth_coef = []
|
254 |
-
|
255 |
-
def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
|
256 |
-
return sample
|
257 |
-
|
258 |
-
def configure(self):
|
259 |
-
lower_order_nums = 0
|
260 |
-
for step_index in range(self.num_inference_steps):
|
261 |
-
step_idx = step_index
|
262 |
-
timestep = self.timesteps[step_idx]
|
263 |
-
|
264 |
-
prev_timestep = 0 if step_idx == len(self.timesteps) - 1 else self.timesteps[step_idx + 1]
|
265 |
-
|
266 |
-
self.dpm_solver_first_order_coefs_precompute(timestep, prev_timestep)
|
267 |
-
|
268 |
-
timestep_list = [self.timesteps[step_index - 1], timestep]
|
269 |
-
self.multistep_dpm_solver_second_order_coefs_precompute(timestep_list, prev_timestep)
|
270 |
-
|
271 |
-
timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
|
272 |
-
self.multistep_dpm_solver_third_order_coefs_precompute(timestep_list, prev_timestep)
|
273 |
-
|
274 |
-
if lower_order_nums < self.solver_order:
|
275 |
-
lower_order_nums += 1
|
276 |
-
|
277 |
-
def dpm_solver_first_order_coefs_precompute(self, timestep, prev_timestep):
|
278 |
-
lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
|
279 |
-
alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
|
280 |
-
sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
|
281 |
-
h = lambda_t - lambda_s
|
282 |
-
if self.algorithm_type == "dpmsolver++":
|
283 |
-
self.first_order_first_coef.append(sigma_t / sigma_s)
|
284 |
-
self.first_order_second_coef.append(alpha_t * (torch.exp(-h) - 1.0))
|
285 |
-
elif self.algorithm_type == "dpmsolver":
|
286 |
-
self.first_order_first_coef.append(alpha_t / alpha_s)
|
287 |
-
self.first_order_second_coef.append(sigma_t * (torch.exp(h) - 1.0))
|
288 |
-
|
289 |
-
def multistep_dpm_solver_second_order_coefs_precompute(self, timestep_list, prev_timestep):
|
290 |
-
t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
|
291 |
-
lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
|
292 |
-
alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
|
293 |
-
sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
|
294 |
-
h = lambda_t - lambda_s0
|
295 |
-
if self.algorithm_type == "dpmsolver++":
|
296 |
-
# See https://arxiv.org/abs/2211.01095 for detailed derivations
|
297 |
-
if self.solver_type == "midpoint":
|
298 |
-
self.second_order_first_coef.append(sigma_t / sigma_s0)
|
299 |
-
self.second_order_second_coef.append((alpha_t * (torch.exp(-h) - 1.0)))
|
300 |
-
self.second_order_third_coef.append(0.5 * (alpha_t * (torch.exp(-h) - 1.0)))
|
301 |
-
elif self.solver_type == "heun":
|
302 |
-
self.second_order_first_coef.append(sigma_t / sigma_s0)
|
303 |
-
self.second_order_second_coef.append((alpha_t * (torch.exp(-h) - 1.0)))
|
304 |
-
self.second_order_third_coef.append(alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0))
|
305 |
-
elif self.algorithm_type == "dpmsolver":
|
306 |
-
# See https://arxiv.org/abs/2206.00927 for detailed derivations
|
307 |
-
if self.solver_type == "midpoint":
|
308 |
-
self.second_order_first_coef.append(alpha_t / alpha_s0)
|
309 |
-
self.second_order_second_coef.append((sigma_t * (torch.exp(h) - 1.0)))
|
310 |
-
self.second_order_third_coef.append(0.5 * (sigma_t * (torch.exp(h) - 1.0)))
|
311 |
-
elif self.solver_type == "heun":
|
312 |
-
self.second_order_first_coef.append(alpha_t / alpha_s0)
|
313 |
-
self.second_order_second_coef.append((sigma_t * (torch.exp(h) - 1.0)))
|
314 |
-
self.second_order_third_coef.append((sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)))
|
315 |
-
|
316 |
-
def multistep_dpm_solver_third_order_coefs_precompute(self, timestep_list, prev_timestep):
|
317 |
-
t, s0 = prev_timestep, timestep_list[-1]
|
318 |
-
lambda_t, lambda_s0 = (
|
319 |
-
self.lambda_t[t],
|
320 |
-
self.lambda_t[s0]
|
321 |
-
)
|
322 |
-
alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
|
323 |
-
sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
|
324 |
-
h = lambda_t - lambda_s0
|
325 |
-
if self.algorithm_type == "dpmsolver++":
|
326 |
-
self.third_order_first_coef.append(sigma_t / sigma_s0)
|
327 |
-
self.third_order_second_coef.append(alpha_t * (torch.exp(-h) - 1.0))
|
328 |
-
self.third_order_third_coef.append(alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0))
|
329 |
-
self.third_order_fourth_coef.append(alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5))
|
330 |
-
elif self.algorithm_type == "dpmsolver":
|
331 |
-
self.third_order_first_coef.append(alpha_t / alpha_s0)
|
332 |
-
self.third_order_second_coef.append(sigma_t * (torch.exp(h) - 1.0))
|
333 |
-
self.third_order_third_coef.append(sigma_t * ((torch.exp(h) - 1.0) / h - 1.0))
|
334 |
-
self.third_order_fourth_coef.append(sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5))
|
335 |
-
|
336 |
-
def set_timesteps(self, num_inference_steps):
|
337 |
-
self.num_inference_steps = num_inference_steps
|
338 |
-
timesteps = (
|
339 |
-
np.linspace(0, self.num_train_timesteps - 1, num_inference_steps + 1)
|
340 |
-
.round()[::-1][:-1]
|
341 |
-
.copy()
|
342 |
-
.astype(np.int32)
|
343 |
-
)
|
344 |
-
self.timesteps = torch.from_numpy(timesteps).to(self.device)
|
345 |
-
self.model_outputs = [
|
346 |
-
None,
|
347 |
-
] * self.solver_order
|
348 |
-
self.lower_order_nums = 0
|
349 |
-
|
350 |
-
def convert_model_output(
|
351 |
-
self, model_output, timestep, sample
|
352 |
-
):
|
353 |
-
# DPM-Solver++ needs to solve an integral of the data prediction model.
|
354 |
-
if self.algorithm_type == "dpmsolver++":
|
355 |
-
if self.predict_epsilon:
|
356 |
-
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
|
357 |
-
x0_pred = (sample - sigma_t * model_output) / alpha_t
|
358 |
-
else:
|
359 |
-
x0_pred = model_output
|
360 |
-
if self.thresholding:
|
361 |
-
# Dynamic thresholding in https://arxiv.org/abs/2205.11487
|
362 |
-
dynamic_max_val = torch.quantile(
|
363 |
-
torch.abs(x0_pred).reshape((x0_pred.shape[0], -1)), self.dynamic_thresholding_ratio, dim=1
|
364 |
-
)
|
365 |
-
dynamic_max_val = torch.maximum(
|
366 |
-
dynamic_max_val,
|
367 |
-
self.sample_max_value * torch.ones_like(dynamic_max_val).to(dynamic_max_val.device),
|
368 |
-
)[(...,) + (None,) * (x0_pred.ndim - 1)]
|
369 |
-
x0_pred = torch.clamp(x0_pred, -dynamic_max_val, dynamic_max_val) / dynamic_max_val
|
370 |
-
return x0_pred
|
371 |
-
# DPM-Solver needs to solve an integral of the noise prediction model.
|
372 |
-
elif self.algorithm_type == "dpmsolver":
|
373 |
-
if self.predict_epsilon:
|
374 |
-
return model_output
|
375 |
-
else:
|
376 |
-
alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
|
377 |
-
epsilon = (sample - alpha_t * model_output) / sigma_t
|
378 |
-
return epsilon
|
379 |
-
|
380 |
-
def dpm_solver_first_order_update(
|
381 |
-
self,
|
382 |
-
idx,
|
383 |
-
model_output,
|
384 |
-
sample
|
385 |
-
):
|
386 |
-
first_coef = self.first_order_first_coef[idx]
|
387 |
-
second_coef = self.first_order_second_coef[idx]
|
388 |
-
|
389 |
-
if self.algorithm_type == "dpmsolver++":
|
390 |
-
x_t = first_coef * sample - second_coef * model_output
|
391 |
-
elif self.algorithm_type == "dpmsolver":
|
392 |
-
x_t = first_coef * sample - second_coef * model_output
|
393 |
-
return x_t
|
394 |
-
|
395 |
-
def multistep_dpm_solver_second_order_update(
|
396 |
-
self,
|
397 |
-
idx,
|
398 |
-
model_output_list,
|
399 |
-
timestep_list,
|
400 |
-
prev_timestep,
|
401 |
-
sample
|
402 |
-
):
|
403 |
-
t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
|
404 |
-
m0, m1 = model_output_list[-1], model_output_list[-2]
|
405 |
-
lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
|
406 |
-
h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
|
407 |
-
r0 = h_0 / h
|
408 |
-
D0, D1 = m0, (1.0 / r0) * (m0 - m1)
|
409 |
-
|
410 |
-
first_coef = self.second_order_first_coef[idx]
|
411 |
-
second_coef = self.second_order_second_coef[idx]
|
412 |
-
third_coef = self.second_order_third_coef[idx]
|
413 |
-
|
414 |
-
if self.algorithm_type == "dpmsolver++":
|
415 |
-
# See https://arxiv.org/abs/2211.01095 for detailed derivations
|
416 |
-
if self.solver_type == "midpoint":
|
417 |
-
x_t = (
|
418 |
-
first_coef * sample
|
419 |
-
- second_coef * D0
|
420 |
-
- third_coef * D1
|
421 |
-
)
|
422 |
-
elif self.solver_type == "heun":
|
423 |
-
x_t = (
|
424 |
-
first_coef * sample
|
425 |
-
- second_coef * D0
|
426 |
-
+ third_coef * D1
|
427 |
-
)
|
428 |
-
elif self.algorithm_type == "dpmsolver":
|
429 |
-
# See https://arxiv.org/abs/2206.00927 for detailed derivations
|
430 |
-
if self.solver_type == "midpoint":
|
431 |
-
x_t = (
|
432 |
-
first_coef * sample
|
433 |
-
- second_coef * D0
|
434 |
-
- third_coef * D1
|
435 |
-
)
|
436 |
-
elif self.solver_type == "heun":
|
437 |
-
x_t = (
|
438 |
-
first_coef * sample
|
439 |
-
- second_coef * D0
|
440 |
-
- third_coef * D1
|
441 |
-
)
|
442 |
-
return x_t
|
443 |
-
|
444 |
-
def multistep_dpm_solver_third_order_update(
|
445 |
-
self,
|
446 |
-
idx,
|
447 |
-
model_output_list,
|
448 |
-
timestep_list,
|
449 |
-
prev_timestep,
|
450 |
-
sample
|
451 |
-
):
|
452 |
-
t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
|
453 |
-
m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
|
454 |
-
lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
|
455 |
-
self.lambda_t[t],
|
456 |
-
self.lambda_t[s0],
|
457 |
-
self.lambda_t[s1],
|
458 |
-
self.lambda_t[s2],
|
459 |
-
)
|
460 |
-
h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
|
461 |
-
r0, r1 = h_0 / h, h_1 / h
|
462 |
-
D0 = m0
|
463 |
-
D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
|
464 |
-
D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
|
465 |
-
D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
|
466 |
-
|
467 |
-
first_coef = self.third_order_first_coef[idx]
|
468 |
-
second_coef = self.third_order_second_coef[idx]
|
469 |
-
third_coef = self.third_order_third_coef[idx]
|
470 |
-
fourth_coef = self.third_order_fourth_coef[idx]
|
471 |
-
|
472 |
-
if self.algorithm_type == "dpmsolver++":
|
473 |
-
# See https://arxiv.org/abs/2206.00927 for detailed derivations
|
474 |
-
x_t = (
|
475 |
-
first_coef * sample
|
476 |
-
- second_coef * D0
|
477 |
-
+ third_coef * D1
|
478 |
-
- fourth_coef * D2
|
479 |
-
)
|
480 |
-
elif self.algorithm_type == "dpmsolver":
|
481 |
-
# See https://arxiv.org/abs/2206.00927 for detailed derivations
|
482 |
-
x_t = (
|
483 |
-
first_coef * sample
|
484 |
-
- second_coef * D0
|
485 |
-
- third_coef * D1
|
486 |
-
- fourth_coef * D2
|
487 |
-
)
|
488 |
-
return x_t
|
489 |
-
|
490 |
-
def step(self, output, latents, step_index, timestep):
|
491 |
-
if self.num_inference_steps is None:
|
492 |
-
raise ValueError(
|
493 |
-
"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
|
494 |
-
)
|
495 |
-
|
496 |
-
prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
|
497 |
-
lower_order_final = (
|
498 |
-
(step_index == len(self.timesteps) - 1) and self.lower_order_final and len(self.timesteps) < 15
|
499 |
-
)
|
500 |
-
lower_order_second = (
|
501 |
-
(step_index == len(self.timesteps) - 2) and self.lower_order_final and len(self.timesteps) < 15
|
502 |
-
)
|
503 |
-
|
504 |
-
output = self.convert_model_output(output, timestep, latents)
|
505 |
-
for i in range(self.solver_order - 1):
|
506 |
-
self.model_outputs[i] = self.model_outputs[i + 1]
|
507 |
-
self.model_outputs[-1] = output
|
508 |
-
|
509 |
-
if self.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
|
510 |
-
prev_sample = self.dpm_solver_first_order_update(step_index, output, latents)
|
511 |
-
elif self.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
|
512 |
-
timestep_list = [self.timesteps[step_index - 1], timestep]
|
513 |
-
prev_sample = self.multistep_dpm_solver_second_order_update(
|
514 |
-
step_index, self.model_outputs, timestep_list, prev_timestep, latents
|
515 |
-
)
|
516 |
-
else:
|
517 |
-
timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
|
518 |
-
prev_sample = self.multistep_dpm_solver_third_order_update(
|
519 |
-
step_index, self.model_outputs, timestep_list, prev_timestep, latents
|
520 |
-
)
|
521 |
-
|
522 |
-
if self.lower_order_nums < self.solver_order:
|
523 |
-
self.lower_order_nums += 1
|
524 |
-
|
525 |
-
return prev_sample
|
526 |
-
|
527 |
-
|
528 |
-
def save_image(images, image_path_dir, image_name_prefix):
|
529 |
-
"""
|
530 |
-
Save the generated images to png files.
|
531 |
-
"""
|
532 |
-
images = ((images + 1) * 255 / 2).clamp(0, 255).detach().permute(0, 2, 3, 1).round().type(torch.uint8).cpu().numpy()
|
533 |
-
for i in range(images.shape[0]):
|
534 |
-
image_path = os.path.join(image_path_dir, image_name_prefix+str(i+1)+'-'+str(random.randint(1000, 9999))+'.png')
|
535 |
-
print(f"Saving image {i+1} / {images.shape[0]} to: {image_path}")
|
536 |
-
Image.fromarray(images[i]).save(image_path)
|
537 |
-
|
538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lyrasd_model/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from . import lyrasd_img2img_pipeline, lyrasd_txt2img_pipeline, lyrasd_controlnet_txt2img_pipeline, lyrasd_controlnet_img2img_pipeline
|
2 |
+
from .lyrasd_txt2img_pipeline import LyraSdTxt2ImgPipeline
|
3 |
+
from .lyrasd_img2img_pipeline import LyraSDImg2ImgPipeline
|
4 |
+
from .lyrasd_controlnet_txt2img_pipeline import LyraSdControlnetTxt2ImgPipeline
|
5 |
+
from .lyrasd_controlnet_img2img_pipeline import LyraSdControlnetImg2ImgPipeline
|
lyrasd_model/lora_util.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
def add_text_lora_layer(clip_model, lora_model_path="Misaka.safetensors", alpha=1.0, lora_file_format="fp32", device="cuda:0"):
|
7 |
+
if lora_file_format == "fp32":
|
8 |
+
model_dtype = np.float32
|
9 |
+
elif lora_file_format == "fp16":
|
10 |
+
model_dtype = np.float16
|
11 |
+
else:
|
12 |
+
raise Exception(f"unsupported model dtype: {lora_file_format}")
|
13 |
+
all_files = os.scandir(lora_model_path)
|
14 |
+
unload_dict = []
|
15 |
+
# directly update weight in diffusers model
|
16 |
+
for file in all_files:
|
17 |
+
|
18 |
+
if 'text' in file.name:
|
19 |
+
layer_infos = file.name.split('.')[0].split('text_model_')[-1].split('_')
|
20 |
+
curr_layer = clip_model.text_model
|
21 |
+
else:
|
22 |
+
continue
|
23 |
+
|
24 |
+
# find the target layer
|
25 |
+
temp_name = layer_infos.pop(0)
|
26 |
+
while len(layer_infos) > -1:
|
27 |
+
try:
|
28 |
+
curr_layer = curr_layer.__getattr__(temp_name)
|
29 |
+
if len(layer_infos) > 0:
|
30 |
+
temp_name = layer_infos.pop(0)
|
31 |
+
# if temp_name == "self":
|
32 |
+
# temp_name += "_" + layer_infos.pop(0)
|
33 |
+
# elif temp_name != "mlp" and len(layer_infos) == 1:
|
34 |
+
# temp_name += "_" + layer_infos.pop(0)
|
35 |
+
elif len(layer_infos) == 0:
|
36 |
+
break
|
37 |
+
except Exception:
|
38 |
+
if len(temp_name) > 0:
|
39 |
+
temp_name += '_'+layer_infos.pop(0)
|
40 |
+
else:
|
41 |
+
temp_name = layer_infos.pop(0)
|
42 |
+
data = torch.from_numpy(np.fromfile(file.path, dtype=model_dtype)).to(clip_model.dtype).to(clip_model.device).reshape(curr_layer.weight.data.shape)
|
43 |
+
if len(curr_layer.weight.data) == 4:
|
44 |
+
adding_weight = alpha * data.permute(0,3,1,2)
|
45 |
+
else:
|
46 |
+
adding_weight = alpha * data
|
47 |
+
curr_layer.weight.data += adding_weight
|
48 |
+
|
49 |
+
curr_layer_unload_data = {
|
50 |
+
"layer": curr_layer,
|
51 |
+
"added_weight": adding_weight
|
52 |
+
}
|
53 |
+
unload_dict.append(curr_layer_unload_data)
|
54 |
+
return unload_dict
|
lyrasd_model/lyrasd_controlnet_img2img_pipeline.py
ADDED
@@ -0,0 +1,637 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from typing import Any, Callable, Dict, List, Optional, Union
|
3 |
+
from diffusers.schedulers import KarrasDiffusionSchedulers
|
4 |
+
from diffusers.loaders import TextualInversionLoaderMixin
|
5 |
+
from diffusers.models import AutoencoderKL
|
6 |
+
from diffusers.utils import randn_tensor, logging
|
7 |
+
from diffusers.schedulers import EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, DPMSolverMultistepScheduler
|
8 |
+
from diffusers.utils import PIL_INTERPOLATION
|
9 |
+
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
10 |
+
import os
|
11 |
+
import numpy as np
|
12 |
+
import warnings
|
13 |
+
from .lora_util import add_text_lora_layer
|
14 |
+
import gc
|
15 |
+
|
16 |
+
from PIL import Image
|
17 |
+
import PIL
|
18 |
+
|
19 |
+
import inspect
|
20 |
+
|
21 |
+
import time
|
22 |
+
|
23 |
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
24 |
+
|
25 |
+
def numpy_to_pil(images):
|
26 |
+
"""
|
27 |
+
Convert a numpy image or a batch of images to a PIL image.
|
28 |
+
"""
|
29 |
+
if images.ndim == 3:
|
30 |
+
images = images[None, ...]
|
31 |
+
images = (images * 255).round().astype("uint8")
|
32 |
+
if images.shape[-1] == 1:
|
33 |
+
# special case for grayscale (single channel) images
|
34 |
+
pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
|
35 |
+
else:
|
36 |
+
pil_images = [Image.fromarray(image) for image in images]
|
37 |
+
|
38 |
+
return pil_images
|
39 |
+
|
40 |
+
|
41 |
+
def preprocess(image):
|
42 |
+
warnings.warn(
|
43 |
+
"The preprocess method is deprecated and will be removed in a future version. Please"
|
44 |
+
" use VaeImageProcessor.preprocess instead",
|
45 |
+
FutureWarning,
|
46 |
+
)
|
47 |
+
if isinstance(image, torch.Tensor):
|
48 |
+
return image
|
49 |
+
elif isinstance(image, PIL.Image.Image):
|
50 |
+
image = [image]
|
51 |
+
|
52 |
+
if isinstance(image[0], PIL.Image.Image):
|
53 |
+
w, h = image[0].size
|
54 |
+
w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
|
55 |
+
|
56 |
+
image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
|
57 |
+
image = np.concatenate(image, axis=0)
|
58 |
+
image = np.array(image).astype(np.float32) / 255.0
|
59 |
+
image = image.transpose(0, 3, 1, 2)
|
60 |
+
image = 2.0 * image - 1.0
|
61 |
+
image = torch.from_numpy(image)
|
62 |
+
elif isinstance(image[0], torch.Tensor):
|
63 |
+
image = torch.cat(image, dim=0)
|
64 |
+
return image
|
65 |
+
|
66 |
+
class LyraSdControlnetImg2ImgPipeline(TextualInversionLoaderMixin):
|
67 |
+
def __init__(self, model_path, lib_so_path, model_dtype='fp32', device=torch.device("cuda"), dtype=torch.float16) -> None:
|
68 |
+
self.device = device
|
69 |
+
self.dtype = dtype
|
70 |
+
|
71 |
+
torch.classes.load_library(lib_so_path)
|
72 |
+
|
73 |
+
self.vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae").to(dtype).to(device)
|
74 |
+
self.tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
|
75 |
+
self.text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder").to(dtype).to(device)
|
76 |
+
|
77 |
+
self.unet_in_channels = 4
|
78 |
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
79 |
+
self.vae.enable_tiling()
|
80 |
+
self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
|
81 |
+
3, # max num of controlnets
|
82 |
+
"fp16" # inference dtype (can only use fp16 for now)
|
83 |
+
)
|
84 |
+
|
85 |
+
unet_path = os.path.join(model_path, "unet_bins/")
|
86 |
+
|
87 |
+
self.reload_unet_model(unet_path, model_dtype)
|
88 |
+
|
89 |
+
self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
|
90 |
+
|
91 |
+
def load_controlnet_model(self, model_name, controlnet_path, model_dtype="fp32"):
|
92 |
+
if len(controlnet_path) > 0 and controlnet_path[-1] != "/":
|
93 |
+
controlnet_path = controlnet_path + "/"
|
94 |
+
self.unet.load_controlnet_model(model_name, controlnet_path, model_dtype)
|
95 |
+
|
96 |
+
def unload_controlnet_model(self, model_name):
|
97 |
+
self.unet.unload_controlnet_model(model_name, True)
|
98 |
+
|
99 |
+
def get_loaded_controlnet(self):
|
100 |
+
return self.unet.get_loaded_controlnet()
|
101 |
+
|
102 |
+
def reload_unet_model(self, unet_path, unet_file_format='fp32'):
|
103 |
+
if len(unet_path) > 0 and unet_path[-1] != "/":
|
104 |
+
unet_path = unet_path + "/"
|
105 |
+
return self.unet.reload_unet_model(unet_path, unet_file_format)
|
106 |
+
|
107 |
+
def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
|
108 |
+
if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
|
109 |
+
lora_model_path = lora_model_path + "/"
|
110 |
+
lora = add_text_lora_layer(self.text_encoder, lora_model_path, lora_strength, lora_file_format)
|
111 |
+
self.loaded_lora[lora_name] = lora
|
112 |
+
self.unet.load_lora(lora_model_path, lora_name, lora_strength, lora_file_format)
|
113 |
+
|
114 |
+
def unload_lora(self, lora_name, clean_cache=False):
|
115 |
+
for layer_data in self.loaded_lora[lora_name]:
|
116 |
+
layer = layer_data['layer']
|
117 |
+
added_weight = layer_data['added_weight']
|
118 |
+
layer.weight.data -= added_weight
|
119 |
+
self.unet.unload_lora(lora_name, clean_cache)
|
120 |
+
del self.loaded_lora[lora_name]
|
121 |
+
gc.collect()
|
122 |
+
torch.cuda.empty_cache()
|
123 |
+
|
124 |
+
def clean_lora_cache(self):
|
125 |
+
self.unet.clean_lora_cache()
|
126 |
+
|
127 |
+
def get_loaded_lora(self):
|
128 |
+
return self.unet.get_loaded_lora()
|
129 |
+
|
130 |
+
def _encode_prompt(
|
131 |
+
self,
|
132 |
+
prompt,
|
133 |
+
device,
|
134 |
+
num_images_per_prompt,
|
135 |
+
do_classifier_free_guidance,
|
136 |
+
negative_prompt=None,
|
137 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
138 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
139 |
+
):
|
140 |
+
r"""
|
141 |
+
Encodes the prompt into text encoder hidden states.
|
142 |
+
|
143 |
+
Args:
|
144 |
+
prompt (`str` or `List[str]`, *optional*):
|
145 |
+
prompt to be encoded
|
146 |
+
device: (`torch.device`):
|
147 |
+
torch device
|
148 |
+
num_images_per_prompt (`int`):
|
149 |
+
number of images that should be generated per prompt
|
150 |
+
do_classifier_free_guidance (`bool`):
|
151 |
+
whether to use classifier free guidance or not
|
152 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
153 |
+
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
154 |
+
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
155 |
+
less than `1`).
|
156 |
+
prompt_embeds (`torch.FloatTensor`, *optional*):
|
157 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
158 |
+
provided, text embeddings will be generated from `prompt` input argument.
|
159 |
+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
160 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
161 |
+
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
162 |
+
argument.
|
163 |
+
"""
|
164 |
+
if prompt is not None and isinstance(prompt, str):
|
165 |
+
batch_size = 1
|
166 |
+
elif prompt is not None and isinstance(prompt, list):
|
167 |
+
batch_size = len(prompt)
|
168 |
+
else:
|
169 |
+
batch_size = prompt_embeds.shape[0]
|
170 |
+
|
171 |
+
if prompt_embeds is None:
|
172 |
+
# textual inversion: procecss multi-vector tokens if necessary
|
173 |
+
if isinstance(self, TextualInversionLoaderMixin):
|
174 |
+
prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
|
175 |
+
|
176 |
+
text_inputs = self.tokenizer(
|
177 |
+
prompt,
|
178 |
+
padding="max_length",
|
179 |
+
max_length=self.tokenizer.model_max_length,
|
180 |
+
truncation=True,
|
181 |
+
return_tensors="pt",
|
182 |
+
)
|
183 |
+
text_input_ids = text_inputs.input_ids
|
184 |
+
untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
|
185 |
+
|
186 |
+
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
|
187 |
+
text_input_ids, untruncated_ids
|
188 |
+
):
|
189 |
+
removed_text = self.tokenizer.batch_decode(
|
190 |
+
untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
|
191 |
+
)
|
192 |
+
logger.warning(
|
193 |
+
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
194 |
+
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
|
195 |
+
)
|
196 |
+
|
197 |
+
if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
|
198 |
+
attention_mask = text_inputs.attention_mask.to(device)
|
199 |
+
else:
|
200 |
+
attention_mask = None
|
201 |
+
|
202 |
+
prompt_embeds = self.text_encoder(
|
203 |
+
text_input_ids.to(device),
|
204 |
+
attention_mask=attention_mask,
|
205 |
+
)
|
206 |
+
prompt_embeds = prompt_embeds[0]
|
207 |
+
|
208 |
+
prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
|
209 |
+
|
210 |
+
bs_embed, seq_len, _ = prompt_embeds.shape
|
211 |
+
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
212 |
+
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
213 |
+
prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
214 |
+
|
215 |
+
# get unconditional embeddings for classifier free guidance
|
216 |
+
if do_classifier_free_guidance and negative_prompt_embeds is None:
|
217 |
+
uncond_tokens: List[str]
|
218 |
+
if negative_prompt is None:
|
219 |
+
uncond_tokens = [""] * batch_size
|
220 |
+
elif type(prompt) is not type(negative_prompt):
|
221 |
+
raise TypeError(
|
222 |
+
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
|
223 |
+
f" {type(prompt)}."
|
224 |
+
)
|
225 |
+
elif isinstance(negative_prompt, str):
|
226 |
+
uncond_tokens = [negative_prompt]
|
227 |
+
elif batch_size != len(negative_prompt):
|
228 |
+
raise ValueError(
|
229 |
+
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
230 |
+
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
231 |
+
" the batch size of `prompt`."
|
232 |
+
)
|
233 |
+
else:
|
234 |
+
uncond_tokens = negative_prompt
|
235 |
+
|
236 |
+
# textual inversion: procecss multi-vector tokens if necessary
|
237 |
+
if isinstance(self, TextualInversionLoaderMixin):
|
238 |
+
uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
|
239 |
+
|
240 |
+
max_length = prompt_embeds.shape[1]
|
241 |
+
uncond_input = self.tokenizer(
|
242 |
+
uncond_tokens,
|
243 |
+
padding="max_length",
|
244 |
+
max_length=max_length,
|
245 |
+
truncation=True,
|
246 |
+
return_tensors="pt",
|
247 |
+
)
|
248 |
+
|
249 |
+
if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
|
250 |
+
attention_mask = uncond_input.attention_mask.to(device)
|
251 |
+
else:
|
252 |
+
attention_mask = None
|
253 |
+
|
254 |
+
negative_prompt_embeds = self.text_encoder(
|
255 |
+
uncond_input.input_ids.to(device),
|
256 |
+
attention_mask=attention_mask,
|
257 |
+
)
|
258 |
+
negative_prompt_embeds = negative_prompt_embeds[0]
|
259 |
+
|
260 |
+
if do_classifier_free_guidance:
|
261 |
+
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
262 |
+
seq_len = negative_prompt_embeds.shape[1]
|
263 |
+
|
264 |
+
negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
|
265 |
+
|
266 |
+
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
267 |
+
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
268 |
+
|
269 |
+
# For classifier free guidance, we need to do two forward passes.
|
270 |
+
# Here we concatenate the unconditional and text embeddings into a single batch
|
271 |
+
# to avoid doing two forward passes
|
272 |
+
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
273 |
+
|
274 |
+
return prompt_embeds
|
275 |
+
|
276 |
+
|
277 |
+
def decode_latents(self, latents):
|
278 |
+
latents = 1 / self.vae.config.scaling_factor * latents
|
279 |
+
image = self.vae.decode(latents).sample
|
280 |
+
image = (image / 2 + 0.5).clamp(0, 1)
|
281 |
+
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
|
282 |
+
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
283 |
+
return image
|
284 |
+
|
285 |
+
def check_inputs(
|
286 |
+
self,
|
287 |
+
prompt,
|
288 |
+
height,
|
289 |
+
width,
|
290 |
+
negative_prompt=None,
|
291 |
+
prompt_embeds=None,
|
292 |
+
negative_prompt_embeds=None,
|
293 |
+
):
|
294 |
+
if height % 64 != 0 or width % 64 != 0: # 初版暂时只支持 64 的倍数的 height 和 width
|
295 |
+
raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
|
296 |
+
|
297 |
+
if prompt is not None and prompt_embeds is not None:
|
298 |
+
raise ValueError(
|
299 |
+
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
|
300 |
+
" only forward one of the two."
|
301 |
+
)
|
302 |
+
elif prompt is None and prompt_embeds is None:
|
303 |
+
raise ValueError(
|
304 |
+
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
|
305 |
+
)
|
306 |
+
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
307 |
+
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
308 |
+
|
309 |
+
if negative_prompt is not None and negative_prompt_embeds is not None:
|
310 |
+
raise ValueError(
|
311 |
+
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
|
312 |
+
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
|
313 |
+
)
|
314 |
+
|
315 |
+
if prompt_embeds is not None and negative_prompt_embeds is not None:
|
316 |
+
if prompt_embeds.shape != negative_prompt_embeds.shape:
|
317 |
+
raise ValueError(
|
318 |
+
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
|
319 |
+
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
|
320 |
+
f" {negative_prompt_embeds.shape}."
|
321 |
+
)
|
322 |
+
|
323 |
+
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
324 |
+
if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
|
325 |
+
raise ValueError(
|
326 |
+
f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
|
327 |
+
)
|
328 |
+
|
329 |
+
image = image.to(device=device, dtype=dtype)
|
330 |
+
|
331 |
+
batch_size = batch_size * num_images_per_prompt
|
332 |
+
|
333 |
+
if image.shape[1] == 4:
|
334 |
+
init_latents = image
|
335 |
+
|
336 |
+
else:
|
337 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
338 |
+
raise ValueError(
|
339 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
340 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
341 |
+
)
|
342 |
+
|
343 |
+
elif isinstance(generator, list):
|
344 |
+
init_latents = [
|
345 |
+
self.vae.encode(image[i: i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
|
346 |
+
]
|
347 |
+
init_latents = torch.cat(init_latents, dim=0)
|
348 |
+
else:
|
349 |
+
init_latents = self.vae.encode(image).latent_dist.sample(generator)
|
350 |
+
|
351 |
+
init_latents = self.vae.config.scaling_factor * init_latents
|
352 |
+
|
353 |
+
if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
|
354 |
+
# expand init_latents for batch_size
|
355 |
+
deprecation_message = (
|
356 |
+
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
|
357 |
+
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
|
358 |
+
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
|
359 |
+
" your script to pass as many initial images as text prompts to suppress this warning."
|
360 |
+
)
|
361 |
+
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
|
362 |
+
additional_image_per_prompt = batch_size // init_latents.shape[0]
|
363 |
+
init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
|
364 |
+
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
|
365 |
+
raise ValueError(
|
366 |
+
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
367 |
+
)
|
368 |
+
else:
|
369 |
+
init_latents = torch.cat([init_latents], dim=0)
|
370 |
+
|
371 |
+
shape = init_latents.shape
|
372 |
+
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
373 |
+
|
374 |
+
# get latents
|
375 |
+
init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
|
376 |
+
latents = init_latents
|
377 |
+
|
378 |
+
return latents
|
379 |
+
|
380 |
+
def prepare_image(
|
381 |
+
self,
|
382 |
+
image,
|
383 |
+
width,
|
384 |
+
height,
|
385 |
+
batch_size,
|
386 |
+
num_images_per_prompt,
|
387 |
+
device,
|
388 |
+
dtype,
|
389 |
+
do_classifier_free_guidance=False,
|
390 |
+
guess_mode=False,
|
391 |
+
):
|
392 |
+
if not isinstance(image, torch.Tensor):
|
393 |
+
if isinstance(image, PIL.Image.Image):
|
394 |
+
image = [image]
|
395 |
+
|
396 |
+
if isinstance(image[0], PIL.Image.Image):
|
397 |
+
images = []
|
398 |
+
|
399 |
+
for image_ in image:
|
400 |
+
image_ = image_.convert("RGB")
|
401 |
+
image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
|
402 |
+
image_ = np.array(image_)
|
403 |
+
image_ = image_[None, :]
|
404 |
+
images.append(image_)
|
405 |
+
|
406 |
+
image = images
|
407 |
+
|
408 |
+
image = np.concatenate(image, axis=0)
|
409 |
+
image = np.array(image).astype(np.float32) / 255.0
|
410 |
+
image = torch.from_numpy(image)
|
411 |
+
elif isinstance(image[0], torch.Tensor):
|
412 |
+
image = torch.cat(image, dim=0)
|
413 |
+
|
414 |
+
image_batch_size = image.shape[0]
|
415 |
+
|
416 |
+
if image_batch_size == 1:
|
417 |
+
repeat_by = batch_size
|
418 |
+
else:
|
419 |
+
# image batch size is the same as prompt batch size
|
420 |
+
repeat_by = num_images_per_prompt
|
421 |
+
|
422 |
+
image = image.repeat_interleave(repeat_by, dim=0)
|
423 |
+
|
424 |
+
image = image.to(device=device, dtype=dtype)
|
425 |
+
|
426 |
+
if do_classifier_free_guidance and not guess_mode:
|
427 |
+
image = torch.cat([image] * 2)
|
428 |
+
|
429 |
+
return image
|
430 |
+
|
431 |
+
def prepare_extra_step_kwargs(self, generator, eta):
|
432 |
+
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
433 |
+
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
434 |
+
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
435 |
+
# and should be between [0, 1]
|
436 |
+
|
437 |
+
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
438 |
+
extra_step_kwargs = {}
|
439 |
+
if accepts_eta:
|
440 |
+
extra_step_kwargs["eta"] = eta
|
441 |
+
|
442 |
+
# check if the scheduler accepts generator
|
443 |
+
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
444 |
+
if accepts_generator:
|
445 |
+
extra_step_kwargs["generator"] = generator
|
446 |
+
return extra_step_kwargs
|
447 |
+
|
448 |
+
def get_timesteps(self, num_inference_steps, strength, device):
|
449 |
+
# get the original timestep using init_timestep
|
450 |
+
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
451 |
+
|
452 |
+
t_start = max(num_inference_steps - init_timestep, 0)
|
453 |
+
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
|
454 |
+
|
455 |
+
return timesteps, num_inference_steps - t_start
|
456 |
+
|
457 |
+
|
458 |
+
@torch.no_grad()
|
459 |
+
def __call__(
|
460 |
+
self,
|
461 |
+
prompt: Union[str, List[str]] = None,
|
462 |
+
image: Union[
|
463 |
+
torch.FloatTensor,
|
464 |
+
PIL.Image.Image,
|
465 |
+
np.ndarray,
|
466 |
+
List[torch.FloatTensor],
|
467 |
+
List[PIL.Image.Image],
|
468 |
+
List[np.ndarray],
|
469 |
+
] = None,
|
470 |
+
strength: float = 0.8,
|
471 |
+
height: Optional[int] = None,
|
472 |
+
width: Optional[int] = None,
|
473 |
+
num_inference_steps: int = 50,
|
474 |
+
guidance_scale: float = 7.5,
|
475 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
476 |
+
num_images_per_prompt: Optional[int] = 1,
|
477 |
+
controlnet_images: Optional[List[PIL.Image.Image]] = None,
|
478 |
+
controlnet_scale: Optional[List[float]] = None,
|
479 |
+
controlnet_names: Optional[List[str]] = None,
|
480 |
+
guess_mode = False,
|
481 |
+
eta: float = 0.0,
|
482 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
483 |
+
latents: Optional[torch.FloatTensor] = None,
|
484 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
485 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
486 |
+
):
|
487 |
+
r"""
|
488 |
+
Function invoked when calling the pipeline for generation.
|
489 |
+
|
490 |
+
Args:
|
491 |
+
prompt (`str` or `List[str]`, *optional*):
|
492 |
+
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
493 |
+
instead.
|
494 |
+
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
495 |
+
The height in pixels of the generated image.
|
496 |
+
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
497 |
+
The width in pixels of the generated image.
|
498 |
+
num_inference_steps (`int`, *optional*, defaults to 50):
|
499 |
+
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
500 |
+
expense of slower inference.
|
501 |
+
guidance_scale (`float`, *optional*, defaults to 7.5):
|
502 |
+
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
503 |
+
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
504 |
+
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
505 |
+
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
506 |
+
usually at the expense of lower image quality.
|
507 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
508 |
+
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
509 |
+
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
510 |
+
less than `1`).
|
511 |
+
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
512 |
+
The number of images to generate per prompt.
|
513 |
+
eta (`float`, *optional*, defaults to 0.0):
|
514 |
+
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
515 |
+
[`schedulers.DDIMScheduler`], will be ignored for others.
|
516 |
+
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
517 |
+
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
518 |
+
to make generation deterministic.
|
519 |
+
latents (`torch.FloatTensor`, *optional*):
|
520 |
+
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
521 |
+
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
522 |
+
tensor will ge generated by sampling using the supplied random `generator`.
|
523 |
+
prompt_embeds (`torch.FloatTensor`, *optional*):
|
524 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
525 |
+
provided, text embeddings will be generated from `prompt` input argument.
|
526 |
+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
527 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
528 |
+
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
529 |
+
argument.
|
530 |
+
|
531 |
+
"""
|
532 |
+
# 1. Check inputs. Raise error if not correct
|
533 |
+
self.check_inputs(
|
534 |
+
prompt, height, width, negative_prompt, prompt_embeds, negative_prompt_embeds
|
535 |
+
)
|
536 |
+
|
537 |
+
# 2. Define call parameters
|
538 |
+
if prompt is not None and isinstance(prompt, str):
|
539 |
+
batch_size = 1
|
540 |
+
elif prompt is not None and isinstance(prompt, list):
|
541 |
+
batch_size = len(prompt)
|
542 |
+
else:
|
543 |
+
batch_size = prompt_embeds.shape[0]
|
544 |
+
|
545 |
+
device = self.device
|
546 |
+
|
547 |
+
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
548 |
+
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
549 |
+
# corresponds to doing no classifier free guidance.
|
550 |
+
do_classifier_free_guidance = guidance_scale > 1.0
|
551 |
+
|
552 |
+
|
553 |
+
# 3. Encode input prompt
|
554 |
+
start = time.perf_counter()
|
555 |
+
prompt_embeds = self._encode_prompt(
|
556 |
+
prompt,
|
557 |
+
device,
|
558 |
+
num_images_per_prompt,
|
559 |
+
do_classifier_free_guidance,
|
560 |
+
negative_prompt,
|
561 |
+
prompt_embeds=prompt_embeds,
|
562 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
563 |
+
)
|
564 |
+
control_images = []
|
565 |
+
|
566 |
+
# 4 prepare controlnet images
|
567 |
+
for image_ in controlnet_images:
|
568 |
+
image_ = self.prepare_image(
|
569 |
+
image=image_,
|
570 |
+
width=width,
|
571 |
+
height=height,
|
572 |
+
batch_size=batch_size * num_images_per_prompt,
|
573 |
+
num_images_per_prompt=num_images_per_prompt,
|
574 |
+
device=device,
|
575 |
+
dtype=prompt_embeds.dtype,
|
576 |
+
do_classifier_free_guidance=do_classifier_free_guidance
|
577 |
+
)
|
578 |
+
|
579 |
+
control_images.append(image_)
|
580 |
+
|
581 |
+
control_scales = []
|
582 |
+
|
583 |
+
scales = [1.0, ] * 13
|
584 |
+
if guess_mode:
|
585 |
+
scales = torch.logspace(-1, 0, 13).tolist()
|
586 |
+
|
587 |
+
for scale in controlnet_scale:
|
588 |
+
scales_ = [d * scale for d in scales]
|
589 |
+
control_scales.append(scales_)
|
590 |
+
|
591 |
+
image = preprocess(image)
|
592 |
+
|
593 |
+
# 5. set timesteps
|
594 |
+
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
595 |
+
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
|
596 |
+
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
|
597 |
+
|
598 |
+
# 6. Prepare latent variables
|
599 |
+
latents = self.prepare_latents(
|
600 |
+
image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
|
601 |
+
)
|
602 |
+
|
603 |
+
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
604 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
605 |
+
|
606 |
+
# 8. Denoising loop
|
607 |
+
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
608 |
+
|
609 |
+
start_unet = time.perf_counter()
|
610 |
+
for i, t in enumerate(timesteps):
|
611 |
+
# expand the latents if we are doing classifier free guidance
|
612 |
+
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
613 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
614 |
+
latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
|
615 |
+
|
616 |
+
# 后边三个 None 是给到controlnet 的参数,暂时给到 None 当 placeholder
|
617 |
+
noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, controlnet_names, control_images, control_scales, guess_mode)
|
618 |
+
|
619 |
+
noise_pred = noise_pred.permute(0, 3, 1, 2)
|
620 |
+
# perform guidance
|
621 |
+
|
622 |
+
if do_classifier_free_guidance:
|
623 |
+
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
624 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
625 |
+
|
626 |
+
# compute the previous noisy sample x_t -> x_t-1
|
627 |
+
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
628 |
+
|
629 |
+
torch.cuda.synchronize()
|
630 |
+
|
631 |
+
start = time.perf_counter()
|
632 |
+
image = self.decode_latents(latents)
|
633 |
+
torch.cuda.synchronize()
|
634 |
+
image = numpy_to_pil(image)
|
635 |
+
|
636 |
+
return image
|
637 |
+
# return None
|
lyrasd_model/lyrasd_controlnet_txt2img_pipeline.py
ADDED
@@ -0,0 +1,547 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from typing import Any, Callable, Dict, List, Optional, Union
|
3 |
+
from diffusers.schedulers import KarrasDiffusionSchedulers
|
4 |
+
from diffusers.loaders import TextualInversionLoaderMixin
|
5 |
+
from diffusers.models import AutoencoderKL
|
6 |
+
from diffusers.utils import randn_tensor, logging
|
7 |
+
from diffusers.schedulers import EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, DPMSolverMultistepScheduler
|
8 |
+
from diffusers.utils import PIL_INTERPOLATION
|
9 |
+
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
10 |
+
import os
|
11 |
+
import numpy as np
|
12 |
+
from .lora_util import add_text_lora_layer
|
13 |
+
import gc
|
14 |
+
from PIL import Image
|
15 |
+
import PIL
|
16 |
+
|
17 |
+
import inspect
|
18 |
+
|
19 |
+
import time
|
20 |
+
|
21 |
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
22 |
+
|
23 |
+
def numpy_to_pil(images):
|
24 |
+
"""
|
25 |
+
Convert a numpy image or a batch of images to a PIL image.
|
26 |
+
"""
|
27 |
+
if images.ndim == 3:
|
28 |
+
images = images[None, ...]
|
29 |
+
images = (images * 255).round().astype("uint8")
|
30 |
+
if images.shape[-1] == 1:
|
31 |
+
# special case for grayscale (single channel) images
|
32 |
+
pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
|
33 |
+
else:
|
34 |
+
pil_images = [Image.fromarray(image) for image in images]
|
35 |
+
|
36 |
+
return pil_images
|
37 |
+
|
38 |
+
|
39 |
+
class LyraSdControlnetTxt2ImgPipeline(TextualInversionLoaderMixin):
|
40 |
+
def __init__(self, model_path, lib_so_path, model_dtype='fp32', device=torch.device("cuda"), dtype=torch.float16) -> None:
|
41 |
+
self.device = device
|
42 |
+
self.dtype = dtype
|
43 |
+
|
44 |
+
torch.classes.load_library(lib_so_path)
|
45 |
+
|
46 |
+
self.vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae").to(dtype).to(device)
|
47 |
+
self.tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
|
48 |
+
self.text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder").to(dtype).to(device)
|
49 |
+
self.unet_in_channels = 4
|
50 |
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
51 |
+
self.vae.enable_tiling()
|
52 |
+
self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
|
53 |
+
3, # max num of controlnets
|
54 |
+
"fp16" # inference dtype (can only use fp16 for now)
|
55 |
+
)
|
56 |
+
|
57 |
+
unet_path = os.path.join(model_path, "unet_bins/")
|
58 |
+
self.reload_unet_model(unet_path, model_dtype)
|
59 |
+
|
60 |
+
self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
|
61 |
+
|
62 |
+
def load_controlnet_model(self, model_name, controlnet_path, model_dtype="fp32"):
|
63 |
+
if len(controlnet_path) > 0 and controlnet_path[-1] != "/":
|
64 |
+
controlnet_path = controlnet_path + "/"
|
65 |
+
self.unet.load_controlnet_model(model_name, controlnet_path, model_dtype)
|
66 |
+
|
67 |
+
def unload_controlnet_model(self, model_name):
|
68 |
+
self.unet.unload_controlnet_model(model_name, True)
|
69 |
+
|
70 |
+
def get_loaded_controlnet(self):
|
71 |
+
return self.unet.get_loaded_controlnet()
|
72 |
+
|
73 |
+
def reload_unet_model(self, unet_path, unet_file_format='fp32'):
|
74 |
+
if len(unet_path) > 0 and unet_path[-1] != "/":
|
75 |
+
unet_path = unet_path + "/"
|
76 |
+
return self.unet.reload_unet_model(unet_path, unet_file_format)
|
77 |
+
|
78 |
+
def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
|
79 |
+
if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
|
80 |
+
lora_model_path = lora_model_path + "/"
|
81 |
+
lora = add_text_lora_layer(self.text_encoder, lora_model_path, lora_strength, lora_file_format)
|
82 |
+
self.loaded_lora[lora_name] = lora
|
83 |
+
self.unet.load_lora(lora_model_path, lora_name, lora_strength, lora_file_format)
|
84 |
+
|
85 |
+
def unload_lora(self, lora_name, clean_cache=False):
|
86 |
+
for layer_data in self.loaded_lora[lora_name]:
|
87 |
+
layer = layer_data['layer']
|
88 |
+
added_weight = layer_data['added_weight']
|
89 |
+
layer.weight.data -= added_weight
|
90 |
+
self.unet.unload_lora(lora_name, clean_cache)
|
91 |
+
del self.loaded_lora[lora_name]
|
92 |
+
gc.collect()
|
93 |
+
torch.cuda.empty_cache()
|
94 |
+
|
95 |
+
def clean_lora_cache(self):
|
96 |
+
self.unet.clean_lora_cache()
|
97 |
+
|
98 |
+
def get_loaded_lora(self):
|
99 |
+
return self.unet.get_loaded_lora()
|
100 |
+
|
101 |
+
def _encode_prompt(
|
102 |
+
self,
|
103 |
+
prompt,
|
104 |
+
device,
|
105 |
+
num_images_per_prompt,
|
106 |
+
do_classifier_free_guidance,
|
107 |
+
negative_prompt=None,
|
108 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
109 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
110 |
+
):
|
111 |
+
r"""
|
112 |
+
Encodes the prompt into text encoder hidden states.
|
113 |
+
|
114 |
+
Args:
|
115 |
+
prompt (`str` or `List[str]`, *optional*):
|
116 |
+
prompt to be encoded
|
117 |
+
device: (`torch.device`):
|
118 |
+
torch device
|
119 |
+
num_images_per_prompt (`int`):
|
120 |
+
number of images that should be generated per prompt
|
121 |
+
do_classifier_free_guidance (`bool`):
|
122 |
+
whether to use classifier free guidance or not
|
123 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
124 |
+
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
125 |
+
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
126 |
+
less than `1`).
|
127 |
+
prompt_embeds (`torch.FloatTensor`, *optional*):
|
128 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
129 |
+
provided, text embeddings will be generated from `prompt` input argument.
|
130 |
+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
131 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
132 |
+
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
133 |
+
argument.
|
134 |
+
"""
|
135 |
+
if prompt is not None and isinstance(prompt, str):
|
136 |
+
batch_size = 1
|
137 |
+
elif prompt is not None and isinstance(prompt, list):
|
138 |
+
batch_size = len(prompt)
|
139 |
+
else:
|
140 |
+
batch_size = prompt_embeds.shape[0]
|
141 |
+
|
142 |
+
if prompt_embeds is None:
|
143 |
+
# textual inversion: procecss multi-vector tokens if necessary
|
144 |
+
if isinstance(self, TextualInversionLoaderMixin):
|
145 |
+
prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
|
146 |
+
|
147 |
+
text_inputs = self.tokenizer(
|
148 |
+
prompt,
|
149 |
+
padding="max_length",
|
150 |
+
max_length=self.tokenizer.model_max_length,
|
151 |
+
truncation=True,
|
152 |
+
return_tensors="pt",
|
153 |
+
)
|
154 |
+
text_input_ids = text_inputs.input_ids
|
155 |
+
untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
|
156 |
+
|
157 |
+
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
|
158 |
+
text_input_ids, untruncated_ids
|
159 |
+
):
|
160 |
+
removed_text = self.tokenizer.batch_decode(
|
161 |
+
untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
|
162 |
+
)
|
163 |
+
logger.warning(
|
164 |
+
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
165 |
+
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
|
166 |
+
)
|
167 |
+
|
168 |
+
if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
|
169 |
+
attention_mask = text_inputs.attention_mask.to(device)
|
170 |
+
else:
|
171 |
+
attention_mask = None
|
172 |
+
|
173 |
+
prompt_embeds = self.text_encoder(
|
174 |
+
text_input_ids.to(device),
|
175 |
+
attention_mask=attention_mask,
|
176 |
+
)
|
177 |
+
prompt_embeds = prompt_embeds[0]
|
178 |
+
|
179 |
+
prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
|
180 |
+
|
181 |
+
bs_embed, seq_len, _ = prompt_embeds.shape
|
182 |
+
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
183 |
+
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
184 |
+
prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
185 |
+
|
186 |
+
# get unconditional embeddings for classifier free guidance
|
187 |
+
if do_classifier_free_guidance and negative_prompt_embeds is None:
|
188 |
+
uncond_tokens: List[str]
|
189 |
+
if negative_prompt is None:
|
190 |
+
uncond_tokens = [""] * batch_size
|
191 |
+
elif type(prompt) is not type(negative_prompt):
|
192 |
+
raise TypeError(
|
193 |
+
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
|
194 |
+
f" {type(prompt)}."
|
195 |
+
)
|
196 |
+
elif isinstance(negative_prompt, str):
|
197 |
+
uncond_tokens = [negative_prompt]
|
198 |
+
elif batch_size != len(negative_prompt):
|
199 |
+
raise ValueError(
|
200 |
+
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
201 |
+
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
202 |
+
" the batch size of `prompt`."
|
203 |
+
)
|
204 |
+
else:
|
205 |
+
uncond_tokens = negative_prompt
|
206 |
+
|
207 |
+
# textual inversion: procecss multi-vector tokens if necessary
|
208 |
+
if isinstance(self, TextualInversionLoaderMixin):
|
209 |
+
uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
|
210 |
+
|
211 |
+
max_length = prompt_embeds.shape[1]
|
212 |
+
uncond_input = self.tokenizer(
|
213 |
+
uncond_tokens,
|
214 |
+
padding="max_length",
|
215 |
+
max_length=max_length,
|
216 |
+
truncation=True,
|
217 |
+
return_tensors="pt",
|
218 |
+
)
|
219 |
+
|
220 |
+
if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
|
221 |
+
attention_mask = uncond_input.attention_mask.to(device)
|
222 |
+
else:
|
223 |
+
attention_mask = None
|
224 |
+
|
225 |
+
negative_prompt_embeds = self.text_encoder(
|
226 |
+
uncond_input.input_ids.to(device),
|
227 |
+
attention_mask=attention_mask,
|
228 |
+
)
|
229 |
+
negative_prompt_embeds = negative_prompt_embeds[0]
|
230 |
+
|
231 |
+
if do_classifier_free_guidance:
|
232 |
+
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
233 |
+
seq_len = negative_prompt_embeds.shape[1]
|
234 |
+
|
235 |
+
negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
|
236 |
+
|
237 |
+
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
238 |
+
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
239 |
+
|
240 |
+
# For classifier free guidance, we need to do two forward passes.
|
241 |
+
# Here we concatenate the unconditional and text embeddings into a single batch
|
242 |
+
# to avoid doing two forward passes
|
243 |
+
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
244 |
+
|
245 |
+
return prompt_embeds
|
246 |
+
|
247 |
+
|
248 |
+
def decode_latents(self, latents):
|
249 |
+
latents = 1 / self.vae.config.scaling_factor * latents
|
250 |
+
image = self.vae.decode(latents).sample
|
251 |
+
image = (image / 2 + 0.5).clamp(0, 1)
|
252 |
+
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
|
253 |
+
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
254 |
+
return image
|
255 |
+
|
256 |
+
def check_inputs(
|
257 |
+
self,
|
258 |
+
prompt,
|
259 |
+
height,
|
260 |
+
width,
|
261 |
+
negative_prompt=None,
|
262 |
+
prompt_embeds=None,
|
263 |
+
negative_prompt_embeds=None,
|
264 |
+
):
|
265 |
+
if height % 64 != 0 or width % 64 != 0: # 初版暂时只支持 64 的倍数的 height 和 width
|
266 |
+
raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
|
267 |
+
|
268 |
+
if prompt is not None and prompt_embeds is not None:
|
269 |
+
raise ValueError(
|
270 |
+
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
|
271 |
+
" only forward one of the two."
|
272 |
+
)
|
273 |
+
elif prompt is None and prompt_embeds is None:
|
274 |
+
raise ValueError(
|
275 |
+
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
|
276 |
+
)
|
277 |
+
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
278 |
+
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
279 |
+
|
280 |
+
if negative_prompt is not None and negative_prompt_embeds is not None:
|
281 |
+
raise ValueError(
|
282 |
+
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
|
283 |
+
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
|
284 |
+
)
|
285 |
+
|
286 |
+
if prompt_embeds is not None and negative_prompt_embeds is not None:
|
287 |
+
if prompt_embeds.shape != negative_prompt_embeds.shape:
|
288 |
+
raise ValueError(
|
289 |
+
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
|
290 |
+
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
|
291 |
+
f" {negative_prompt_embeds.shape}."
|
292 |
+
)
|
293 |
+
|
294 |
+
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
295 |
+
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
296 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
297 |
+
raise ValueError(
|
298 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
299 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
300 |
+
)
|
301 |
+
|
302 |
+
if latents is None:
|
303 |
+
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
304 |
+
else:
|
305 |
+
latents = latents.to(device)
|
306 |
+
|
307 |
+
# scale the initial noise by the standard deviation required by the scheduler
|
308 |
+
latents = latents * self.scheduler.init_noise_sigma
|
309 |
+
return latents
|
310 |
+
|
311 |
+
def prepare_image(
|
312 |
+
self,
|
313 |
+
image,
|
314 |
+
width,
|
315 |
+
height,
|
316 |
+
batch_size,
|
317 |
+
num_images_per_prompt,
|
318 |
+
device,
|
319 |
+
dtype,
|
320 |
+
do_classifier_free_guidance=False,
|
321 |
+
guess_mode=False,
|
322 |
+
):
|
323 |
+
if not isinstance(image, torch.Tensor):
|
324 |
+
if isinstance(image, PIL.Image.Image):
|
325 |
+
image = [image]
|
326 |
+
|
327 |
+
if isinstance(image[0], PIL.Image.Image):
|
328 |
+
images = []
|
329 |
+
|
330 |
+
for image_ in image:
|
331 |
+
image_ = image_.convert("RGB")
|
332 |
+
image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])
|
333 |
+
image_ = np.array(image_)
|
334 |
+
image_ = image_[None, :]
|
335 |
+
images.append(image_)
|
336 |
+
|
337 |
+
image = images
|
338 |
+
|
339 |
+
image = np.concatenate(image, axis=0)
|
340 |
+
image = np.array(image).astype(np.float32) / 255.0
|
341 |
+
image = torch.from_numpy(image)
|
342 |
+
elif isinstance(image[0], torch.Tensor):
|
343 |
+
image = torch.cat(image, dim=0)
|
344 |
+
|
345 |
+
image_batch_size = image.shape[0]
|
346 |
+
|
347 |
+
if image_batch_size == 1:
|
348 |
+
repeat_by = batch_size
|
349 |
+
else:
|
350 |
+
# image batch size is the same as prompt batch size
|
351 |
+
repeat_by = num_images_per_prompt
|
352 |
+
|
353 |
+
image = image.repeat_interleave(repeat_by, dim=0)
|
354 |
+
|
355 |
+
image = image.to(device=device, dtype=dtype)
|
356 |
+
|
357 |
+
if do_classifier_free_guidance and not guess_mode:
|
358 |
+
image = torch.cat([image] * 2)
|
359 |
+
|
360 |
+
return image
|
361 |
+
|
362 |
+
def prepare_extra_step_kwargs(self, generator, eta):
|
363 |
+
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
364 |
+
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
365 |
+
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
366 |
+
# and should be between [0, 1]
|
367 |
+
|
368 |
+
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
369 |
+
extra_step_kwargs = {}
|
370 |
+
if accepts_eta:
|
371 |
+
extra_step_kwargs["eta"] = eta
|
372 |
+
|
373 |
+
# check if the scheduler accepts generator
|
374 |
+
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
375 |
+
if accepts_generator:
|
376 |
+
extra_step_kwargs["generator"] = generator
|
377 |
+
return extra_step_kwargs
|
378 |
+
|
379 |
+
@torch.no_grad()
|
380 |
+
def __call__(
|
381 |
+
self,
|
382 |
+
prompt: Union[str, List[str]] = None,
|
383 |
+
height: Optional[int] = None,
|
384 |
+
width: Optional[int] = None,
|
385 |
+
num_inference_steps: int = 50,
|
386 |
+
guidance_scale: float = 7.5,
|
387 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
388 |
+
num_images_per_prompt: Optional[int] = 1,
|
389 |
+
controlnet_images: Optional[List[PIL.Image.Image]] = None,
|
390 |
+
controlnet_scale: Optional[List[float]] = None,
|
391 |
+
controlnet_names: Optional[List[str]] = None,
|
392 |
+
guess_mode = False,
|
393 |
+
eta: float = 0.0,
|
394 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
395 |
+
latents: Optional[torch.FloatTensor] = None,
|
396 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
397 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
398 |
+
):
|
399 |
+
r"""
|
400 |
+
Function invoked when calling the pipeline for generation.
|
401 |
+
|
402 |
+
Args:
|
403 |
+
prompt (`str` or `List[str]`, *optional*):
|
404 |
+
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
405 |
+
instead.
|
406 |
+
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
407 |
+
The height in pixels of the generated image.
|
408 |
+
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
409 |
+
The width in pixels of the generated image.
|
410 |
+
num_inference_steps (`int`, *optional*, defaults to 50):
|
411 |
+
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
412 |
+
expense of slower inference.
|
413 |
+
guidance_scale (`float`, *optional*, defaults to 7.5):
|
414 |
+
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
415 |
+
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
416 |
+
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
417 |
+
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
418 |
+
usually at the expense of lower image quality.
|
419 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
420 |
+
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
421 |
+
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
422 |
+
less than `1`).
|
423 |
+
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
424 |
+
The number of images to generate per prompt.
|
425 |
+
eta (`float`, *optional*, defaults to 0.0):
|
426 |
+
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
427 |
+
[`schedulers.DDIMScheduler`], will be ignored for others.
|
428 |
+
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
429 |
+
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
430 |
+
to make generation deterministic.
|
431 |
+
latents (`torch.FloatTensor`, *optional*):
|
432 |
+
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
433 |
+
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
434 |
+
tensor will ge generated by sampling using the supplied random `generator`.
|
435 |
+
prompt_embeds (`torch.FloatTensor`, *optional*):
|
436 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
437 |
+
provided, text embeddings will be generated from `prompt` input argument.
|
438 |
+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
439 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
440 |
+
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
441 |
+
argument.
|
442 |
+
|
443 |
+
"""
|
444 |
+
# 1. Check inputs. Raise error if not correct
|
445 |
+
self.check_inputs(
|
446 |
+
prompt, height, width, negative_prompt, prompt_embeds, negative_prompt_embeds
|
447 |
+
)
|
448 |
+
|
449 |
+
# 2. Define call parameters
|
450 |
+
if prompt is not None and isinstance(prompt, str):
|
451 |
+
batch_size = 1
|
452 |
+
elif prompt is not None and isinstance(prompt, list):
|
453 |
+
batch_size = len(prompt)
|
454 |
+
else:
|
455 |
+
batch_size = prompt_embeds.shape[0]
|
456 |
+
|
457 |
+
device = self.device
|
458 |
+
|
459 |
+
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
460 |
+
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
461 |
+
# corresponds to doing no classifier free guidance.
|
462 |
+
do_classifier_free_guidance = guidance_scale > 1.0
|
463 |
+
|
464 |
+
# 3. Encode input prompt
|
465 |
+
prompt_embeds = self._encode_prompt(
|
466 |
+
prompt,
|
467 |
+
device,
|
468 |
+
num_images_per_prompt,
|
469 |
+
do_classifier_free_guidance,
|
470 |
+
negative_prompt,
|
471 |
+
prompt_embeds=prompt_embeds,
|
472 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
473 |
+
)
|
474 |
+
control_images = []
|
475 |
+
|
476 |
+
for image_ in controlnet_images:
|
477 |
+
image_ = self.prepare_image(
|
478 |
+
image=image_,
|
479 |
+
width=width,
|
480 |
+
height=height,
|
481 |
+
batch_size=batch_size * num_images_per_prompt,
|
482 |
+
num_images_per_prompt=num_images_per_prompt,
|
483 |
+
device=device,
|
484 |
+
dtype=prompt_embeds.dtype,
|
485 |
+
do_classifier_free_guidance=do_classifier_free_guidance
|
486 |
+
)
|
487 |
+
|
488 |
+
control_images.append(image_)
|
489 |
+
|
490 |
+
control_scales = []
|
491 |
+
|
492 |
+
scales = [1.0, ] * 13
|
493 |
+
if guess_mode:
|
494 |
+
scales = torch.logspace(-1, 0, 13).tolist()
|
495 |
+
|
496 |
+
for scale in controlnet_scale:
|
497 |
+
scales_ = [d * scale for d in scales]
|
498 |
+
control_scales.append(scales_)
|
499 |
+
|
500 |
+
# 4. Prepare timesteps
|
501 |
+
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
502 |
+
timesteps = self.scheduler.timesteps
|
503 |
+
|
504 |
+
# 5. Prepare latent variables
|
505 |
+
num_channels_latents = self.unet_in_channels
|
506 |
+
latents = self.prepare_latents(
|
507 |
+
batch_size * num_images_per_prompt,
|
508 |
+
num_channels_latents,
|
509 |
+
height,
|
510 |
+
width,
|
511 |
+
prompt_embeds.dtype,
|
512 |
+
device,
|
513 |
+
generator,
|
514 |
+
latents,
|
515 |
+
)
|
516 |
+
|
517 |
+
# # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
518 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
519 |
+
|
520 |
+
# 7. Denoising loop
|
521 |
+
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
522 |
+
|
523 |
+
start_unet = time.perf_counter()
|
524 |
+
for i, t in enumerate(timesteps):
|
525 |
+
# expand the latents if we are doing classifier free guidance
|
526 |
+
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
527 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
528 |
+
latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
|
529 |
+
|
530 |
+
# 后边三个 None 是给到controlnet 的参数,暂时给到 None 当 placeholder
|
531 |
+
noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, controlnet_names, control_images, control_scales, guess_mode)
|
532 |
+
|
533 |
+
noise_pred = noise_pred.permute(0, 3, 1, 2)
|
534 |
+
# perform guidance
|
535 |
+
|
536 |
+
if do_classifier_free_guidance:
|
537 |
+
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
538 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
539 |
+
|
540 |
+
# compute the previous noisy sample x_t -> x_t-1
|
541 |
+
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
542 |
+
|
543 |
+
image = self.decode_latents(latents)
|
544 |
+
image = numpy_to_pil(image)
|
545 |
+
|
546 |
+
return image
|
547 |
+
# return None
|
lyrasd_model/lyrasd_img2img_pipeline.py
ADDED
@@ -0,0 +1,554 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import inspect
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import warnings
|
5 |
+
from typing import Callable, List, Optional, Union
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import PIL
|
9 |
+
import torch
|
10 |
+
from diffusers.loaders import TextualInversionLoaderMixin
|
11 |
+
from diffusers.models import AutoencoderKL
|
12 |
+
from diffusers.schedulers import EulerAncestralDiscreteScheduler
|
13 |
+
from diffusers.utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor
|
14 |
+
from PIL import Image
|
15 |
+
from transformers import CLIPTextModel, CLIPTokenizer
|
16 |
+
from .lora_util import add_text_lora_layer
|
17 |
+
import gc
|
18 |
+
|
19 |
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
20 |
+
|
21 |
+
|
22 |
+
def numpy_to_pil(images):
|
23 |
+
"""
|
24 |
+
Convert a numpy image or a batch of images to a PIL image.
|
25 |
+
"""
|
26 |
+
if images.ndim == 3:
|
27 |
+
images = images[None, ...]
|
28 |
+
images = (images * 255).round().astype("uint8")
|
29 |
+
if images.shape[-1] == 1:
|
30 |
+
# special case for grayscale (single channel) images
|
31 |
+
pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
|
32 |
+
else:
|
33 |
+
pil_images = [Image.fromarray(image) for image in images]
|
34 |
+
|
35 |
+
return pil_images
|
36 |
+
|
37 |
+
|
38 |
+
def preprocess(image):
|
39 |
+
warnings.warn(
|
40 |
+
"The preprocess method is deprecated and will be removed in a future version. Please"
|
41 |
+
" use VaeImageProcessor.preprocess instead",
|
42 |
+
FutureWarning,
|
43 |
+
)
|
44 |
+
if isinstance(image, torch.Tensor):
|
45 |
+
return image
|
46 |
+
elif isinstance(image, PIL.Image.Image):
|
47 |
+
image = [image]
|
48 |
+
|
49 |
+
if isinstance(image[0], PIL.Image.Image):
|
50 |
+
w, h = image[0].size
|
51 |
+
w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8
|
52 |
+
|
53 |
+
image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
|
54 |
+
image = np.concatenate(image, axis=0)
|
55 |
+
image = np.array(image).astype(np.float32) / 255.0
|
56 |
+
image = image.transpose(0, 3, 1, 2)
|
57 |
+
image = 2.0 * image - 1.0
|
58 |
+
image = torch.from_numpy(image)
|
59 |
+
elif isinstance(image[0], torch.Tensor):
|
60 |
+
image = torch.cat(image, dim=0)
|
61 |
+
return image
|
62 |
+
|
63 |
+
|
64 |
+
class LyraSDImg2ImgPipeline(TextualInversionLoaderMixin):
|
65 |
+
def __init__(self, model_path, lib_so_path, model_dtype='fp32', device=torch.device("cuda"), dtype=torch.float16) -> None:
|
66 |
+
self.device = device
|
67 |
+
self.dtype = dtype
|
68 |
+
|
69 |
+
torch.classes.load_library(lib_so_path)
|
70 |
+
|
71 |
+
self.vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae").to(dtype).to(device)
|
72 |
+
self.tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
|
73 |
+
self.text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder").to(dtype).to(device)
|
74 |
+
unet_path = os.path.join(model_path, "unet_bins/")
|
75 |
+
|
76 |
+
self.unet_in_channels = 4
|
77 |
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
78 |
+
self.vae.enable_tiling()
|
79 |
+
self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
|
80 |
+
3, # max num of controlnets
|
81 |
+
"fp16" # inference dtype (can only use fp16 for now)
|
82 |
+
)
|
83 |
+
|
84 |
+
self.reload_unet_model(unet_path, model_dtype)
|
85 |
+
|
86 |
+
self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
|
87 |
+
|
88 |
+
def reload_unet_model(self, unet_path, unet_file_format='fp32'):
|
89 |
+
if len(unet_path) > 0 and unet_path[-1] != "/":
|
90 |
+
unet_path = unet_path + "/"
|
91 |
+
return self.unet.reload_unet_model(unet_path, unet_file_format)
|
92 |
+
|
93 |
+
def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
|
94 |
+
if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
|
95 |
+
lora_model_path = lora_model_path + "/"
|
96 |
+
lora = add_text_lora_layer(self.text_encoder, lora_model_path, lora_strength, lora_file_format)
|
97 |
+
self.loaded_lora[lora_name] = lora
|
98 |
+
self.unet.load_lora(lora_model_path, lora_name, lora_strength, lora_file_format)
|
99 |
+
|
100 |
+
def unload_lora(self, lora_name, clean_cache=False):
|
101 |
+
for layer_data in self.loaded_lora[lora_name]:
|
102 |
+
layer = layer_data['layer']
|
103 |
+
added_weight = layer_data['added_weight']
|
104 |
+
layer.weight.data -= added_weight
|
105 |
+
self.unet.unload_lora(lora_name, clean_cache)
|
106 |
+
del self.loaded_lora[lora_name]
|
107 |
+
gc.collect()
|
108 |
+
torch.cuda.empty_cache()
|
109 |
+
|
110 |
+
def clean_lora_cache(self):
|
111 |
+
self.unet.clean_lora_cache()
|
112 |
+
|
113 |
+
def get_loaded_lora(self):
|
114 |
+
return self.unet.get_loaded_lora()
|
115 |
+
|
116 |
+
|
117 |
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
|
118 |
+
def _encode_prompt(
|
119 |
+
self,
|
120 |
+
prompt,
|
121 |
+
device,
|
122 |
+
num_images_per_prompt,
|
123 |
+
do_classifier_free_guidance,
|
124 |
+
negative_prompt=None,
|
125 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
126 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
127 |
+
):
|
128 |
+
r"""
|
129 |
+
Encodes the prompt into text encoder hidden states.
|
130 |
+
|
131 |
+
Args:
|
132 |
+
prompt (`str` or `List[str]`, *optional*):
|
133 |
+
prompt to be encoded
|
134 |
+
device: (`torch.device`):
|
135 |
+
torch device
|
136 |
+
num_images_per_prompt (`int`):
|
137 |
+
number of images that should be generated per prompt
|
138 |
+
do_classifier_free_guidance (`bool`):
|
139 |
+
whether to use classifier free guidance or not
|
140 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
141 |
+
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
142 |
+
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
143 |
+
less than `1`).
|
144 |
+
prompt_embeds (`torch.FloatTensor`, *optional*):
|
145 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
146 |
+
provided, text embeddings will be generated from `prompt` input argument.
|
147 |
+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
148 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
149 |
+
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
150 |
+
argument.
|
151 |
+
"""
|
152 |
+
|
153 |
+
if prompt is not None and isinstance(prompt, str):
|
154 |
+
batch_size = 1
|
155 |
+
elif prompt is not None and isinstance(prompt, list):
|
156 |
+
batch_size = len(prompt)
|
157 |
+
else:
|
158 |
+
batch_size = prompt_embeds.shape[0]
|
159 |
+
|
160 |
+
if prompt_embeds is None:
|
161 |
+
# textual inversion: procecss multi-vector tokens if necessary
|
162 |
+
if isinstance(self, TextualInversionLoaderMixin):
|
163 |
+
prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
|
164 |
+
|
165 |
+
text_inputs = self.tokenizer(
|
166 |
+
prompt,
|
167 |
+
padding="max_length",
|
168 |
+
max_length=self.tokenizer.model_max_length,
|
169 |
+
truncation=True,
|
170 |
+
return_tensors="pt",
|
171 |
+
)
|
172 |
+
text_input_ids = text_inputs.input_ids
|
173 |
+
untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
|
174 |
+
|
175 |
+
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
|
176 |
+
text_input_ids, untruncated_ids
|
177 |
+
):
|
178 |
+
removed_text = self.tokenizer.batch_decode(
|
179 |
+
untruncated_ids[:, self.tokenizer.model_max_length - 1: -1]
|
180 |
+
)
|
181 |
+
logger.warning(
|
182 |
+
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
183 |
+
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
|
184 |
+
)
|
185 |
+
|
186 |
+
if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
|
187 |
+
attention_mask = text_inputs.attention_mask.to(device)
|
188 |
+
else:
|
189 |
+
attention_mask = None
|
190 |
+
|
191 |
+
prompt_embeds = self.text_encoder(
|
192 |
+
text_input_ids.to(device),
|
193 |
+
attention_mask=attention_mask,
|
194 |
+
)
|
195 |
+
prompt_embeds = prompt_embeds[0]
|
196 |
+
|
197 |
+
if self.text_encoder is not None:
|
198 |
+
prompt_embeds_dtype = self.text_encoder.dtype
|
199 |
+
elif self.unet is not None:
|
200 |
+
prompt_embeds_dtype = self.unet.dtype
|
201 |
+
else:
|
202 |
+
prompt_embeds_dtype = prompt_embeds.dtype
|
203 |
+
|
204 |
+
prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
|
205 |
+
|
206 |
+
bs_embed, seq_len, _ = prompt_embeds.shape
|
207 |
+
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
208 |
+
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
209 |
+
prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
210 |
+
|
211 |
+
# get unconditional embeddings for classifier free guidance
|
212 |
+
if do_classifier_free_guidance and negative_prompt_embeds is None:
|
213 |
+
uncond_tokens: List[str]
|
214 |
+
if negative_prompt is None:
|
215 |
+
uncond_tokens = [""] * batch_size
|
216 |
+
elif prompt is not None and type(prompt) is not type(negative_prompt):
|
217 |
+
raise TypeError(
|
218 |
+
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
|
219 |
+
f" {type(prompt)}."
|
220 |
+
)
|
221 |
+
elif isinstance(negative_prompt, str):
|
222 |
+
uncond_tokens = [negative_prompt]
|
223 |
+
elif batch_size != len(negative_prompt):
|
224 |
+
raise ValueError(
|
225 |
+
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
226 |
+
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
227 |
+
" the batch size of `prompt`."
|
228 |
+
)
|
229 |
+
else:
|
230 |
+
uncond_tokens = negative_prompt
|
231 |
+
|
232 |
+
# textual inversion: procecss multi-vector tokens if necessary
|
233 |
+
if isinstance(self, TextualInversionLoaderMixin):
|
234 |
+
uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
|
235 |
+
|
236 |
+
max_length = prompt_embeds.shape[1]
|
237 |
+
uncond_input = self.tokenizer(
|
238 |
+
uncond_tokens,
|
239 |
+
padding="max_length",
|
240 |
+
max_length=max_length,
|
241 |
+
truncation=True,
|
242 |
+
return_tensors="pt",
|
243 |
+
)
|
244 |
+
|
245 |
+
if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
|
246 |
+
attention_mask = uncond_input.attention_mask.to(device)
|
247 |
+
else:
|
248 |
+
attention_mask = None
|
249 |
+
|
250 |
+
negative_prompt_embeds = self.text_encoder(
|
251 |
+
uncond_input.input_ids.to(device),
|
252 |
+
attention_mask=attention_mask,
|
253 |
+
)
|
254 |
+
negative_prompt_embeds = negative_prompt_embeds[0]
|
255 |
+
|
256 |
+
if do_classifier_free_guidance:
|
257 |
+
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
258 |
+
seq_len = negative_prompt_embeds.shape[1]
|
259 |
+
|
260 |
+
negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
|
261 |
+
|
262 |
+
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
263 |
+
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
264 |
+
|
265 |
+
# For classifier free guidance, we need to do two forward passes.
|
266 |
+
# Here we concatenate the unconditional and text embeddings into a single batch
|
267 |
+
# to avoid doing two forward passes
|
268 |
+
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
269 |
+
|
270 |
+
return prompt_embeds
|
271 |
+
|
272 |
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
|
273 |
+
|
274 |
+
def decode_latents(self, latents):
|
275 |
+
latents = 1 / self.vae.config.scaling_factor * latents
|
276 |
+
image = self.vae.decode(latents).sample
|
277 |
+
image = (image / 2 + 0.5).clamp(0, 1)
|
278 |
+
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
|
279 |
+
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
280 |
+
return image
|
281 |
+
|
282 |
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
|
283 |
+
def prepare_extra_step_kwargs(self, generator, eta):
|
284 |
+
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
285 |
+
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
286 |
+
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
287 |
+
# and should be between [0, 1]
|
288 |
+
|
289 |
+
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
290 |
+
extra_step_kwargs = {}
|
291 |
+
if accepts_eta:
|
292 |
+
extra_step_kwargs["eta"] = eta
|
293 |
+
|
294 |
+
# check if the scheduler accepts generator
|
295 |
+
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
296 |
+
if accepts_generator:
|
297 |
+
extra_step_kwargs["generator"] = generator
|
298 |
+
return extra_step_kwargs
|
299 |
+
|
300 |
+
def check_inputs(
|
301 |
+
self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
|
302 |
+
):
|
303 |
+
if strength < 0 or strength > 1:
|
304 |
+
raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
|
305 |
+
|
306 |
+
if (callback_steps is None) or (
|
307 |
+
callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
|
308 |
+
):
|
309 |
+
raise ValueError(
|
310 |
+
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
|
311 |
+
f" {type(callback_steps)}."
|
312 |
+
)
|
313 |
+
|
314 |
+
if prompt is not None and prompt_embeds is not None:
|
315 |
+
raise ValueError(
|
316 |
+
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
|
317 |
+
" only forward one of the two."
|
318 |
+
)
|
319 |
+
elif prompt is None and prompt_embeds is None:
|
320 |
+
raise ValueError(
|
321 |
+
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
|
322 |
+
)
|
323 |
+
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
324 |
+
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
325 |
+
|
326 |
+
if negative_prompt is not None and negative_prompt_embeds is not None:
|
327 |
+
raise ValueError(
|
328 |
+
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
|
329 |
+
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
|
330 |
+
)
|
331 |
+
|
332 |
+
if prompt_embeds is not None and negative_prompt_embeds is not None:
|
333 |
+
if prompt_embeds.shape != negative_prompt_embeds.shape:
|
334 |
+
raise ValueError(
|
335 |
+
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
|
336 |
+
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
|
337 |
+
f" {negative_prompt_embeds.shape}."
|
338 |
+
)
|
339 |
+
|
340 |
+
def get_timesteps(self, num_inference_steps, strength, device):
|
341 |
+
# get the original timestep using init_timestep
|
342 |
+
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
343 |
+
|
344 |
+
t_start = max(num_inference_steps - init_timestep, 0)
|
345 |
+
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:]
|
346 |
+
|
347 |
+
return timesteps, num_inference_steps - t_start
|
348 |
+
|
349 |
+
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
350 |
+
if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
|
351 |
+
raise ValueError(
|
352 |
+
f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
|
353 |
+
)
|
354 |
+
|
355 |
+
image = image.to(device=device, dtype=dtype)
|
356 |
+
|
357 |
+
batch_size = batch_size * num_images_per_prompt
|
358 |
+
|
359 |
+
if image.shape[1] == 4:
|
360 |
+
init_latents = image
|
361 |
+
|
362 |
+
else:
|
363 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
364 |
+
raise ValueError(
|
365 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
366 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
367 |
+
)
|
368 |
+
|
369 |
+
elif isinstance(generator, list):
|
370 |
+
init_latents = [
|
371 |
+
self.vae.encode(image[i: i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
|
372 |
+
]
|
373 |
+
init_latents = torch.cat(init_latents, dim=0)
|
374 |
+
else:
|
375 |
+
init_latents = self.vae.encode(image).latent_dist.sample(generator)
|
376 |
+
|
377 |
+
init_latents = self.vae.config.scaling_factor * init_latents
|
378 |
+
|
379 |
+
if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
|
380 |
+
# expand init_latents for batch_size
|
381 |
+
deprecation_message = (
|
382 |
+
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
|
383 |
+
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
|
384 |
+
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
|
385 |
+
" your script to pass as many initial images as text prompts to suppress this warning."
|
386 |
+
)
|
387 |
+
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
|
388 |
+
additional_image_per_prompt = batch_size // init_latents.shape[0]
|
389 |
+
init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
|
390 |
+
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
|
391 |
+
raise ValueError(
|
392 |
+
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
393 |
+
)
|
394 |
+
else:
|
395 |
+
init_latents = torch.cat([init_latents], dim=0)
|
396 |
+
|
397 |
+
shape = init_latents.shape
|
398 |
+
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
399 |
+
|
400 |
+
# get latents
|
401 |
+
init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
|
402 |
+
latents = init_latents
|
403 |
+
|
404 |
+
return latents
|
405 |
+
|
406 |
+
@torch.no_grad()
|
407 |
+
def __call__(
|
408 |
+
self,
|
409 |
+
prompt: Union[str, List[str]] = None,
|
410 |
+
image: Union[
|
411 |
+
torch.FloatTensor,
|
412 |
+
PIL.Image.Image,
|
413 |
+
np.ndarray,
|
414 |
+
List[torch.FloatTensor],
|
415 |
+
List[PIL.Image.Image],
|
416 |
+
List[np.ndarray],
|
417 |
+
] = None,
|
418 |
+
strength: float = 0.8,
|
419 |
+
num_inference_steps: Optional[int] = 50,
|
420 |
+
guidance_scale: Optional[float] = 7.5,
|
421 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
422 |
+
num_images_per_prompt: Optional[int] = 1,
|
423 |
+
eta: Optional[float] = 0.0,
|
424 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
425 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
426 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
427 |
+
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
428 |
+
callback_steps: int = 1,
|
429 |
+
):
|
430 |
+
r"""
|
431 |
+
The call function to the pipeline for generation.
|
432 |
+
|
433 |
+
Args:
|
434 |
+
prompt (`str` or `List[str]`, *optional*):
|
435 |
+
The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
|
436 |
+
image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
|
437 |
+
`Image` or tensor representing an image batch to be used as the starting point. Can also accept image
|
438 |
+
latents as `image`, but if passing latents directly it is not encoded again.
|
439 |
+
strength (`float`, *optional*, defaults to 0.8):
|
440 |
+
Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
|
441 |
+
starting point and more noise is added the higher the `strength`. The number of denoising steps depends
|
442 |
+
on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
|
443 |
+
process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
|
444 |
+
essentially ignores `image`.
|
445 |
+
num_inference_steps (`int`, *optional*, defaults to 50):
|
446 |
+
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
447 |
+
expense of slower inference. This parameter is modulated by `strength`.
|
448 |
+
guidance_scale (`float`, *optional*, defaults to 7.5):
|
449 |
+
A higher guidance scale value encourages the model to generate images closely linked to the text
|
450 |
+
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
|
451 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
452 |
+
The prompt or prompts to guide what to not include in image generation. If not defined, you need to
|
453 |
+
pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
|
454 |
+
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
455 |
+
The number of images to generate per prompt.
|
456 |
+
eta (`float`, *optional*, defaults to 0.0):
|
457 |
+
Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
|
458 |
+
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
|
459 |
+
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
460 |
+
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
461 |
+
generation deterministic.
|
462 |
+
prompt_embeds (`torch.FloatTensor`, *optional*):
|
463 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
|
464 |
+
provided, text embeddings are generated from the `prompt` input argument.
|
465 |
+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
466 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
|
467 |
+
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
468 |
+
callback (`Callable`, *optional*):
|
469 |
+
A function that calls every `callback_steps` steps during inference. The function is called with the
|
470 |
+
following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
|
471 |
+
callback_steps (`int`, *optional*, defaults to 1):
|
472 |
+
The frequency at which the `callback` function is called. If not specified, the callback is called at
|
473 |
+
every step.
|
474 |
+
|
475 |
+
Examples:
|
476 |
+
|
477 |
+
Returns:
|
478 |
+
[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
|
479 |
+
If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
|
480 |
+
otherwise a `tuple` is returned where the first element is a list with the generated images and the
|
481 |
+
second element is a list of `bool`s indicating whether the corresponding generated image contains
|
482 |
+
"not-safe-for-work" (nsfw) content.
|
483 |
+
"""
|
484 |
+
# 1. Check inputs. Raise error if not correct
|
485 |
+
self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
|
486 |
+
|
487 |
+
# 2. Define call parameters
|
488 |
+
if prompt is not None and isinstance(prompt, str):
|
489 |
+
batch_size = 1
|
490 |
+
elif prompt is not None and isinstance(prompt, list):
|
491 |
+
batch_size = len(prompt)
|
492 |
+
else:
|
493 |
+
batch_size = prompt_embeds.shape[0]
|
494 |
+
|
495 |
+
device = self.device
|
496 |
+
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
497 |
+
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
498 |
+
# corresponds to doing no classifier free guidance.
|
499 |
+
do_classifier_free_guidance = guidance_scale > 1.0
|
500 |
+
|
501 |
+
# 3. Encode input prompt
|
502 |
+
prompt_embeds = self._encode_prompt(
|
503 |
+
prompt,
|
504 |
+
device,
|
505 |
+
num_images_per_prompt,
|
506 |
+
do_classifier_free_guidance,
|
507 |
+
negative_prompt,
|
508 |
+
prompt_embeds=prompt_embeds,
|
509 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
510 |
+
)
|
511 |
+
|
512 |
+
# 4. Preprocess image
|
513 |
+
image = preprocess(image)
|
514 |
+
|
515 |
+
# 5. set timesteps
|
516 |
+
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
517 |
+
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
|
518 |
+
latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
|
519 |
+
|
520 |
+
# 6. Prepare latent variables
|
521 |
+
latents = self.prepare_latents(
|
522 |
+
image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
|
523 |
+
)
|
524 |
+
|
525 |
+
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
526 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
527 |
+
|
528 |
+
# 8. Denoising loop
|
529 |
+
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
530 |
+
|
531 |
+
for i, t in enumerate(timesteps):
|
532 |
+
# expand the latents if we are doing classifier free guidance
|
533 |
+
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
534 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
535 |
+
latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
|
536 |
+
|
537 |
+
# predict the noise residual
|
538 |
+
# 后边4个 None 是给到controlnet 的参数,暂时给到 None 当 placeholder
|
539 |
+
noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, None, None, None, None)
|
540 |
+
|
541 |
+
noise_pred = noise_pred.permute(0, 3, 1, 2)
|
542 |
+
|
543 |
+
# perform guidance
|
544 |
+
if do_classifier_free_guidance:
|
545 |
+
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
546 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
547 |
+
|
548 |
+
# compute the previous noisy sample x_t -> x_t-1
|
549 |
+
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
550 |
+
|
551 |
+
image = self.decode_latents(latents)
|
552 |
+
image = numpy_to_pil(image)
|
553 |
+
|
554 |
+
return image
|
lyraSD/muse_trt/libnvinfer_plugin.so → lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm80.so
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0689ed5d3b55f5033a8869d5f23ce900793aa0ab7fdc4a3e3c0a0f3a243c83da
|
3 |
+
size 65441456
|
sd1.4-engine/superx4-512-512.plan → lyrasd_model/lyrasd_lib/libth_lyrasd_cu11_sm86.so
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8e27e715fa3a17ce25bf23b772e0dd355d0780c1bd93cfeeb12ef45b0ba2444
|
3 |
+
size 65389176
|
sd1.4-engine/clip.plan → lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm80.so
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2eaa9067ad8eb1d20872afa71ed9497f62d930819704d15e5e8bf559623eca7
|
3 |
+
size 65498752
|
sd1.4-engine/vae-decoder.plan → lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d0c909ff2498934c6d1ed8f46af6cdc7812872177c0a4e7ca0ee99bf88fcb65
|
3 |
+
size 65519232
|
lyrasd_model/lyrasd_lib/placeholder.txt
ADDED
File without changes
|
lyrasd_model/lyrasd_txt2img_pipeline.py
ADDED
@@ -0,0 +1,458 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import inspect
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
from typing import Any, Callable, Dict, List, Optional, Union
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from diffusers.loaders import TextualInversionLoaderMixin
|
8 |
+
from diffusers.models import AutoencoderKL
|
9 |
+
from diffusers.schedulers import (DPMSolverMultistepScheduler,
|
10 |
+
EulerAncestralDiscreteScheduler,
|
11 |
+
EulerDiscreteScheduler,
|
12 |
+
KarrasDiffusionSchedulers)
|
13 |
+
from diffusers.utils import logging, randn_tensor
|
14 |
+
from PIL import Image
|
15 |
+
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
|
16 |
+
import gc
|
17 |
+
import numpy as np
|
18 |
+
|
19 |
+
from .lora_util import add_text_lora_layer
|
20 |
+
|
21 |
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
22 |
+
|
23 |
+
|
24 |
+
def numpy_to_pil(images):
|
25 |
+
"""
|
26 |
+
Convert a numpy image or a batch of images to a PIL image.
|
27 |
+
"""
|
28 |
+
if images.ndim == 3:
|
29 |
+
images = images[None, ...]
|
30 |
+
images = (images * 255).round().astype("uint8")
|
31 |
+
if images.shape[-1] == 1:
|
32 |
+
# special case for grayscale (single channel) images
|
33 |
+
pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
|
34 |
+
else:
|
35 |
+
pil_images = [Image.fromarray(image) for image in images]
|
36 |
+
|
37 |
+
return pil_images
|
38 |
+
|
39 |
+
|
40 |
+
class LyraSdTxt2ImgPipeline(TextualInversionLoaderMixin):
|
41 |
+
def __init__(self, model_path, lib_so_path, model_dtype="fp32", device=torch.device("cuda"), dtype=torch.float16) -> None:
|
42 |
+
self.device = device
|
43 |
+
self.dtype = dtype
|
44 |
+
|
45 |
+
torch.classes.load_library(lib_so_path)
|
46 |
+
|
47 |
+
self.vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae").to(dtype).to(device)
|
48 |
+
self.tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
|
49 |
+
self.text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder").to(dtype).to(device)
|
50 |
+
unet_path = os.path.join(model_path, "unet_bins/")
|
51 |
+
|
52 |
+
self.unet_in_channels = 4
|
53 |
+
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
54 |
+
self.vae.enable_tiling()
|
55 |
+
self.unet = torch.classes.lyrasd.Unet2dConditionalModelOp(
|
56 |
+
3, # max num of controlnets
|
57 |
+
"fp16" # inference dtype (can only use fp16 for now)
|
58 |
+
)
|
59 |
+
|
60 |
+
unet_path = os.path.join(model_path, "unet_bins/")
|
61 |
+
|
62 |
+
self.reload_unet_model(unet_path, model_dtype)
|
63 |
+
|
64 |
+
self.scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_path, subfolder="scheduler")
|
65 |
+
|
66 |
+
self.loaded_lora = {}
|
67 |
+
|
68 |
+
def reload_unet_model(self, unet_path, unet_file_format='fp32'):
|
69 |
+
if len(unet_path) > 0 and unet_path[-1] != "/":
|
70 |
+
unet_path = unet_path + "/"
|
71 |
+
return self.unet.reload_unet_model(unet_path, unet_file_format)
|
72 |
+
|
73 |
+
def load_lora(self, lora_model_path, lora_name, lora_strength, lora_file_format='fp32'):
|
74 |
+
if len(lora_model_path) > 0 and lora_model_path[-1] != "/":
|
75 |
+
lora_model_path = lora_model_path + "/"
|
76 |
+
lora = add_text_lora_layer(self.text_encoder, lora_model_path, lora_strength, lora_file_format)
|
77 |
+
self.loaded_lora[lora_name] = lora
|
78 |
+
self.unet.load_lora(lora_model_path, lora_name, lora_strength, lora_file_format)
|
79 |
+
|
80 |
+
def unload_lora(self, lora_name, clean_cache=False):
|
81 |
+
for layer_data in self.loaded_lora[lora_name]:
|
82 |
+
layer = layer_data['layer']
|
83 |
+
added_weight = layer_data['added_weight']
|
84 |
+
layer.weight.data -= added_weight
|
85 |
+
self.unet.unload_lora(lora_name, clean_cache)
|
86 |
+
del self.loaded_lora[lora_name]
|
87 |
+
gc.collect()
|
88 |
+
torch.cuda.empty_cache()
|
89 |
+
|
90 |
+
def clean_lora_cache(self):
|
91 |
+
self.unet.clean_lora_cache()
|
92 |
+
|
93 |
+
def get_loaded_lora(self):
|
94 |
+
return self.unet.get_loaded_lora()
|
95 |
+
|
96 |
+
def _encode_prompt(
|
97 |
+
self,
|
98 |
+
prompt,
|
99 |
+
device,
|
100 |
+
num_images_per_prompt,
|
101 |
+
do_classifier_free_guidance,
|
102 |
+
negative_prompt=None,
|
103 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
104 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
105 |
+
):
|
106 |
+
r"""
|
107 |
+
Encodes the prompt into text encoder hidden states.
|
108 |
+
|
109 |
+
Args:
|
110 |
+
prompt (`str` or `List[str]`, *optional*):
|
111 |
+
prompt to be encoded
|
112 |
+
device: (`torch.device`):
|
113 |
+
torch device
|
114 |
+
num_images_per_prompt (`int`):
|
115 |
+
number of images that should be generated per prompt
|
116 |
+
do_classifier_free_guidance (`bool`):
|
117 |
+
whether to use classifier free guidance or not
|
118 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
119 |
+
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
120 |
+
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
121 |
+
less than `1`).
|
122 |
+
prompt_embeds (`torch.FloatTensor`, *optional*):
|
123 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
124 |
+
provided, text embeddings will be generated from `prompt` input argument.
|
125 |
+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
126 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
127 |
+
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
128 |
+
argument.
|
129 |
+
"""
|
130 |
+
if prompt is not None and isinstance(prompt, str):
|
131 |
+
batch_size = 1
|
132 |
+
elif prompt is not None and isinstance(prompt, list):
|
133 |
+
batch_size = len(prompt)
|
134 |
+
else:
|
135 |
+
batch_size = prompt_embeds.shape[0]
|
136 |
+
|
137 |
+
if prompt_embeds is None:
|
138 |
+
# textual inversion: procecss multi-vector tokens if necessary
|
139 |
+
if isinstance(self, TextualInversionLoaderMixin):
|
140 |
+
prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
|
141 |
+
|
142 |
+
text_inputs = self.tokenizer(
|
143 |
+
prompt,
|
144 |
+
padding="max_length",
|
145 |
+
max_length=self.tokenizer.model_max_length,
|
146 |
+
truncation=True,
|
147 |
+
return_tensors="pt",
|
148 |
+
)
|
149 |
+
text_input_ids = text_inputs.input_ids
|
150 |
+
untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
|
151 |
+
|
152 |
+
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
|
153 |
+
text_input_ids, untruncated_ids
|
154 |
+
):
|
155 |
+
removed_text = self.tokenizer.batch_decode(
|
156 |
+
untruncated_ids[:, self.tokenizer.model_max_length - 1: -1]
|
157 |
+
)
|
158 |
+
logger.warning(
|
159 |
+
"The following part of your input was truncated because CLIP can only handle sequences up to"
|
160 |
+
f" {self.tokenizer.model_max_length} tokens: {removed_text}"
|
161 |
+
)
|
162 |
+
|
163 |
+
if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
|
164 |
+
attention_mask = text_inputs.attention_mask.to(device)
|
165 |
+
else:
|
166 |
+
attention_mask = None
|
167 |
+
|
168 |
+
prompt_embeds = self.text_encoder(
|
169 |
+
text_input_ids.to(device),
|
170 |
+
attention_mask=attention_mask,
|
171 |
+
)
|
172 |
+
prompt_embeds = prompt_embeds[0]
|
173 |
+
|
174 |
+
prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
|
175 |
+
|
176 |
+
bs_embed, seq_len, _ = prompt_embeds.shape
|
177 |
+
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
178 |
+
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
179 |
+
prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
|
180 |
+
|
181 |
+
# get unconditional embeddings for classifier free guidance
|
182 |
+
if do_classifier_free_guidance and negative_prompt_embeds is None:
|
183 |
+
uncond_tokens: List[str]
|
184 |
+
if negative_prompt is None:
|
185 |
+
uncond_tokens = [""] * batch_size
|
186 |
+
elif type(prompt) is not type(negative_prompt):
|
187 |
+
raise TypeError(
|
188 |
+
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
|
189 |
+
f" {type(prompt)}."
|
190 |
+
)
|
191 |
+
elif isinstance(negative_prompt, str):
|
192 |
+
uncond_tokens = [negative_prompt]
|
193 |
+
elif batch_size != len(negative_prompt):
|
194 |
+
raise ValueError(
|
195 |
+
f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
|
196 |
+
f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
|
197 |
+
" the batch size of `prompt`."
|
198 |
+
)
|
199 |
+
else:
|
200 |
+
uncond_tokens = negative_prompt
|
201 |
+
|
202 |
+
# textual inversion: procecss multi-vector tokens if necessary
|
203 |
+
if isinstance(self, TextualInversionLoaderMixin):
|
204 |
+
uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
|
205 |
+
|
206 |
+
max_length = prompt_embeds.shape[1]
|
207 |
+
uncond_input = self.tokenizer(
|
208 |
+
uncond_tokens,
|
209 |
+
padding="max_length",
|
210 |
+
max_length=max_length,
|
211 |
+
truncation=True,
|
212 |
+
return_tensors="pt",
|
213 |
+
)
|
214 |
+
|
215 |
+
if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
|
216 |
+
attention_mask = uncond_input.attention_mask.to(device)
|
217 |
+
else:
|
218 |
+
attention_mask = None
|
219 |
+
|
220 |
+
negative_prompt_embeds = self.text_encoder(
|
221 |
+
uncond_input.input_ids.to(device),
|
222 |
+
attention_mask=attention_mask,
|
223 |
+
)
|
224 |
+
negative_prompt_embeds = negative_prompt_embeds[0]
|
225 |
+
|
226 |
+
if do_classifier_free_guidance:
|
227 |
+
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
|
228 |
+
seq_len = negative_prompt_embeds.shape[1]
|
229 |
+
|
230 |
+
negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
|
231 |
+
|
232 |
+
negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
233 |
+
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
234 |
+
|
235 |
+
# For classifier free guidance, we need to do two forward passes.
|
236 |
+
# Here we concatenate the unconditional and text embeddings into a single batch
|
237 |
+
# to avoid doing two forward passes
|
238 |
+
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
239 |
+
|
240 |
+
return prompt_embeds
|
241 |
+
|
242 |
+
def decode_latents(self, latents):
|
243 |
+
latents = 1 / self.vae.config.scaling_factor * latents
|
244 |
+
image = self.vae.decode(latents).sample
|
245 |
+
image = (image / 2 + 0.5).clamp(0, 1)
|
246 |
+
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
|
247 |
+
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
|
248 |
+
return image
|
249 |
+
|
250 |
+
def check_inputs(
|
251 |
+
self,
|
252 |
+
prompt,
|
253 |
+
height,
|
254 |
+
width,
|
255 |
+
negative_prompt=None,
|
256 |
+
prompt_embeds=None,
|
257 |
+
negative_prompt_embeds=None,
|
258 |
+
):
|
259 |
+
if height % 64 != 0 or width % 64 != 0: # 初版暂时只支持 64 的倍数的 height 和 width
|
260 |
+
raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
|
261 |
+
|
262 |
+
if prompt is not None and prompt_embeds is not None:
|
263 |
+
raise ValueError(
|
264 |
+
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
|
265 |
+
" only forward one of the two."
|
266 |
+
)
|
267 |
+
elif prompt is None and prompt_embeds is None:
|
268 |
+
raise ValueError(
|
269 |
+
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
|
270 |
+
)
|
271 |
+
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
272 |
+
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
273 |
+
|
274 |
+
if negative_prompt is not None and negative_prompt_embeds is not None:
|
275 |
+
raise ValueError(
|
276 |
+
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
|
277 |
+
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
|
278 |
+
)
|
279 |
+
|
280 |
+
if prompt_embeds is not None and negative_prompt_embeds is not None:
|
281 |
+
if prompt_embeds.shape != negative_prompt_embeds.shape:
|
282 |
+
raise ValueError(
|
283 |
+
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
|
284 |
+
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
|
285 |
+
f" {negative_prompt_embeds.shape}."
|
286 |
+
)
|
287 |
+
|
288 |
+
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
|
289 |
+
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
|
290 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
291 |
+
raise ValueError(
|
292 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
293 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
294 |
+
)
|
295 |
+
|
296 |
+
if latents is None:
|
297 |
+
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
298 |
+
else:
|
299 |
+
latents = latents.to(device)
|
300 |
+
|
301 |
+
# scale the initial noise by the standard deviation required by the scheduler
|
302 |
+
latents = latents * self.scheduler.init_noise_sigma
|
303 |
+
return latents
|
304 |
+
|
305 |
+
def prepare_extra_step_kwargs(self, generator, eta):
|
306 |
+
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
307 |
+
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
308 |
+
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
309 |
+
# and should be between [0, 1]
|
310 |
+
|
311 |
+
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
312 |
+
extra_step_kwargs = {}
|
313 |
+
if accepts_eta:
|
314 |
+
extra_step_kwargs["eta"] = eta
|
315 |
+
|
316 |
+
# check if the scheduler accepts generator
|
317 |
+
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
318 |
+
if accepts_generator:
|
319 |
+
extra_step_kwargs["generator"] = generator
|
320 |
+
return extra_step_kwargs
|
321 |
+
|
322 |
+
@torch.no_grad()
|
323 |
+
def __call__(
|
324 |
+
self,
|
325 |
+
prompt: Union[str, List[str]] = None,
|
326 |
+
height: Optional[int] = None,
|
327 |
+
width: Optional[int] = None,
|
328 |
+
num_inference_steps: int = 50,
|
329 |
+
guidance_scale: float = 7.5,
|
330 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
331 |
+
num_images_per_prompt: Optional[int] = 1,
|
332 |
+
eta: float = 0.0,
|
333 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
334 |
+
latents: Optional[torch.FloatTensor] = None,
|
335 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
336 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
337 |
+
):
|
338 |
+
r"""
|
339 |
+
Function invoked when calling the pipeline for generation.
|
340 |
+
|
341 |
+
Args:
|
342 |
+
prompt (`str` or `List[str]`, *optional*):
|
343 |
+
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
344 |
+
instead.
|
345 |
+
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
346 |
+
The height in pixels of the generated image.
|
347 |
+
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
348 |
+
The width in pixels of the generated image.
|
349 |
+
num_inference_steps (`int`, *optional*, defaults to 50):
|
350 |
+
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
351 |
+
expense of slower inference.
|
352 |
+
guidance_scale (`float`, *optional*, defaults to 7.5):
|
353 |
+
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
354 |
+
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
355 |
+
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
356 |
+
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
357 |
+
usually at the expense of lower image quality.
|
358 |
+
negative_prompt (`str` or `List[str]`, *optional*):
|
359 |
+
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
360 |
+
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
361 |
+
less than `1`).
|
362 |
+
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
363 |
+
The number of images to generate per prompt.
|
364 |
+
eta (`float`, *optional*, defaults to 0.0):
|
365 |
+
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
|
366 |
+
[`schedulers.DDIMScheduler`], will be ignored for others.
|
367 |
+
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
368 |
+
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
|
369 |
+
to make generation deterministic.
|
370 |
+
latents (`torch.FloatTensor`, *optional*):
|
371 |
+
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
372 |
+
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
373 |
+
tensor will ge generated by sampling using the supplied random `generator`.
|
374 |
+
prompt_embeds (`torch.FloatTensor`, *optional*):
|
375 |
+
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
376 |
+
provided, text embeddings will be generated from `prompt` input argument.
|
377 |
+
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
378 |
+
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
379 |
+
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
380 |
+
argument.
|
381 |
+
|
382 |
+
"""
|
383 |
+
# 1. Check inputs. Raise error if not correct
|
384 |
+
self.check_inputs(
|
385 |
+
prompt, height, width, negative_prompt, prompt_embeds, negative_prompt_embeds
|
386 |
+
)
|
387 |
+
|
388 |
+
# 2. Define call parameters
|
389 |
+
if prompt is not None and isinstance(prompt, str):
|
390 |
+
batch_size = 1
|
391 |
+
elif prompt is not None and isinstance(prompt, list):
|
392 |
+
batch_size = len(prompt)
|
393 |
+
else:
|
394 |
+
batch_size = prompt_embeds.shape[0]
|
395 |
+
|
396 |
+
device = self.device
|
397 |
+
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
398 |
+
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
399 |
+
# corresponds to doing no classifier free guidance.
|
400 |
+
do_classifier_free_guidance = guidance_scale > 1.0
|
401 |
+
|
402 |
+
# 3. Encode input prompt
|
403 |
+
prompt_embeds = self._encode_prompt(
|
404 |
+
prompt,
|
405 |
+
device,
|
406 |
+
num_images_per_prompt,
|
407 |
+
do_classifier_free_guidance,
|
408 |
+
negative_prompt,
|
409 |
+
prompt_embeds=prompt_embeds,
|
410 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
411 |
+
)
|
412 |
+
|
413 |
+
# 4. Prepare timesteps
|
414 |
+
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
415 |
+
timesteps = self.scheduler.timesteps
|
416 |
+
|
417 |
+
# 5. Prepare latent variables
|
418 |
+
num_channels_latents = self.unet_in_channels
|
419 |
+
latents = self.prepare_latents(
|
420 |
+
batch_size * num_images_per_prompt,
|
421 |
+
num_channels_latents,
|
422 |
+
height,
|
423 |
+
width,
|
424 |
+
prompt_embeds.dtype,
|
425 |
+
device,
|
426 |
+
generator,
|
427 |
+
latents,
|
428 |
+
)
|
429 |
+
|
430 |
+
# # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
431 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
432 |
+
|
433 |
+
# 7. Denoising loop
|
434 |
+
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
435 |
+
|
436 |
+
for i, t in enumerate(timesteps):
|
437 |
+
# expand the latents if we are doing classifier free guidance
|
438 |
+
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
439 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
440 |
+
latent_model_input = latent_model_input.permute(0, 2, 3, 1).contiguous()
|
441 |
+
|
442 |
+
# 后边4个 None 是给到controlnet 的参数,暂时给到 None 当 placeholder
|
443 |
+
noise_pred = self.unet.forward(latent_model_input, prompt_embeds, t, None, None, None, None)
|
444 |
+
|
445 |
+
noise_pred = noise_pred.permute(0, 3, 1, 2)
|
446 |
+
# perform guidance
|
447 |
+
|
448 |
+
if do_classifier_free_guidance:
|
449 |
+
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
450 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
451 |
+
|
452 |
+
# compute the previous noisy sample x_t -> x_t-1
|
453 |
+
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
454 |
+
|
455 |
+
image = self.decode_latents(latents)
|
456 |
+
image = numpy_to_pil(image)
|
457 |
+
|
458 |
+
return image
|
models/README.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Models
|
2 |
+
### This is the place where you should download the checkpoints, and unzip them
|
3 |
+
|
4 |
+
```bash
|
5 |
+
wget -O lyrasd_rev_animated.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/lyrasd_rev_animated.tar.gz?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1694078210;1866878210&q-key-time=1694078210;1866878210&q-header-list=&q-url-param-list=&q-signature=6046546135631dee9e8be7d8e061a77e8790e675"
|
6 |
+
wget -O lyrasd_canny.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/lyrasd_canny.tar.gz?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1694078194;1866878194&q-key-time=1694078194;1866878194&q-header-list=&q-url-param-list=&q-signature=efb713ee650a0ee3c954fb3a0e148c37ef13cd3b"
|
7 |
+
wget -O lyrasd_xiaorenshu_lora.tar.gz "https://chuangxin-research-1258344705.cos.ap-guangzhou.myqcloud.com/share/files/lyrasd/lyrasd_xiaorenshu_lora.tar.gz?q-sign-algorithm=sha1&q-ak=AKIDBF6i7GCtKWS8ZkgOtACzX3MQDl37xYty&q-sign-time=1694078234;1866878234&q-key-time=1694078234;1866878234&q-header-list=&q-url-param-list=&q-signature=fb9a577a54ea6dedd9be696e40b96b71a1b23b5d"
|
8 |
+
|
9 |
+
tar -xvf lyrasd_rev_animated.tar.gz
|
10 |
+
tar -xvf lyrasd_canny.tar.gz
|
11 |
+
tar -xvf lyrasd_xiaorenshu_lora.tar.gz
|
12 |
+
```
|
output/img2img_demo.jpg
DELETED
Binary file (22 kB)
|
|
output/img2img_input.jpg
DELETED
Binary file (97.3 kB)
|
|
output/text2img_demo.jpg
DELETED
Binary file (42.2 kB)
|
|
outputs/res_controlnet_img2img_0.png
ADDED
![]() |
outputs/res_controlnet_txt2img_0.png
ADDED
![]() |
outputs/res_img2img_0.png
ADDED
![]() |
outputs/res_txt2img_0.png
ADDED
![]() |
outputs/res_txt2img_lora_0.png
ADDED
![]() |
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
diffusers
|
2 |
+
transformers
|
sd1.4-engine/feature_extractor/preprocessor_config.json
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"crop_size": {
|
3 |
-
"height": 224,
|
4 |
-
"width": 224
|
5 |
-
},
|
6 |
-
"do_center_crop": true,
|
7 |
-
"do_convert_rgb": true,
|
8 |
-
"do_normalize": true,
|
9 |
-
"do_rescale": true,
|
10 |
-
"do_resize": true,
|
11 |
-
"feature_extractor_type": "CLIPFeatureExtractor",
|
12 |
-
"image_mean": [
|
13 |
-
0.48145466,
|
14 |
-
0.4578275,
|
15 |
-
0.40821073
|
16 |
-
],
|
17 |
-
"image_processor_type": "CLIPImageProcessor",
|
18 |
-
"image_std": [
|
19 |
-
0.26862954,
|
20 |
-
0.26130258,
|
21 |
-
0.27577711
|
22 |
-
],
|
23 |
-
"resample": 3,
|
24 |
-
"rescale_factor": 0.00392156862745098,
|
25 |
-
"size": {
|
26 |
-
"shortest_edge": 224
|
27 |
-
}
|
28 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sd1.4-engine/scheduler/scheduler_config.json
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_class_name": "PNDMScheduler",
|
3 |
-
"_diffusers_version": "0.14.0",
|
4 |
-
"beta_end": 0.012,
|
5 |
-
"beta_schedule": "scaled_linear",
|
6 |
-
"beta_start": 0.00085,
|
7 |
-
"clip_sample": false,
|
8 |
-
"num_train_timesteps": 1000,
|
9 |
-
"prediction_type": "epsilon",
|
10 |
-
"set_alpha_to_one": false,
|
11 |
-
"skip_prk_steps": true,
|
12 |
-
"steps_offset": 1,
|
13 |
-
"trained_betas": null
|
14 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sd1.4-engine/text_encoder/config.json
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_name_or_path": "openai/clip-vit-large-patch14",
|
3 |
-
"architectures": [
|
4 |
-
"CLIPTextModel"
|
5 |
-
],
|
6 |
-
"attention_dropout": 0.0,
|
7 |
-
"bos_token_id": 0,
|
8 |
-
"dropout": 0.0,
|
9 |
-
"eos_token_id": 2,
|
10 |
-
"hidden_act": "quick_gelu",
|
11 |
-
"hidden_size": 768,
|
12 |
-
"initializer_factor": 1.0,
|
13 |
-
"initializer_range": 0.02,
|
14 |
-
"intermediate_size": 3072,
|
15 |
-
"layer_norm_eps": 1e-05,
|
16 |
-
"max_position_embeddings": 77,
|
17 |
-
"model_type": "clip_text_model",
|
18 |
-
"num_attention_heads": 12,
|
19 |
-
"num_hidden_layers": 12,
|
20 |
-
"pad_token_id": 1,
|
21 |
-
"projection_dim": 768,
|
22 |
-
"torch_dtype": "float32",
|
23 |
-
"transformers_version": "4.25.1",
|
24 |
-
"vocab_size": 49408
|
25 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sd1.4-engine/tokenizer/merges.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|
sd1.4-engine/tokenizer/special_tokens_map.json
DELETED
@@ -1,24 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"bos_token": {
|
3 |
-
"content": "<|startoftext|>",
|
4 |
-
"lstrip": false,
|
5 |
-
"normalized": true,
|
6 |
-
"rstrip": false,
|
7 |
-
"single_word": false
|
8 |
-
},
|
9 |
-
"eos_token": {
|
10 |
-
"content": "<|endoftext|>",
|
11 |
-
"lstrip": false,
|
12 |
-
"normalized": true,
|
13 |
-
"rstrip": false,
|
14 |
-
"single_word": false
|
15 |
-
},
|
16 |
-
"pad_token": "<|endoftext|>",
|
17 |
-
"unk_token": {
|
18 |
-
"content": "<|endoftext|>",
|
19 |
-
"lstrip": false,
|
20 |
-
"normalized": true,
|
21 |
-
"rstrip": false,
|
22 |
-
"single_word": false
|
23 |
-
}
|
24 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sd1.4-engine/tokenizer/tokenizer_config.json
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"add_prefix_space": false,
|
3 |
-
"bos_token": {
|
4 |
-
"__type": "AddedToken",
|
5 |
-
"content": "<|startoftext|>",
|
6 |
-
"lstrip": false,
|
7 |
-
"normalized": true,
|
8 |
-
"rstrip": false,
|
9 |
-
"single_word": false
|
10 |
-
},
|
11 |
-
"do_lower_case": true,
|
12 |
-
"eos_token": {
|
13 |
-
"__type": "AddedToken",
|
14 |
-
"content": "<|endoftext|>",
|
15 |
-
"lstrip": false,
|
16 |
-
"normalized": true,
|
17 |
-
"rstrip": false,
|
18 |
-
"single_word": false
|
19 |
-
},
|
20 |
-
"errors": "replace",
|
21 |
-
"model_max_length": 77,
|
22 |
-
"name_or_path": "openai/clip-vit-large-patch14",
|
23 |
-
"pad_token": "<|endoftext|>",
|
24 |
-
"special_tokens_map_file": "./special_tokens_map.json",
|
25 |
-
"tokenizer_class": "CLIPTokenizer",
|
26 |
-
"unk_token": {
|
27 |
-
"__type": "AddedToken",
|
28 |
-
"content": "<|endoftext|>",
|
29 |
-
"lstrip": false,
|
30 |
-
"normalized": true,
|
31 |
-
"rstrip": false,
|
32 |
-
"single_word": false
|
33 |
-
}
|
34 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sd1.4-engine/tokenizer/vocab.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
sd1.4-engine/unet_fp16.plan
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:0373e858a69bb44fe0f21c4990de3ae18d415b4d99ca44e4809ea48fc3482e5a
|
3 |
-
size 1725864976
|
|
|
|
|
|
|
|
sd1.4-engine/vae/config.json
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_class_name": "AutoencoderKL",
|
3 |
-
"_diffusers_version": "0.14.0",
|
4 |
-
"_name_or_path": "stabilityai/sd-vae-ft-mse",
|
5 |
-
"act_fn": "silu",
|
6 |
-
"block_out_channels": [
|
7 |
-
128,
|
8 |
-
256,
|
9 |
-
512,
|
10 |
-
512
|
11 |
-
],
|
12 |
-
"down_block_types": [
|
13 |
-
"DownEncoderBlock2D",
|
14 |
-
"DownEncoderBlock2D",
|
15 |
-
"DownEncoderBlock2D",
|
16 |
-
"DownEncoderBlock2D"
|
17 |
-
],
|
18 |
-
"in_channels": 3,
|
19 |
-
"latent_channels": 4,
|
20 |
-
"layers_per_block": 2,
|
21 |
-
"norm_num_groups": 32,
|
22 |
-
"out_channels": 3,
|
23 |
-
"sample_size": 256,
|
24 |
-
"scaling_factor": 0.18215,
|
25 |
-
"up_block_types": [
|
26 |
-
"UpDecoderBlock2D",
|
27 |
-
"UpDecoderBlock2D",
|
28 |
-
"UpDecoderBlock2D",
|
29 |
-
"UpDecoderBlock2D"
|
30 |
-
]
|
31 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sd1.4-engine/vae/diffusion_pytorch_model.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:a4302e1efa25f3a47ceb7536bc335715ad9d1f203e90c2d25507600d74006e89
|
3 |
-
size 334715313
|
|
|
|
|
|
|
|
txt2img_demo.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import time
|
3 |
+
|
4 |
+
from lyrasd_model import LyraSdTxt2ImgPipeline
|
5 |
+
|
6 |
+
# 存放模型文件的路径,应该包含一下结构:
|
7 |
+
# 1. clip 模型
|
8 |
+
# 2. 转换好的优化后的 unet 模型,放入其中的 unet_bins 文件夹
|
9 |
+
# 3. vae 模型
|
10 |
+
# 4. scheduler 配置
|
11 |
+
|
12 |
+
# LyraSD 的 C++ 编译动态链接库,其中包含 C++ CUDA 计算的细节
|
13 |
+
lib_path = "./lyrasd_model/lyrasd_lib/libth_lyrasd_cu12_sm86.so"
|
14 |
+
model_path = "./models/lyrasd_rev_animated"
|
15 |
+
lora_path = "./models/lyrasd_xiaorenshu_lora"
|
16 |
+
|
17 |
+
# 构建 Txt2Img 的 Pipeline
|
18 |
+
model = LyraSdTxt2ImgPipeline(model_path, lib_path)
|
19 |
+
|
20 |
+
# load lora
|
21 |
+
# 参数分别为 lora 存放位置,名字,lora 强度,lora模型精度
|
22 |
+
model.load_lora(lora_path, "xiaorenshu", 0.4, "fp32")
|
23 |
+
|
24 |
+
# 准备应用的输入和超参数
|
25 |
+
prompt = "a cat, cute, cartoon, concise, traditional, chinese painting, Tang and Song Dynasties, masterpiece, 4k, 8k, UHD, best quality"
|
26 |
+
negative_prompt = "(((horrible))), (((scary))), (((naked))), (((large breasts))), high saturation, colorful, human:2, body:2, low quality, bad quality, lowres, out of frame, duplicate, watermark, signature, text, frames, cut, cropped, malformed limbs, extra limbs, (((missing arms))), (((missing legs)))"
|
27 |
+
height, width = 512, 512
|
28 |
+
steps = 30
|
29 |
+
guidance_scale = 7
|
30 |
+
generator = torch.Generator().manual_seed(123)
|
31 |
+
num_images = 1
|
32 |
+
|
33 |
+
start = time.perf_counter()
|
34 |
+
# 推理生成
|
35 |
+
images = model(prompt, height, width, steps,
|
36 |
+
guidance_scale, negative_prompt, num_images,
|
37 |
+
generator=generator)
|
38 |
+
print("image gen cost: ",time.perf_counter() - start)
|
39 |
+
# 存储生成的图片
|
40 |
+
for i, image in enumerate(images):
|
41 |
+
image.save(f"outputs/res_txt2img_lora_{i}.png")
|
42 |
+
|
43 |
+
# unload lora,参数为 lora 的名字,是否清除 lora 缓存
|
44 |
+
# model.unload_lora("xiaorenshu", True)
|