Spaces:
Running
on
Zero
Running
on
Zero
AlekseyCalvin
commited on
Update pipeline.py
Browse files- pipeline.py +102 -12
pipeline.py
CHANGED
@@ -68,17 +68,108 @@ def prepare_timesteps(
|
|
68 |
# FLUX pipeline function
|
69 |
class FluxWithCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
def __init__(
|
72 |
self,
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
tokenizer_2
|
79 |
-
|
80 |
-
text_encoder_2 = T5EncoderModel,
|
81 |
-
text_encoder_3 = None,
|
82 |
):
|
83 |
super().__init__()
|
84 |
|
@@ -86,21 +177,20 @@ class FluxWithCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFile
|
|
86 |
vae=vae,
|
87 |
text_encoder=text_encoder,
|
88 |
text_encoder_2=text_encoder_2,
|
89 |
-
text_encoder_3=text_encoder_3,
|
90 |
tokenizer=tokenizer,
|
91 |
tokenizer_2=tokenizer_2,
|
92 |
-
tokenizer_3=tokenizer_3,
|
93 |
transformer=transformer,
|
94 |
scheduler=scheduler,
|
95 |
)
|
96 |
self.vae_scale_factor = (
|
97 |
-
2 ** (len(self.vae.config.block_out_channels)
|
98 |
)
|
99 |
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
100 |
self.tokenizer_max_length = (
|
101 |
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
|
102 |
)
|
103 |
self.default_sample_size = 64
|
|
|
104 |
def __call__(
|
105 |
self,
|
106 |
prompt: Union[str, List[str]] = None,
|
|
|
68 |
# FLUX pipeline function
|
69 |
class FluxWithCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
|
70 |
|
71 |
+
r"""
|
72 |
+
The Flux pipeline for text-to-image generation.
|
73 |
+
|
74 |
+
Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
|
75 |
+
|
76 |
+
Args:
|
77 |
+
transformer ([`FluxTransformer2DModel`]):
|
78 |
+
Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
|
79 |
+
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
80 |
+
A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
|
81 |
+
vae ([`AutoencoderKL`]):
|
82 |
+
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
83 |
+
text_encoder ([`CLIPTextModel`]):
|
84 |
+
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
|
85 |
+
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
86 |
+
text_encoder_2 ([`T5EncoderModel`]):
|
87 |
+
[T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
|
88 |
+
the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
|
89 |
+
tokenizer (`CLIPTokenizer`):
|
90 |
+
Tokenizer of class
|
91 |
+
[CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
|
92 |
+
tokenizer_2 (`T5TokenizerFast`):
|
93 |
+
Second Tokenizer of class
|
94 |
+
[T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
|
95 |
+
"""
|
96 |
+
|
97 |
+
model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
|
98 |
+
_optional_components = []
|
99 |
+
_callback_tensor_inputs = ["latents", "prompt_embeds"] model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
|
100 |
+
_optional_components = []
|
101 |
+
_callback_tensor_inputs = ["latents", "prompt_embeds"]
|
102 |
+
|
103 |
+
def __init__(
|
104 |
+
self,
|
105 |
+
scheduler: FlowMatchEulerDiscreteScheduler,
|
106 |
+
vae: AutoencoderKL,
|
107 |
+
text_encoder: CLIPTextModel,
|
108 |
+
tokenizer: CLIPTokenizer,
|
109 |
+
text_encoder_2: T5EncoderModel,
|
110 |
+
tokenizer_2: T5TokenizerFast,
|
111 |
+
transformer: FluxTransformer2DModel,
|
112 |
+
):
|
113 |
+
super().__init__()
|
114 |
+
|
115 |
+
self.register_modules(
|
116 |
+
vae=vae,
|
117 |
+
text_encoder=text_encoder,
|
118 |
+
text_encoder_2=text_encoder_2,
|
119 |
+
tokenizer=tokenizer,
|
120 |
+
tokenizer_2=tokenizer_2,
|
121 |
+
transformer=transformer,
|
122 |
+
scheduler=scheduler,
|
123 |
+
)
|
124 |
+
self.vae_scale_factor = (
|
125 |
+
2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
|
126 |
+
)
|
127 |
+
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
128 |
+
self.tokenizer_max_length = (
|
129 |
+
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
|
130 |
+
)
|
131 |
+
self.default_sample_size = 64
|
132 |
+
r"""
|
133 |
+
The Flux pipeline for text-to-image generation.
|
134 |
+
|
135 |
+
Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
|
136 |
+
|
137 |
+
Args:
|
138 |
+
transformer ([`FluxTransformer2DModel`]):
|
139 |
+
Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
|
140 |
+
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
141 |
+
A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
|
142 |
+
vae ([`AutoencoderKL`]):
|
143 |
+
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
|
144 |
+
text_encoder ([`CLIPTextModel`]):
|
145 |
+
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
|
146 |
+
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
147 |
+
text_encoder_2 ([`T5EncoderModel`]):
|
148 |
+
[T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
|
149 |
+
the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
|
150 |
+
tokenizer (`CLIPTokenizer`):
|
151 |
+
Tokenizer of class
|
152 |
+
[CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
|
153 |
+
tokenizer_2 (`T5TokenizerFast`):
|
154 |
+
Second Tokenizer of class
|
155 |
+
[T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
|
156 |
+
"""
|
157 |
+
|
158 |
+
model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
|
159 |
+
_optional_components = []
|
160 |
+
_callback_tensor_inputs = ["latents", "prompt_embeds"] model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
|
161 |
+
_optional_components = []
|
162 |
+
_callback_tensor_inputs = ["latents", "prompt_embeds"]
|
163 |
+
|
164 |
def __init__(
|
165 |
self,
|
166 |
+
scheduler: FlowMatchEulerDiscreteScheduler,
|
167 |
+
vae: AutoencoderKL,
|
168 |
+
text_encoder: CLIPTextModel,
|
169 |
+
tokenizer: CLIPTokenizer,
|
170 |
+
text_encoder_2: T5EncoderModel,
|
171 |
+
tokenizer_2: T5TokenizerFast,
|
172 |
+
transformer: FluxTransformer2DModel,
|
|
|
|
|
173 |
):
|
174 |
super().__init__()
|
175 |
|
|
|
177 |
vae=vae,
|
178 |
text_encoder=text_encoder,
|
179 |
text_encoder_2=text_encoder_2,
|
|
|
180 |
tokenizer=tokenizer,
|
181 |
tokenizer_2=tokenizer_2,
|
|
|
182 |
transformer=transformer,
|
183 |
scheduler=scheduler,
|
184 |
)
|
185 |
self.vae_scale_factor = (
|
186 |
+
2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
|
187 |
)
|
188 |
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
|
189 |
self.tokenizer_max_length = (
|
190 |
self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
|
191 |
)
|
192 |
self.default_sample_size = 64
|
193 |
+
|
194 |
def __call__(
|
195 |
self,
|
196 |
prompt: Union[str, List[str]] = None,
|