Spaces:

jbilcke-hf
/

ai-toolkit

Paused

App Files Files

xet

Community

jbilcke-hf HF Staff commited on Aug 17

Commit

3cc1e25

verified ·

1 Parent(s): 3180dc0

Upload 430 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
FAQ.md +10 -0
LICENSE +21 -0
README.md +473 -8
assets/VAE_test1.jpg +3 -0
assets/glif.svg +40 -0
assets/lora_ease_ui.png +3 -0
build_and_push_docker +29 -0
build_and_push_docker_dev +21 -0
config/examples/extract.example.yml +75 -0
config/examples/generate.example.yaml +60 -0
config/examples/mod_lora_scale.yaml +48 -0
config/examples/modal/modal_train_lora_flux_24gb.yaml +96 -0
config/examples/modal/modal_train_lora_flux_schnell_24gb.yaml +98 -0
config/examples/train_flex_redux.yaml +112 -0
config/examples/train_full_fine_tune_flex.yaml +107 -0
config/examples/train_full_fine_tune_lumina.yaml +99 -0
config/examples/train_lora_chroma_24gb.yaml +104 -0
config/examples/train_lora_flex2_24gb.yaml +165 -0
config/examples/train_lora_flex_24gb.yaml +101 -0
config/examples/train_lora_flux_24gb.yaml +96 -0
config/examples/train_lora_flux_kontext_24gb.yaml +106 -0
config/examples/train_lora_flux_schnell_24gb.yaml +98 -0
config/examples/train_lora_hidream_48.yaml +112 -0
config/examples/train_lora_lumina.yaml +96 -0
config/examples/train_lora_omnigen2_24gb.yaml +94 -0
config/examples/train_lora_sd35_large_24gb.yaml +97 -0
config/examples/train_lora_wan21_14b_24gb.yaml +101 -0
config/examples/train_lora_wan21_1b_24gb.yaml +90 -0
config/examples/train_slider.example.yml +230 -0
docker-compose.yml +25 -0
docker/Dockerfile +83 -0
docker/start.sh +70 -0
extensions/example/ExampleMergeModels.py +129 -0
extensions/example/__init__.py +25 -0
extensions/example/config/config.example.yaml +48 -0
extensions_built_in/.DS_Store +0 -0
extensions_built_in/advanced_generator/Img2ImgGenerator.py +256 -0
extensions_built_in/advanced_generator/PureLoraGenerator.py +102 -0
extensions_built_in/advanced_generator/ReferenceGenerator.py +212 -0
extensions_built_in/advanced_generator/__init__.py +59 -0
extensions_built_in/advanced_generator/config/train.example.yaml +91 -0
extensions_built_in/concept_replacer/ConceptReplacer.py +151 -0
extensions_built_in/concept_replacer/__init__.py +26 -0
extensions_built_in/concept_replacer/config/train.example.yaml +91 -0
extensions_built_in/dataset_tools/DatasetTools.py +20 -0
extensions_built_in/dataset_tools/SuperTagger.py +196 -0
extensions_built_in/dataset_tools/SyncFromCollection.py +131 -0
extensions_built_in/dataset_tools/__init__.py +43 -0
extensions_built_in/dataset_tools/tools/caption.py +53 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/lora_ease_ui.png filter=lfs diff=lfs merge=lfs -text
+assets/VAE_test1.jpg filter=lfs diff=lfs merge=lfs -text
+toolkit/timestep_weighing/flex_timestep_weights_plot.png filter=lfs diff=lfs merge=lfs -text

FAQ.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# FAQ
+WIP. Will continue to add things as they are needed.
+## FLUX.1 Training
+#### How much VRAM is required to train a lora on FLUX.1?
+24GB minimum is required.

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Ostris, LLC
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,11 +1,476 @@
 ---
-title: Ai Toolkit
-emoji: 🚀
-colorFrom: red
-colorTo: gray
-sdk: docker
-pinned: false
-short_description: Ostris AI Toolkit running as a HF space
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# AI Toolkit by Ostris
+AI Toolkit is an all in one training suite for diffusion models. I try to support all the latest models on consumer grade hardware. Image and video models. It can be run as a GUI or CLI. It is designed to be easy to use but still have every feature imaginable.
+## Support My Work
+If you enjoy my projects or use them commercially, please consider sponsoring me. Every bit helps! 💖
+[Sponsor on GitHub](https://github.com/orgs/ostris) | [Support on Patreon](https://www.patreon.com/ostris) | [Donate on PayPal](https://www.paypal.com/donate/?hosted_button_id=9GEFUKC8T9R9W)
+### Current Sponsors
+All of these people / organizations are the ones who selflessly make this project possible. Thank you!!
+_Last updated: 2025-08-08 17:01 UTC_
+<p align="center">
+<a href="https://x.com/NuxZoe" target="_blank" rel="noopener noreferrer"><img src="https://pbs.twimg.com/profile_images/1919488160125616128/QAZXTMEj_400x400.png" alt="a16z" width="200" height="200" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<a href="https://github.com/replicate" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/60410876?v=4" alt="Replicate" width="200" height="200" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<a href="https://github.com/huggingface" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/25720743?v=4" alt="Hugging Face" width="200" height="200" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<a href="https://github.com/josephrocca" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/1167575?u=92d92921b4cb5c8c7e225663fed53c4b41897736&v=4" alt="josephrocca" width="200" height="200" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/162524101/81a72689c3754ac5b9e38612ce5ce914/eyJ3IjoyMDB9/1.png?token-hash=JHRjAxd2XxV1aXIUijj-l65pfTnLoefYSvgNPAsw2lI%3D" alt="Prasanth Veerina" width="200" height="200" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://github.com/weights-ai" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/185568492?v=4" alt="Weights" width="200" height="200" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+</p>
+<hr style="width:100%;border:none;height:2px;background:#ddd;margin:30px 0;">
+<p align="center">
+<img src="https://c8.patreon.com/4/200/93304/J" alt="Joseph Rocca" width="150" height="150" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/161471720/dd330b4036d44a5985ed5985c12a5def/eyJ3IjoyMDB9/1.jpeg?token-hash=k1f4Vv7TevzYa9tqlzAjsogYmkZs8nrXQohPCDGJGkc%3D" alt="Vladimir Sotnikov" width="150" height="150" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/33158543/C" alt="clement Delangue" width="150" height="150" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/8654302/b0f5ebedc62a47c4b56222693e1254e9/eyJ3IjoyMDB9/2.jpeg?token-hash=suI7_QjKUgWpdPuJPaIkElkTrXfItHlL8ZHLPT-w_d4%3D" alt="Misch Strotz" width="150" height="150" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/120239481/49b1ce70d3d24704b8ec34de24ec8f55/eyJ3IjoyMDB9/1.jpeg?token-hash=o0y1JqSXqtGvVXnxb06HMXjQXs6OII9yMMx5WyyUqT4%3D" alt="nitish PNR" width="150" height="150" style="border-radius:8px;margin:5px;display: inline-block;">
+</p>
+<hr style="width:100%;border:none;height:2px;background:#ddd;margin:30px 0;">
+<p align="center">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/2298192/1228b69bd7d7481baf3103315183250d/eyJ3IjoyMDB9/1.jpg?token-hash=opN1e4r4Nnvqbtr8R9HI8eyf9m5F50CiHDOdHzb4UcA%3D" alt="Mohamed Oumoumad" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/548524/S" alt="Steve Hanff" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/152118848/3b15a43d71714552b5ed1c9f84e66adf/eyJ3IjoyMDB9/1.png?token-hash=MKf3sWHz0MFPm_OAFjdsNvxoBfN5B5l54mn1ORdlRy8%3D" alt="Kristjan Retter" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/83319230/M" alt="Miguel Lara" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/8449560/P" alt="Patron" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://x.com/NuxZoe" target="_blank" rel="noopener noreferrer"><img src="https://pbs.twimg.com/profile_images/1916482710069014528/RDLnPRSg_400x400.jpg" alt="tungsten" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/169502989/220069e79ce745b29237e94c22a729df/eyJ3IjoyMDB9/1.png?token-hash=E8E2JOqx66k2zMtYUw8Gy57dw-gVqA6OPpdCmWFFSFw%3D" alt="Timothy Bielec" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/34200989/58ae95ebda0640c8b7a91b4fa31357aa/eyJ3IjoyMDB9/1.jpeg?token-hash=4mVDM1kCYGauYa33zLG14_g0oj9_UjDK_-Qp4zk42GE%3D" alt="Noah Miller" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/27288932/6c35d2d961ee4e14a7a368c990791315/eyJ3IjoyMDB9/1.jpeg?token-hash=TGIto_PGEG2NEKNyqwzEnRStOkhrjb3QlMhHA3raKJY%3D" alt="David Garrido" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://x.com/RalFingerLP" target="_blank" rel="noopener noreferrer"><img src="https://pbs.twimg.com/profile_images/919595465041162241/ZU7X3T5k_400x400.jpg" alt="RalFinger" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+</p>
+<hr style="width:100%;border:none;height:2px;background:#ddd;margin:30px 0;">
+<p align="center">
+<a href="http://www.ir-ltd.net" target="_blank" rel="noopener noreferrer"><img src="https://pbs.twimg.com/profile_images/1602579392198283264/6Tm2GYus_400x400.jpg" alt="IR-Entertainment Ltd" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/9547341/bb35d9a222fd460e862e960ba3eacbaf/eyJ3IjoyMDB9/1.jpeg?token-hash=Q2XGDvkCbiONeWNxBCTeTMOcuwTjOaJ8Z-CAf5xq3Hs%3D" alt="Travis Harrington" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/98811435/3a3632d1795b4c2b9f8f0270f2f6a650/eyJ3IjoyMDB9/1.jpeg?token-hash=657rzuJ0bZavMRZW3XZ-xQGqm3Vk6FkMZgFJVMCOPdk%3D" alt="EmmanuelMr18" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/81275465/1e4148fe9c47452b838949d02dd9a70f/eyJ3IjoyMDB9/1.jpeg?token-hash=YAX1ucxybpCIujUCXfdwzUQkttIn3c7pfi59uaFPSwM%3D" alt="Aaron Amortegui" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/155963250/6f8fd7075c3b4247bfeb054ba49172d6/eyJ3IjoyMDB9/1.png?token-hash=z81EHmdU2cqSrwa9vJmZTV3h0LG-z9Qakhxq34FrYT4%3D" alt="Un Defined" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/45562978/0de33cf52ec642ae8a2f612cddec4ca6/eyJ3IjoyMDB9/1.jpeg?token-hash=aD4debMD5ZQjqTII6s4zYSgVK2-bdQt9p3eipi0bENs%3D" alt="Jack English" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/27791680/J" alt="Jean-Tristan Marin" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/570742/4ceb33453a5a4745b430a216aba9280f/eyJ3IjoyMDB9/1.jpg?token-hash=nPcJ2zj3sloND9jvbnbYnob2vMXRnXdRuujthqDLWlU%3D" alt="Al H" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/82763/f99cc484361d4b9d94fe4f0814ada303/eyJ3IjoyMDB9/1.jpeg?token-hash=A3JWlBNL0b24FFWb-FCRDAyhs-OAxg-zrhfBXP_axuU%3D" alt="Doron Adler" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/103077711/bb215761cc004e80bd9cec7d4bcd636d/eyJ3IjoyMDB9/2.jpeg?token-hash=3U8kdZSUpnmeYIDVK4zK9TTXFpnAud_zOwBRXx18018%3D" alt="John Dopamine" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/99036356/7ae9c4d80e604e739b68cca12ee2ed01/eyJ3IjoyMDB9/3.png?token-hash=ZhsBMoTOZjJ-Y6h5NOmU5MT-vDb2fjK46JDlpEehkVQ%3D" alt="Noctre" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/141098579/1a9f0a1249d447a7a0df718a57343912/eyJ3IjoyMDB9/2.png?token-hash=_n-AQmPgY0FP9zCGTIEsr5ka4Y7YuaMkt3qL26ZqGg8%3D" alt="The Local Lab" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/93348210/5c650f32a0bc481d80900d2674528777/eyJ3IjoyMDB9/1.jpeg?token-hash=0jiknRw3jXqYWW6En8bNfuHgVDj4LI_rL7lSS4-_xlo%3D" alt="Armin Behjati" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/134129880/680c7e14cd1a4d1a9face921fb010f88/eyJ3IjoyMDB9/1.png?token-hash=5fqqHE6DCTbt7gDQL7VRcWkV71jF7FvWcLhpYl5aMXA%3D" alt="Bharat Prabhakar" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/70218846/C" alt="Cosmosis" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/30931983/54ab4e4ceab946e79a6418d205f9ed51/eyJ3IjoyMDB9/1.png?token-hash=j2phDrgd6IWuqKqNIDbq9fR2B3fMF-GUCQSdETS1w5Y%3D" alt="HestoySeghuro ." width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/4105384/J" alt="Jack Blakely" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/4541423/S" alt="Sören " width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://www.youtube.com/@happyme7055" target="_blank" rel="noopener noreferrer"><img src="https://yt3.googleusercontent.com/ytc/AIdro_mFqhIRk99SoEWY2gvSvVp6u1SkCGMkRqYQ1OlBBeoOVp8=s160-c-k-c0x00ffffff-no-rj" alt="Marcus Rass" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://c8.patreon.com/4/200/53077895/M" alt="Marc" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/157407541/bb9d80cffdab4334ad78366060561520/eyJ3IjoyMDB9/2.png?token-hash=WYz-U_9zabhHstOT5UIa5jBaoFwrwwqyWxWEzIR2m_c%3D" alt="Tokio Studio srl IT10640050968" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/44568304/a9d83a0e786b41b4bdada150f7c9271c/eyJ3IjoyMDB9/1.jpeg?token-hash=FtxnwrSrknQUQKvDRv2rqPceX2EF23eLq4pNQYM_fmw%3D" alt="Albert Bukoski" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/5048649/B" alt="Ben Ward" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/111904990/08b1cf65be6a4de091c9b73b693b3468/eyJ3IjoyMDB9/1.png?token-hash=_Odz6RD3CxtubEHbUxYujcjw6zAajbo3w8TRz249VBA%3D" alt="Brian Smith" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/494309/J" alt="Julian Tsependa" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/5602036/K" alt="Kelevra" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/159203973/36c817f941ac4fa18103a4b8c0cb9cae/eyJ3IjoyMDB9/1.png?token-hash=zkt72HW3EoiIEAn3LSk9gJPBsXfuTVcc4rRBS3CeR8w%3D" alt="Marko jak" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/24653779/R" alt="RayHell" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/76566911/6485eaf5ec6249a7b524ee0b979372f0/eyJ3IjoyMDB9/1.jpeg?token-hash=mwCSkTelDBaengG32NkN0lVl5mRjB-cwo6-a47wnOsU%3D" alt="the biitz" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/32633822/1ab5612efe80417cbebfe91e871fc052/eyJ3IjoyMDB9/1.png?token-hash=pOS_IU3b3RL5-iL96A3Xqoj2bQ-dDo4RUkBylcMED_s%3D" alt="Zack Abrams" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/97985240/3d1d0e6905d045aba713e8132cab4a30/eyJ3IjoyMDB9/1.png?token-hash=fRavvbO_yqWKA_OsJb5DzjfKZ1Yt-TG-ihMoeVBvlcM%3D" alt="עומר מכלוף" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://github.com/julien-blanchon" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/11278197?v=4" alt="Blanchon" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/11198131/e696d9647feb4318bcf16243c2425805/eyJ3IjoyMDB9/1.jpeg?token-hash=c2c2p1SaiX86iXAigvGRvzm4jDHvIFCg298A49nIfUM%3D" alt="Nicholas Agranoff" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/785333/bdb9ede5765d42e5a2021a86eebf0d8f/eyJ3IjoyMDB9/2.jpg?token-hash=l_rajMhxTm6wFFPn7YdoKBxeUqhdRXKdy6_8SGCuNsE%3D" alt="Sapjes " width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/2446176/S" alt="Scott VanKirk" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/83034/W" alt="william tatum" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/138787189/2b5662dcb638466282ac758e3ac651b4/eyJ3IjoyMDB9/1.png?token-hash=zwj7MScO18vhDxhKt6s5q4gdeNJM3xCLuhSt8zlqlZs%3D" alt="Антон Антонио" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/30530914/T" alt="Techer " width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/25209707/36ae876d662d4d85aaf162b6d67d31e7/eyJ3IjoyMDB9/1.png?token-hash=Zows_A6uqlY5jClhfr4Y3QfMnDKVkS3mbxNHUDkVejo%3D" alt="fjioq8" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/46680573/ee3d99c04a674dd5a8e1ecfb926db6a2/eyJ3IjoyMDB9/1.jpeg?token-hash=cgD4EXyfZMPnXIrcqWQ5jGqzRUfqjPafb9yWfZUPB4Q%3D" alt="Neil Murray" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://ostris.com/wp-content/uploads/2025/08/supporter_default.jpg" alt="Joakim Sällström" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/63510241/A" alt="Andrew Park" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://github.com/Spikhalskiy" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/532108?u=2464983638afea8caf4cd9f0e4a7bc3e6a63bb0a&v=4" alt="Dmitry Spikhalsky" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://c8.patreon.com/4/200/88567307/E" alt="el Chavo" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/117569999/55f75c57f95343e58402529cec852b26/eyJ3IjoyMDB9/1.jpeg?token-hash=squblHZH4-eMs3gI46Uqu1oTOK9sQ-0gcsFdZcB9xQg%3D" alt="James Thompson" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/66157709/6fe70df085e24464995a1a9293a53760/eyJ3IjoyMDB9/1.jpeg?token-hash=eqe0wvg6JfbRUGMKpL_x3YPI5Ppf18aUUJe2EzADU-g%3D" alt="Joey Santana" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://ostris.com/wp-content/uploads/2025/08/supporter_default.jpg" alt="Heikki Rinkinen" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/6175608/B" alt="Bobbie " width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://github.com/Slartibart23" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/133593860?u=31217adb2522fb295805824ffa7e14e8f0fca6fa&v=4" alt="Slarti" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://ostris.com/wp-content/uploads/2025/08/supporter_default.jpg" alt="Tommy Falkowski" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/28533016/e8f6044ccfa7483f87eeaa01c894a773/eyJ3IjoyMDB9/2.png?token-hash=ak-h3JWB50hyenCavcs32AAPw6nNhmH2nBFKpdk5hvM%3D" alt="William Tatum" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://ostris.com/wp-content/uploads/2025/08/supporter_default.jpg" alt="Karol Stępień" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/156564939/17dbfd45c59d4cf29853d710cb0c5d6f/eyJ3IjoyMDB9/1.png?token-hash=e6wXA_S8cgJeEDI9eJK934eB0TiM8mxJm9zW_VH0gDU%3D" alt="Hans Untch" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/59408413/B" alt="ByteC" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/3712451/432e22a355494ec0a1ea1927ff8d452e/eyJ3IjoyMDB9/7.jpeg?token-hash=OpQ9SAfVQ4Un9dSYlGTHuApZo5GlJ797Mo0DtVtMOSc%3D" alt="David Shorey" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/53634141/c1441f6c605344bbaef885d4272977bb/eyJ3IjoyMDB9/1.JPG?token-hash=Aizd6AxQhY3n6TBE5AwCVeSwEBbjALxQmu6xqc08qBo%3D" alt="Jana Spacelight" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/11180426/J" alt="jarrett towe" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/21828017/J" alt="Jim" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/63232055/2300b4ab370341b5b476902c9b8218ee/eyJ3IjoyMDB9/1.png?token-hash=R9Nb4O0aLBRwxT1cGHUMThlvf6A2MD5SO88lpZBdH7M%3D" alt="Marek P" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/9944625/P" alt="Pomoe " width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/25047900/423e4cb73aba457f8f9c6e5582eddaeb/eyJ3IjoyMDB9/1.jpeg?token-hash=81RvQXBbT66usxqtyWum9Ul4oBn3qHK1cM71IvthC-U%3D" alt="Ruairi Robinson" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/178476551/0b9e83efcd234df5a6bea30d59e6c1cd/eyJ3IjoyMDB9/1.png?token-hash=3XoYMrMxk-K6GelM22mE-FwkjFulX9hpIL7QI3wO2jI%3D" alt="Timmy" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/10876902/T" alt="Tyssel" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://ostris.com/wp-content/uploads/2025/08/supporter_default.jpg" alt="Juan Franco" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+</p>
 ---
+## Installation
+Requirements:
+- python >3.10
+- Nvidia GPU with enough ram to do what you need
+- python venv
+- git
+Linux:
+```bash
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+python3 -m venv venv
+source venv/bin/activate
+# install torch first
+pip3 install --no-cache-dir torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu126
+pip3 install -r requirements.txt
+```
+Windows:
+If you are having issues with Windows. I recommend using the easy install script at [https://github.com/Tavris1/AI-Toolkit-Easy-Install](https://github.com/Tavris1/AI-Toolkit-Easy-Install)
+```bash
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+python -m venv venv
+.\venv\Scripts\activate
+pip install --no-cache-dir torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu126
+pip install -r requirements.txt
+```
+# AI Toolkit UI
+<img src="https://ostris.com/wp-content/uploads/2025/02/toolkit-ui.jpg" alt="AI Toolkit UI" width="100%">
+The AI Toolkit UI is a web interface for the AI Toolkit. It allows you to easily start, stop, and monitor jobs. It also allows you to easily train models with a few clicks. It also allows you to set a token for the UI to prevent unauthorized access so it is mostly safe to run on an exposed server.
+## Running the UI
+Requirements:
+- Node.js > 18
+The UI does not need to be kept running for the jobs to run. It is only needed to start/stop/monitor jobs. The commands below
+will install / update the UI and it's dependencies and start the UI.
+```bash
+cd ui
+npm run build_and_start
+```
+You can now access the UI at `http://localhost:8675` or `http://<your-ip>:8675` if you are running it on a server.
+## Securing the UI
+If you are hosting the UI on a cloud provider or any network that is not secure, I highly recommend securing it with an auth token.
+You can do this by setting the environment variable `AI_TOOLKIT_AUTH` to super secure password. This token will be required to access
+the UI. You can set this when starting the UI like so:
+```bash
+# Linux
+AI_TOOLKIT_AUTH=super_secure_password npm run build_and_start
+# Windows
+set AI_TOOLKIT_AUTH=super_secure_password && npm run build_and_start
+# Windows Powershell
+$env:AI_TOOLKIT_AUTH="super_secure_password"; npm run build_and_start
+```
+## FLUX.1 Training
+### Tutorial
+To get started quickly, check out [@araminta_k](https://x.com/araminta_k) tutorial on [Finetuning Flux Dev on a 3090](https://www.youtube.com/watch?v=HzGW_Kyermg) with 24GB VRAM.
+### Requirements
+You currently need a GPU with **at least 24GB of VRAM** to train FLUX.1. If you are using it as your GPU to control
+your monitors, you probably need to set the flag `low_vram: true` in the config file under `model:`. This will quantize
+the model on CPU and should allow it to train with monitors attached. Users have gotten it to work on Windows with WSL,
+but there are some reports of a bug when running on windows natively.
+I have only tested on linux for now. This is still extremely experimental
+and a lot of quantizing and tricks had to happen to get it to fit on 24GB at all.
+### FLUX.1-dev
+FLUX.1-dev has a non-commercial license. Which means anything you train will inherit the
+non-commercial license. It is also a gated model, so you need to accept the license on HF before using it.
+Otherwise, this will fail. Here are the required steps to setup a license.
+1. Sign into HF and accept the model access here [black-forest-labs/FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev)
+2. Make a file named `.env` in the root on this folder
+3. [Get a READ key from huggingface](https://huggingface.co/settings/tokens/new?) and add it to the `.env` file like so `HF_TOKEN=your_key_here`
+### FLUX.1-schnell
+FLUX.1-schnell is Apache 2.0. Anything trained on it can be licensed however you want and it does not require a HF_TOKEN to train.
+However, it does require a special adapter to train with it, [ostris/FLUX.1-schnell-training-adapter](https://huggingface.co/ostris/FLUX.1-schnell-training-adapter).
+It is also highly experimental. For best overall quality, training on FLUX.1-dev is recommended.
+To use it, You just need to add the assistant to the `model` section of your config file like so:
+```yaml
+      model:
+        name_or_path: "black-forest-labs/FLUX.1-schnell"
+        assistant_lora_path: "ostris/FLUX.1-schnell-training-adapter"
+        is_flux: true
+        quantize: true
+```
+You also need to adjust your sample steps since schnell does not require as many
+```yaml
+      sample:
+        guidance_scale: 1  # schnell does not do guidance
+        sample_steps: 4  # 1 - 4 works well
+```
+### Training
+1. Copy the example config file located at `config/examples/train_lora_flux_24gb.yaml` (`config/examples/train_lora_flux_schnell_24gb.yaml` for schnell) to the `config` folder and rename it to `whatever_you_want.yml`
+2. Edit the file following the comments in the file
+3. Run the file like so `python run.py config/whatever_you_want.yml`
+A folder with the name and the training folder from the config file will be created when you start. It will have all
+checkpoints and images in it. You can stop the training at any time using ctrl+c and when you resume, it will pick back up
+from the last checkpoint.
+IMPORTANT. If you press crtl+c while it is saving, it will likely corrupt that checkpoint. So wait until it is done saving
+### Need help?
+Please do not open a bug report unless it is a bug in the code. You are welcome to [Join my Discord](https://discord.gg/VXmU2f5WEU)
+and ask for help there. However, please refrain from PMing me directly with general question or support. Ask in the discord
+and I will answer when I can.
+## Gradio UI
+To get started training locally with a with a custom UI, once you followed the steps above and `ai-toolkit` is installed:
+```bash
+cd ai-toolkit #in case you are not yet in the ai-toolkit folder
+huggingface-cli login #provide a `write` token to publish your LoRA at the end
+python flux_train_ui.py
+```
+You will instantiate a UI that will let you upload your images, caption them, train and publish your LoRA
+![image](assets/lora_ease_ui.png)
+## Training in RunPod
+Example RunPod template: **runpod/pytorch:2.2.0-py3.10-cuda12.1.1-devel-ubuntu22.04**
+> You need a minimum of 24GB VRAM, pick a GPU by your preference.
+#### Example config ($0.5/hr):
+- 1x A40 (48 GB VRAM)
+- 19 vCPU 100 GB RAM
+#### Custom overrides (you need some storage to clone FLUX.1, store datasets, store trained models and samples):
+- ~120 GB Disk
+- ~120 GB Pod Volume
+- Start Jupyter Notebook
+### 1. Setup
+```
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+git submodule update --init --recursive
+python -m venv venv
+source venv/bin/activate
+pip install torch
+pip install -r requirements.txt
+pip install --upgrade accelerate transformers diffusers huggingface_hub #Optional, run it if you run into issues
+```
+### 2. Upload your dataset
+- Create a new folder in the root, name it `dataset` or whatever you like.
+- Drag and drop your .jpg, .jpeg, or .png images and .txt files inside the newly created dataset folder.
+### 3. Login into Hugging Face with an Access Token
+- Get a READ token from [here](https://huggingface.co/settings/tokens) and request access to Flux.1-dev model from [here](https://huggingface.co/black-forest-labs/FLUX.1-dev).
+- Run ```huggingface-cli login``` and paste your token.
+### 4. Training
+- Copy an example config file located at ```config/examples``` to the config folder and rename it to ```whatever_you_want.yml```.
+- Edit the config following the comments in the file.
+- Change ```folder_path: "/path/to/images/folder"``` to your dataset path like ```folder_path: "/workspace/ai-toolkit/your-dataset"```.
+- Run the file: ```python run.py config/whatever_you_want.yml```.
+### Screenshot from RunPod
+<img width="1728" alt="RunPod Training Screenshot" src="https://github.com/user-attachments/assets/53a1b8ef-92fa-4481-81a7-bde45a14a7b5">
+## Training in Modal
+### 1. Setup
+#### ai-toolkit:
+```
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+git submodule update --init --recursive
+python -m venv venv
+source venv/bin/activate
+pip install torch
+pip install -r requirements.txt
+pip install --upgrade accelerate transformers diffusers huggingface_hub #Optional, run it if you run into issues
+```
+#### Modal:
+- Run `pip install modal` to install the modal Python package.
+- Run `modal setup` to authenticate (if this doesn’t work, try `python -m modal setup`).
+#### Hugging Face:
+- Get a READ token from [here](https://huggingface.co/settings/tokens) and request access to Flux.1-dev model from [here](https://huggingface.co/black-forest-labs/FLUX.1-dev).
+- Run `huggingface-cli login` and paste your token.
+### 2. Upload your dataset
+- Drag and drop your dataset folder containing the .jpg, .jpeg, or .png images and .txt files in `ai-toolkit`.
+### 3. Configs
+- Copy an example config file located at ```config/examples/modal``` to the `config` folder and rename it to ```whatever_you_want.yml```.
+- Edit the config following the comments in the file, **<ins>be careful and follow the example `/root/ai-toolkit` paths</ins>**.
+### 4. Edit run_modal.py
+- Set your entire local `ai-toolkit` path at `code_mount = modal.Mount.from_local_dir` like:
+   ```
+   code_mount = modal.Mount.from_local_dir("/Users/username/ai-toolkit", remote_path="/root/ai-toolkit")
+   ```
+- Choose a `GPU` and `Timeout` in `@app.function` _(default is A100 40GB and 2 hour timeout)_.
+### 5. Training
+- Run the config file in your terminal: `modal run run_modal.py --config-file-list-str=/root/ai-toolkit/config/whatever_you_want.yml`.
+- You can monitor your training in your local terminal, or on [modal.com](https://modal.com/).
+- Models, samples and optimizer will be stored in `Storage > flux-lora-models`.
+### 6. Saving the model
+- Check contents of the volume by running `modal volume ls flux-lora-models`.
+- Download the content by running `modal volume get flux-lora-models your-model-name`.
+- Example: `modal volume get flux-lora-models my_first_flux_lora_v1`.
+### Screenshot from Modal
+<img width="1728" alt="Modal Traning Screenshot" src="https://github.com/user-attachments/assets/7497eb38-0090-49d6-8ad9-9c8ea7b5388b">
 ---
+## Dataset Preparation
+Datasets generally need to be a folder containing images and associated text files. Currently, the only supported
+formats are jpg, jpeg, and png. Webp currently has issues. The text files should be named the same as the images
+but with a `.txt` extension. For example `image2.jpg` and `image2.txt`. The text file should contain only the caption.
+You can add the word `[trigger]` in the caption file and if you have `trigger_word` in your config, it will be automatically
+replaced.
+Images are never upscaled but they are downscaled and placed in buckets for batching. **You do not need to crop/resize your images**.
+The loader will automatically resize them and can handle varying aspect ratios.
+## Training Specific Layers
+To train specific layers with LoRA, you can use the `only_if_contains` network kwargs. For instance, if you want to train only the 2 layers
+used by The Last Ben, [mentioned in this post](https://x.com/__TheBen/status/1829554120270987740), you can adjust your
+network kwargs like so:
+```yaml
+      network:
+        type: "lora"
+        linear: 128
+        linear_alpha: 128
+        network_kwargs:
+          only_if_contains:
+            - "transformer.single_transformer_blocks.7.proj_out"
+            - "transformer.single_transformer_blocks.20.proj_out"
+```
+The naming conventions of the layers are in diffusers format, so checking the state dict of a model will reveal
+the suffix of the name of the layers you want to train. You can also use this method to only train specific groups of weights.
+For instance to only train the `single_transformer` for FLUX.1, you can use the following:
+```yaml
+      network:
+        type: "lora"
+        linear: 128
+        linear_alpha: 128
+        network_kwargs:
+          only_if_contains:
+            - "transformer.single_transformer_blocks."
+```
+You can also exclude layers by their names by using `ignore_if_contains` network kwarg. So to exclude all the single transformer blocks,
+```yaml
+      network:
+        type: "lora"
+        linear: 128
+        linear_alpha: 128
+        network_kwargs:
+          ignore_if_contains:
+            - "transformer.single_transformer_blocks."
+```
+`ignore_if_contains` takes priority over `only_if_contains`. So if a weight is covered by both,
+if will be ignored.
+## LoKr Training
+To learn more about LoKr, read more about it at [KohakuBlueleaf/LyCORIS](https://github.com/KohakuBlueleaf/LyCORIS/blob/main/docs/Guidelines.md). To train a LoKr model, you can adjust the network type in the config file like so:
+```yaml
+      network:
+        type: "lokr"
+        lokr_full_rank: true
+        lokr_factor: 8
+```
+Everything else should work the same including layer targeting.
+## Updates
+Only larger updates are listed here. There are usually smaller daily updated that are omitted.
+### Jul 17, 2025
+- Make it easy to add control images to the samples in the ui
+### Jul 11, 2025
+- Added better video config settings to the UI for video models.
+- Added Wan I2V training to the UI
+### June 29, 2025
+- Fixed issue where Kontext forced sizes on sampling
+### June 26, 2025
+- Added support for FLUX.1 Kontext training
+- added support for instruction dataset training
+### June 25, 2025
+- Added support for OmniGen2 training
+-
+### June 17, 2025
+- Performance optimizations for batch preparation
+- Added some docs via a popup for items in the simple ui explaining what settings do. Still a WIP
+### June 16, 2025
+- Hide control images in the UI when viewing datasets
+- WIP on mean flow loss
+### June 12, 2025
+- Fixed issue that resulted in blank captions in the dataloader
+### June 10, 2025
+- Decided to keep track up updates in the readme
+- Added support for SDXL in the UI
+- Added support for SD 1.5 in the UI
+- Fixed UI Wan 2.1 14b name bug
+- Added support for for conv training in the UI for models that support it

assets/VAE_test1.jpg ADDED Viewed

Git LFS Details

SHA256: 879fcb537d039408d7aada297b7397420132684f0106edacc1205fb5cc839476
Pointer size: 132 Bytes
Size of remote file: 1.51 MB

assets/glif.svg ADDED Viewed

assets/lora_ease_ui.png ADDED Viewed

Git LFS Details

SHA256: f647b9fe90cc96db2aa84d1cb25a73b60ffcc5394822f99e9dac27d373f89d79
Pointer size: 131 Bytes
Size of remote file: 349 kB

build_and_push_docker ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/usr/bin/env bash
+# Extract version from version.py
+if [ -f "version.py" ]; then
+    VERSION=$(python3 -c "from version import VERSION; print(VERSION)")
+    echo "Building version: $VERSION"
+else
+    echo "Error: version.py not found. Please create a version.py file with VERSION defined."
+    exit 1
+fi
+echo "Docker builds from the repo, not this dir. Make sure changes are pushed to the repo."
+echo "Building version: $VERSION and latest"
+# wait 2 seconds
+sleep 2
+# Build the image with cache busting
+docker build --build-arg CACHEBUST=$(date +%s) -t aitoolkit:$VERSION -f docker/Dockerfile .
+# Tag with version and latest
+docker tag aitoolkit:$VERSION ostris/aitoolkit:$VERSION
+docker tag aitoolkit:$VERSION ostris/aitoolkit:latest
+# Push both tags
+echo "Pushing images to Docker Hub..."
+docker push ostris/aitoolkit:$VERSION
+docker push ostris/aitoolkit:latest
+echo "Successfully built and pushed ostris/aitoolkit:$VERSION and ostris/aitoolkit:latest"

build_and_push_docker_dev ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/env bash
+VERSION=dev
+GIT_COMMIT=dev
+echo "Docker builds from the repo, not this dir. Make sure changes are pushed to the repo."
+echo "Building version: $VERSION"
+# wait 2 seconds
+sleep 2
+# Build the image with cache busting
+docker build --build-arg CACHEBUST=$(date +%s) -t aitoolkit:$VERSION -f docker/Dockerfile .
+# Tag with version and latest
+docker tag aitoolkit:$VERSION ostris/aitoolkit:$VERSION
+# Push both tags
+echo "Pushing images to Docker Hub..."
+docker push ostris/aitoolkit:$VERSION
+echo "Successfully built and pushed ostris/aitoolkit:$VERSION"

config/examples/extract.example.yml ADDED Viewed

	@@ -0,0 +1,75 @@

+---
+# this is in yaml format. You can use json if you prefer
+# I like both but yaml is easier to read and write
+# plus it has comments which is nice for documentation
+job: extract # tells the runner what to do
+config:
+  # the name will be used to create a folder in the output folder
+  # it will also replace any [name] token in the rest of this config
+  name: name_of_your_model
+  # can be hugging face model, a .ckpt, or a .safetensors
+  base_model: "/path/to/base/model.safetensors"
+  # can be hugging face model, a .ckpt, or a .safetensors
+  extract_model: "/path/to/model/to/extract/trained.safetensors"
+  # we will create folder here with name above so. This will create /path/to/output/folder/name_of_your_model
+  output_folder: "/path/to/output/folder"
+  is_v2: false
+  dtype: fp16 # saved dtype
+  device: cpu # cpu, cuda:0, etc
+  # processes can be chained like this to run multiple in a row
+  # they must all use same models above, but great for testing different
+  # sizes and typed of extractions. It is much faster as we already have the models loaded
+  process:
+  # process 1
+  - type: locon  # locon or lora (locon is lycoris)
+    filename: "[name]_64_32.safetensors" # will be put in output folder
+    dtype: fp16
+    mode: fixed
+    linear: 64
+    conv: 32
+  # process 2
+  - type: locon
+    output_path: "/absolute/path/for/this/output.safetensors" # can be absolute
+    mode: ratio
+    linear: 0.2
+    conv: 0.2
+  # process 3
+  - type: locon
+    filename: "[name]_ratio_02.safetensors"
+    mode: quantile
+    linear: 0.5
+    conv: 0.5
+  # process 4
+  - type: lora  # traditional lora extraction (lierla) with linear layers only
+    filename: "[name]_4.safetensors"
+    mode: fixed  # fixed, ratio, quantile supported for lora as well
+    linear: 4 # lora dim or rank
+    # no conv for lora
+  # process 5
+  - type: lora
+    filename: "[name]_q05.safetensors"
+    mode: quantile
+    linear: 0.5
+# you can put any information you want here, and it will be saved in the model
+# the below is an example. I recommend doing trigger words at a minimum
+# in the metadata. The software will include this plus some other information
+meta:
+  name: "[name]"  # [name] gets replaced with the name above
+  description: A short description of your model
+  trigger_words:
+  - put
+  - trigger
+  - words
+  - here
+  version: '0.1'
+  creator:
+    name: Your Name
+    email: [email protected]
+    website: https://yourwebsite.com
+  any: All meta data above is arbitrary, it can be whatever you want.

config/examples/generate.example.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+---
+job: generate # tells the runner what to do
+config:
+  name: "generate" # this is not really used anywhere currently but required by runner
+  process:
+    # process 1
+    - type: to_folder  # process images to a folder
+      output_folder: "output/gen"
+      device: cuda:0 # cpu, cuda:0, etc
+      generate:
+        # these are your defaults you can override most of them with flags
+        sampler: "ddpm" # ignored for now, will add later though ddpm is used regardless for now
+        width: 1024
+        height: 1024
+        neg: "cartoon, fake, drawing, illustration, cgi, animated, anime"
+        seed: -1 # -1 is random
+        guidance_scale: 7
+        sample_steps: 20
+        ext: ".png" # .png, .jpg, .jpeg, .webp
+        # here ate the flags you can use for prompts. Always start with
+        # your prompt first then add these flags after. You can use as many
+        # like
+        # photo of a baseball --n painting, ugly --w 1024 --h 1024 --seed 42 --cfg 7 --steps 20
+        # we will try to support all sd-scripts flags where we can
+        # FROM SD-SCRIPTS
+        # --n Treat everything until the next option as a negative prompt.
+        # --w Specify the width of the generated image.
+        # --h Specify the height of the generated image.
+        # --d Specify the seed for the generated image.
+        # --l Specify the CFG scale for the generated image.
+        # --s Specify the number of steps during generation.
+        # OURS and some QOL additions
+        # --p2 Prompt for the second text encoder (SDXL only)
+        # --n2 Negative prompt for the second text encoder (SDXL only)
+        # --gr Specify the guidance rescale for the generated image (SDXL only)
+        # --seed Specify the seed for the generated image same as --d
+        # --cfg Specify the CFG scale for the generated image same as --l
+        # --steps Specify the number of steps during generation same as --s
+        prompt_file: false # if true a txt file will be created next to images with prompt strings used
+        # prompts can also be a path to a text file with one prompt per line
+        # prompts: "/path/to/prompts.txt"
+        prompts:
+          - "photo of batman"
+          - "photo of superman"
+          - "photo of spiderman"
+          - "photo of a superhero --n batman superman spiderman"
+      model:
+        # huggingface name, relative prom project path, or absolute path to .safetensors or .ckpt
+        #      name_or_path: "runwayml/stable-diffusion-v1-5"
+        name_or_path: "/mnt/Models/stable-diffusion/models/stable-diffusion/Ostris/Ostris_Real_v1.safetensors"
+        is_v2: false  # for v2 models
+        is_v_pred: false # for v-prediction models (most v2 models)
+        is_xl: false  # for SDXL models
+        dtype: bf16

config/examples/mod_lora_scale.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+---
+job: mod
+config:
+  name: name_of_your_model_v1
+  process:
+    - type: rescale_lora
+      # path to your current lora model
+      input_path: "/path/to/lora/lora.safetensors"
+      # output path for your new lora model, can be the same as input_path to replace
+      output_path: "/path/to/lora/output_lora_v1.safetensors"
+      # replaces meta with the meta below (plus minimum meta fields)
+      # if false, we will leave the meta alone except for updating hashes (sd-script hashes)
+      replace_meta: true
+      # how to adjust, we can scale the up_down weights or the alpha
+      # up_down is the default and probably the best, they will both net the same outputs
+      # would only affect rare NaN cases and maybe merging with old merge tools
+      scale_target: 'up_down'
+      # precision to save, fp16 is the default and standard
+      save_dtype: fp16
+      # current_weight is the ideal weight you use as a multiplier when using the lora
+      # IE in automatic1111 <lora:my_lora:6.0> the 6.0 is the current_weight
+      # you can do negatives here too if you want to flip the lora
+      current_weight: 6.0
+      # target_weight is the ideal weight you use as a multiplier when using the lora
+      # instead of the one above. IE in automatic1111 instead of using <lora:my_lora:6.0>
+      # we want to use <lora:my_lora:1.0> so 1.0 is the target_weight
+      target_weight: 1.0
+      # base model for the lora
+      # this is just used to add meta so automatic111 knows which model it is for
+      # assume v1.5 if these are not set
+      is_xl: false
+      is_v2: false
+meta:
+  # this is only used if you set replace_meta to true above
+  name: "[name]"  # [name] gets replaced with the name above
+  description: A short description of your lora
+  trigger_words:
+  - put
+  - trigger
+  - words
+  - here
+  version: '0.1'
+  creator:
+    name: Your Name
+    email: [email protected]
+    website: https://yourwebsite.com
+  any: All meta data above is arbitrary, it can be whatever you want.

config/examples/modal/modal_train_lora_flux_24gb.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "/root/ai-toolkit/modal_output" # must match MOUNT_DIR from run_modal.py
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # your dataset must be placed in /ai-toolkit and /root is for modal to find the dir:
+        - folder_path: "/root/ai-toolkit/your-dataset"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        # if you get an error, or get stuck while downloading,
+        # check https://github.com/ostris/ai-toolkit/issues/84, download the model locally and
+        # place it like "/root/ai-toolkit/FLUX.1-dev"
+        name_or_path: "black-forest-labs/FLUX.1-dev"
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 20
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/modal/modal_train_lora_flux_schnell_24gb.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "/root/ai-toolkit/modal_output" # must match MOUNT_DIR from run_modal.py
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # your dataset must be placed in /ai-toolkit and /root is for modal to find the dir:
+        - folder_path: "/root/ai-toolkit/your-dataset"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        # if you get an error, or get stuck while downloading,
+        # check https://github.com/ostris/ai-toolkit/issues/84, download the models locally and
+        # place them like "/root/ai-toolkit/FLUX.1-schnell" and "/root/ai-toolkit/FLUX.1-schnell-training-adapter"
+        name_or_path: "black-forest-labs/FLUX.1-schnell"
+        assistant_lora_path: "ostris/FLUX.1-schnell-training-adapter" # Required for flux schnell training
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+        # low_vram is painfully slow to fuse in the adapter avoid it unless absolutely necessary
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 1  # schnell does not do guidance
+        sample_steps: 4  # 1 - 4 works well
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_flex_redux.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flex_redux_finetune_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      adapter:
+        type: "redux"
+        # you can finetune an existing adapter or start from scratch. Set to null to start from scratch
+        name_or_path: '/local/path/to/redux_adapter_to_finetune.safetensors'
+        # name_or_path: null
+        # image_encoder_path: 'google/siglip-so400m-patch14-384' # Flux.1 redux adapter
+        image_encoder_path: 'google/siglip2-so400m-patch16-512' # Flex.1 512 redux adapter
+        # image_encoder_arch: 'siglip' # for Flux.1
+        image_encoder_arch: 'siglip2'
+        # You need a control input for each sample. Best to do squares for both images
+        test_img_path:
+          - "/path/to/x_01.jpg"
+          - "/path/to/x_02.jpg"
+          - "/path/to/x_03.jpg"
+          - "/path/to/x_04.jpg"
+          - "/path/to/x_05.jpg"
+          - "/path/to/x_06.jpg"
+          - "/path/to/x_07.jpg"
+          - "/path/to/x_08.jpg"
+          - "/path/to/x_09.jpg"
+          - "/path/to/x_10.jpg"
+        clip_layer: 'last_hidden_state'
+        train: true
+      save:
+        dtype: bf16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          # clip_image_path is directory containting your control images. They must have filename as their train image. (extension does not matter)
+          # for normal redux, we are just recreating the same image, so you can use the same folder path above
+          clip_image_path: "/path/to/control/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          resolution: [ 512, 768, 1024 ]  # flex enjoys multiple resolutions
+      train:
+        # this is what I used for the 24GB card, but feel free to adjust
+        # total batch size is 6 here
+        batch_size: 3
+        gradient_accumulation: 2
+        # captions are not needed for this training, we cache a blank proompt and rely on the vision encoder
+        unload_text_encoder: true
+        loss_type: "mse"
+        train_unet: true
+        train_text_encoder: false
+        steps: 4000000  # I set this very high and stop when I like the results
+        content_or_style: balanced  # content, style, balanced
+        gradient_checkpointing: true
+        noise_scheduler: "flowmatch" # or "ddpm", "lms", "euler_a"
+        timestep_type: "flux_shift"
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # this is for Flex.1, comment this out for FLUX.1-dev
+        bypass_guidance_embedding: true
+        dtype: bf16
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+      model:
+        name_or_path: "ostris/Flex.1-alpha"
+        is_flux: true
+        quantize: true
+        text_encoder_bits: 8
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        # I leave half blank to test prompt and unprompted
+        prompts:
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - ""
+          - ""
+          - ""
+          - ""
+          - ""
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+        network_multiplier: 1.0
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_full_fine_tune_flex.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+---
+# This configuration requires 48GB of VRAM or more to operate
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flex_finetune_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+      # performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # trigger_word: "p3r5on"
+      save:
+        dtype: bf16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 2 # how many intermittent saves to keep
+        save_format: 'diffusers' # 'diffusers'
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          # cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flex enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # IMPORTANT! For Flex, you must bypass the guidance embedder during training
+        bypass_guidance_embedding: true
+        # can be 'sigmoid', 'linear', or 'lognorm_blend'
+        timestep_type: 'sigmoid'
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flex
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adafactor"
+        lr: 3e-5
+        # Paramiter swapping can reduce vram requirements. Set factor from 1.0 to 0.0.
+        # 0.1 is 10% of paramiters active at easc step. Only works with adafactor
+        # do_paramiter_swapping: true
+        # paramiter_swapping_factor: 0.9
+        # uncomment this to skip the pre training sample
+        # skip_first_sample: true
+        # uncomment to completely disable sampling
+        # disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on if you have the vram
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flex, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "ostris/Flex.1-alpha"
+        is_flux: true # flex is flux architecture
+        # full finetuning quantized models is a crapshoot and results in subpar outputs
+        # quantize: true
+        # you can quantize just the T5 text encoder here to save vram
+        quantize_te: true
+        # only train the transformer blocks
+        only_if_contains:
+          - "transformer.transformer_blocks."
+          - "transformer.single_transformer_blocks."
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flex
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_full_fine_tune_lumina.yaml ADDED Viewed

	@@ -0,0 +1,99 @@

+---
+# This configuration requires 24GB of VRAM or more to operate
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_lumina_finetune_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+      # performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # trigger_word: "p3r5on"
+      save:
+        dtype: bf16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 2 # how many intermittent saves to keep
+        save_format: 'diffusers' # 'diffusers'
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          # cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # lumina2 enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # can be 'sigmoid', 'linear', or 'lumina2_shift'
+        timestep_type: 'lumina2_shift'
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with lumina2
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adafactor"
+        lr: 3e-5
+        # Paramiter swapping can reduce vram requirements. Set factor from 1.0 to 0.0.
+        # 0.1 is 10% of paramiters active at easc step. Only works with adafactor
+        # do_paramiter_swapping: true
+        # paramiter_swapping_factor: 0.9
+        # uncomment this to skip the pre training sample
+        # skip_first_sample: true
+        # uncomment to completely disable sampling
+        # disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on if you have the vram
+        # ema_config:
+        #   use_ema: true
+        #   ema_decay: 0.99
+        # will probably need this if gpu supports it for lumina2, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Alpha-VLLM/Lumina-Image-2.0"
+        is_lumina2: true # lumina2 architecture
+        # you can quantize just the Gemma2 text encoder here to save vram
+        quantize_te: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a cat that is half black and half orange tabby, split down the middle. The cat has on a blue tophat. They are holding a martini glass with a pink ball of yarn in it with green knitting needles sticking out, in one paw. In the other paw, they are holding a DVD case for a movie titled, \"This is a test\" that has a golden robot on it. In the background is a busy night club with a giant mushroom man dancing with a bear."
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4.0
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_chroma_24gb.yaml ADDED Viewed

	@@ -0,0 +1,104 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_chroma_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # chroma enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with chroma
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for chroma, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # Download the whichever model you prefer from the Chroma repo
+        # https://huggingface.co/lodestones/Chroma/tree/main
+        # point to it here.
+        # name_or_path: "/path/to/chroma/chroma-unlocked-vVERSION.safetensors"
+        # using lodestones/Chroma will automatically use the latest version
+        name_or_path: "lodestones/Chroma"
+        # # You can also select a version of Chroma like so
+        # name_or_path: "lodestones/Chroma/v28"
+        arch: "chroma"
+        quantize: true  # run 8bit mixed precision
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # negative prompt, optional
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_flex2_24gb.yaml ADDED Viewed

	@@ -0,0 +1,165 @@

+# Note, Flex2 is a highly experimental WIP model. Finetuning a model with built in controls and inpainting has not
+# been done before, so you will be experimenting with me on how to do it. This is my recommended setup, but this is highly
+# subject to change as we learn more about how Flex2 works.
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flex2_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          # Flex2 is trained with controls and inpainting. If you want the model to truely understand how the
+          # controls function with your dataset, it is a good idea to keep doing controls during training.
+          # this will automatically generate the controls for you before training. The current script is not
+          # fully optimized so this could be rather slow for large datasets, but it caches them to disk so it
+          # only needs to be done once. If you want to skip this step, you can set the controls to [] and it will
+          controls:
+            - "depth"
+            - "line"
+            - "pose"
+            - "inpaint"
+          # you can make custom inpainting images as well. These images must be webp or png format with an alpha.
+          # just erase the part of the image you want to inpaint and save it as a webp or png. Again, erase your
+          # train target. So the person if training a person. The automatic controls above with inpaint will
+          # just run a background remover mask and erase the foreground, which works well for subjects.
+          # inpaint_path: "/my/impaint/images"
+          # you can also specify existing control image pairs. It can handle multiple groups and will randomly
+          # select one for each step.
+          # control_path:
+          #   - "/my/custom/control/images"
+          #   - "/my/custom/control/images2"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          resolution: [ 512, 768, 1024 ]  # flex2 enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # IMPORTANT! For Flex2, you must bypass the guidance embedder during training
+        bypass_guidance_embedding: true
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flex2
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        # shift works well for training fast and learning composition and style.
+        # for just subject, you may want to change this to sigmoid
+        timestep_type: 'shift'  # 'linear', 'sigmoid', 'shift'
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        optimizer_params:
+          weight_decay: 1e-5
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Defaults off
+        ema_config:
+          use_ema: false
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flex, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "ostris/Flex.2-preview"
+        arch: "flex2"
+        quantize: true  # run 8bit mixed precision
+        quantize_te: true
+        # you can pass special training infor for controls to the model here
+        # percentages are decimal based so 0.0 is 0% and 1.0 is 100% of the time.
+        model_kwargs:
+          # inverts the inpainting mask, good to learn outpainting as well, recommended 0.0 for characters
+          invert_inpaint_mask_chance: 0.5
+          # this will do a normal t2i training step without inpaint when dropped out. REcommended if you want
+          # your lora to be able to inference with and without inpainting.
+          inpaint_dropout: 0.5
+          # randomly drops out the control image. Dropout recvommended if your want it to work without controls as well.
+          control_dropout: 0.5
+          # does a random inpaint blob. Usually a good idea to keep. Without it, the model will learn to always 100%
+          # fill the inpaint area with your subject. This is not always a good thing.
+          inpaint_random_chance: 0.5
+          # generates random inpaint blobs if you did not provide an inpaint image for your dataset. Inpaint breaks down fast
+          # if you are not training with it. Controls are a little more robust and can be left out,
+          # but when in doubt, always leave this on
+          do_random_inpainting: false
+          # does random blurring of the inpaint mask. Helps prevent weird edge artifacts for real workd inpainting. Leave on.
+          random_blur_mask: true
+          # applies a small amount of random dialition and restriction to the inpaint mask. Helps with edge artifacts.
+          # Leave on.
+          random_dialate_mask: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          # you can use a single inpaint or single control image on your samples.
+          # for controls, the ctrl_idx is 1, the images can be any name and image format.
+          # use either a pose/line/depth image or whatever you are training with. An example is
+          # - "photo of [trigger] --ctrl_idx 1 --ctrl_img /path/to/control/image.jpg"
+          # for an inpainting image, it must be png/webp. Erase the part of the image you want to inpaint
+          # IMPORTANT! the inpaint images must be ctrl_idx 0 and have .inpaint.{ext} in the name for this to work right.
+          # - "photo of [trigger] --ctrl_idx 0 --ctrl_img /path/to/inpaint/image.inpaint.png"
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flex2
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_flex_24gb.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flex_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flex enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # IMPORTANT! For Flex, you must bypass the guidance embedder during training
+        bypass_guidance_embedding: true
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flex
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flex, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "ostris/Flex.1-alpha"
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+        quantize_kwargs:
+          exclude:
+            - "*time_text_embed*"  # exclude the time text embedder from quantization
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flex
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_flux_24gb.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "black-forest-labs/FLUX.1-dev"
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 20
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_flux_kontext_24gb.yaml ADDED Viewed

	@@ -0,0 +1,106 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_kontext_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          # control path is the input images for kontext for a paired dataset. These are the source images you want to change.
+          # You can comment this out and only use normal images if you don't have a paired dataset.
+          # Control images need to match the filenames on the folder path but in
+          # a different folder. These do not need captions.
+          control_path: "/path/to/control/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          # Kontext runs images in at 2x the latent size. It may OOM at 1024 resolution with 24GB vram.
+          resolution: [ 512, 768 ]  # flux enjoys multiple resolutions
+          # resolution: [ 512, 768, 1024 ]
+      train:
+        batch_size: 1
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        timestep_type: "weighted" # sigmoid, linear, or weighted.
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # ema will smooth out learning, but could slow it down.
+        # ema_config:
+        #   use_ema: true
+        #   ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path. This model is gated.
+        # visit https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev to accept the terms and conditions
+        # and then you can use this model.
+        name_or_path: "black-forest-labs/FLUX.1-Kontext-dev"
+        arch: "flux_kontext"
+        quantize: true  # run 8bit mixed precision
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # the --ctrl_img path is the one loaded to apply the kontext editing to
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "make the person smile  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "give the person an afro  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "turn this image into a cartoon  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "put this person in an action film  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "make this person a rapper in a rap music video  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "make the person smile  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "give the person an afro  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "turn this image into a cartoon  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "put this person in an action film  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "make this person a rapper in a rap music video  --ctrl_img /path/to/control/folder/person1.jpg"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 20
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_flux_schnell_24gb.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new bell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "black-forest-labs/FLUX.1-schnell"
+        assistant_lora_path: "ostris/FLUX.1-schnell-training-adapter" # Required for flux schnell training
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+        # low_vram is painfully slow to fuse in the adapter avoid it unless absolutely necessary
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 1  # schnell does not do guidance
+        sample_steps: 4  # 1 - 4 works well
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_hidream_48.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# HiDream training is still highly experimental. The settings here will take ~35.2GB of vram to train.
+# It is not possible to train on a single 24GB card yet, but I am working on it. If you have more VRAM
+# I highly recommend first disabling quantization on the model itself if you can. You can leave the TEs quantized.
+# HiDream has a mixture of experts that may take special training considerations that I do not
+# have implemented properly. The current implementation seems to work well for LoRA training, but
+# may not be effective for longer training runs. The implementation could change in future updates
+# so your results may vary when this happens.
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_hidream_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+        network_kwargs:
+          # it is probably best to ignore the mixture of experts since only 2 are active each block. It works activating it, but I wouldnt.
+          # proper training of it is not fully implemented
+          ignore_if_contains:
+            - "ff_i.experts"
+            - "ff_i.gate"
+      save:
+        dtype: bfloat16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          resolution: [ 512, 768, 1024 ]  # hidream enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # wont work with hidream
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        timestep_type: shift # sigmoid, shift, linear
+        optimizer: "adamw8bit"
+        lr: 2e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Defaults off
+        ema_config:
+          use_ema: false
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for hidream, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # the transformer will get grabbed from this hf repo
+        # warning ONLY train on Full. The dev and fast models are distilled and will break
+        name_or_path: "HiDream-ai/HiDream-I1-Full"
+        # the extras will be grabbed from this hf repo. (text encoder, vae)
+        extras_name_or_path: "HiDream-ai/HiDream-I1-Full"
+        arch: "hidream"
+        # both need to be quantized to train on 48GB currently
+        quantize: true
+        quantize_te: true
+        model_kwargs:
+          # llama is a gated model, It defaults to unsloth version, but you can set the llama path here
+          llama_model_path: "unsloth/Meta-Llama-3.1-8B-Instruct"
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_lumina.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+---
+# This configuration requires 20GB of VRAM or more to operate
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_lumina_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+      # performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: bf16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 2 # how many intermittent saves to keep
+        save_format: 'diffusers' # 'diffusers'
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          # cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # lumina2 enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # can be 'sigmoid', 'linear', or 'lumina2_shift'
+        timestep_type: 'lumina2_shift'
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with lumina2
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+        # skip_first_sample: true
+        # uncomment to completely disable sampling
+        # disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on if you have the vram
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for lumina2, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Alpha-VLLM/Lumina-Image-2.0"
+        is_lumina2: true # lumina2 architecture
+        # you can quantize just the Gemma2 text encoder here to save vram
+        quantize_te: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a cat that is half black and half orange tabby, split down the middle. The cat has on a blue tophat. They are holding a martini glass with a pink ball of yarn in it with green knitting needles sticking out, in one paw. In the other paw, they are holding a DVD case for a movie titled, \"This is a test\" that has a golden robot on it. In the background is a busy night club with a giant mushroom man dancing with a bear."
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4.0
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_omnigen2_24gb.yaml ADDED Viewed

	@@ -0,0 +1,94 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_omnigen2_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # omnigen2 should work with multiple resolutions
+      train:
+        batch_size: 1
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with omnigen2
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        timestep_type: 'sigmoid' # sigmoid, linear, shift
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # ema will smooth out learning, but could slow it down.
+        # ema_config:
+        #   use_ema: true
+        #   ema_decay: 0.99
+        # will probably need this if gpu supports it for omnigen2, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        name_or_path: "OmniGen2/OmniGen2
+        arch: "omnigen2"
+        quantize_te: true  # quantize_only te
+        # quantize: true  # quantize transformer
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # negative prompt, optional
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_sd35_large_24gb.yaml ADDED Viewed

	@@ -0,0 +1,97 @@

+---
+# NOTE!! THIS IS CURRENTLY EXPERIMENTAL AND UNDER DEVELOPMENT. SOME THINGS WILL CHANGE
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_sd3l_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 1024 ]
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # May not fully work with SD3 yet
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch"
+        timestep_type: "linear" # linear or sigmoid
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for sd3, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "stabilityai/stable-diffusion-3.5-large"
+        is_v3: true
+        quantize: true  # run 8bit mixed precision
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_wan21_14b_24gb.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+# IMPORTANT: The Wan2.1 14B model is huge. This config should work on 24GB GPUs. It cannot
+# support keeping the text encoder on GPU while training with 24GB, so it is only good
+# for training on a single prompt, for example a person with a trigger word.
+# to train on captions, you need more vran for now.
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_wan21_14b_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # this is probably needed for 24GB cards when offloading TE to CPU
+      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # AI-Toolkit does not currently support video datasets, we will train on 1 frame at a time
+        # it works well for characters, but not as well for "actions"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 632 ]  # will be around 480p
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with wan
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        timestep_type: 'sigmoid'
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        optimizer_params:
+          weight_decay: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        dtype: bf16
+        # required for 24GB cards
+        # this will encode your trigger word and use those embeddings for every image in the dataset
+        unload_text_encoder: true
+      model:
+        # huggingface model name or path
+        name_or_path: "Wan-AI/Wan2.1-T2V-14B-Diffusers"
+        arch: 'wan21'
+        # these settings will save as much vram as possible
+        quantize: true
+        quantize_te: true
+        low_vram: true
+      sample:
+        sampler: "flowmatch"
+        sample_every: 250 # sample every this many steps
+        width: 832
+        height: 480
+        num_frames: 40
+        fps: 15
+        # samples take a long time. so use them sparingly
+        # samples will be animated webp files, if you don't see them animated, open in a browser.
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 5
+        sample_steps: 30
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_wan21_1b_24gb.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_wan21_1b_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # AI-Toolkit does not currently support video datasets, we will train on 1 frame at a time
+        # it works well for characters, but not as well for "actions"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 632 ]  # will be around 480p
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with wan
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        timestep_type: 'sigmoid'
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        optimizer_params:
+          weight_decay: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
+        arch: 'wan21'
+        quantize_te: true # saves vram
+      sample:
+        sampler: "flowmatch"
+        sample_every: 250 # sample every this many steps
+        width: 832
+        height: 480
+        num_frames: 40
+        fps: 15
+        # samples take a long time. so use them sparingly
+        # samples will be animated webp files, if you don't see them animated, open in a browser.
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 5
+        sample_steps: 30
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_slider.example.yml ADDED Viewed

	@@ -0,0 +1,230 @@

+---
+# This is in yaml format. You can use json if you prefer
+# I like both but yaml is easier to write
+# Plus it has comments which is nice for documentation
+# This is the config I use on my sliders, It is solid and tested
+job: train
+config:
+  # the name will be used to create a folder in the output folder
+  # it will also replace any [name] token in the rest of this config
+  name: detail_slider_v1
+  # folder will be created with name above in folder below
+  # it can be relative to the project root or absolute
+  training_folder: "output/LoRA"
+  device: cuda:0 # cpu, cuda:0, etc
+  # for tensorboard logging, we will make a subfolder for this job
+  log_dir: "output/.tensorboard"
+  # you can stack processes for other jobs, It is not tested with sliders though
+  # just use one for now
+  process:
+    - type: slider # tells runner to run the slider process
+      # network is the LoRA network for a slider, I recommend to leave this be
+      network:
+        # network type lierla is traditional LoRA that works everywhere, only linear layers
+        type: "lierla"
+        # rank / dim of the network. Bigger is not always better. Especially for sliders. 8 is good
+        linear: 8
+        linear_alpha: 4 # Do about half of rank
+      # training config
+      train:
+        # this is also used in sampling. Stick with ddpm unless you know what you are doing
+        noise_scheduler: "ddpm" # or "ddpm", "lms", "euler_a"
+        # how many steps to train. More is not always better. I rarely go over 1000
+        steps: 500
+        # I have had good results with 4e-4 to 1e-4 at 500 steps
+        lr: 2e-4
+        # enables gradient checkpoint, saves vram, leave it on
+        gradient_checkpointing: true
+        # train the unet. I recommend leaving this true
+        train_unet: true
+        # train the text encoder. I don't recommend this unless you have a special use case
+        # for sliders we are adjusting representation of the concept (unet),
+        # not the description of it (text encoder)
+        train_text_encoder: false
+        # same as from sd-scripts, not fully tested but should speed up training
+        min_snr_gamma: 5.0
+        # just leave unless you know what you are doing
+        # also supports "dadaptation" but set lr to 1 if you use that,
+        # but it learns too fast and I don't recommend it
+        optimizer: "adamw"
+        # only constant for now
+        lr_scheduler: "constant"
+        # we randomly denoise random num of steps form 1 to this number
+        # while training. Just leave it
+        max_denoising_steps: 40
+        # works great at 1. I do 1 even with my 4090.
+        # higher may not work right with newer single batch stacking code anyway
+        batch_size: 1
+        # bf16 works best if your GPU supports it (modern)
+        dtype: bf16  # fp32, bf16, fp16
+        # if you have it, use it. It is faster and better
+        # torch 2.0 doesnt need xformers anymore, only use if you have lower version
+#        xformers: true
+        # I don't recommend using unless you are trying to make a darker lora. Then do 0.1 MAX
+        # although, the way we train sliders is comparative, so it probably won't work anyway
+        noise_offset: 0.0
+#        noise_offset: 0.0357  # SDXL was trained with offset of 0.0357. So use that when training on SDXL
+      # the model to train the LoRA network on
+      model:
+        # huggingface name, relative prom project path, or absolute path to .safetensors or .ckpt
+        name_or_path: "runwayml/stable-diffusion-v1-5"
+        is_v2: false  # for v2 models
+        is_v_pred: false # for v-prediction models (most v2 models)
+        # has some issues with the dual text encoder and the way we train sliders
+        # it works bit weights need to probably be higher to see it.
+        is_xl: false  # for SDXL models
+      # saving config
+      save:
+        dtype: float16 # precision to save. I recommend float16
+        save_every: 50 # save every this many steps
+        # this will remove step counts more than this number
+        # allows you to save more often in case of a crash without filling up your drive
+        max_step_saves_to_keep: 2
+      # sampling config
+      sample:
+        # must match train.noise_scheduler, this is not used here
+        # but may be in future and in other processes
+        sampler: "ddpm"
+        # sample every this many steps
+        sample_every: 20
+        # image size
+        width: 512
+        height: 512
+        # prompts to use for sampling. Do as many as you want, but it slows down training
+        # pick ones that will best represent the concept you are trying to adjust
+        # allows some flags after the prompt
+        #  --m [number]  # network multiplier. LoRA weight. -3 for the negative slide, 3 for the positive
+        #      slide are good tests. will inherit sample.network_multiplier if not set
+        #  --n [string]  # negative prompt, will inherit sample.neg if not set
+        # Only 75 tokens allowed currently
+        # I like to do a wide positive and negative spread so I can see a good range and stop
+        # early if the network is braking down
+        prompts:
+          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m -5"
+          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m -3"
+          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m 3"
+          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m 5"
+          - "a golden retriever sitting on a leather couch, --m -5"
+          - "a golden retriever sitting on a leather couch --m -3"
+          - "a golden retriever sitting on a leather couch --m 3"
+          - "a golden retriever sitting on a leather couch --m 5"
+          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m -5"
+          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m -3"
+          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m 3"
+          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m 5"
+        # negative prompt used on all prompts above as default if they don't have one
+        neg: "cartoon, fake, drawing, illustration, cgi, animated, anime, monochrome"
+        # seed for sampling. 42 is the answer for everything
+        seed: 42
+        # walks the seed so s1 is 42, s2 is 43, s3 is 44, etc
+        # will start over on next sample_every so s1 is always seed
+        # works well if you use same prompt but want different results
+        walk_seed: false
+        # cfg scale (4 to 10 is good)
+        guidance_scale: 7
+        # sampler steps (20 to 30 is good)
+        sample_steps: 20
+        # default network multiplier for all prompts
+        # since we are training a slider, I recommend overriding this with --m [number]
+        # in the prompts above to get both sides of the slider
+        network_multiplier: 1.0
+      # logging information
+      logging:
+        log_every: 10 # log every this many steps
+        use_wandb: false # not supported yet
+        verbose: false # probably done need unless you are debugging
+      # slider training config, best for last
+      slider:
+        # resolutions to train on. [ width, height ]. This is less important for sliders
+        # as we are not teaching the model anything it doesn't already know
+        # but must be a size it understands [ 512, 512 ] for sd_v1.5  and [ 768, 768 ] for sd_v2.1
+        # and [ 1024, 1024 ] for sd_xl
+        # you can do as many as you want here
+        resolutions:
+          - [ 512, 512 ]
+#          - [ 512, 768 ]
+#          - [ 768, 768 ]
+        # slider training uses 4 combined steps for a single round. This will do it in one gradient
+        # step. It is highly optimized and shouldn't take anymore vram than doing without it,
+        # since we break down batches for gradient accumulation now. so just leave it on.
+        batch_full_slide: true
+        # These are the concepts to train on. You can do as many as you want here,
+        # but they can conflict outweigh each other. Other than experimenting, I recommend
+        # just doing one for good results
+        targets:
+            # target_class is the base concept we are adjusting the representation of
+            # for example, if we are adjusting the representation of a person, we would use "person"
+            # if we are adjusting the representation of a cat, we would use "cat" It is not
+            # a keyword necessarily but what the model understands the concept to represent.
+            # "person" will affect men, women, children, etc but will not affect cats, dogs, etc
+            # it is the models base general understanding of the concept and everything it represents
+            # you can leave it blank to affect everything. In this example, we are adjusting
+            # detail, so we will leave it blank to affect everything
+          - target_class: ""
+            # positive is the prompt for the positive side of the slider.
+            # It is the concept that will be excited and amplified in the model when we slide the slider
+            # to the positive side and forgotten / inverted when we slide
+            # the slider to the negative side. It is generally best to include the target_class in
+            # the prompt. You want it to be the extreme of what you want to train on. For example,
+            # if you want to train on fat people, you would use "an extremely fat, morbidly obese person"
+            # as the prompt. Not just "fat person"
+            # max 75 tokens for now
+            positive: "high detail, 8k, intricate, detailed, high resolution, high res, high quality"
+            # negative is the prompt for the negative side of the slider and works the same as positive
+            # it does not necessarily work the same as a negative prompt when generating images
+            # these need to be polar opposites.
+            # max 76 tokens for now
+            negative: "blurry, boring, fuzzy, low detail, low resolution, low res, low quality"
+            # the loss for this target is multiplied by this number.
+            # if you are doing more than one target it may be good to set less important ones
+            # to a lower number like 0.1 so they don't outweigh the primary target
+            weight: 1.0
+            # shuffle the prompts split by the comma. We will run every combination randomly
+            # this will make the LoRA more robust. You probably want this on unless prompt order
+            # is important for some reason
+            shuffle: true
+        # anchors are prompts that we will try to hold on to while training the slider
+        # these are NOT necessary and can prevent the slider from converging if not done right
+        # leave them off if you are having issues, but they can help lock the network
+        # on certain concepts to help prevent catastrophic forgetting
+        # you want these to generate an image that is not your target_class, but close to it
+        # is fine as long as it does not directly overlap it.
+        # For example, if you are training on a person smiling,
+        # you could use "a person with a face mask" as an anchor. It is a person, the image is the same
+        # regardless if they are smiling or not, however, the closer the concept is to the target_class
+        # the less the multiplier needs to be. Keep multipliers less than 1.0 for anchors usually
+        # for close concepts, you want to be closer to 0.1 or 0.2
+        # these will slow down training. I am leaving them off for the demo
+#        anchors:
+#          - prompt: "a woman"
+#            neg_prompt: "animal"
+#            # the multiplier applied to the LoRA when this is run.
+#            # higher will give it more weight but also help keep the lora from collapsing
+#            multiplier: 1.0
+#          - prompt: "a man"
+#            neg_prompt: "animal"
+#            multiplier: 1.0
+#          - prompt: "a person"
+#            neg_prompt: "animal"
+#            multiplier: 1.0
+# You can put any information you want here, and it will be saved in the model.
+# The below is an example, but you can put your grocery list in it if you want.
+# It is saved in the model so be aware of that. The software will include this
+# plus some other information for you automatically
+meta:
+  # [name] gets replaced with the name above
+  name: "[name]"
+#  version: '1.0'
+#  creator:
+#    name: Your Name
+#    email: [email protected]
+#    website: https://your.website

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+version: "3.8"
+services:
+  ai-toolkit:
+    image: ostris/aitoolkit:latest
+    restart: unless-stopped
+    ports:
+      - "8675:8675"
+    volumes:
+      - ~/.cache/huggingface/hub:/root/.cache/huggingface/hub
+      - ./aitk_db.db:/app/ai-toolkit/aitk_db.db
+      - ./datasets:/app/ai-toolkit/datasets
+      - ./output:/app/ai-toolkit/output
+      - ./config:/app/ai-toolkit/config
+    environment:
+      - AI_TOOLKIT_AUTH=${AI_TOOLKIT_AUTH:-password}
+      - NODE_ENV=production
+      - TZ=UTC
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,83 @@

+FROM nvidia/cuda:12.8.1-devel-ubuntu22.04
+LABEL authors="jaret"
+# Set noninteractive to avoid timezone prompts
+ENV DEBIAN_FRONTEND=noninteractive
+# ref https://en.wikipedia.org/wiki/CUDA
+ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0 10.0 12.0"
+# Install dependencies
+RUN apt-get update && apt-get install --no-install-recommends -y \
+    git \
+    curl \
+    build-essential \
+    cmake \
+    wget \
+    python3.10 \
+    python3-pip \
+    python3-dev \
+    python3-setuptools \
+    python3-wheel \
+    python3-venv \
+    ffmpeg \
+    tmux \
+    htop \
+    nvtop \
+    python3-opencv \
+    openssh-client \
+    openssh-server \
+    openssl \
+    rsync \
+    unzip \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Install nodejs
+WORKDIR /tmp
+RUN curl -sL https://deb.nodesource.com/setup_23.x -o nodesource_setup.sh && \
+    bash nodesource_setup.sh && \
+    apt-get update && \
+    apt-get install -y nodejs && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Set aliases for python and pip
+RUN ln -s /usr/bin/python3 /usr/bin/python
+# install pytorch before cache bust to avoid redownloading pytorch
+RUN pip install --pre --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
+# Fix cache busting by moving CACHEBUST to right before git clone
+ARG CACHEBUST=1234
+ARG GIT_COMMIT=main
+RUN echo "Cache bust: ${CACHEBUST}" && \
+    git clone https://github.com/ostris/ai-toolkit.git && \
+    cd ai-toolkit && \
+    git checkout ${GIT_COMMIT}
+WORKDIR /app/ai-toolkit
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt && \
+    pip install --pre --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128 --force && \
+    pip install setuptools==69.5.1 --no-cache-dir
+# Build UI
+WORKDIR /app/ai-toolkit/ui
+RUN npm install && \
+    npm run build && \
+    npm run update_db
+# Expose port (assuming the application runs on port 3000)
+EXPOSE 8675
+WORKDIR /
+COPY docker/start.sh /start.sh
+RUN chmod +x /start.sh
+CMD ["/start.sh"]

docker/start.sh ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/bin/bash
+set -e  # Exit the script if any statement returns a non-true return value
+# ref https://github.com/runpod/containers/blob/main/container-template/start.sh
+# ---------------------------------------------------------------------------- #
+#                          Function Definitions                                #
+# ---------------------------------------------------------------------------- #
+# Setup ssh
+setup_ssh() {
+    if [[ $PUBLIC_KEY ]]; then
+        echo "Setting up SSH..."
+        mkdir -p ~/.ssh
+        echo "$PUBLIC_KEY" >> ~/.ssh/authorized_keys
+        chmod 700 -R ~/.ssh
+         if [ ! -f /etc/ssh/ssh_host_rsa_key ]; then
+            ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -q -N ''
+            echo "RSA key fingerprint:"
+            ssh-keygen -lf /etc/ssh/ssh_host_rsa_key.pub
+        fi
+        if [ ! -f /etc/ssh/ssh_host_dsa_key ]; then
+            ssh-keygen -t dsa -f /etc/ssh/ssh_host_dsa_key -q -N ''
+            echo "DSA key fingerprint:"
+            ssh-keygen -lf /etc/ssh/ssh_host_dsa_key.pub
+        fi
+        if [ ! -f /etc/ssh/ssh_host_ecdsa_key ]; then
+            ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key -q -N ''
+            echo "ECDSA key fingerprint:"
+            ssh-keygen -lf /etc/ssh/ssh_host_ecdsa_key.pub
+        fi
+        if [ ! -f /etc/ssh/ssh_host_ed25519_key ]; then
+            ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key -q -N ''
+            echo "ED25519 key fingerprint:"
+            ssh-keygen -lf /etc/ssh/ssh_host_ed25519_key.pub
+        fi
+        service ssh start
+        echo "SSH host keys:"
+        for key in /etc/ssh/*.pub; do
+            echo "Key: $key"
+            ssh-keygen -lf $key
+        done
+    fi
+}
+# Export env vars
+export_env_vars() {
+    echo "Exporting environment variables..."
+    printenv | grep -E '^RUNPOD_|^PATH=|^_=' | awk -F = '{ print "export " $1 "=\"" $2 "\"" }' >> /etc/rp_environment
+    echo 'source /etc/rp_environment' >> ~/.bashrc
+}
+# ---------------------------------------------------------------------------- #
+#                               Main Program                                   #
+# ---------------------------------------------------------------------------- #
+echo "Pod Started"
+setup_ssh
+export_env_vars
+echo "Starting AI Toolkit UI..."
+cd /app/ai-toolkit/ui && npm run start

extensions/example/ExampleMergeModels.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+import gc
+from collections import OrderedDict
+from typing import TYPE_CHECKING
+from jobs.process import BaseExtensionProcess
+from toolkit.config_modules import ModelConfig
+from toolkit.stable_diffusion_model import StableDiffusion
+from toolkit.train_tools import get_torch_dtype
+from tqdm import tqdm
+# Type check imports. Prevents circular imports
+if TYPE_CHECKING:
+    from jobs import ExtensionJob
+# extend standard config classes to add weight
+class ModelInputConfig(ModelConfig):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.weight = kwargs.get('weight', 1.0)
+        # overwrite default dtype unless user specifies otherwise
+        # float 32 will give up better precision on the merging functions
+        self.dtype: str = kwargs.get('dtype', 'float32')
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+# this is our main class process
+class ExampleMergeModels(BaseExtensionProcess):
+    def __init__(
+            self,
+            process_id: int,
+            job: 'ExtensionJob',
+            config: OrderedDict
+    ):
+        super().__init__(process_id, job, config)
+        # this is the setup process, do not do process intensive stuff here, just variable setup and
+        # checking requirements. This is called before the run() function
+        # no loading models or anything like that, it is just for setting up the process
+        # all of your process intensive stuff should be done in the run() function
+        # config will have everything from the process item in the config file
+        # convince methods exist on BaseProcess to get config values
+        # if required is set to true and the value is not found it will throw an error
+        # you can pass a default value to get_conf() as well if it was not in the config file
+        # as well as a type to cast the value to
+        self.save_path = self.get_conf('save_path', required=True)
+        self.save_dtype = self.get_conf('save_dtype', default='float16', as_type=get_torch_dtype)
+        self.device = self.get_conf('device', default='cpu', as_type=torch.device)
+        # build models to merge list
+        models_to_merge = self.get_conf('models_to_merge', required=True, as_type=list)
+        # build list of ModelInputConfig objects. I find it is a good idea to make a class for each config
+        # this way you can add methods to it and it is easier to read and code. There are a lot of
+        # inbuilt config classes located in toolkit.config_modules as well
+        self.models_to_merge = [ModelInputConfig(**model) for model in models_to_merge]
+        # setup is complete. Don't load anything else here, just setup variables and stuff
+    # this is the entire run process be sure to call super().run() first
+    def run(self):
+        # always call first
+        super().run()
+        print(f"Running process: {self.__class__.__name__}")
+        # let's adjust our weights first to normalize them so the total is 1.0
+        total_weight = sum([model.weight for model in self.models_to_merge])
+        weight_adjust = 1.0 / total_weight
+        for model in self.models_to_merge:
+            model.weight *= weight_adjust
+        output_model: StableDiffusion = None
+        # let's do the merge, it is a good idea to use tqdm to show progress
+        for model_config in tqdm(self.models_to_merge, desc="Merging models"):
+            # setup model class with our helper class
+            sd_model = StableDiffusion(
+                device=self.device,
+                model_config=model_config,
+                dtype="float32"
+            )
+            # load the model
+            sd_model.load_model()
+            # adjust the weight of the text encoder
+            if isinstance(sd_model.text_encoder, list):
+                # sdxl model
+                for text_encoder in sd_model.text_encoder:
+                    for key, value in text_encoder.state_dict().items():
+                        value *= model_config.weight
+            else:
+                # normal model
+                for key, value in sd_model.text_encoder.state_dict().items():
+                    value *= model_config.weight
+            # adjust the weights of the unet
+            for key, value in sd_model.unet.state_dict().items():
+                value *= model_config.weight
+            if output_model is None:
+                # use this one as the base
+                output_model = sd_model
+            else:
+                # merge the models
+                # text encoder
+                if isinstance(output_model.text_encoder, list):
+                    # sdxl model
+                    for i, text_encoder in enumerate(output_model.text_encoder):
+                        for key, value in text_encoder.state_dict().items():
+                            value += sd_model.text_encoder[i].state_dict()[key]
+                else:
+                    # normal model
+                    for key, value in output_model.text_encoder.state_dict().items():
+                        value += sd_model.text_encoder.state_dict()[key]
+                # unet
+                for key, value in output_model.unet.state_dict().items():
+                    value += sd_model.unet.state_dict()[key]
+                # remove the model to free memory
+                del sd_model
+                flush()
+        # merge loop is done, let's save the model
+        print(f"Saving merged model to {self.save_path}")
+        output_model.save(self.save_path, meta=self.meta, save_dtype=self.save_dtype)
+        print(f"Saved merged model to {self.save_path}")
+        # do cleanup here
+        del output_model
+        flush()

extensions/example/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# This is an example extension for custom training. It is great for experimenting with new ideas.
+from toolkit.extension import Extension
+# We make a subclass of Extension
+class ExampleMergeExtension(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "example_merge_extension"
+    # name is the name of the extension for printing
+    name = "Example Merge Extension"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .ExampleMergeModels import ExampleMergeModels
+        return ExampleMergeModels
+AI_TOOLKIT_EXTENSIONS = [
+    # you can put a list of extensions here
+    ExampleMergeExtension
+]

extensions/example/config/config.example.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+---
+# Always include at least one example config file to show how to use your extension.
+# use plenty of comments so users know how to use it and what everything does
+# all extensions will use this job name
+job: extension
+config:
+  name: 'my_awesome_merge'
+  process:
+    # Put your example processes here. This will be passed
+    # to your extension process in the config argument.
+    # the type MUST match your extension uid
+    - type: "example_merge_extension"
+      # save path for the merged model
+      save_path: "output/merge/[name].safetensors"
+      # save type
+      dtype: fp16
+      # device to run it on
+      device: cuda:0
+      # input models can only be SD1.x and SD2.x models for this example (currently)
+      models_to_merge:
+        # weights are relative, total weights will be normalized
+        # for example. If you have 2 models with weight 1.0, they will
+        # both be weighted 0.5. If you have 1 model with weight 1.0 and
+        # another with weight 2.0, the first will be weighted 1/3 and the
+        # second will be weighted 2/3
+        - name_or_path: "input/model1.safetensors"
+          weight: 1.0
+        - name_or_path: "input/model2.safetensors"
+          weight: 1.0
+        - name_or_path: "input/model3.safetensors"
+          weight: 0.3
+        - name_or_path: "input/model4.safetensors"
+          weight: 1.0
+# you can put any information you want here, and it will be saved in the model
+# the below is an example. I recommend doing trigger words at a minimum
+# in the metadata. The software will include this plus some other information
+meta:
+  name: "[name]"  # [name] gets replaced with the name above
+  description: A short description of your model
+  version: '0.1'
+  creator:
+    name: Your Name
+    email: [email protected]
+    website: https://yourwebsite.com
+  any: All meta data above is arbitrary, it can be whatever you want.

extensions_built_in/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

extensions_built_in/advanced_generator/Img2ImgGenerator.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import math
+import os
+import random
+from collections import OrderedDict
+from typing import List
+import numpy as np
+from PIL import Image
+from diffusers import T2IAdapter
+from diffusers.utils.torch_utils import randn_tensor
+from torch.utils.data import DataLoader
+from diffusers import StableDiffusionXLImg2ImgPipeline, PixArtSigmaPipeline
+from tqdm import tqdm
+from toolkit.config_modules import ModelConfig, GenerateImageConfig, preprocess_dataset_raw_config, DatasetConfig
+from toolkit.data_transfer_object.data_loader import FileItemDTO, DataLoaderBatchDTO
+from toolkit.sampler import get_sampler
+from toolkit.stable_diffusion_model import StableDiffusion
+import gc
+import torch
+from jobs.process import BaseExtensionProcess
+from toolkit.data_loader import get_dataloader_from_datasets
+from toolkit.train_tools import get_torch_dtype
+from controlnet_aux.midas import MidasDetector
+from diffusers.utils import load_image
+from torchvision.transforms import ToTensor
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class GenerateConfig:
+    def __init__(self, **kwargs):
+        self.prompts: List[str]
+        self.sampler = kwargs.get('sampler', 'ddpm')
+        self.neg = kwargs.get('neg', '')
+        self.seed = kwargs.get('seed', -1)
+        self.walk_seed = kwargs.get('walk_seed', False)
+        self.guidance_scale = kwargs.get('guidance_scale', 7)
+        self.sample_steps = kwargs.get('sample_steps', 20)
+        self.guidance_rescale = kwargs.get('guidance_rescale', 0.0)
+        self.ext = kwargs.get('ext', 'png')
+        self.denoise_strength = kwargs.get('denoise_strength', 0.5)
+        self.trigger_word = kwargs.get('trigger_word', None)
+class Img2ImgGenerator(BaseExtensionProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict):
+        super().__init__(process_id, job, config)
+        self.output_folder = self.get_conf('output_folder', required=True)
+        self.copy_inputs_to = self.get_conf('copy_inputs_to', None)
+        self.device = self.get_conf('device', 'cuda')
+        self.model_config = ModelConfig(**self.get_conf('model', required=True))
+        self.generate_config = GenerateConfig(**self.get_conf('generate', required=True))
+        self.is_latents_cached = True
+        raw_datasets = self.get_conf('datasets', None)
+        if raw_datasets is not None and len(raw_datasets) > 0:
+            raw_datasets = preprocess_dataset_raw_config(raw_datasets)
+        self.datasets = None
+        self.datasets_reg = None
+        self.dtype = self.get_conf('dtype', 'float16')
+        self.torch_dtype = get_torch_dtype(self.dtype)
+        self.params = []
+        if raw_datasets is not None and len(raw_datasets) > 0:
+            for raw_dataset in raw_datasets:
+                dataset = DatasetConfig(**raw_dataset)
+                is_caching = dataset.cache_latents or dataset.cache_latents_to_disk
+                if not is_caching:
+                    self.is_latents_cached = False
+                if dataset.is_reg:
+                    if self.datasets_reg is None:
+                        self.datasets_reg = []
+                    self.datasets_reg.append(dataset)
+                else:
+                    if self.datasets is None:
+                        self.datasets = []
+                    self.datasets.append(dataset)
+        self.progress_bar = None
+        self.sd = StableDiffusion(
+            device=self.device,
+            model_config=self.model_config,
+            dtype=self.dtype,
+        )
+        print(f"Using device {self.device}")
+        self.data_loader: DataLoader = None
+        self.adapter: T2IAdapter = None
+    def to_pil(self, img):
+        # image comes in -1 to 1. convert to a PIL RGB image
+        img = (img + 1) / 2
+        img = img.clamp(0, 1)
+        img = img[0].permute(1, 2, 0).cpu().numpy()
+        img = (img * 255).astype(np.uint8)
+        image = Image.fromarray(img)
+        return image
+    def run(self):
+        with torch.no_grad():
+            super().run()
+            print("Loading model...")
+            self.sd.load_model()
+            device = torch.device(self.device)
+            if self.model_config.is_xl:
+                pipe = StableDiffusionXLImg2ImgPipeline(
+                    vae=self.sd.vae,
+                    unet=self.sd.unet,
+                    text_encoder=self.sd.text_encoder[0],
+                    text_encoder_2=self.sd.text_encoder[1],
+                    tokenizer=self.sd.tokenizer[0],
+                    tokenizer_2=self.sd.tokenizer[1],
+                    scheduler=get_sampler(self.generate_config.sampler),
+                ).to(device, dtype=self.torch_dtype)
+            elif self.model_config.is_pixart:
+                pipe = self.sd.pipeline.to(device, dtype=self.torch_dtype)
+            else:
+                raise NotImplementedError("Only XL models are supported")
+            pipe.set_progress_bar_config(disable=True)
+            # pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            # midas_depth = torch.compile(midas_depth, mode="reduce-overhead", fullgraph=True)
+            self.data_loader = get_dataloader_from_datasets(self.datasets, 1, self.sd)
+            num_batches = len(self.data_loader)
+            pbar = tqdm(total=num_batches, desc="Generating images")
+            seed = self.generate_config.seed
+            # load images from datasets, use tqdm
+            for i, batch in enumerate(self.data_loader):
+                batch: DataLoaderBatchDTO = batch
+                gen_seed = seed if seed > 0 else random.randint(0, 2 ** 32 - 1)
+                generator = torch.manual_seed(gen_seed)
+                file_item: FileItemDTO = batch.file_items[0]
+                img_path = file_item.path
+                img_filename = os.path.basename(img_path)
+                img_filename_no_ext = os.path.splitext(img_filename)[0]
+                img_filename = img_filename_no_ext + '.' + self.generate_config.ext
+                output_path = os.path.join(self.output_folder, img_filename)
+                output_caption_path = os.path.join(self.output_folder, img_filename_no_ext + '.txt')
+                if self.copy_inputs_to is not None:
+                    output_inputs_path = os.path.join(self.copy_inputs_to, img_filename)
+                    output_inputs_caption_path = os.path.join(self.copy_inputs_to, img_filename_no_ext + '.txt')
+                else:
+                    output_inputs_path = None
+                    output_inputs_caption_path = None
+                caption = batch.get_caption_list()[0]
+                if self.generate_config.trigger_word is not None:
+                    caption = caption.replace('[trigger]', self.generate_config.trigger_word)
+                img: torch.Tensor = batch.tensor.clone()
+                image = self.to_pil(img)
+                # image.save(output_depth_path)
+                if self.model_config.is_pixart:
+                    pipe: PixArtSigmaPipeline = pipe
+                    # Encode the full image once
+                    encoded_image = pipe.vae.encode(
+                        pipe.image_processor.preprocess(image).to(device=pipe.device, dtype=pipe.dtype))
+                    if hasattr(encoded_image, "latent_dist"):
+                        latents = encoded_image.latent_dist.sample(generator)
+                    elif hasattr(encoded_image, "latents"):
+                        latents = encoded_image.latents
+                    else:
+                        raise AttributeError("Could not access latents of provided encoder_output")
+                    latents = pipe.vae.config.scaling_factor * latents
+                    # latents = self.sd.encode_images(img)
+                    # self.sd.noise_scheduler.set_timesteps(self.generate_config.sample_steps)
+                    # start_step = math.floor(self.generate_config.sample_steps * self.generate_config.denoise_strength)
+                    # timestep = self.sd.noise_scheduler.timesteps[start_step].unsqueeze(0)
+                    # timestep = timestep.to(device, dtype=torch.int32)
+                    # latent = latent.to(device, dtype=self.torch_dtype)
+                    # noise = torch.randn_like(latent, device=device, dtype=self.torch_dtype)
+                    # latent = self.sd.add_noise(latent, noise, timestep)
+                    # timesteps_to_use = self.sd.noise_scheduler.timesteps[start_step + 1:]
+                    batch_size = 1
+                    num_images_per_prompt = 1
+                    shape = (batch_size, pipe.transformer.config.in_channels, image.height // pipe.vae_scale_factor,
+                             image.width // pipe.vae_scale_factor)
+                    noise = randn_tensor(shape, generator=generator, device=pipe.device, dtype=pipe.dtype)
+                    # noise = torch.randn_like(latents, device=device, dtype=self.torch_dtype)
+                    num_inference_steps = self.generate_config.sample_steps
+                    strength = self.generate_config.denoise_strength
+                    # Get timesteps
+                    init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+                    t_start = max(num_inference_steps - init_timestep, 0)
+                    pipe.scheduler.set_timesteps(num_inference_steps, device="cpu")
+                    timesteps = pipe.scheduler.timesteps[t_start:]
+                    timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+                    latents = pipe.scheduler.add_noise(latents, noise, timestep)
+                    gen_images = pipe.__call__(
+                        prompt=caption,
+                        negative_prompt=self.generate_config.neg,
+                        latents=latents,
+                        timesteps=timesteps,
+                        width=image.width,
+                        height=image.height,
+                        num_inference_steps=num_inference_steps,
+                        num_images_per_prompt=num_images_per_prompt,
+                        guidance_scale=self.generate_config.guidance_scale,
+                        # strength=self.generate_config.denoise_strength,
+                        use_resolution_binning=False,
+                        output_type="np"
+                    ).images[0]
+                    gen_images = (gen_images * 255).clip(0, 255).astype(np.uint8)
+                    gen_images = Image.fromarray(gen_images)
+                else:
+                    pipe: StableDiffusionXLImg2ImgPipeline = pipe
+                    gen_images = pipe.__call__(
+                        prompt=caption,
+                        negative_prompt=self.generate_config.neg,
+                        image=image,
+                        num_inference_steps=self.generate_config.sample_steps,
+                        guidance_scale=self.generate_config.guidance_scale,
+                        strength=self.generate_config.denoise_strength,
+                    ).images[0]
+                os.makedirs(os.path.dirname(output_path), exist_ok=True)
+                gen_images.save(output_path)
+                # save caption
+                with open(output_caption_path, 'w') as f:
+                    f.write(caption)
+                if output_inputs_path is not None:
+                    os.makedirs(os.path.dirname(output_inputs_path), exist_ok=True)
+                    image.save(output_inputs_path)
+                    with open(output_inputs_caption_path, 'w') as f:
+                        f.write(caption)
+                pbar.update(1)
+                batch.cleanup()
+            pbar.close()
+            print("Done generating images")
+            # cleanup
+            del self.sd
+            gc.collect()
+            torch.cuda.empty_cache()

extensions_built_in/advanced_generator/PureLoraGenerator.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+from collections import OrderedDict
+from toolkit.config_modules import ModelConfig, GenerateImageConfig, SampleConfig, LoRMConfig
+from toolkit.lorm import ExtractMode, convert_diffusers_unet_to_lorm
+from toolkit.sd_device_states_presets import get_train_sd_device_state_preset
+from toolkit.stable_diffusion_model import StableDiffusion
+import gc
+import torch
+from jobs.process import BaseExtensionProcess
+from toolkit.train_tools import get_torch_dtype
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class PureLoraGenerator(BaseExtensionProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict):
+        super().__init__(process_id, job, config)
+        self.output_folder = self.get_conf('output_folder', required=True)
+        self.device = self.get_conf('device', 'cuda')
+        self.device_torch = torch.device(self.device)
+        self.model_config = ModelConfig(**self.get_conf('model', required=True))
+        self.generate_config = SampleConfig(**self.get_conf('sample', required=True))
+        self.dtype = self.get_conf('dtype', 'float16')
+        self.torch_dtype = get_torch_dtype(self.dtype)
+        lorm_config = self.get_conf('lorm', None)
+        self.lorm_config = LoRMConfig(**lorm_config) if lorm_config is not None else None
+        self.device_state_preset = get_train_sd_device_state_preset(
+            device=torch.device(self.device),
+        )
+        self.progress_bar = None
+        self.sd = StableDiffusion(
+            device=self.device,
+            model_config=self.model_config,
+            dtype=self.dtype,
+        )
+    def run(self):
+        super().run()
+        print("Loading model...")
+        with torch.no_grad():
+            self.sd.load_model()
+            self.sd.unet.eval()
+            self.sd.unet.to(self.device_torch)
+            if isinstance(self.sd.text_encoder, list):
+                for te in self.sd.text_encoder:
+                    te.eval()
+                    te.to(self.device_torch)
+            else:
+                self.sd.text_encoder.eval()
+                self.sd.to(self.device_torch)
+            print(f"Converting to LoRM UNet")
+            # replace the unet with LoRMUnet
+            convert_diffusers_unet_to_lorm(
+                self.sd.unet,
+                config=self.lorm_config,
+            )
+            sample_folder = os.path.join(self.output_folder)
+            gen_img_config_list = []
+            sample_config = self.generate_config
+            start_seed = sample_config.seed
+            current_seed = start_seed
+            for i in range(len(sample_config.prompts)):
+                if sample_config.walk_seed:
+                    current_seed = start_seed + i
+                filename = f"[time]_[count].{self.generate_config.ext}"
+                output_path = os.path.join(sample_folder, filename)
+                prompt = sample_config.prompts[i]
+                extra_args = {}
+                gen_img_config_list.append(GenerateImageConfig(
+                    prompt=prompt,  # it will autoparse the prompt
+                    width=sample_config.width,
+                    height=sample_config.height,
+                    negative_prompt=sample_config.neg,
+                    seed=current_seed,
+                    guidance_scale=sample_config.guidance_scale,
+                    guidance_rescale=sample_config.guidance_rescale,
+                    num_inference_steps=sample_config.sample_steps,
+                    network_multiplier=sample_config.network_multiplier,
+                    output_path=output_path,
+                    output_ext=sample_config.ext,
+                    adapter_conditioning_scale=sample_config.adapter_conditioning_scale,
+                    **extra_args
+                ))
+            # send to be generated
+            self.sd.generate_images(gen_img_config_list, sampler=sample_config.sampler)
+            print("Done generating images")
+            # cleanup
+            del self.sd
+            gc.collect()
+            torch.cuda.empty_cache()

extensions_built_in/advanced_generator/ReferenceGenerator.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+import random
+from collections import OrderedDict
+from typing import List
+import numpy as np
+from PIL import Image
+from diffusers import T2IAdapter
+from torch.utils.data import DataLoader
+from diffusers import StableDiffusionXLAdapterPipeline, StableDiffusionAdapterPipeline
+from tqdm import tqdm
+from toolkit.config_modules import ModelConfig, GenerateImageConfig, preprocess_dataset_raw_config, DatasetConfig
+from toolkit.data_transfer_object.data_loader import FileItemDTO, DataLoaderBatchDTO
+from toolkit.sampler import get_sampler
+from toolkit.stable_diffusion_model import StableDiffusion
+import gc
+import torch
+from jobs.process import BaseExtensionProcess
+from toolkit.data_loader import get_dataloader_from_datasets
+from toolkit.train_tools import get_torch_dtype
+from controlnet_aux.midas import MidasDetector
+from diffusers.utils import load_image
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class GenerateConfig:
+    def __init__(self, **kwargs):
+        self.prompts: List[str]
+        self.sampler = kwargs.get('sampler', 'ddpm')
+        self.neg = kwargs.get('neg', '')
+        self.seed = kwargs.get('seed', -1)
+        self.walk_seed = kwargs.get('walk_seed', False)
+        self.t2i_adapter_path = kwargs.get('t2i_adapter_path', None)
+        self.guidance_scale = kwargs.get('guidance_scale', 7)
+        self.sample_steps = kwargs.get('sample_steps', 20)
+        self.prompt_2 = kwargs.get('prompt_2', None)
+        self.neg_2 = kwargs.get('neg_2', None)
+        self.prompts = kwargs.get('prompts', None)
+        self.guidance_rescale = kwargs.get('guidance_rescale', 0.0)
+        self.ext = kwargs.get('ext', 'png')
+        self.adapter_conditioning_scale = kwargs.get('adapter_conditioning_scale', 1.0)
+        if kwargs.get('shuffle', False):
+            # shuffle the prompts
+            random.shuffle(self.prompts)
+class ReferenceGenerator(BaseExtensionProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict):
+        super().__init__(process_id, job, config)
+        self.output_folder = self.get_conf('output_folder', required=True)
+        self.device = self.get_conf('device', 'cuda')
+        self.model_config = ModelConfig(**self.get_conf('model', required=True))
+        self.generate_config = GenerateConfig(**self.get_conf('generate', required=True))
+        self.is_latents_cached = True
+        raw_datasets = self.get_conf('datasets', None)
+        if raw_datasets is not None and len(raw_datasets) > 0:
+            raw_datasets = preprocess_dataset_raw_config(raw_datasets)
+        self.datasets = None
+        self.datasets_reg = None
+        self.dtype = self.get_conf('dtype', 'float16')
+        self.torch_dtype = get_torch_dtype(self.dtype)
+        self.params = []
+        if raw_datasets is not None and len(raw_datasets) > 0:
+            for raw_dataset in raw_datasets:
+                dataset = DatasetConfig(**raw_dataset)
+                is_caching = dataset.cache_latents or dataset.cache_latents_to_disk
+                if not is_caching:
+                    self.is_latents_cached = False
+                if dataset.is_reg:
+                    if self.datasets_reg is None:
+                        self.datasets_reg = []
+                    self.datasets_reg.append(dataset)
+                else:
+                    if self.datasets is None:
+                        self.datasets = []
+                    self.datasets.append(dataset)
+        self.progress_bar = None
+        self.sd = StableDiffusion(
+            device=self.device,
+            model_config=self.model_config,
+            dtype=self.dtype,
+        )
+        print(f"Using device {self.device}")
+        self.data_loader: DataLoader = None
+        self.adapter: T2IAdapter = None
+    def run(self):
+        super().run()
+        print("Loading model...")
+        self.sd.load_model()
+        device = torch.device(self.device)
+        if self.generate_config.t2i_adapter_path is not None:
+            self.adapter = T2IAdapter.from_pretrained(
+                self.generate_config.t2i_adapter_path,
+                torch_dtype=self.torch_dtype,
+                varient="fp16"
+            ).to(device)
+        midas_depth = MidasDetector.from_pretrained(
+            "valhalla/t2iadapter-aux-models", filename="dpt_large_384.pt", model_type="dpt_large"
+        ).to(device)
+        if self.model_config.is_xl:
+            pipe = StableDiffusionXLAdapterPipeline(
+                vae=self.sd.vae,
+                unet=self.sd.unet,
+                text_encoder=self.sd.text_encoder[0],
+                text_encoder_2=self.sd.text_encoder[1],
+                tokenizer=self.sd.tokenizer[0],
+                tokenizer_2=self.sd.tokenizer[1],
+                scheduler=get_sampler(self.generate_config.sampler),
+                adapter=self.adapter,
+            ).to(device, dtype=self.torch_dtype)
+        else:
+            pipe = StableDiffusionAdapterPipeline(
+                vae=self.sd.vae,
+                unet=self.sd.unet,
+                text_encoder=self.sd.text_encoder,
+                tokenizer=self.sd.tokenizer,
+                scheduler=get_sampler(self.generate_config.sampler),
+                safety_checker=None,
+                feature_extractor=None,
+                requires_safety_checker=False,
+                adapter=self.adapter,
+            ).to(device, dtype=self.torch_dtype)
+        pipe.set_progress_bar_config(disable=True)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+        # midas_depth = torch.compile(midas_depth, mode="reduce-overhead", fullgraph=True)
+        self.data_loader = get_dataloader_from_datasets(self.datasets, 1, self.sd)
+        num_batches = len(self.data_loader)
+        pbar = tqdm(total=num_batches, desc="Generating images")
+        seed = self.generate_config.seed
+        # load images from datasets, use tqdm
+        for i, batch in enumerate(self.data_loader):
+            batch: DataLoaderBatchDTO = batch
+            file_item: FileItemDTO = batch.file_items[0]
+            img_path = file_item.path
+            img_filename = os.path.basename(img_path)
+            img_filename_no_ext = os.path.splitext(img_filename)[0]
+            output_path = os.path.join(self.output_folder, img_filename)
+            output_caption_path = os.path.join(self.output_folder, img_filename_no_ext + '.txt')
+            output_depth_path = os.path.join(self.output_folder, img_filename_no_ext + '.depth.png')
+            caption = batch.get_caption_list()[0]
+            img: torch.Tensor = batch.tensor.clone()
+            # image comes in -1 to 1. convert to a PIL RGB image
+            img = (img + 1) / 2
+            img = img.clamp(0, 1)
+            img = img[0].permute(1, 2, 0).cpu().numpy()
+            img = (img * 255).astype(np.uint8)
+            image = Image.fromarray(img)
+            width, height = image.size
+            min_res = min(width, height)
+            if self.generate_config.walk_seed:
+                seed = seed + 1
+            if self.generate_config.seed == -1:
+                # random
+                seed = random.randint(0, 1000000)
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
+            # generate depth map
+            image = midas_depth(
+                image,
+                detect_resolution=min_res,  # do 512 ?
+                image_resolution=min_res
+            )
+            # image.save(output_depth_path)
+            gen_images = pipe(
+                prompt=caption,
+                negative_prompt=self.generate_config.neg,
+                image=image,
+                num_inference_steps=self.generate_config.sample_steps,
+                adapter_conditioning_scale=self.generate_config.adapter_conditioning_scale,
+                guidance_scale=self.generate_config.guidance_scale,
+            ).images[0]
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            gen_images.save(output_path)
+            # save caption
+            with open(output_caption_path, 'w') as f:
+                f.write(caption)
+            pbar.update(1)
+            batch.cleanup()
+        pbar.close()
+        print("Done generating images")
+        # cleanup
+        del self.sd
+        gc.collect()
+        torch.cuda.empty_cache()

extensions_built_in/advanced_generator/__init__.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# This is an example extension for custom training. It is great for experimenting with new ideas.
+from toolkit.extension import Extension
+# This is for generic training (LoRA, Dreambooth, FineTuning)
+class AdvancedReferenceGeneratorExtension(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "reference_generator"
+    # name is the name of the extension for printing
+    name = "Reference Generator"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .ReferenceGenerator import ReferenceGenerator
+        return ReferenceGenerator
+# This is for generic training (LoRA, Dreambooth, FineTuning)
+class PureLoraGenerator(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "pure_lora_generator"
+    # name is the name of the extension for printing
+    name = "Pure LoRA Generator"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .PureLoraGenerator import PureLoraGenerator
+        return PureLoraGenerator
+# This is for generic training (LoRA, Dreambooth, FineTuning)
+class Img2ImgGeneratorExtension(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "batch_img2img"
+    # name is the name of the extension for printing
+    name = "Img2ImgGeneratorExtension"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .Img2ImgGenerator import Img2ImgGenerator
+        return Img2ImgGenerator
+AI_TOOLKIT_EXTENSIONS = [
+    # you can put a list of extensions here
+    AdvancedReferenceGeneratorExtension, PureLoraGenerator, Img2ImgGeneratorExtension
+]

extensions_built_in/advanced_generator/config/train.example.yaml ADDED Viewed

	@@ -0,0 +1,91 @@

+---
+job: extension
+config:
+  name: test_v1
+  process:
+    - type: 'textual_inversion_trainer'
+      training_folder: "out/TI"
+      device: cuda:0
+      # for tensorboard logging
+      log_dir: "out/.tensorboard"
+      embedding:
+        trigger: "your_trigger_here"
+        tokens: 12
+        init_words: "man with short brown hair"
+        save_format: "safetensors"  # 'safetensors' or 'pt'
+      save:
+        dtype: float16 # precision to save
+        save_every: 100 # save every this many steps
+        max_step_saves_to_keep: 5 # only affects step counts
+      datasets:
+        - folder_path: "/path/to/dataset"
+          caption_ext: "txt"
+          default_caption: "[trigger]"
+          buckets: true
+          resolution: 512
+      train:
+        noise_scheduler: "ddpm" # or "ddpm", "lms", "euler_a"
+        steps: 3000
+        weight_jitter: 0.0
+        lr: 5e-5
+        train_unet: false
+        gradient_checkpointing: true
+        train_text_encoder: false
+        optimizer: "adamw"
+#        optimizer: "prodigy"
+        optimizer_params:
+          weight_decay: 1e-2
+        lr_scheduler: "constant"
+        max_denoising_steps: 1000
+        batch_size: 4
+        dtype: bf16
+        xformers: true
+        min_snr_gamma: 5.0
+#        skip_first_sample: true
+        noise_offset: 0.0 # not needed for this
+      model:
+        # objective reality v2
+        name_or_path: "https://civitai.com/models/128453?modelVersionId=142465"
+        is_v2: false  # for v2 models
+        is_xl: false  # for SDXL models
+        is_v_pred: false # for v-prediction models (most v2 models)
+      sample:
+        sampler: "ddpm" # must match train.noise_scheduler
+        sample_every: 100 # sample every this many steps
+        width: 512
+        height: 512
+        prompts:
+          - "photo of [trigger] laughing"
+          - "photo of [trigger] smiling"
+          - "[trigger] close up"
+          - "dark scene [trigger] frozen"
+          - "[trigger] nighttime"
+          - "a painting of [trigger]"
+          - "a drawing of [trigger]"
+          - "a cartoon of [trigger]"
+          - "[trigger] pixar style"
+          - "[trigger] costume"
+        neg: ""
+        seed: 42
+        walk_seed: false
+        guidance_scale: 7
+        sample_steps: 20
+        network_multiplier: 1.0
+      logging:
+        log_every: 10 # log every this many steps
+        use_wandb: false # not supported yet
+        verbose: false
+# You can put any information you want here, and it will be saved in the model.
+# The below is an example, but you can put your grocery list in it if you want.
+# It is saved in the model so be aware of that. The software will include this
+# plus some other information for you automatically
+meta:
+  # [name] gets replaced with the name above
+  name: "[name]"
+#  version: '1.0'
+#  creator:
+#    name: Your Name
+#    email: [email protected]
+#    website: https://your.website

extensions_built_in/concept_replacer/ConceptReplacer.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import random
+from collections import OrderedDict
+from torch.utils.data import DataLoader
+from toolkit.prompt_utils import concat_prompt_embeds, split_prompt_embeds
+from toolkit.stable_diffusion_model import StableDiffusion, BlankNetwork
+from toolkit.train_tools import get_torch_dtype, apply_snr_weight
+import gc
+import torch
+from jobs.process import BaseSDTrainProcess
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class ConceptReplacementConfig:
+    def __init__(self, **kwargs):
+        self.concept: str = kwargs.get('concept', '')
+        self.replacement: str = kwargs.get('replacement', '')
+class ConceptReplacer(BaseSDTrainProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict, **kwargs):
+        super().__init__(process_id, job, config, **kwargs)
+        replacement_list = self.config.get('replacements', [])
+        self.replacement_list = [ConceptReplacementConfig(**x) for x in replacement_list]
+    def before_model_load(self):
+        pass
+    def hook_before_train_loop(self):
+        self.sd.vae.eval()
+        self.sd.vae.to(self.device_torch)
+        # textual inversion
+        if self.embedding is not None:
+            # set text encoder to train. Not sure if this is necessary but diffusers example did it
+            self.sd.text_encoder.train()
+    def hook_train_loop(self, batch):
+        with torch.no_grad():
+            dtype = get_torch_dtype(self.train_config.dtype)
+            noisy_latents, noise, timesteps, conditioned_prompts, imgs = self.process_general_training_batch(batch)
+            network_weight_list = batch.get_network_weight_list()
+            # have a blank network so we can wrap it in a context and set multipliers without checking every time
+            if self.network is not None:
+                network = self.network
+            else:
+                network = BlankNetwork()
+            batch_replacement_list = []
+            # get a random replacement for each prompt
+            for prompt in conditioned_prompts:
+                replacement = random.choice(self.replacement_list)
+                batch_replacement_list.append(replacement)
+            # build out prompts
+            concept_prompts = []
+            replacement_prompts = []
+            for idx, replacement in enumerate(batch_replacement_list):
+                prompt = conditioned_prompts[idx]
+                # insert shuffled concept at beginning and end of prompt
+                shuffled_concept = [x.strip() for x in replacement.concept.split(',')]
+                random.shuffle(shuffled_concept)
+                shuffled_concept = ', '.join(shuffled_concept)
+                concept_prompts.append(f"{shuffled_concept}, {prompt}, {shuffled_concept}")
+                # insert replacement at beginning and end of prompt
+                shuffled_replacement = [x.strip() for x in replacement.replacement.split(',')]
+                random.shuffle(shuffled_replacement)
+                shuffled_replacement = ', '.join(shuffled_replacement)
+                replacement_prompts.append(f"{shuffled_replacement}, {prompt}, {shuffled_replacement}")
+            # predict the replacement without network
+            conditional_embeds = self.sd.encode_prompt(replacement_prompts).to(self.device_torch, dtype=dtype)
+            replacement_pred = self.sd.predict_noise(
+                latents=noisy_latents.to(self.device_torch, dtype=dtype),
+                conditional_embeddings=conditional_embeds.to(self.device_torch, dtype=dtype),
+                timestep=timesteps,
+                guidance_scale=1.0,
+            )
+            del conditional_embeds
+            replacement_pred = replacement_pred.detach()
+        self.optimizer.zero_grad()
+        flush()
+        # text encoding
+        grad_on_text_encoder = False
+        if self.train_config.train_text_encoder:
+            grad_on_text_encoder = True
+        if self.embedding:
+            grad_on_text_encoder = True
+        # set the weights
+        network.multiplier = network_weight_list
+        # activate network if it exits
+        with network:
+            with torch.set_grad_enabled(grad_on_text_encoder):
+                # embed the prompts
+                conditional_embeds = self.sd.encode_prompt(concept_prompts).to(self.device_torch, dtype=dtype)
+            if not grad_on_text_encoder:
+                # detach the embeddings
+                conditional_embeds = conditional_embeds.detach()
+                self.optimizer.zero_grad()
+                flush()
+            noise_pred = self.sd.predict_noise(
+                latents=noisy_latents.to(self.device_torch, dtype=dtype),
+                conditional_embeddings=conditional_embeds.to(self.device_torch, dtype=dtype),
+                timestep=timesteps,
+                guidance_scale=1.0,
+            )
+            loss = torch.nn.functional.mse_loss(noise_pred.float(), replacement_pred.float(), reduction="none")
+            loss = loss.mean([1, 2, 3])
+            if self.train_config.min_snr_gamma is not None and self.train_config.min_snr_gamma > 0.000001:
+                # add min_snr_gamma
+                loss = apply_snr_weight(loss, timesteps, self.sd.noise_scheduler, self.train_config.min_snr_gamma)
+            loss = loss.mean()
+            # back propagate loss to free ram
+            loss.backward()
+            flush()
+        # apply gradients
+        self.optimizer.step()
+        self.optimizer.zero_grad()
+        self.lr_scheduler.step()
+        if self.embedding is not None:
+            # Let's make sure we don't update any embedding weights besides the newly added token
+            self.embedding.restore_embeddings()
+        loss_dict = OrderedDict(
+            {'loss': loss.item()}
+        )
+        # reset network multiplier
+        network.multiplier = 1.0
+        return loss_dict

extensions_built_in/concept_replacer/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# This is an example extension for custom training. It is great for experimenting with new ideas.
+from toolkit.extension import Extension
+# This is for generic training (LoRA, Dreambooth, FineTuning)
+class ConceptReplacerExtension(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "concept_replacer"
+    # name is the name of the extension for printing
+    name = "Concept Replacer"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .ConceptReplacer import ConceptReplacer
+        return ConceptReplacer
+AI_TOOLKIT_EXTENSIONS = [
+    # you can put a list of extensions here
+    ConceptReplacerExtension,
+]

extensions_built_in/concept_replacer/config/train.example.yaml ADDED Viewed

	@@ -0,0 +1,91 @@

+---
+job: extension
+config:
+  name: test_v1
+  process:
+    - type: 'textual_inversion_trainer'
+      training_folder: "out/TI"
+      device: cuda:0
+      # for tensorboard logging
+      log_dir: "out/.tensorboard"
+      embedding:
+        trigger: "your_trigger_here"
+        tokens: 12
+        init_words: "man with short brown hair"
+        save_format: "safetensors"  # 'safetensors' or 'pt'
+      save:
+        dtype: float16 # precision to save
+        save_every: 100 # save every this many steps
+        max_step_saves_to_keep: 5 # only affects step counts
+      datasets:
+        - folder_path: "/path/to/dataset"
+          caption_ext: "txt"
+          default_caption: "[trigger]"
+          buckets: true
+          resolution: 512
+      train:
+        noise_scheduler: "ddpm" # or "ddpm", "lms", "euler_a"
+        steps: 3000
+        weight_jitter: 0.0
+        lr: 5e-5
+        train_unet: false
+        gradient_checkpointing: true
+        train_text_encoder: false
+        optimizer: "adamw"
+#        optimizer: "prodigy"
+        optimizer_params:
+          weight_decay: 1e-2
+        lr_scheduler: "constant"
+        max_denoising_steps: 1000
+        batch_size: 4
+        dtype: bf16
+        xformers: true
+        min_snr_gamma: 5.0
+#        skip_first_sample: true
+        noise_offset: 0.0 # not needed for this
+      model:
+        # objective reality v2
+        name_or_path: "https://civitai.com/models/128453?modelVersionId=142465"
+        is_v2: false  # for v2 models
+        is_xl: false  # for SDXL models
+        is_v_pred: false # for v-prediction models (most v2 models)
+      sample:
+        sampler: "ddpm" # must match train.noise_scheduler
+        sample_every: 100 # sample every this many steps
+        width: 512
+        height: 512
+        prompts:
+          - "photo of [trigger] laughing"
+          - "photo of [trigger] smiling"
+          - "[trigger] close up"
+          - "dark scene [trigger] frozen"
+          - "[trigger] nighttime"
+          - "a painting of [trigger]"
+          - "a drawing of [trigger]"
+          - "a cartoon of [trigger]"
+          - "[trigger] pixar style"
+          - "[trigger] costume"
+        neg: ""
+        seed: 42
+        walk_seed: false
+        guidance_scale: 7
+        sample_steps: 20
+        network_multiplier: 1.0
+      logging:
+        log_every: 10 # log every this many steps
+        use_wandb: false # not supported yet
+        verbose: false
+# You can put any information you want here, and it will be saved in the model.
+# The below is an example, but you can put your grocery list in it if you want.
+# It is saved in the model so be aware of that. The software will include this
+# plus some other information for you automatically
+meta:
+  # [name] gets replaced with the name above
+  name: "[name]"
+#  version: '1.0'
+#  creator:
+#    name: Your Name
+#    email: [email protected]
+#    website: https://your.website

extensions_built_in/dataset_tools/DatasetTools.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from collections import OrderedDict
+import gc
+import torch
+from jobs.process import BaseExtensionProcess
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class DatasetTools(BaseExtensionProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict):
+        super().__init__(process_id, job, config)
+    def run(self):
+        super().run()
+        raise NotImplementedError("This extension is not yet implemented")

extensions_built_in/dataset_tools/SuperTagger.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import copy
+import json
+import os
+from collections import OrderedDict
+import gc
+import traceback
+import torch
+from PIL import Image, ImageOps
+from tqdm import tqdm
+from .tools.dataset_tools_config_modules import RAW_DIR, TRAIN_DIR, Step, ImgInfo
+from .tools.fuyu_utils import FuyuImageProcessor
+from .tools.image_tools import load_image, ImageProcessor, resize_to_max
+from .tools.llava_utils import LLaVAImageProcessor
+from .tools.caption import default_long_prompt, default_short_prompt, default_replacements
+from jobs.process import BaseExtensionProcess
+from .tools.sync_tools import get_img_paths
+img_ext = ['.jpg', '.jpeg', '.png', '.webp']
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+VERSION = 2
+class SuperTagger(BaseExtensionProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict):
+        super().__init__(process_id, job, config)
+        parent_dir = config.get('parent_dir', None)
+        self.dataset_paths: list[str] = config.get('dataset_paths', [])
+        self.device = config.get('device', 'cuda')
+        self.steps: list[Step] = config.get('steps', [])
+        self.caption_method = config.get('caption_method', 'llava:default')
+        self.caption_prompt = config.get('caption_prompt', default_long_prompt)
+        self.caption_short_prompt = config.get('caption_short_prompt', default_short_prompt)
+        self.force_reprocess_img = config.get('force_reprocess_img', False)
+        self.caption_replacements = config.get('caption_replacements', default_replacements)
+        self.caption_short_replacements = config.get('caption_short_replacements', default_replacements)
+        self.master_dataset_dict = OrderedDict()
+        self.dataset_master_config_file = config.get('dataset_master_config_file', None)
+        if parent_dir is not None and len(self.dataset_paths) == 0:
+            # find all folders in the patent_dataset_path
+            self.dataset_paths = [
+                os.path.join(parent_dir, folder)
+                for folder in os.listdir(parent_dir)
+                if os.path.isdir(os.path.join(parent_dir, folder))
+            ]
+        else:
+            # make sure they exist
+            for dataset_path in self.dataset_paths:
+                if not os.path.exists(dataset_path):
+                    raise ValueError(f"Dataset path does not exist: {dataset_path}")
+        print(f"Found {len(self.dataset_paths)} dataset paths")
+        self.image_processor: ImageProcessor = self.get_image_processor()
+    def get_image_processor(self):
+        if self.caption_method.startswith('llava'):
+            return LLaVAImageProcessor(device=self.device)
+        elif self.caption_method.startswith('fuyu'):
+            return FuyuImageProcessor(device=self.device)
+        else:
+            raise ValueError(f"Unknown caption method: {self.caption_method}")
+    def process_image(self, img_path: str):
+        root_img_dir = os.path.dirname(os.path.dirname(img_path))
+        filename = os.path.basename(img_path)
+        filename_no_ext = os.path.splitext(filename)[0]
+        train_dir = os.path.join(root_img_dir, TRAIN_DIR)
+        train_img_path = os.path.join(train_dir, filename)
+        json_path = os.path.join(train_dir, f"{filename_no_ext}.json")
+        # check if json exists, if it does load it as image info
+        if os.path.exists(json_path):
+            with open(json_path, 'r') as f:
+                img_info = ImgInfo(**json.load(f))
+        else:
+            img_info = ImgInfo()
+        # always send steps first in case other processes need them
+        img_info.add_steps(copy.deepcopy(self.steps))
+        img_info.set_version(VERSION)
+        img_info.set_caption_method(self.caption_method)
+        image: Image = None
+        caption_image: Image = None
+        did_update_image = False
+        # trigger reprocess of steps
+        if self.force_reprocess_img:
+            img_info.trigger_image_reprocess()
+        # set the image as updated if it does not exist on disk
+        if not os.path.exists(train_img_path):
+            did_update_image = True
+            image = load_image(img_path)
+        if img_info.force_image_process:
+            did_update_image = True
+            image = load_image(img_path)
+        # go through the needed steps
+        for step in copy.deepcopy(img_info.state.steps_to_complete):
+            if step == 'caption':
+                # load image
+                if image is None:
+                    image = load_image(img_path)
+                if caption_image is None:
+                    caption_image = resize_to_max(image, 1024, 1024)
+                if not self.image_processor.is_loaded:
+                    print('Loading Model. Takes a while, especially the first time')
+                    self.image_processor.load_model()
+                img_info.caption = self.image_processor.generate_caption(
+                    image=caption_image,
+                    prompt=self.caption_prompt,
+                    replacements=self.caption_replacements
+                )
+                img_info.mark_step_complete(step)
+            elif step == 'caption_short':
+                # load image
+                if image is None:
+                    image = load_image(img_path)
+                if caption_image is None:
+                    caption_image = resize_to_max(image, 1024, 1024)
+                if not self.image_processor.is_loaded:
+                    print('Loading Model. Takes a while, especially the first time')
+                    self.image_processor.load_model()
+                img_info.caption_short = self.image_processor.generate_caption(
+                    image=caption_image,
+                    prompt=self.caption_short_prompt,
+                    replacements=self.caption_short_replacements
+                )
+                img_info.mark_step_complete(step)
+            elif step == 'contrast_stretch':
+                # load image
+                if image is None:
+                    image = load_image(img_path)
+                image = ImageOps.autocontrast(image, cutoff=(0.1, 0), preserve_tone=True)
+                did_update_image = True
+                img_info.mark_step_complete(step)
+            else:
+                raise ValueError(f"Unknown step: {step}")
+        os.makedirs(os.path.dirname(train_img_path), exist_ok=True)
+        if did_update_image:
+            image.save(train_img_path)
+        if img_info.is_dirty:
+            with open(json_path, 'w') as f:
+                json.dump(img_info.to_dict(), f, indent=4)
+        if self.dataset_master_config_file:
+            # add to master dict
+            self.master_dataset_dict[train_img_path] = img_info.to_dict()
+    def run(self):
+        super().run()
+        imgs_to_process = []
+        # find all images
+        for dataset_path in self.dataset_paths:
+            raw_dir = os.path.join(dataset_path, RAW_DIR)
+            raw_image_paths = get_img_paths(raw_dir)
+            for raw_image_path in raw_image_paths:
+                imgs_to_process.append(raw_image_path)
+        if len(imgs_to_process) == 0:
+            print(f"No images to process")
+        else:
+            print(f"Found {len(imgs_to_process)} to process")
+            for img_path in tqdm(imgs_to_process, desc="Processing images"):
+                try:
+                    self.process_image(img_path)
+                except Exception:
+                    # print full stack trace
+                    print(traceback.format_exc())
+                    continue
+                # self.process_image(img_path)
+        if self.dataset_master_config_file is not None:
+            # save it as json
+            with open(self.dataset_master_config_file, 'w') as f:
+                json.dump(self.master_dataset_dict, f, indent=4)
+        del self.image_processor
+        flush()

extensions_built_in/dataset_tools/SyncFromCollection.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+import shutil
+from collections import OrderedDict
+import gc
+from typing import List
+import torch
+from tqdm import tqdm
+from .tools.dataset_tools_config_modules import DatasetSyncCollectionConfig, RAW_DIR, NEW_DIR
+from .tools.sync_tools import get_unsplash_images, get_pexels_images, get_local_image_file_names, download_image, \
+    get_img_paths
+from jobs.process import BaseExtensionProcess
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class SyncFromCollection(BaseExtensionProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict):
+        super().__init__(process_id, job, config)
+        self.min_width = config.get('min_width', 1024)
+        self.min_height = config.get('min_height', 1024)
+        # add our min_width and min_height to each dataset config if they don't exist
+        for dataset_config in config.get('dataset_sync', []):
+            if 'min_width' not in dataset_config:
+                dataset_config['min_width'] = self.min_width
+            if 'min_height' not in dataset_config:
+                dataset_config['min_height'] = self.min_height
+        self.dataset_configs: List[DatasetSyncCollectionConfig] = [
+            DatasetSyncCollectionConfig(**dataset_config)
+            for dataset_config in config.get('dataset_sync', [])
+        ]
+        print(f"Found {len(self.dataset_configs)} dataset configs")
+    def move_new_images(self, root_dir: str):
+        raw_dir = os.path.join(root_dir, RAW_DIR)
+        new_dir = os.path.join(root_dir, NEW_DIR)
+        new_images = get_img_paths(new_dir)
+        for img_path in new_images:
+            # move to raw
+            new_path = os.path.join(raw_dir, os.path.basename(img_path))
+            shutil.move(img_path, new_path)
+        # remove new dir
+        shutil.rmtree(new_dir)
+    def sync_dataset(self, config: DatasetSyncCollectionConfig):
+        if config.host == 'unsplash':
+            get_images = get_unsplash_images
+        elif config.host == 'pexels':
+            get_images = get_pexels_images
+        else:
+            raise ValueError(f"Unknown host: {config.host}")
+        results = {
+            'num_downloaded': 0,
+            'num_skipped': 0,
+            'bad': 0,
+            'total': 0,
+        }
+        photos = get_images(config)
+        raw_dir = os.path.join(config.directory, RAW_DIR)
+        new_dir = os.path.join(config.directory, NEW_DIR)
+        raw_images = get_local_image_file_names(raw_dir)
+        new_images = get_local_image_file_names(new_dir)
+        for photo in tqdm(photos, desc=f"{config.host}-{config.collection_id}"):
+            try:
+                if photo.filename not in raw_images and photo.filename not in new_images:
+                    download_image(photo, new_dir, min_width=self.min_width, min_height=self.min_height)
+                    results['num_downloaded'] += 1
+                else:
+                    results['num_skipped'] += 1
+            except Exception as e:
+                print(f" - BAD({photo.id}): {e}")
+                results['bad'] += 1
+                continue
+            results['total'] += 1
+        return results
+    def print_results(self, results):
+        print(
+            f" - new:{results['num_downloaded']}, old:{results['num_skipped']}, bad:{results['bad']} total:{results['total']}")
+    def run(self):
+        super().run()
+        print(f"Syncing {len(self.dataset_configs)} datasets")
+        all_results = None
+        failed_datasets = []
+        for dataset_config in tqdm(self.dataset_configs, desc="Syncing datasets", leave=True):
+            try:
+                results = self.sync_dataset(dataset_config)
+                if all_results is None:
+                    all_results = {**results}
+                else:
+                    for key, value in results.items():
+                        all_results[key] += value
+                self.print_results(results)
+            except Exception as e:
+                print(f" - FAILED: {e}")
+                if 'response' in e.__dict__:
+                    error = f"{e.response.status_code}: {e.response.text}"
+                    print(f"   - {error}")
+                    failed_datasets.append({'dataset': dataset_config, 'error': error})
+                else:
+                    failed_datasets.append({'dataset': dataset_config, 'error': str(e)})
+                continue
+        print("Moving new images to raw")
+        for dataset_config in self.dataset_configs:
+            self.move_new_images(dataset_config.directory)
+        print("Done syncing datasets")
+        self.print_results(all_results)
+        if len(failed_datasets) > 0:
+            print(f"Failed to sync {len(failed_datasets)} datasets")
+            for failed in failed_datasets:
+                print(f" - {failed['dataset'].host}-{failed['dataset'].collection_id}")
+                print(f"   - ERR: {failed['error']}")

extensions_built_in/dataset_tools/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from toolkit.extension import Extension
+class DatasetToolsExtension(Extension):
+    uid = "dataset_tools"
+    # name is the name of the extension for printing
+    name = "Dataset Tools"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .DatasetTools import DatasetTools
+        return DatasetTools
+class SyncFromCollectionExtension(Extension):
+    uid = "sync_from_collection"
+    name = "Sync from Collection"
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .SyncFromCollection import SyncFromCollection
+        return SyncFromCollection
+class SuperTaggerExtension(Extension):
+    uid = "super_tagger"
+    name = "Super Tagger"
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .SuperTagger import SuperTagger
+        return SuperTagger
+AI_TOOLKIT_EXTENSIONS = [
+    SyncFromCollectionExtension, DatasetToolsExtension, SuperTaggerExtension
+]

extensions_built_in/dataset_tools/tools/caption.py ADDED Viewed

	@@ -0,0 +1,53 @@

+caption_manipulation_steps = ['caption', 'caption_short']
+default_long_prompt = 'caption this image. describe every single thing in the image in detail. Do not include any unnecessary words in your description for the sake of good grammar. I want many short statements that serve the single purpose of giving the most thorough description if items as possible in the smallest, comma separated way possible. be sure to describe people\'s moods, clothing, the environment, lighting, colors, and everything.'
+default_short_prompt = 'caption this image in less than ten words'
+default_replacements = [
+    ("the image features", ""),
+    ("the image shows", ""),
+    ("the image depicts", ""),
+    ("the image is", ""),
+    ("in this image", ""),
+    ("in the image", ""),
+]
+def clean_caption(cap, replacements=None):
+    if replacements is None:
+        replacements = default_replacements
+    # remove any newlines
+    cap = cap.replace("\n", ", ")
+    cap = cap.replace("\r", ", ")
+    cap = cap.replace(".", ",")
+    cap = cap.replace("\"", "")
+    # remove unicode characters
+    cap = cap.encode('ascii', 'ignore').decode('ascii')
+    # make lowercase
+    cap = cap.lower()
+    # remove any extra spaces
+    cap = " ".join(cap.split())
+    for replacement in replacements:
+        if replacement[0].startswith('*'):
+            # we are removing all text if it starts with this and the rest matches
+            search_text = replacement[0][1:]
+            if cap.startswith(search_text):
+                cap = ""
+        else:
+            cap = cap.replace(replacement[0].lower(), replacement[1].lower())
+    cap_list = cap.split(",")
+    # trim whitespace
+    cap_list = [c.strip() for c in cap_list]
+    # remove empty strings
+    cap_list = [c for c in cap_list if c != ""]
+    # remove duplicates
+    cap_list = list(dict.fromkeys(cap_list))
+    # join back together
+    cap = ", ".join(cap_list)
+    return cap