Spaces:

SahilCarterr
/

ImageRevamp

Runtime error

App Files Files Community

SahilCarterr commited on May 2, 2024

Commit

703a7c0

verified ·

1 Parent(s): fd92c88

Upload 27 files

Browse files

Files changed (28) hide show

.gitattributes +3 -0
ControlNetInpaint/.gitignore +129 -0
ControlNetInpaint/ControlNet-with-Inpaint-Demo-colab.ipynb +0 -0
ControlNetInpaint/ControlNet-with-Inpaint-Demo.ipynb +1130 -0
ControlNetInpaint/LICENSE +21 -0
ControlNetInpaint/README.md +111 -0
ControlNetInpaint/output/baseline_grid.png +3 -0
ControlNetInpaint/output/baseline_result.png +0 -0
ControlNetInpaint/output/canny_cheeseburger.png +3 -0
ControlNetInpaint/output/canny_cheeseburger_grid.png +3 -0
ControlNetInpaint/output/canny_grid.png +0 -0
ControlNetInpaint/output/canny_result.png +0 -0
ControlNetInpaint/output/depth_grid.png +0 -0
ControlNetInpaint/output/depth_result.png +0 -0
ControlNetInpaint/output/hed_grid.png +0 -0
ControlNetInpaint/output/hed_result.png +0 -0
ControlNetInpaint/output/mlsd_grid.png +0 -0
ControlNetInpaint/output/mlsd_result.png +0 -0
ControlNetInpaint/output/normal_grid.png +0 -0
ControlNetInpaint/output/normal_result.png +0 -0
ControlNetInpaint/output/openpose_grid.png +0 -0
ControlNetInpaint/output/openpose_result.png +0 -0
ControlNetInpaint/output/scribble_grid.png +0 -0
ControlNetInpaint/output/scribble_result.png +0 -0
ControlNetInpaint/output/seg_grid.png +0 -0
ControlNetInpaint/output/seg_result.png +0 -0
ControlNetInpaint/setup.py +25 -0
ControlNetInpaint/src/pipeline_stable_diffusion_controlnet_inpaint.py +521 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ControlNetInpaint/output/baseline_grid.png filter=lfs diff=lfs merge=lfs -text
+ControlNetInpaint/output/canny_cheeseburger_grid.png filter=lfs diff=lfs merge=lfs -text
+ControlNetInpaint/output/canny_cheeseburger.png filter=lfs diff=lfs merge=lfs -text

ControlNetInpaint/.gitignore ADDED Viewed

	@@ -0,0 +1,129 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

ControlNetInpaint/ControlNet-with-Inpaint-Demo-colab.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

ControlNetInpaint/ControlNet-with-Inpaint-Demo.ipynb ADDED Viewed

	@@ -0,0 +1,1130 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "43976805",
+   "metadata": {},
+   "source": [
+    "# Inpainting with ControlNet\n",
+    "This notebook contains examples of using a new `StableDiffusionControlNetInpaintPipeline`.\n",
+    "\n",
+    "The main two parameters you can play with are the strength of text guidance and image guidance:\n",
+    "* Text guidance (`guidance_scale`) is set to `7.5` by default, and usually this value works quite well.\n",
+    "* Image guidance (`controlnet_conditioning_scale`) is set to `0.4` by default. This value is a good starting point, but can be lowered if there is a big misalignment between the text prompt and the control image (meaning that it is very hard to \"imagine\" an output image that both satisfies the text prompt and aligns with the control image).\n",
+    "\n",
+    "The naming of these parameters is based on other pipelines `StableDiffusionInpaintPipeline` and `StableDiffusionControlNetPipeline` and the same convention has been preserved for consistency."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33c2f672",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from diffusers import StableDiffusionInpaintPipeline, ControlNetModel, UniPCMultistepScheduler\n",
+    "from src.pipeline_stable_diffusion_controlnet_inpaint import *\n",
+    "from diffusers.utils import load_image\n",
+    "\n",
+    "import cv2\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from matplotlib import pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb869cff",
+   "metadata": {},
+   "source": [
+    "### Baseline: Stable Diffusion 1.5 Inpainting\n",
+    "The StableDiffusion1.5 Inpainting model is used as the core for ControlNet inpainting. For reference, you can also try to run the same results on this core model alone:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f011126d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe_sd = StableDiffusionInpaintPipeline.from_pretrained(\n",
+    "    \"runwayml/stable-diffusion-inpainting\",\n",
+    "    revision=\"fp16\",\n",
+    "    torch_dtype=torch.float16,\n",
+    ")\n",
+    "# speed up diffusion process with faster scheduler and memory optimization\n",
+    "pipe_sd.scheduler = UniPCMultistepScheduler.from_config(pipe_sd.scheduler.config)\n",
+    "# remove following line if xformers is not installed\n",
+    "pipe_sd.enable_xformers_memory_efficient_attention()\n",
+    "\n",
+    "pipe_sd.to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a4d89ea7",
+   "metadata": {},
+   "source": [
+    "### Task\n",
+    "Let's start by turning this dog into a red panda using various types of guidance!\n",
+    "\n",
+    "All we need is an `image`, a `mask`, and a `text_prompt` of **\"a red panda sitting on a bench\"**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "517add62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download an image\n",
+    "image = load_image(\n",
+    "     \"https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png\"\n",
+    ")\n",
+    "image = np.array(image)\n",
+    "mask_image = load_image(\n",
+    "     \"https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png\"\n",
+    ")\n",
+    "mask_image = np.array(mask_image)\n",
+    "\n",
+    "text_prompt=\"a red panda sitting on a bench\"\n",
+    "\n",
+    "plt.figure(figsize=(12,4))\n",
+    "\n",
+    "plt.subplot(1,2,1)\n",
+    "plt.imshow(image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Input')\n",
+    "plt.subplot(1,2,2)\n",
+    "plt.imshow((255-np.array(image))*(255-np.array(mask_image)))\n",
+    "plt.axis('off')\n",
+    "plt.title('Masked')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "489f2543",
+   "metadata": {},
+   "source": [
+    "## Canny Edge"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "906b2654",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get canny image\n",
+    "canny_image = cv2.Canny(image, 100, 200)\n",
+    "canny_image = canny_image[:, :, None]\n",
+    "canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2)\n",
+    "\n",
+    "image=Image.fromarray(image)\n",
+    "mask_image=Image.fromarray(mask_image)\n",
+    "canny_image = Image.fromarray(canny_image)\n",
+    "\n",
+    "canny_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41d35b98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load control net and stable diffusion v1-5\n",
+    "controlnet = ControlNetModel.from_pretrained(\"lllyasviel/sd-controlnet-canny\", torch_dtype=torch.float16)\n",
+    "pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(\n",
+    "     \"runwayml/stable-diffusion-inpainting\", controlnet=controlnet, torch_dtype=torch.float16\n",
+    " )\n",
+    "\n",
+    "# speed up diffusion process with faster scheduler and memory optimization\n",
+    "pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)\n",
+    "# remove following line if xformers is not installed\n",
+    "pipe.enable_xformers_memory_efficient_attention()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6146702",
+   "metadata": {},
+   "source": [
+    "### Scaling image control...\n",
+    "In this example, `canny_image` input is actually quite hard to satisfy with the our text prompt due to a lot of local noise. In this special case, we adjust `controlnet_conditioning_scale` to `0.5` to make this guidance more subtle.\n",
+    "\n",
+    "In all other examples, the default value of `controlnet_conditioning_scale` = `1.0` works rather well!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5069621",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe.to('cuda')\n",
+    "\n",
+    "# generate image\n",
+    "generator = torch.manual_seed(0)\n",
+    "new_image = pipe(\n",
+    "    text_prompt,\n",
+    "    num_inference_steps=20,\n",
+    "    generator=generator,\n",
+    "    image=image,\n",
+    "    control_image=canny_image,\n",
+    "    controlnet_conditioning_scale = 0.5,\n",
+    "    mask_image=mask_image\n",
+    ").images[0]\n",
+    "\n",
+    "new_image.save('output/canny_result.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f9c6ff6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(12,4))\n",
+    "\n",
+    "plt.subplot(1,4,1)\n",
+    "plt.imshow(image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Input')\n",
+    "plt.subplot(1,4,2)\n",
+    "plt.imshow((255-np.array(image))*(255-np.array(mask_image)))\n",
+    "plt.axis('off')\n",
+    "plt.title('Masked')\n",
+    "plt.subplot(1,4,3)\n",
+    "plt.imshow(canny_image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Condition')\n",
+    "plt.subplot(1,4,4)\n",
+    "plt.imshow(new_image)\n",
+    "plt.title('Output')\n",
+    "plt.axis('off')\n",
+    "\n",
+    "plt.savefig('output/canny_grid.png',\n",
+    "            dpi=200,\n",
+    "            bbox_inches='tight',\n",
+    "            pad_inches=0.0\n",
+    "       )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "87de0502",
+   "metadata": {},
+   "source": [
+    "### Comparison: vanilla inpainting from StableDiffusion1.5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f2ef71fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# generate image\n",
+    "generator = torch.manual_seed(0)\n",
+    "new_image = pipe_sd(\n",
+    "    text_prompt,\n",
+    "    num_inference_steps=20,\n",
+    "    generator=generator,\n",
+    "    image=image,\n",
+    "    mask_image=mask_image\n",
+    ").images[0]\n",
+    "\n",
+    "new_image.save('output/baseline_result.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e09513c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(12,4))\n",
+    "\n",
+    "plt.subplot(1,3,1)\n",
+    "plt.imshow(image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Input')\n",
+    "plt.subplot(1,3,2)\n",
+    "plt.imshow((255-np.array(image))*(255-np.array(mask_image)))\n",
+    "plt.axis('off')\n",
+    "plt.title('Masked')\n",
+    "plt.subplot(1,3,3)\n",
+    "plt.imshow(new_image)\n",
+    "plt.title('Output')\n",
+    "plt.axis('off')\n",
+    "\n",
+    "plt.savefig('output/baseline_grid.png',\n",
+    "            dpi=200,\n",
+    "            bbox_inches='tight',\n",
+    "            pad_inches=0.0\n",
+    "       )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0569600e",
+   "metadata": {},
+   "source": [
+    "## Challenging Examples 🐕➡️🍔\n",
+    "Let's see how tuning the `controlnet_conditioning_scale` works out for a more challenging example of turning the dog into a cheeseburger!\n",
+    "\n",
+    "In this case, we **demand a large semantic leap** and that requires a more subtle guide from the control image!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69a352a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "difficult_text_prompt=\"a big cheeseburger sitting on a bench\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0803c982",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# First - StableDiffusion1.5 baseline (no ControlNet)\n",
+    "\n",
+    "# generate image\n",
+    "generator = torch.manual_seed(0)\n",
+    "new_image = pipe_sd(\n",
+    "    difficult_text_prompt,\n",
+    "    num_inference_steps=20,\n",
+    "    generator=generator,\n",
+    "    image=image,\n",
+    "    mask_image=mask_image\n",
+    ").images[0]\n",
+    "\n",
+    "sd_output=new_image\n",
+    "sd_output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "319b867e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89dbb557",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6d74fdd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bdaa2483",
+   "metadata": {},
+   "source": [
+    "## HED"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c5f1ead",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from controlnet_aux import HEDdetector\n",
+    "\n",
+    "hed = HEDdetector.from_pretrained('lllyasviel/ControlNet')\n",
+    "\n",
+    "hed_image = hed(image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "192a9881",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "controlnet = ControlNetModel.from_pretrained(\n",
+    "    \"fusing/stable-diffusion-v1-5-controlnet-hed\", torch_dtype=torch.float16\n",
+    ")\n",
+    "pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(\n",
+    "     \"runwayml/stable-diffusion-inpainting\", controlnet=controlnet, torch_dtype=torch.float16\n",
+    " )\n",
+    "\n",
+    "pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)\n",
+    "\n",
+    "# Remove if you do not have xformers installed\n",
+    "# see https://huggingface.co/docs/diffusers/v0.13.0/en/optimization/xformers#installing-xformers\n",
+    "# for installation instructions\n",
+    "pipe.enable_xformers_memory_efficient_attention()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa054f4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe.to('cuda')\n",
+    "\n",
+    "# generate image\n",
+    "generator = torch.manual_seed(0)\n",
+    "new_image = pipe(\n",
+    "    text_prompt,\n",
+    "    num_inference_steps=20,\n",
+    "    generator=generator,\n",
+    "    image=image,\n",
+    "    control_image=hed_image,\n",
+    "    mask_image=mask_image\n",
+    ").images[0]\n",
+    "\n",
+    "new_image.save('output/hed_result.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc33ddfa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(12,4))\n",
+    "\n",
+    "plt.subplot(1,4,1)\n",
+    "plt.imshow(image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Input')\n",
+    "plt.subplot(1,4,2)\n",
+    "plt.imshow((255-np.array(image))*(255-np.array(mask_image)))\n",
+    "plt.axis('off')\n",
+    "plt.title('Masked')\n",
+    "plt.subplot(1,4,3)\n",
+    "plt.imshow(hed_image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Condition')\n",
+    "plt.subplot(1,4,4)\n",
+    "plt.imshow(new_image)\n",
+    "plt.title('Output')\n",
+    "plt.axis('off')\n",
+    "\n",
+    "plt.savefig('output/hed_grid.png',\n",
+    "            dpi=200,\n",
+    "            bbox_inches='tight',\n",
+    "            pad_inches=0.0\n",
+    "       )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1be22a64",
+   "metadata": {},
+   "source": [
+    "### Scribble"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4b376bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from controlnet_aux import HEDdetector\n",
+    "\n",
+    "hed = HEDdetector.from_pretrained('lllyasviel/ControlNet')\n",
+    "\n",
+    "scribble_image = hed(image,scribble=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0c63b8f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "controlnet = ControlNetModel.from_pretrained(\n",
+    "    \"fusing/stable-diffusion-v1-5-controlnet-scribble\", torch_dtype=torch.float16\n",
+    ")\n",
+    "pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(\n",
+    "     \"runwayml/stable-diffusion-inpainting\", controlnet=controlnet, torch_dtype=torch.float16\n",
+    " )\n",
+    "\n",
+    "pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)\n",
+    "\n",
+    "# Remove if you do not have xformers installed\n",
+    "# see https://huggingface.co/docs/diffusers/v0.13.0/en/optimization/xformers#installing-xformers\n",
+    "# for installation instructions\n",
+    "pipe.enable_xformers_memory_efficient_attention()\n",
+    "\n",
+    "#pipe.enable_model_cpu_offload()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f30189e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe.to('cuda')\n",
+    "\n",
+    "# generate image\n",
+    "generator = torch.manual_seed(0)\n",
+    "new_image = pipe(\n",
+    "    text_prompt,\n",
+    "    num_inference_steps=20,\n",
+    "    generator=generator,\n",
+    "    image=image,\n",
+    "    control_image=scribble_image,\n",
+    "    mask_image=mask_image\n",
+    ").images[0]\n",
+    "\n",
+    "new_image.save('output/scribble_result.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8de59fe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(12,4))\n",
+    "\n",
+    "plt.subplot(1,4,1)\n",
+    "plt.imshow(image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Input')\n",
+    "plt.subplot(1,4,2)\n",
+    "plt.imshow((255-np.array(image))*(255-np.array(mask_image)))\n",
+    "plt.axis('off')\n",
+    "plt.title('Masked')\n",
+    "plt.subplot(1,4,3)\n",
+    "plt.imshow(scribble_image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Condition')\n",
+    "plt.subplot(1,4,4)\n",
+    "plt.imshow(new_image)\n",
+    "plt.title('Output')\n",
+    "plt.axis('off')\n",
+    "\n",
+    "plt.savefig('output/scribble_grid.png',\n",
+    "            dpi=200,\n",
+    "            bbox_inches='tight',\n",
+    "            pad_inches=0.0\n",
+    "       )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e30c6ce2",
+   "metadata": {},
+   "source": [
+    "### Depth"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f681c4d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline\n",
+    "\n",
+    "depth_estimator = pipeline('depth-estimation')\n",
+    "\n",
+    "depth_image = depth_estimator(image)['depth']\n",
+    "depth_image = np.array(depth_image)\n",
+    "depth_image = depth_image[:, :, None]\n",
+    "depth_image = np.concatenate(3*[depth_image], axis=2)\n",
+    "depth_image = Image.fromarray(depth_image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f8fdcf5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "controlnet = ControlNetModel.from_pretrained(\n",
+    "    \"fusing/stable-diffusion-v1-5-controlnet-depth\", torch_dtype=torch.float16\n",
+    ")\n",
+    "pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(\n",
+    "     \"runwayml/stable-diffusion-inpainting\", controlnet=controlnet, torch_dtype=torch.float16\n",
+    " )\n",
+    "\n",
+    "pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)\n",
+    "\n",
+    "# Remove if you do not have xformers installed\n",
+    "# see https://huggingface.co/docs/diffusers/v0.13.0/en/optimization/xformers#installing-xformers\n",
+    "# for installation instructions\n",
+    "pipe.enable_xformers_memory_efficient_attention()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58ab718d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe.to('cuda')\n",
+    "\n",
+    "# generate image\n",
+    "generator = torch.manual_seed(0)\n",
+    "new_image = pipe(\n",
+    "    text_prompt,\n",
+    "    num_inference_steps=20,\n",
+    "    generator=generator,\n",
+    "    image=image,\n",
+    "    control_image=depth_image,\n",
+    "    mask_image=mask_image\n",
+    ").images[0]\n",
+    "\n",
+    "new_image.save('output/depth_result.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82ac435e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(12,4))\n",
+    "\n",
+    "plt.subplot(1,4,1)\n",
+    "plt.imshow(image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Input')\n",
+    "plt.subplot(1,4,2)\n",
+    "plt.imshow((255-np.array(image))*(255-np.array(mask_image)))\n",
+    "plt.axis('off')\n",
+    "plt.title('Masked')\n",
+    "plt.subplot(1,4,3)\n",
+    "plt.imshow(depth_image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Condition')\n",
+    "plt.subplot(1,4,4)\n",
+    "plt.imshow(new_image)\n",
+    "plt.title('Output')\n",
+    "plt.axis('off')\n",
+    "\n",
+    "plt.savefig('output/depth_grid.png',\n",
+    "            dpi=200,\n",
+    "            bbox_inches='tight',\n",
+    "            pad_inches=0.0\n",
+    "       )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "93db13cb",
+   "metadata": {},
+   "source": [
+    "### Normal Map"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08ffd6da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cv2\n",
+    "\n",
+    "depth_estimator = pipeline(\"depth-estimation\", model =\"Intel/dpt-hybrid-midas\" )\n",
+    "\n",
+    "normal_image = depth_estimator(image)['predicted_depth'][0]\n",
+    "\n",
+    "normal_image = normal_image.numpy()\n",
+    "\n",
+    "image_depth = normal_image.copy()\n",
+    "image_depth -= np.min(image_depth)\n",
+    "image_depth /= np.max(image_depth)\n",
+    "\n",
+    "bg_threhold = 0.4\n",
+    "\n",
+    "x = cv2.Sobel(normal_image, cv2.CV_32F, 1, 0, ksize=3)\n",
+    "x[image_depth < bg_threhold] = 0\n",
+    "\n",
+    "y = cv2.Sobel(normal_image, cv2.CV_32F, 0, 1, ksize=3)\n",
+    "y[image_depth < bg_threhold] = 0\n",
+    "\n",
+    "z = np.ones_like(x) * np.pi * 2.0\n",
+    "\n",
+    "normal_image = np.stack([x, y, z], axis=2)\n",
+    "normal_image /= np.sum(normal_image ** 2.0, axis=2, keepdims=True) ** 0.5\n",
+    "normal_image = (normal_image * 127.5 + 127.5).clip(0, 255).astype(np.uint8)\n",
+    "normal_image = Image.fromarray(normal_image).resize((512,512))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c41bd52b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "controlnet = ControlNetModel.from_pretrained(\n",
+    "    \"fusing/stable-diffusion-v1-5-controlnet-normal\", torch_dtype=torch.float16\n",
+    ")\n",
+    "pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(\n",
+    "     \"runwayml/stable-diffusion-inpainting\", controlnet=controlnet, torch_dtype=torch.float16\n",
+    " )\n",
+    "\n",
+    "pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)\n",
+    "\n",
+    "# Remove if you do not have xformers installed\n",
+    "# see https://huggingface.co/docs/diffusers/v0.13.0/en/optimization/xformers#installing-xformers\n",
+    "# for installation instructions\n",
+    "pipe.enable_xformers_memory_efficient_attention()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8b5a39e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe.to('cuda')\n",
+    "\n",
+    "# generate image\n",
+    "generator = torch.manual_seed(0)\n",
+    "new_image = pipe(\n",
+    "    text_prompt,\n",
+    "    num_inference_steps=20,\n",
+    "    generator=generator,\n",
+    "    image=image,\n",
+    "    control_image=normal_image,\n",
+    "    mask_image=mask_image\n",
+    ").images[0]\n",
+    "\n",
+    "new_image.save('output/normal_result.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2737d23f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(12,4))\n",
+    "\n",
+    "plt.subplot(1,4,1)\n",
+    "plt.imshow(image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Input')\n",
+    "plt.subplot(1,4,2)\n",
+    "plt.imshow((255-np.array(image))*(255-np.array(mask_image)))\n",
+    "plt.axis('off')\n",
+    "plt.title('Masked')\n",
+    "plt.subplot(1,4,3)\n",
+    "plt.imshow(normal_image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Condition')\n",
+    "plt.subplot(1,4,4)\n",
+    "plt.imshow(new_image)\n",
+    "plt.title('Output')\n",
+    "plt.axis('off')\n",
+    "\n",
+    "plt.savefig('output/normal_grid.png',\n",
+    "            dpi=200,\n",
+    "            bbox_inches='tight',\n",
+    "            pad_inches=0.0\n",
+    "       )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04683be6",
+   "metadata": {},
+   "source": [
+    "### More control input types\n",
+    "For these control input types, we will use a different image as in those cases, an image of the dog on the bench is not appropriate!\n",
+    "\n",
+    "Let's start with a room photo..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d5c7d55",
+   "metadata": {},
+   "source": [
+    "### M-LSD"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d2e3a7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from controlnet_aux import MLSDdetector\n",
+    "\n",
+    "mlsd = MLSDdetector.from_pretrained('lllyasviel/ControlNet')\n",
+    "\n",
+    "room_image = load_image(\"https://huggingface.co/lllyasviel/sd-controlnet-mlsd/resolve/main/images/room.png\")\n",
+    "\n",
+    "mlsd_image = mlsd(room_image).resize(room_image.size)\n",
+    "#room_image = room_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "45629903",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "room_mask=np.zeros_like(np.array(room_image))\n",
+    "room_mask[120:420,220:,:]=255\n",
+    "room_mask=Image.fromarray(room_mask)\n",
+    "\n",
+    "\n",
+    "room_mask=room_mask.resize((512,512))\n",
+    "mlsd_image=mlsd_image.resize((512,512))\n",
+    "room_image=room_image.resize((512,512))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e491ab22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "controlnet = ControlNetModel.from_pretrained(\n",
+    "    \"fusing/stable-diffusion-v1-5-controlnet-mlsd\", torch_dtype=torch.float16\n",
+    ")\n",
+    "pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(\n",
+    "     \"runwayml/stable-diffusion-inpainting\", controlnet=controlnet, torch_dtype=torch.float16\n",
+    " )\n",
+    "\n",
+    "pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)\n",
+    "\n",
+    "# Remove if you do not have xformers installed\n",
+    "# see https://huggingface.co/docs/diffusers/v0.13.0/en/optimization/xformers#installing-xformers\n",
+    "# for installation instructions\n",
+    "pipe.enable_xformers_memory_efficient_attention()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b414f354",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe.to('cuda')\n",
+    "\n",
+    "# generate image\n",
+    "generator = torch.manual_seed(0)\n",
+    "new_image = pipe(\n",
+    "    \"an image of a room with a city skyline view\",\n",
+    "    num_inference_steps=20,\n",
+    "    generator=generator,\n",
+    "    image=room_image,\n",
+    "    control_image=mlsd_image,\n",
+    "    mask_image=room_mask\n",
+    ").images[0]\n",
+    "\n",
+    "new_image.save('output/mlsd_result.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "326145e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(12,4))\n",
+    "\n",
+    "plt.subplot(1,4,1)\n",
+    "plt.imshow(room_image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Input')\n",
+    "plt.subplot(1,4,2)\n",
+    "plt.imshow((255-np.array(room_image))*(255-np.array(room_mask)))\n",
+    "plt.axis('off')\n",
+    "plt.title('Masked')\n",
+    "plt.subplot(1,4,3)\n",
+    "plt.imshow(mlsd_image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Condition')\n",
+    "plt.subplot(1,4,4)\n",
+    "plt.imshow(new_image)\n",
+    "plt.title('Output')\n",
+    "plt.axis('off')\n",
+    "\n",
+    "plt.savefig('output/mlsd_grid.png',\n",
+    "            dpi=200,\n",
+    "            bbox_inches='tight',\n",
+    "            pad_inches=0.0\n",
+    "       )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1f68f30b",
+   "metadata": {},
+   "source": [
+    "### OpenPose"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bbf9b00b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "controlnet = ControlNetModel.from_pretrained(\n",
+    "    \"fusing/stable-diffusion-v1-5-controlnet-openpose\", torch_dtype=torch.float16\n",
+    ")\n",
+    "pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(\n",
+    "     \"runwayml/stable-diffusion-inpainting\", controlnet=controlnet, torch_dtype=torch.float16\n",
+    " )\n",
+    "\n",
+    "pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)\n",
+    "\n",
+    "# Remove if you do not have xformers installed\n",
+    "# see https://huggingface.co/docs/diffusers/v0.13.0/en/optimization/xformers#installing-xformers\n",
+    "# for installation instructions\n",
+    "pipe.enable_xformers_memory_efficient_attention()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e819d17c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from controlnet_aux import OpenposeDetector\n",
+    "\n",
+    "openpose = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')\n",
+    "\n",
+    "pose_real_image = load_image(\"https://huggingface.co/lllyasviel/sd-controlnet-openpose/resolve/main/images/pose.png\")\n",
+    "\n",
+    "pose_image = openpose(pose_real_image)\n",
+    "pose_real_image=pose_real_image.resize(pose_image.size)\n",
+    "\n",
+    "pose_mask=np.zeros_like(np.array(pose_image))\n",
+    "pose_mask[250:700,:,:]=255\n",
+    "pose_mask=Image.fromarray(pose_mask)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b6faf93",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe.to('cuda')\n",
+    "\n",
+    "# generate image\n",
+    "generator = torch.manual_seed(0)\n",
+    "new_image = pipe(\n",
+    "    \"a man in a knight armor\",\n",
+    "    num_inference_steps=20,\n",
+    "    generator=generator,\n",
+    "    image=pose_real_image,\n",
+    "    control_image=pose_image,\n",
+    "    mask_image=pose_mask\n",
+    ").images[0]\n",
+    "\n",
+    "new_image.save('output/openpose_result.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a665a931",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(12,4))\n",
+    "\n",
+    "plt.subplot(1,4,1)\n",
+    "plt.imshow(pose_real_image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Input')\n",
+    "plt.subplot(1,4,2)\n",
+    "plt.imshow((255-np.array(pose_real_image))*(255-np.array(pose_mask)))\n",
+    "plt.axis('off')\n",
+    "plt.title('Masked')\n",
+    "plt.subplot(1,4,3)\n",
+    "plt.imshow(pose_image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Condition')\n",
+    "plt.subplot(1,4,4)\n",
+    "plt.imshow(new_image)\n",
+    "plt.title('Output')\n",
+    "plt.axis('off')\n",
+    "\n",
+    "\n",
+    "plt.savefig('output/openpose_grid.png',\n",
+    "            dpi=200,\n",
+    "            bbox_inches='tight',\n",
+    "            pad_inches=0.0\n",
+    "       )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b982380d",
+   "metadata": {},
+   "source": [
+    "### Segmentation Mask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f667b04a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "controlnet = ControlNetModel.from_pretrained(\n",
+    "    \"fusing/stable-diffusion-v1-5-controlnet-seg\", torch_dtype=torch.float16\n",
+    ")\n",
+    "pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(\n",
+    "     \"runwayml/stable-diffusion-inpainting\", controlnet=controlnet, torch_dtype=torch.float16\n",
+    " )\n",
+    "\n",
+    "pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)\n",
+    "\n",
+    "# Remove if you do not have xformers installed\n",
+    "# see https://huggingface.co/docs/diffusers/v0.13.0/en/optimization/xformers#installing-xformers\n",
+    "# for installation instructions\n",
+    "pipe.enable_xformers_memory_efficient_attention()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb27c72b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "house_real_image=load_image(\"https://huggingface.co/lllyasviel/sd-controlnet-seg/resolve/main/images/house.png\")\n",
+    "seg_image=load_image(\"https://huggingface.co/lllyasviel/sd-controlnet-seg/resolve/main/images/house_seg.png\")\n",
+    "\n",
+    "house_mask=np.zeros((*seg_image.size,3),dtype='uint8')\n",
+    "house_mask[50:400,-350:,:]=255\n",
+    "house_mask=Image.fromarray(house_mask)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f81e50b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe.to('cuda')\n",
+    "\n",
+    "# generate image\n",
+    "generator = torch.manual_seed(0)\n",
+    "new_image = pipe(\n",
+    "    \"a pink eerie scary house\",\n",
+    "    num_inference_steps=20,\n",
+    "    generator=generator,\n",
+    "    image=house_real_image,\n",
+    "    control_image=seg_image,\n",
+    "    mask_image=house_mask\n",
+    ").images[0]\n",
+    "\n",
+    "new_image.save('output/seg_result.png')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37c0d695",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(12,4))\n",
+    "\n",
+    "plt.subplot(1,4,1)\n",
+    "plt.imshow(house_real_image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Input')\n",
+    "plt.subplot(1,4,2)\n",
+    "plt.imshow((255-np.array(house_real_image))*(255-np.array(house_mask)))\n",
+    "plt.axis('off')\n",
+    "plt.title('Masked')\n",
+    "plt.subplot(1,4,3)\n",
+    "plt.imshow(seg_image)\n",
+    "plt.axis('off')\n",
+    "plt.title('Condition')\n",
+    "plt.subplot(1,4,4)\n",
+    "plt.imshow(new_image)\n",
+    "plt.title('Output')\n",
+    "plt.axis('off')\n",
+    "\n",
+    "plt.savefig('output/seg_grid.png',\n",
+    "            dpi=200,\n",
+    "            bbox_inches='tight',\n",
+    "            pad_inches=0.0\n",
+    "       )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7b8346f7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

ControlNetInpaint/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 mikonvergence
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

ControlNetInpaint/README.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# :recycle: ControlNetInpaint
+[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mikonvergence/ControlNetInpaint/blob/main/ControlNet-with-Inpaint-Demo-colab.ipynb)
+[ControlNet](https://github.com/lllyasviel/ControlNet) has proven to be a great tool for guiding StableDiffusion models with image-based hints! But what about **changing only a part of the image** based on that hint?
+:crystal_ball: The initial set of models of ControlNet were not trained to work with StableDiffusion inpainting backbone, but it turns out that the results can be pretty good!
+In this repository, you will find a basic example notebook that shows how this can work. **The key trick is to use the right value of the parameter** `controlnet_conditioning_scale` - while value of `1.0` often works well, it is sometimes beneficial to bring it down a bit when the controlling image does not fit the selected text prompt very well.
+## Demos on 🤗  HuggingFace Using ControlNetInpaint
+### :pencil2: Mask and Sketch
+Check out the [HuggingFace Space](https://huggingface.co/spaces/mikonvergence/mask-and-sketch) which allows you to scribble and describe how you want to recreate a part of an image:
+[<img width="1518" alt="Screenshot 2023-04-16 at 11 56 29" src="https://user-images.githubusercontent.com/13435425/232302552-123744ba-4953-4972-9df8-ab19ee7b599b.png">](https://huggingface.co/spaces/mikonvergence/mask-and-sketch)
+### :performing_arts:theaTRON
+Check out the [HuggingFace Space](https://huggingface.co/spaces/mikonvergence/theaTRON) that reimagines scenes with human subjects using a text prompt:
+[<img width="1518" alt="theaTRON tool examples" src="https://huggingface.co/spaces/mikonvergence/theaTRON/resolve/main/data/image-only.png">](https://huggingface.co/spaces/mikonvergence/theaTRON)
+## Code Usage
+> This code is currently compatible with `diffusers==0.14.0`. An upgrade to the latest version can be expected in the near future (currently, some breaking changes are present in `0.15.0` that should ideally be fixed on the side of the diffusers interface).
+Here's an example of how this new pipeline (`StableDiffusionControlNetInpaintPipeline`) is used with the core backbone of `"runwayml/stable-diffusion-inpainting"`:
+```python
+# load control net and stable diffusion v1-5
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+     "runwayml/stable-diffusion-inpainting", controlnet=controlnet, torch_dtype=torch.float16
+ )
+# speed up diffusion process with faster scheduler and memory optimization
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+# remove following line if xformers is not installed
+pipe.enable_xformers_memory_efficient_attention()
+pipe.to('cuda')
+# generate image
+generator = torch.manual_seed(0)
+new_image = pipe(
+    text_prompt,
+    num_inference_steps=20,
+    generator=generator,
+    image=image,
+    control_image=canny_image,
+    mask_image=mask_image
+).images[0]
+```
+(Full example how to get images and run the results is available in the notebook!)
+## Results
+All results below have been generated using the `ControlNet-with-Inpaint-Demo.ipynb` notebook.
+Let's start with turning a dog into a red panda!
+### Canny Edge
+**Prompt**: *"a red panda sitting on a bench"*
+![Canny Result](output/canny_grid.png)
+### HED
+**Prompt**: *"a red panda sitting on a bench"*
+![HED Result](output/hed_grid.png)
+### Scribble
+**Prompt**: *"a red panda sitting on a bench"*
+![Canny Result](output/scribble_grid.png)
+### Depth
+**Prompt**: *"a red panda sitting on a bench"*
+![Canny Result](output/depth_grid.png)
+### Normal
+**Prompt**: *"a red panda sitting on a bench"*
+![Normal Result](output/normal_grid.png)
+For the remaining modalities, the panda example doesn't really make much sense, so we use different images and prompts to illustrate the capability!
+### M-LSD
+**Prompt**: *"an image of a room with a city skyline view"*
+![MLSD Result](output/mlsd_grid.png)
+### OpenPose
+**Prompt**: *"a man in a knight armor"*
+![Normal Result](output/openpose_grid.png)
+### Segmentation Mask
+**Prompt**: *"a pink eerie scary house"*
+![Normal Result](output/seg_grid.png)
+## Challenging Example 🐕➡️🍔
+Let's see how tuning the `controlnet_conditioning_scale` works out for a more challenging example of turning the dog into a cheeseburger!
+In this case, we **demand a large semantic leap** and that requires a more subtle guide from the control image!
+![Cheeseburger Result](output/canny_cheeseburger_grid.png)
+### :fast_forward: DiffusionFastForward: learn diffusion from ground up! 🎻
+If you want to learn more about the process of denoising diffusion for images, check out the **open-source course** [DiffusionFastForward](https://github.com/mikonvergence/DiffusionFastForward) with colab notebooks where networks are trained from scratch on high-resolution data! :beginner:
+[![Logo](https://user-images.githubusercontent.com/13435425/222425743-213279f9-d0a1-413c-a16a-2c88b512f827.png)](https://github.com/mikonvergence/DiffusionFastForward)
+### Acknowledgement
+There is a related excellent repository of [ControlNet-for-Any-Basemodel](https://github.com/haofanwang/ControlNet-for-Diffusers) that, among many other things, also shows similar examples of using ControlNet for inpainting. However, that definition of the pipeline is quite different, but most importantly, does not allow for controlling the `controlnet_conditioning_scale` as an input argument.
+There are other differences, such as the fact that in this implementation, only one pipeline needs to be instantiated (as opposed to two in the other one), but **the key motivation for publishing this repository is to provide a space solely focused on the application of ControlNet for inpainting.**

ControlNetInpaint/output/baseline_grid.png ADDED Viewed

Git LFS Details

SHA256: 11ebb2475096f74e87cae92bf87379d3917d1391be0054bd04410c69b2f05658
Pointer size: 132 Bytes
Size of remote file: 1.03 MB

ControlNetInpaint/output/baseline_result.png ADDED Viewed

ControlNetInpaint/output/canny_cheeseburger.png ADDED Viewed

Git LFS Details

SHA256: ad0894cf88bd7f24abfc0509570638aa349d85700673a260493c8184b2c7073a
Pointer size: 132 Bytes
Size of remote file: 1.31 MB

ControlNetInpaint/output/canny_cheeseburger_grid.png ADDED Viewed

Git LFS Details

SHA256: 70b7be97ade344bf207512592d44c0c9ed4e3607ce1e69e7f9969bbe2fdeedbe
Pointer size: 132 Bytes
Size of remote file: 3.65 MB

ControlNetInpaint/output/canny_grid.png ADDED Viewed

ControlNetInpaint/output/canny_result.png ADDED Viewed

ControlNetInpaint/output/depth_grid.png ADDED Viewed

ControlNetInpaint/output/depth_result.png ADDED Viewed

ControlNetInpaint/output/hed_grid.png ADDED Viewed

ControlNetInpaint/output/hed_result.png ADDED Viewed

ControlNetInpaint/output/mlsd_grid.png ADDED Viewed

ControlNetInpaint/output/mlsd_result.png ADDED Viewed

ControlNetInpaint/output/normal_grid.png ADDED Viewed

ControlNetInpaint/output/normal_result.png ADDED Viewed

ControlNetInpaint/output/openpose_grid.png ADDED Viewed

ControlNetInpaint/output/openpose_result.png ADDED Viewed

ControlNetInpaint/output/scribble_grid.png ADDED Viewed

ControlNetInpaint/output/scribble_result.png ADDED Viewed

ControlNetInpaint/output/seg_grid.png ADDED Viewed

ControlNetInpaint/output/seg_result.png ADDED Viewed

ControlNetInpaint/setup.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from setuptools import setup
+setup(
+    name='controlnetinpaint',
+    version='0.1',
+    description='ControlNet Inpainting with StableDiffusion',
+    url='https://github.com/mikonvergence/ControlNetInpaint',
+    author='Mikolaj Czerkawski',
+    author_email="mikolaj.czerkawski@esa.int",
+    package_dir={"controlnetinpaint":"src"},
+    install_requires=[
+      "torch>=1.10.0",
+      "torchvision",
+      "numpy",
+      "tqdm",
+      "pillow",
+      "diffusers==0.14.0",
+      "xformers",
+      "transformers",
+      "scipy",
+      "ftfy",
+      "accelerate",
+      "controlnet_aux"
+    ],
+)

ControlNetInpaint/src/pipeline_stable_diffusion_controlnet_inpaint.py ADDED Viewed

	@@ -0,0 +1,521 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import PIL.Image
+import numpy as np
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import *
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+        >>> import cv2
+        >>> from PIL import Image
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        ... )
+        >>> image = np.array(image)
+        >>> mask_image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+        ... )
+        >>> mask_image = np.array(mask_image)
+        >>> # get canny image
+        >>> canny_image = cv2.Canny(image, 100, 200)
+        >>> canny_image = canny_image[:, :, None]
+        >>> canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2)
+        >>> canny_image = Image.fromarray(canny_image)
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-inpainting", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> # remove following line if xformers is not installed
+        >>> pipe.enable_xformers_memory_efficient_attention()
+        >>> pipe.enable_model_cpu_offload()
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking doggo",
+        ...     num_inference_steps=20,
+        ...     generator=generator,
+        ...     image=image,
+        ...     control_image=canny_image,
+        ...     mask_image=mask_image
+        ... ).images[0]
+        ```
+"""
+def prepare_mask_and_masked_image(image, mask):
+    """
+    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+    masked_image = image * (mask < 0.5)
+    return mask, masked_image
+class StableDiffusionControlNetInpaintPipeline(StableDiffusionControlNetPipeline):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion with ControlNet guidance.
+    This model inherits from [`StableDiffusionControlNetPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`]):
+            Provides additional conditioning to the unet during the denoising process
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+        masked_image = masked_image.to(device=device, dtype=dtype)
+        # encode the mask image into latents space so we can concatenate it to the latents
+        if isinstance(generator, list):
+            masked_image_latents = [
+                self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i])
+                for i in range(batch_size)
+            ]
+            masked_image_latents = torch.cat(masked_image_latents, dim=0)
+        else:
+            masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
+        masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        control_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: float = 1.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. PIL.Image.Image` can
+                also be accepted as an image. The control image is automatically resized to fit the output image.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height, width = self._default_height_width(height, width, control_image)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, control_image, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # 4. Prepare image
+        control_image = self.prepare_image(
+            control_image,
+            width,
+            height,
+            batch_size * num_images_per_prompt,
+            num_images_per_prompt,
+            device,
+            self.controlnet.dtype,
+        )
+        if do_classifier_free_guidance:
+            control_image = torch.cat([control_image] * 2)
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 6. Prepare latent variables
+        num_channels_latents = self.controlnet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # EXTRA: prepare mask latents
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_cond=control_image,
+                    return_dict=False,
+                )
+                down_block_res_samples = [
+                    down_block_res_sample * controlnet_conditioning_scale
+                    for down_block_res_sample in down_block_res_samples
+                ]
+                mid_block_res_sample *= controlnet_conditioning_scale
+                # predict the noise residual
+                latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)