Spaces:

TempoFunk
/

makeavid-sd-jax

Runtime error

App Files Files Community

lopho commited on May 10, 2023

Commit

07b5d00

1 Parent(s): c122e26

saner defaults, more input sanitization, shorter queue

Browse files

Files changed (6) hide show

README.md +0 -2
app.py +62 -58
example.webp +0 -3
examples/example_04_furry_moster/params.json +1 -1
examples/example_06_sophie/params.json +1 -1
makeavid_sd/inference.py +5 -5

README.md CHANGED Viewed

@@ -19,5 +19,3 @@ models:
 tags:
 - jax-diffusers-event
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 tags:
 - jax-diffusers-event
 ---

app.py CHANGED Viewed

@@ -33,7 +33,7 @@ if _model.failed != False:
 _examples = []
 _expath = 'examples'
-for x in os.listdir(_expath):
     with open(os.path.join(_expath, x, 'params.json'), 'r') as f:
         ex = json.load(f)
     ex['image_input'] = None
@@ -56,22 +56,23 @@ def generate(
         cfg = 15.0,
         cfg_image = 9.0,
         seed = 0,
-        fps = 24,
         num_frames = 24,
         height = 512,
         width = 512,
-        scheduler_type = 'DPM',
-        output_format = 'webp'
 ) -> str:
-    num_frames = int(num_frames)
-    inference_steps = int(inference_steps)
-    height = int(height)
-    width = int(width)
     height = (height // 64) * 64
     width = (width // 64) * 64
     cfg = max(cfg, 1.0)
     cfg_image = max(cfg_image, 1.0)
-    seed = int(seed)
     if seed < 0:
         seed = -seed
     if hint_image is not None:
@@ -79,11 +80,12 @@ def generate(
             hint_image = hint_image.convert('RGB')
         if hint_image.size != (width, height):
             hint_image = ImageOps.fit(hint_image, (width, height), method = Image.Resampling.LANCZOS)
     if scheduler_type not in SCHEDULERS:
-        scheduler_type = 'DPM'
     output_format = output_format.lower()
     if output_format not in _output_formats:
-        output_format = 'webp'
     mask_image = None
     images = _model.generate(
             prompt = [prompt] * _model.device_count,
@@ -100,26 +102,24 @@ def generate(
             scheduler_type = scheduler_type
     )
     _seen_compilations.add((hint_image is None, inference_steps, height, width, num_frames))
-    buffer = BytesIO()
-    images[1].save(
-            buffer,
-            format = output_format,
-            save_all = True,
-            append_images = images[2:],
-            loop = 0,
-            duration = round(1000 / fps),
-            allow_mixed = True
-    )
-    data = f'data:image/{output_format};base64,' + base64.b64encode(buffer.getvalue()).decode()
-    buffer.close()
-    buffer = BytesIO()
-    images[-1].save(buffer, format ='png')
-    last_data = f'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()
-    buffer.close()
-    buffer = BytesIO()
-    images[0].save(buffer, format ='png')
-    first_data = f'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()
-    buffer.close()
     return data, last_data, first_data
 def check_if_compiled(hint_image, inference_steps, height, width, num_frames, scheduler_type, message):
@@ -140,11 +140,11 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
             intro1 = gr.Markdown("""
                         # Make-A-Video Stable Diffusion JAX
-                        We have extended a pretrained LDM inpainting image generation model with temporal convolutions and attention.
-                        By taking advantage of the extra 5 input channels of the inpaint model, we guide the video generation with a hint image.
                         In this demo the hint image can be given by the user, otherwise it is generated by an generative image model.
-                        The temporal layers are a port of [Make-A-Video PyTorch](https://github.com/lucidrains/make-a-video-pytorch) to FLAX.
                         The convolution is pseudo 3D and seperately convolves accross the spatial dimension in 2D and over the temporal dimension in 1D.
                         Temporal attention is purely self attention and also separately attends to time.
@@ -160,7 +160,7 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
                         **Please be patient. The model might have to compile with current parameters.**
                         This can take up to 5 minutes on the first run, and 2-3 minutes on later runs.
-                        The compilation will be cached and consecutive runs with the same parameters
                         will be much faster.
                         Changes to the following parameters require the model to compile
@@ -170,7 +170,9 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
                         - Input image vs. no input image
                         - Noise scheduler type
-                        If you encounter any issues, please report them here: [Space discussions](https://huggingface.co/spaces/TempoFunk/makeavid-sd-jax/discussions)
             """)
     with gr.Row(variant = variant):
@@ -221,14 +223,14 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
             inference_steps_input = gr.Slider(
                     label = 'Steps',
                     minimum = 2,
-                    maximum = 100,
                     value = 20,
                     step = 1,
                     interactive = True
             )
             num_frames_input = gr.Slider(
                     label = 'Number of frames to generate',
-                    minimum = 1,
                     maximum = 24,
                     step = 1,
                     value = 24,
@@ -236,7 +238,7 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
             )
             width_input = gr.Slider(
                     label = 'Width',
-                    minimum = 64,
                     maximum = 576,
                     step = 64,
                     value = 512,
@@ -244,7 +246,7 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
             )
             height_input = gr.Slider(
                     label = 'Height',
-                    minimum = 64,
                     maximum = 576,
                     step = 64,
                     value = 512,
@@ -253,7 +255,7 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
             scheduler_input = gr.Dropdown(
                     label = 'Noise scheduler',
                     choices = list(SCHEDULERS.keys()),
-                    value = 'DPM',
                     interactive = True
             )
             with gr.Row():
@@ -279,32 +281,33 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
                     value = 'example.gif',
                     interactive = False
             )
-            tips = gr.Markdown('🤫 *Secret tip*: take the last frame as input for the next generation.')
             with gr.Row():
                 last_frame_output = gr.Image(
                         label = 'Last frame',
                         interactive = False
                 )
                 first_frame_output = gr.Image(
-                        label = 'First frame',
                         interactive = False
                 )
     examples_lst = []
     for x in _examples:
         examples_lst.append([
-            x['image_output'],
-            x['prompt'],
-            x['neg_prompt'],
-            x['image_input'],
-            x['cfg'],
-            x['cfg_image'],
-            x['seed'],
-            x['fps'],
-            x['num_frames'],
-            x['height'],
-            x['width'],
-            x['scheduler'],
-            x['format']
         ])
     examples = gr.Examples(
             examples = examples_lst,
@@ -317,10 +320,11 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
                     cfg_image_input,
                     seed_input,
                     fps_input,
                     num_frames_input,
                     height_input,
                     width_input,
-                    scheduler_input,
                     output_format
             ],
             postprocess = False
@@ -355,6 +359,6 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
     )
     #cancel_button.click(fn = lambda: None, cancels = ev)
-demo.queue(concurrency_count = 1, max_size = 10)
 demo.launch()

 _examples = []
 _expath = 'examples'
+for x in sorted(os.listdir(_expath)):
     with open(os.path.join(_expath, x, 'params.json'), 'r') as f:
         ex = json.load(f)
     ex['image_input'] = None
         cfg = 15.0,
         cfg_image = 9.0,
         seed = 0,
+        fps = 12,
         num_frames = 24,
         height = 512,
         width = 512,
+        scheduler_type = 'dpm',
+        output_format = 'gif'
 ) -> str:
+    num_frames = min(24, max(2, int(num_frames)))
+    inference_steps = min(60, max(2, int(inference_steps)))
+    height = min(576, max(256, int(height)))
+    width = min(576, max(256, int(width)))
     height = (height // 64) * 64
     width = (width // 64) * 64
     cfg = max(cfg, 1.0)
     cfg_image = max(cfg_image, 1.0)
+    fps = min(1000, max(1, int(fps)))
+    seed = min(2**32-2, int(seed))
     if seed < 0:
         seed = -seed
     if hint_image is not None:
             hint_image = hint_image.convert('RGB')
         if hint_image.size != (width, height):
             hint_image = ImageOps.fit(hint_image, (width, height), method = Image.Resampling.LANCZOS)
+    scheduler_type = scheduler_type.lower()
     if scheduler_type not in SCHEDULERS:
+        scheduler_type = 'dpm'
     output_format = output_format.lower()
     if output_format not in _output_formats:
+        output_format = 'gif'
     mask_image = None
     images = _model.generate(
             prompt = [prompt] * _model.device_count,
             scheduler_type = scheduler_type
     )
     _seen_compilations.add((hint_image is None, inference_steps, height, width, num_frames))
+    with BytesIO() as buffer:
+        images[1].save(
+                buffer,
+                format = output_format,
+                save_all = True,
+                append_images = images[2:],
+                loop = 0,
+                duration = round(1000 / fps),
+                allow_mixed = True,
+                optimize = True
+        )
+        data = f'data:image/{output_format};base64,' + base64.b64encode(buffer.getvalue()).decode()
+    with BytesIO() as buffer:
+        images[-1].save(buffer, format = 'png', optimize = True)
+        last_data = f'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()
+    with BytesIO() as buffer:
+        images[0].save(buffer, format ='png', optimize = True)
+        first_data = f'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode()
     return data, last_data, first_data
 def check_if_compiled(hint_image, inference_steps, height, width, num_frames, scheduler_type, message):
             intro1 = gr.Markdown("""
                         # Make-A-Video Stable Diffusion JAX
+                        We have extended a pretrained latent-diffusion inpainting image generation model with **temporal convolutions and attention**.
+                        We guide the video generation with a hint image by taking advantage of the extra 5 input channels of the inpainting model.
                         In this demo the hint image can be given by the user, otherwise it is generated by an generative image model.
+                        The temporal layers are a port of [Make-A-Video PyTorch](https://github.com/lucidrains/make-a-video-pytorch) to [JAX](https://github.com/google/jax) utilizing [FLAX](https://github.com/google/flax).
                         The convolution is pseudo 3D and seperately convolves accross the spatial dimension in 2D and over the temporal dimension in 1D.
                         Temporal attention is purely self attention and also separately attends to time.
                         **Please be patient. The model might have to compile with current parameters.**
                         This can take up to 5 minutes on the first run, and 2-3 minutes on later runs.
+                        The compilation will be cached and later runs with the same parameters
                         will be much faster.
                         Changes to the following parameters require the model to compile
                         - Input image vs. no input image
                         - Noise scheduler type
+                        If you encounter any issues, please report them here: [Space discussions](https://huggingface.co/spaces/TempoFunk/makeavid-sd-jax/discussions) (or DM [@lopho](https://twitter.com/lopho))
+                        <small>Leave a ❤️ like if you like. Consider it a dopamine donation at no cost.</small>
             """)
     with gr.Row(variant = variant):
             inference_steps_input = gr.Slider(
                     label = 'Steps',
                     minimum = 2,
+                    maximum = 60,
                     value = 20,
                     step = 1,
                     interactive = True
             )
             num_frames_input = gr.Slider(
                     label = 'Number of frames to generate',
+                    minimum = 2,
                     maximum = 24,
                     step = 1,
                     value = 24,
             )
             width_input = gr.Slider(
                     label = 'Width',
+                    minimum = 256,
                     maximum = 576,
                     step = 64,
                     value = 512,
             )
             height_input = gr.Slider(
                     label = 'Height',
+                    minimum = 256,
                     maximum = 576,
                     step = 64,
                     value = 512,
             scheduler_input = gr.Dropdown(
                     label = 'Noise scheduler',
                     choices = list(SCHEDULERS.keys()),
+                    value = 'dpm',
                     interactive = True
             )
             with gr.Row():
                     value = 'example.gif',
                     interactive = False
             )
+            tips = gr.Markdown('🤫 *Secret tip*: try using the last frame as input for the next generation.')
             with gr.Row():
                 last_frame_output = gr.Image(
                         label = 'Last frame',
                         interactive = False
                 )
                 first_frame_output = gr.Image(
+                        label = 'Initial frame',
                         interactive = False
                 )
     examples_lst = []
     for x in _examples:
         examples_lst.append([
+                x['image_output'],
+                x['prompt'],
+                x['neg_prompt'],
+                x['image_input'],
+                x['cfg'],
+                x['cfg_image'],
+                x['seed'],
+                x['fps'],
+                x['steps'],
+                x['scheduler'],
+                x['num_frames'],
+                x['height'],
+                x['width'],
+                x['format']
         ])
     examples = gr.Examples(
             examples = examples_lst,
                     cfg_image_input,
                     seed_input,
                     fps_input,
+                    inference_steps_input,
+                    scheduler_input,
                     num_frames_input,
                     height_input,
                     width_input,
                     output_format
             ],
             postprocess = False
     )
     #cancel_button.click(fn = lambda: None, cancels = ev)
+demo.queue(concurrency_count = 1, max_size = 8)
 demo.launch()

example.webp DELETED Viewed

Git LFS Details

SHA256: ffd7cb93989a8e311395799f6d6e566e698ad7654f9f5a471196d8c781f46c1f
Pointer size: 132 Bytes
Size of remote file: 1.45 MB

examples/example_04_furry_moster/params.json CHANGED Viewed

@@ -8,7 +8,7 @@
     "width": 512,
     "height": 512,
     "scheduler": "dpm",
-    "fps": 20,
     "format": "gif",
     "num_frames": 24
 }

     "width": 512,
     "height": 512,
     "scheduler": "dpm",
+    "fps": 12,
     "format": "gif",
     "num_frames": 24
 }

examples/example_06_sophie/params.json CHANGED Viewed

@@ -3,7 +3,7 @@
     "neg_prompt": "",
     "cfg": 15,
     "cfg_image": 9,
-    "seed": 1,
     "steps": 20,
     "width": 512,
     "height": 512,

     "neg_prompt": "",
     "cfg": 15,
     "cfg_image": 9,
+    "seed": 0,
     "steps": 20,
     "width": 512,
     "height": 512,

makeavid_sd/inference.py CHANGED Viewed

@@ -45,8 +45,8 @@ SchedulerStateType = Union[
 ]
 SCHEDULERS: Dict[str, SchedulerType] = {
-        'DPM': FlaxDPMSolverMultistepScheduler, # husbando
-        'DDIM': FlaxDDIMScheduler,
         #'PLMS': FlaxPNDMScheduler, # its not correctly implemented in diffusers, output is bad, but at least it "works"
         #'LMS': FlaxLMSDiscreteScheduler, # borked
         #    image_latents, image_scheduler_state = scheduler.step(
@@ -224,8 +224,8 @@ class InferenceUNetPseudo3D:
         return tokens, neg_tokens, hint, mask
     def generate(self,
-            prompt: Union[str, List[str]],
-            inference_steps: int,
             hint_image: Union[Image.Image, List[Image.Image], None] = None,
             mask_image: Union[Image.Image, List[Image.Image], None] = None,
             neg_prompt: Union[str, List[str]] = '',
@@ -235,7 +235,7 @@ class InferenceUNetPseudo3D:
             width: int = 512,
             height: int = 512,
             seed: int = 0,
-            scheduler_type: str = 'DDIM'
     ) -> List[List[Image.Image]]:
         assert inference_steps > 0, f'number of inference steps must be > 0 but is {inference_steps}'
         assert num_frames > 0, f'number of frames must be > 0 but is {num_frames}'

 ]
 SCHEDULERS: Dict[str, SchedulerType] = {
+        'dpm': FlaxDPMSolverMultistepScheduler, # husbando
+        'ddim': FlaxDDIMScheduler,
         #'PLMS': FlaxPNDMScheduler, # its not correctly implemented in diffusers, output is bad, but at least it "works"
         #'LMS': FlaxLMSDiscreteScheduler, # borked
         #    image_latents, image_scheduler_state = scheduler.step(
         return tokens, neg_tokens, hint, mask
     def generate(self,
+            prompt: Union[str, List[str]] = '',
+            inference_steps: int = 20,
             hint_image: Union[Image.Image, List[Image.Image], None] = None,
             mask_image: Union[Image.Image, List[Image.Image], None] = None,
             neg_prompt: Union[str, List[str]] = '',
             width: int = 512,
             height: int = 512,
             seed: int = 0,
+            scheduler_type: str = 'dpm'
     ) -> List[List[Image.Image]]:
         assert inference_steps > 0, f'number of inference steps must be > 0 but is {inference_steps}'
         assert num_frames > 0, f'number of frames must be > 0 but is {num_frames}'