hunyuanvideo-community/HunyuanVideo-I2V · RuntimeError: The size of tensor a (7) must match the size of tensor b (16) at non-singleton dimension 1

It happens:
RuntimeError                              Traceback (most recent call last)
Cell In[1], line 22
     17 prompt = "A man with short gray hair plays a red electric guitar."
     18 image = load_image(
     19     "https://hf-mirror.com/datasets/huggingface/documentation-images/resolve/main/diffusers/guitar-man.png"
     20 )
---> 22 output = pipe(image=image, prompt=prompt).frames[0]
     23 export_to_video(output, "HunyuanVideo-I2V-diffusers.mp4", fps=15)

File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    113 @functools.wraps(func)
    114 def decorate_context(*args, **kwargs):
    115     with ctx_factory():
--> 116         return func(*args, **kwargs)

File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py:733, in HunyuanVideoImageToVideoPipeline.__call__(self, image, prompt, prompt_2, negative_prompt, negative_prompt_2, height, width, num_frames, num_inference_steps, sigmas, true_cfg_scale, guidance_scale, num_videos_per_prompt, generator, latents, prompt_embeds, pooled_prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_pooled_prompt_embeds, negative_prompt_attention_mask, output_type, return_dict, attention_kwargs, callback_on_step_end, callback_on_step_end_tensor_inputs, prompt_template, max_sequence_length)
    731 image_tensor = self.video_processor.preprocess(image, height, width).to(device, vae_dtype)
    732 num_channels_latents = (self.transformer.config.in_channels - 1) // 2
--> 733 latents, image_latents = self.prepare_latents(
    734     image_tensor,
    735     batch_size * num_videos_per_prompt,
    736     num_channels_latents,
    737     height,
    738     width,
    739     num_frames,
    740     torch.float32,
    741     device,
    742     generator,
    743     latents,
    744 )
    745 image_latents[:, :, 1:] = 0
    746 mask = image_latents.new_ones(image_latents.shape[0], 1, *image_latents.shape[2:])

File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py:514, in HunyuanVideoImageToVideoPipeline.prepare_latents(self, image, batch_size, num_channels_latents, height, width, num_frames, dtype, device, generator, latents)
    511     latents = latents.to(device=device, dtype=dtype)
    513 t = torch.tensor([0.999]).to(device=device)
--> 514 latents = latents * t + image_latents * (1 - t)
    516 return latents, image_latents

RuntimeError: The size of tensor a (7) must match the size of tensor b (16) at non-singleton dimension 1