Chroma_latest

Paused

App Files Files Community

Chroma_latest / comfy_extras /nodes_hunyuan.py

gokaygokay

Upload 1005 files

77f10a3 verified 5 months ago

raw

history blame

5.91 kB

	import nodes
	import node_helpers
	import torch
	import comfy.model_management


	class CLIPTextEncodeHunyuanDiT:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {
	"clip": ("CLIP", ),
	"bert": ("STRING", {"multiline": True, "dynamicPrompts": True}),
	"mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}),
	}}
	RETURN_TYPES = ("CONDITIONING",)
	FUNCTION = "encode"

	CATEGORY = "advanced/conditioning"

	def encode(self, clip, bert, mt5xl):
	tokens = clip.tokenize(bert)
	tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"]

	return (clip.encode_from_tokens_scheduled(tokens), )

	class EmptyHunyuanLatentVideo:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
	"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
	"length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
	"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
	RETURN_TYPES = ("LATENT",)
	FUNCTION = "generate"

	CATEGORY = "latent/video"

	def generate(self, width, height, length, batch_size=1):
	latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
	return ({"samples":latent}, )

	PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
	"<\|start_header_id\|>system<\|end_header_id\|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
	"1. The main content and theme of the video."
	"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
	"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
	"4. background environment, light, style and atmosphere."
	"5. camera angles, movements, and transitions used in the video:<\|eot_id\|>\n\n"
	"<\|start_header_id\|>user<\|end_header_id\|>\n\n{}<\|eot_id\|>"
	"<\|start_header_id\|>assistant<\|end_header_id\|>\n\n"
	)

	class TextEncodeHunyuanVideo_ImageToVideo:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {
	"clip": ("CLIP", ),
	"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
	"prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
	"image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}),
	}}
	RETURN_TYPES = ("CONDITIONING",)
	FUNCTION = "encode"

	CATEGORY = "advanced/conditioning"

	def encode(self, clip, clip_vision_output, prompt, image_interleave):
	tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave)
	return (clip.encode_from_tokens_scheduled(tokens), )

	class HunyuanImageToVideo:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {"positive": ("CONDITIONING", ),
	"vae": ("VAE", ),
	"width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
	"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
	"length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
	"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
	"guidance_type": (["v1 (concat)", "v2 (replace)", "custom"], )
	},
	"optional": {"start_image": ("IMAGE", ),
	}}

	RETURN_TYPES = ("CONDITIONING", "LATENT")
	RETURN_NAMES = ("positive", "latent")
	FUNCTION = "encode"

	CATEGORY = "conditioning/video_models"

	def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None):
	latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
	out_latent = {}

	if start_image is not None:
	start_image = comfy.utils.common_upscale(start_image[:length, :, :, :3].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)

	concat_latent_image = vae.encode(start_image)
	mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
	mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0

	if guidance_type == "v1 (concat)":
	cond = {"concat_latent_image": concat_latent_image, "concat_mask": mask}
	elif guidance_type == "v2 (replace)":
	cond = {'guiding_frame_index': 0}
	latent[:, :, :concat_latent_image.shape[2]] = concat_latent_image
	out_latent["noise_mask"] = mask
	elif guidance_type == "custom":
	cond = {"ref_latent": concat_latent_image}

	positive = node_helpers.conditioning_set_values(positive, cond)

	out_latent["samples"] = latent
	return (positive, out_latent)



	NODE_CLASS_MAPPINGS = {
	"CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
	"TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo,
	"EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo,
	"HunyuanImageToVideo": HunyuanImageToVideo,
	}