Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

VideoModelStudio / docs /finetrainers-src-codebase /examples /_legacy /training /cogvideox /args.py

jbilcke-hf HF Staff

we are going to hack into finetrainers

9fd1204 4 months ago

raw

history blame

16.9 kB

	import argparse


	def _get_model_args(parser: argparse.ArgumentParser) -> None:
	parser.add_argument(
	"--pretrained_model_name_or_path",
	type=str,
	default=None,
	required=True,
	help="Path to pretrained model or model identifier from huggingface.co/models.",
	)
	parser.add_argument(
	"--revision",
	type=str,
	default=None,
	required=False,
	help="Revision of pretrained model identifier from huggingface.co/models.",
	)
	parser.add_argument(
	"--variant",
	type=str,
	default=None,
	help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
	)
	parser.add_argument(
	"--cache_dir",
	type=str,
	default=None,
	help="The directory where the downloaded models and datasets will be stored.",
	)


	def _get_dataset_args(parser: argparse.ArgumentParser) -> None:
	parser.add_argument(
	"--data_root",
	type=str,
	default=None,
	help=("A folder containing the training data."),
	)
	parser.add_argument(
	"--dataset_file",
	type=str,
	default=None,
	help=("Path to a CSV file if loading prompts/video paths using this format."),
	)
	parser.add_argument(
	"--video_column",
	type=str,
	default="video",
	help="The column of the dataset containing videos. Or, the name of the file in `--data_root` folder containing the line-separated path to video data.",
	)
	parser.add_argument(
	"--caption_column",
	type=str,
	default="text",
	help="The column of the dataset containing the instance prompt for each video. Or, the name of the file in `--data_root` folder containing the line-separated instance prompts.",
	)
	parser.add_argument(
	"--id_token",
	type=str,
	default=None,
	help="Identifier token appended to the start of each prompt if provided.",
	)
	parser.add_argument(
	"--height_buckets",
	nargs="+",
	type=int,
	default=[256, 320, 384, 480, 512, 576, 720, 768, 960, 1024, 1280, 1536],
	)
	parser.add_argument(
	"--width_buckets",
	nargs="+",
	type=int,
	default=[256, 320, 384, 480, 512, 576, 720, 768, 960, 1024, 1280, 1536],
	)
	parser.add_argument(
	"--frame_buckets",
	nargs="+",
	type=int,
	default=[49],
	help="CogVideoX1.5 need to guarantee that ((num_frames - 1) // self.vae_scale_factor_temporal + 1) % patch_size_t == 0, such as 53"
	)
	parser.add_argument(
	"--load_tensors",
	action="store_true",
	help="Whether to use a pre-encoded tensor dataset of latents and prompt embeddings instead of videos and text prompts. The expected format is that saved by running the `prepare_dataset.py` script.",
	)
	parser.add_argument(
	"--random_flip",
	type=float,
	default=None,
	help="If random horizontal flip augmentation is to be used, this should be the flip probability.",
	)
	parser.add_argument(
	"--dataloader_num_workers",
	type=int,
	default=0,
	help="Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.",
	)
	parser.add_argument(
	"--pin_memory",
	action="store_true",
	help="Whether or not to use the pinned memory setting in pytorch dataloader.",
	)


	def _get_validation_args(parser: argparse.ArgumentParser) -> None:
	parser.add_argument(
	"--validation_prompt",
	type=str,
	default=None,
	help="One or more prompt(s) that is used during validation to verify that the model is learning. Multiple validation prompts should be separated by the '--validation_prompt_seperator' string.",
	)
	parser.add_argument(
	"--validation_images",
	type=str,
	default=None,
	help="One or more image path(s)/URLs that is used during validation to verify that the model is learning. Multiple validation paths should be separated by the '--validation_prompt_seperator' string. These should correspond to the order of the validation prompts.",
	)
	parser.add_argument(
	"--validation_prompt_separator",
	type=str,
	default=":::",
	help="String that separates multiple validation prompts",
	)
	parser.add_argument(
	"--num_validation_videos",
	type=int,
	default=1,
	help="Number of videos that should be generated during validation per `validation_prompt`.",
	)
	parser.add_argument(
	"--validation_epochs",
	type=int,
	default=None,
	help="Run validation every X training epochs. Validation consists of running the validation prompt `args.num_validation_videos` times.",
	)
	parser.add_argument(
	"--validation_steps",
	type=int,
	default=None,
	help="Run validation every X training steps. Validation consists of running the validation prompt `args.num_validation_videos` times.",
	)
	parser.add_argument(
	"--guidance_scale",
	type=float,
	default=6,
	help="The guidance scale to use while sampling validation videos.",
	)
	parser.add_argument(
	"--use_dynamic_cfg",
	action="store_true",
	default=False,
	help="Whether or not to use the default cosine dynamic guidance schedule when sampling validation videos.",
	)
	parser.add_argument(
	"--enable_model_cpu_offload",
	action="store_true",
	default=False,
	help="Whether or not to enable model-wise CPU offloading when performing validation/testing to save memory.",
	)


	def _get_training_args(parser: argparse.ArgumentParser) -> None:
	parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
	parser.add_argument("--rank", type=int, default=64, help="The rank for LoRA matrices.")
	parser.add_argument(
	"--lora_alpha",
	type=int,
	default=64,
	help="The lora_alpha to compute scaling factor (lora_alpha / rank) for LoRA matrices.",
	)
	parser.add_argument(
	"--mixed_precision",
	type=str,
	default=None,
	choices=["no", "fp16", "bf16"],
	help=(
	"Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10.and an Nvidia Ampere GPU. "
	"Default to the value of accelerate config of the current system or the flag passed with the `accelerate.launch` command. Use this "
	"argument to override the accelerate config."
	),
	)
	parser.add_argument(
	"--output_dir",
	type=str,
	default="cogvideox-sft",
	help="The output directory where the model predictions and checkpoints will be written.",
	)
	parser.add_argument(
	"--height",
	type=int,
	default=480,
	help="All input videos are resized to this height.",
	)
	parser.add_argument(
	"--width",
	type=int,
	default=720,
	help="All input videos are resized to this width.",
	)
	parser.add_argument(
	"--video_reshape_mode",
	type=str,
	default=None,
	help="All input videos are reshaped to this mode. Choose between ['center', 'random', 'none']",
	)
	parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
	parser.add_argument(
	"--max_num_frames",
	type=int,
	default=49,
	help="All input videos will be truncated to these many frames.",
	)
	parser.add_argument(
	"--skip_frames_start",
	type=int,
	default=0,
	help="Number of frames to skip from the beginning of each input video. Useful if training data contains intro sequences.",
	)
	parser.add_argument(
	"--skip_frames_end",
	type=int,
	default=0,
	help="Number of frames to skip from the end of each input video. Useful if training data contains outro sequences.",
	)
	parser.add_argument(
	"--train_batch_size",
	type=int,
	default=4,
	help="Batch size (per device) for the training dataloader.",
	)
	parser.add_argument("--num_train_epochs", type=int, default=1)
	parser.add_argument(
	"--max_train_steps",
	type=int,
	default=None,
	help="Total number of training steps to perform. If provided, overrides `--num_train_epochs`.",
	)
	parser.add_argument(
	"--checkpointing_steps",
	type=int,
	default=500,
	help=(
	"Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
	" checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
	" training using `--resume_from_checkpoint`."
	),
	)
	parser.add_argument(
	"--checkpoints_total_limit",
	type=int,
	default=None,
	help=("Max number of checkpoints to store."),
	)
	parser.add_argument(
	"--resume_from_checkpoint",
	type=str,
	default=None,
	help=(
	"Whether training should be resumed from a previous checkpoint. Use a path saved by"
	' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
	),
	)
	parser.add_argument(
	"--gradient_accumulation_steps",
	type=int,
	default=1,
	help="Number of updates steps to accumulate before performing a backward/update pass.",
	)
	parser.add_argument(
	"--gradient_checkpointing",
	action="store_true",
	help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
	)
	parser.add_argument(
	"--learning_rate",
	type=float,
	default=1e-4,
	help="Initial learning rate (after the potential warmup period) to use.",
	)
	parser.add_argument(
	"--scale_lr",
	action="store_true",
	default=False,
	help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
	)
	parser.add_argument(
	"--lr_scheduler",
	type=str,
	default="constant",
	help=(
	'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
	' "constant", "constant_with_warmup"]'
	),
	)
	parser.add_argument(
	"--lr_warmup_steps",
	type=int,
	default=500,
	help="Number of steps for the warmup in the lr scheduler.",
	)
	parser.add_argument(
	"--lr_num_cycles",
	type=int,
	default=1,
	help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
	)
	parser.add_argument(
	"--lr_power",
	type=float,
	default=1.0,
	help="Power factor of the polynomial scheduler.",
	)
	parser.add_argument(
	"--enable_slicing",
	action="store_true",
	default=False,
	help="Whether or not to use VAE slicing for saving memory.",
	)
	parser.add_argument(
	"--enable_tiling",
	action="store_true",
	default=False,
	help="Whether or not to use VAE tiling for saving memory.",
	)
	parser.add_argument(
	"--noised_image_dropout",
	type=float,
	default=0.05,
	help="Image condition dropout probability when finetuning image-to-video.",
	)
	parser.add_argument(
	"--ignore_learned_positional_embeddings",
	action="store_true",
	default=False,
	help=(
	"Whether to ignore the learned positional embeddings when training CogVideoX Image-to-Video. This setting "
	"should be used when performing multi-resolution training, because CogVideoX-I2V does not support it "
	"otherwise. Please read the comments in https://github.com/a-r-r-o-w/cogvideox-factory/issues/26 to understand why."
	),
	)


	def _get_optimizer_args(parser: argparse.ArgumentParser) -> None:
	parser.add_argument(
	"--optimizer",
	type=lambda s: s.lower(),
	default="adam",
	choices=["adam", "adamw", "prodigy", "came"],
	help=("The optimizer type to use."),
	)
	parser.add_argument(
	"--use_8bit",
	action="store_true",
	help="Whether or not to use 8-bit optimizers from `bitsandbytes` or `bitsandbytes`.",
	)
	parser.add_argument(
	"--use_4bit",
	action="store_true",
	help="Whether or not to use 4-bit optimizers from `torchao`.",
	)
	parser.add_argument(
	"--use_torchao", action="store_true", help="Whether or not to use the `torchao` backend for optimizers."
	)
	parser.add_argument(
	"--beta1",
	type=float,
	default=0.9,
	help="The beta1 parameter for the Adam and Prodigy optimizers.",
	)
	parser.add_argument(
	"--beta2",
	type=float,
	default=0.95,
	help="The beta2 parameter for the Adam and Prodigy optimizers.",
	)
	parser.add_argument(
	"--beta3",
	type=float,
	default=None,
	help="Coefficients for computing the Prodigy optimizer's stepsize using running averages. If set to None, uses the value of square root of beta2.",
	)
	parser.add_argument(
	"--prodigy_decouple",
	action="store_true",
	help="Use AdamW style decoupled weight decay.",
	)
	parser.add_argument(
	"--weight_decay",
	type=float,
	default=1e-04,
	help="Weight decay to use for optimizer.",
	)
	parser.add_argument(
	"--epsilon",
	type=float,
	default=1e-8,
	help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
	)
	parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
	parser.add_argument(
	"--prodigy_use_bias_correction",
	action="store_true",
	help="Turn on Adam's bias correction.",
	)
	parser.add_argument(
	"--prodigy_safeguard_warmup",
	action="store_true",
	help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage.",
	)
	parser.add_argument(
	"--use_cpu_offload_optimizer",
	action="store_true",
	help="Whether or not to use the CPUOffloadOptimizer from TorchAO to perform optimization step and maintain parameters on the CPU.",
	)
	parser.add_argument(
	"--offload_gradients",
	action="store_true",
	help="Whether or not to offload the gradients to CPU when using the CPUOffloadOptimizer from TorchAO.",
	)


	def _get_configuration_args(parser: argparse.ArgumentParser) -> None:
	parser.add_argument("--tracker_name", type=str, default=None, help="Project tracker name")
	parser.add_argument(
	"--push_to_hub",
	action="store_true",
	help="Whether or not to push the model to the Hub.",
	)
	parser.add_argument(
	"--hub_token",
	type=str,
	default=None,
	help="The token to use to push to the Model Hub.",
	)
	parser.add_argument(
	"--hub_model_id",
	type=str,
	default=None,
	help="The name of the repository to keep in sync with the local `output_dir`.",
	)
	parser.add_argument(
	"--logging_dir",
	type=str,
	default="logs",
	help="Directory where logs are stored.",
	)
	parser.add_argument(
	"--allow_tf32",
	action="store_true",
	help=(
	"Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
	" https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
	),
	)
	parser.add_argument(
	"--nccl_timeout",
	type=int,
	default=600,
	help="Maximum timeout duration before which allgather, or related, operations fail in multi-GPU/multi-node training settings.",
	)
	parser.add_argument(
	"--report_to",
	type=str,
	default=None,
	help=(
	'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
	' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
	),
	)


	def get_args():
	parser = argparse.ArgumentParser(description="Simple example of a training script for CogVideoX.")

	_get_model_args(parser)
	_get_dataset_args(parser)
	_get_training_args(parser)
	_get_validation_args(parser)
	_get_optimizer_args(parser)
	_get_configuration_args(parser)

	return parser.parse_args()