import os

from PIL import Image
from huggingface_hub import snapshot_download

from videogen_hub import MODEL_PATH


class ConsistI2V:
    def __init__(self, device="cuda"):

        class Args:
            def __init__(self):
                self.inference_config = "configs/inference/inference.yaml"
                self.prompt = None
                self.n_prompt = ""
                self.seed = "random"
                self.path_to_first_frame = None
                self.prompt_config = "configs/prompts/default.yaml"
                self.format = "mp4"
                self.save_model = False
                self.optional_args = []

        self.args = Args()
        model_path = os.path.join(MODEL_PATH, "TIGER-Lab", "ConsistI2V").replace("\\", "\\\\")
        yaml_config = f"""
            output_dir: "samples/inference"
            output_name: "i2v"
            pretrained_model_path: "{model_path}"
            unet_path: null
            unet_ckpt_prefix: "module."
            pipeline_pretrained_path: null

            sampling_kwargs:
            height: 256
            width: 256
            n_frames: 16
            steps: 50
            ddim_eta: 0.0
            guidance_scale_txt: 7.5
            guidance_scale_img: 1.0
            guidance_rescale: 0.0
            num_videos_per_prompt: 1
            frame_stride: 3

            unet_additional_kwargs:
            variant: null
            n_temp_heads: 8
            augment_temporal_attention: true
            temp_pos_embedding: "rotary" # "rotary" or "sinusoidal"
            first_frame_condition_mode: "concat"
            use_frame_stride_condition: true
            noise_sampling_method: "pyoco_mixed" # "vanilla" or "pyoco_mixed" or "pyoco_progressive"
            noise_alpha: 1.0

            noise_scheduler_kwargs:
            beta_start: 0.00085
            beta_end: 0.012
            beta_schedule: "linear"
            steps_offset: 1
            clip_sample: false
            rescale_betas_zero_snr: false     # true if using zero terminal snr
            timestep_spacing:       "leading" # "trailing" if using zero terminal snr
            prediction_type:        "epsilon" # "v_prediction" if using zero terminal snr

            frameinit_kwargs:
            enable: true
            camera_motion: null
            noise_level: 850
            filter_params:
                method: 'gaussian'
                d_s: 0.25
                d_t: 0.25
        """

        from omegaconf import OmegaConf

        self.config = OmegaConf.create(yaml_config)
        model_path = os.path.join(MODEL_PATH, "ConsistI2V").replace("\\", "\\\\")
        snapshot_download("TIGER-Lab/ConsistI2V", local_dir=model_path)
        from videogen_hub.pipelines.consisti2v.scripts.animate import main

        self.pipeline = main

    def infer_one_video(
        self,
        input_image: Image.Image,
        prompt: str = None,
        size: list = [320, 512],
        seconds: int = 2,
        fps: int = 8,
        seed: int = 42,
    ):
        """
        Generates a single video based on a textual prompt and first frame image, using either a provided image or an image path as the starting point. The output is a tensor representing the video.

        Args:
            input_image (PIL.Image.Image): The input image to use as the basis for video generation.
            prompt (str, optional): The text prompt that guides the video generation. If not specified, the video generation will rely solely on the input image. Defaults to None.
            size (list, optional): Specifies the resolution of the output video as [height, width]. Defaults to [320, 512].
            seconds (int, optional): The duration of the video in seconds. Defaults to 2.
            fps (int, optional): The number of frames per second in the generated video. This determines how smooth the video appears. Defaults to 8.
            seed (int, optional): A seed value for random number generation, ensuring reproducibility of the video generation process. Defaults to 42.

        Returns:
            torch.Tensor: A tensor representing the generated video, structured as (time, channel, height, width).
        """

        self.args.prompt = prompt
        self.args.path_to_first_frame = input_image
        self.args.seed = str(seed)
        self.config.sampling_kwargs.height = size[0]
        self.config.sampling_kwargs.width = size[1]
        self.config.sampling_kwargs.n_frames = seconds * fps

        return self.pipeline(self.args, self.config)