from typing import Dict, List, Any
import torch
from torch import autocast
from huggingface_hub import hf_hub_download
from diffusers import DiffusionPipeline
import base64
from io import BytesIO
from cog_sdxl.dataset_and_utils import TokenEmbeddingsHandler


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device ~>", device)


class EndpointHandler:
    def __init__(self, path=""):
        print("path ~>", path)

        self.pipe = DiffusionPipeline.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0",
            torch_dtype=torch.float16 if device.type == "cuda" else None,
            variant="fp16",
        ).to(device)

        self.pipe.load_lora_weights("SvenN/sdxl-emoji", weight_name="lora.safetensors")

        text_encoders = [self.pipe.text_encoder, self.pipe.text_encoder_2]
        tokenizers = [self.pipe.tokenizer, self.pipe.tokenizer_2]

        embedding_path = hf_hub_download(
            repo_id="SvenN/sdxl-emoji", filename="embeddings.pti", repo_type="model"
        )
        embhandler = TokenEmbeddingsHandler(text_encoders, tokenizers)
        embhandler.load_embeddings(embedding_path)

    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`dict`:. base64 encoded image
        """
        inputs = data.pop("inputs", data)

        # Automatically add trigger tokens to the beginning of the prompt
        full_prompt = f"A <s0><s1> {inputs}"
        images = self.pipe(
            full_prompt,
            cross_attention_kwargs={"scale": 0.8},
        ).images
        image = images[0]
        # encode image as base 64
        buffered = BytesIO()
        image.save(buffered, format="JPEG")
        img_str = base64.b64encode(buffered.getvalue())

        # postprocess the prediction
        return {"image": img_str.decode()}


if __name__ == "__main__":
    handler = EndpointHandler()
    print(handler)
    output = handler({"inputs": "emoji of a tiger face, white background"})
    print(output)