from typing import Any
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


def extract_assistant_response_simple(response_text):
    # Split by assistant header and eot
    parts = response_text.split("<|start_header_id|>assistant<|end_header_id|>")[
        1
    ].split("<|eot_id|>")[0]
    return parts.strip()


class EndpointHandler:
    def __init__(self, path=""):
        self.tokenizer = AutoTokenizer.from_pretrained(path)
        self.model = AutoModelForCausalLM.from_pretrained(
            path,
            torch_dtype="auto",
        )

    def __call__(self, data: Any):
        start = time.perf_counter()

        text = data.pop("inputs", data)

        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": text},
        ]

        inputs = self.tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,  # Must add for generation
            return_tensors="pt",
        ).to("cuda")

        print(f"inputs={inputs}")

        with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
            outputs = self.model.generate(
                input_ids=inputs,
                max_new_tokens=64,
                use_cache=False,
                temperature=1.5,
                min_p=0.1,
            )

        response_length = len(outputs[0])
        response = extract_assistant_response_simple(self.tokenizer.decode(outputs[0]))
        end = time.perf_counter()
        elapsed = end - start

        return {"response": response, "response_token_length": response_length, "elapsed": elapsed}