from typing import Any import time import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig def extract_assistant_response_simple(response_text): # Split by assistant header and eot parts = response_text.split("<|start_header_id|>assistant<|end_header_id|>")[ 1 ].split("<|eot_id|>")[0] return parts.strip() class EndpointHandler: def __init__(self, path=""): self.tokenizer = AutoTokenizer.from_pretrained(path) self.model = AutoModelForCausalLM.from_pretrained( path, torch_dtype="auto", ) def __call__(self, data: Any): start = time.perf_counter() text = data.pop("inputs", data) messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": text}, ] inputs = self.tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, # Must add for generation return_tensors="pt", ).to("cuda") print(f"inputs={inputs}") with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): outputs = self.model.generate( input_ids=inputs, max_new_tokens=64, use_cache=False, temperature=1.5, min_p=0.1, ) response_length = len(outputs[0]) response = extract_assistant_response_simple(self.tokenizer.decode(outputs[0])) end = time.perf_counter() elapsed = end - start return {"response": response, "response_token_length": response_length, "elapsed": elapsed}