from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig from peft import PeftModel class EndpointHandler(): def __init__(self): self.tokenizer = LlamaTokenizer.from_pretrained( "decapoda-research/llama-7b-hf") self.tokenizer.pad_token_id = 0 self.model = LlamaForCausalLM.from_pretrained( "decapoda-research/llama-7b-hf", load_in_8bit=True, device_map="auto", ) # load model after fine tuned on alpaca datasets self.model = PeftModel.from_pretrained( self.model, "Nelsonlin0321/alpaca-lora-7b-tuned-on-hk-csv-fqa_causal_lm") self.eval_generation_config = GenerationConfig( temperature=0.1, top_p=0.75, num_beams=4) @staticmethod def generate_prompt_eval(instruction): template = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Response:""" return template def __call__(self, instruction: str) -> str: prompt = self.generate_prompt_eval(instruction) inputs = self.tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].cuda() generation_output = self.model.generate( input_ids=input_ids, generation_config=self.eval_generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=256 ) for s in generation_output.sequences: output = self.tokenizer.decode(s) output = output.split("### Response:")[1].strip() return output