import torch from transformers import AutoModelForCausalLM, AutoTokenizer MODEL_NAME = "kasterkeqi/Lumo-8B-Fork-Sol-Copilot" class EndpointHandler: def __init__(self, model_dir=None): """Initialize the model and tokenizer.""" self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Loading model on {self.device}...") self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) self.model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ).to(self.device) print("Model loaded successfully.") def __call__(self, inputs): """Handle inference requests.""" prompt = inputs.get("inputs", "") if not prompt: return {"error": "No input provided"} # Tokenize input input_tokens = self.tokenizer(prompt, return_tensors="pt").to(self.device) # Generate output with torch.no_grad(): output_tokens = self.model.generate(**input_tokens, max_length=200) # Decode output response = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True) return {"response": response}