import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
MODEL_NAME = "lumolabs-ai/Lumo-8B-Instruct" | |
class ModelHandler: | |
def __init__(self): | |
"""Load the model and tokenizer""" | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Loading model on {self.device}...") | |
self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(self.device) | |
print("Model loaded successfully.") | |
def __call__(self, inputs): | |
"""Handle inference requests""" | |
text = inputs.get("inputs", "") | |
if not text: | |
return {"error": "No input provided"} | |
# Tokenize input | |
input_tokens = self.tokenizer(text, return_tensors="pt").to(self.device) | |
# Generate output | |
with torch.no_grad(): | |
output_tokens = self.model.generate(**input_tokens, max_length=200) | |
# Decode output | |
response = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True) | |
return {"response": response} | |