import os import torch import clip from utils import MLP, normalized # set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") class EndpointHandler: def __init__(self, path=""): model = MLP(768) s = torch.load(os.path.join(path, "sac+logos+ava1-l14-linearMSE.pth"), map_location=device) model.load_state_dict(s) model.to(device) model.eval() model2, preprocess = clip.load("ViT-L/14", device=device) self.model_dict = {} self.model_dict["classifier"] = model self.model_dict["clip_model"] = model2 self.model_dict["clip_preprocess"] = preprocess self.model_dict["device"] = device def __call__(self, data): """ data args: inputs (:obj:`PIL.Image`) Return: A :obj:`list`:. The list contains items that are dicts should be liked {"label": "XXX", "score": 0.82} """ # extract converted PIL image from serialized request image = data.pop("inputs", data) image_input = self.model_dict["clip_preprocess"](image).unsqueeze(0).to(self.model_dict["device"]) with torch.no_grad(): image_features = self.model_dict["clip_model"].encode_image(image_input) if self.model_dict["device"].type == "cuda": im_emb_arr = normalized(image_features.detach().cpu().numpy()) im_emb = torch.from_numpy(im_emb_arr).to(self.model_dict["device"]).type(torch.cuda.FloatTensor) else: im_emb_arr = normalized(image_features.detach().numpy()) im_emb = torch.from_numpy(im_emb_arr).to(self.model_dict["device"]).type(torch.FloatTensor) prediction = self.model_dict["classifier"](im_emb) score = prediction.item() return {"aesthetic score": score}