| from PIL import Image | |
| import requests | |
| from transformers import CLIPProcessor, CLIPModel | |
| def food_not_food(input_image): | |
| model = CLIPModel.from_pretrained("flax-community/clip-rsicd-v2") | |
| processor = CLIPProcessor.from_pretrained("flax-community/clip-rsicd-v2") | |
| labels = ["food", "not food"] | |
| inputs = processor(text=[f"a photo of a {l}" for l in labels], images=input_image, return_tensors="pt", padding=True) | |
| outputs = model(**inputs) | |
| logits_per_image = outputs.logits_per_image | |
| prob = logits_per_image.softmax(dim=1).detach().cpu().numpy().argmax(axis=1) | |
| return labels[prob[0]] |