import requests import torch import streamlit as st from transformers import pipeline, AutoProcessor, LlavaForConditionalGeneration from PIL import Image pipeline = pipeline(task="image-classification", model="julien-c/hotdog-not-hotdog") # processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") # model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b") st.title("Hot Dog? Or Not?") file_name = st.file_uploader("Upload a hot dog candidate image") if file_name is not None: col1, col2 = st.columns(2) image = Image.open(file_name) col1.image(image, use_column_width=True) predictions = pipeline(image) col2.header("Probabilities") for p in predictions: col2.subheader(f"{ p['label'] }: { round(p['score'] * 100, 1)}%") # img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' # raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') # # question = "how many dogs are in the picture?" # inputs = processor(raw_image, question, return_tensors="pt") # # out = model.generate(**inputs) # print(processor.decode(out[0], skip_special_tokens=True).strip()) # # model_id = "llava-hf/llava-1.5-7b-hf" # # prompt = "USER: \nWhat are these?\nASSISTANT:" # image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" # # model = LlavaForConditionalGeneration.from_pretrained( # model_id, # torch_dtype=torch.float16, # low_cpu_mem_usage=True, # ).to(0) # # processor = AutoProcessor.from_pretrained(model_id) # # # raw_image = Image.open(requests.get(image_file, stream=True).raw) # inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16) # # output = model.generate(**inputs, max_new_tokens=200, do_sample=False) # print(processor.decode(output[0][2:], skip_special_tokens=True))