import requests | |
import torch | |
import streamlit as st | |
from transformers import pipeline, AutoProcessor, LlavaForConditionalGeneration | |
from PIL import Image | |
pipeline = pipeline(task="image-classification", model="julien-c/hotdog-not-hotdog") | |
# processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") | |
# model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b") | |
st.title("Hot Dog? Or Not?") | |
file_name = st.file_uploader("Upload a hot dog candidate image") | |
if file_name is not None: | |
col1, col2 = st.columns(2) | |
image = Image.open(file_name) | |
col1.image(image, use_column_width=True) | |
predictions = pipeline(image) | |
col2.header("Probabilities") | |
for p in predictions: | |
col2.subheader(f"{ p['label'] }: { round(p['score'] * 100, 1)}%") | |
# img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' | |
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') | |
# | |
# question = "how many dogs are in the picture?" | |
# inputs = processor(raw_image, question, return_tensors="pt") | |
# | |
# out = model.generate(**inputs) | |
# print(processor.decode(out[0], skip_special_tokens=True).strip()) | |
# | |
# model_id = "llava-hf/llava-1.5-7b-hf" | |
# | |
# prompt = "USER: <image>\nWhat are these?\nASSISTANT:" | |
# image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" | |
# | |
# model = LlavaForConditionalGeneration.from_pretrained( | |
# model_id, | |
# torch_dtype=torch.float16, | |
# low_cpu_mem_usage=True, | |
# ).to(0) | |
# | |
# processor = AutoProcessor.from_pretrained(model_id) | |
# | |
# | |
# raw_image = Image.open(requests.get(image_file, stream=True).raw) | |
# inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16) | |
# | |
# output = model.generate(**inputs, max_new_tokens=200, do_sample=False) | |
# print(processor.decode(output[0][2:], skip_special_tokens=True)) |