import requests
import torch
import streamlit as st
from transformers import pipeline, AutoProcessor, LlavaForConditionalGeneration
from PIL import Image

pipeline = pipeline(task="image-classification", model="julien-c/hotdog-not-hotdog")
# processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
# model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")

st.title("Hot Dog? Or Not?")
file_name = st.file_uploader("Upload a hot dog candidate image")

if file_name is not None:
    col1, col2 = st.columns(2)

    image = Image.open(file_name)
    col1.image(image, use_column_width=True)
    predictions = pipeline(image)

    col2.header("Probabilities")
    for p in predictions:
        col2.subheader(f"{ p['label'] }: { round(p['score'] * 100, 1)}%")

# img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
#
# question = "how many dogs are in the picture?"
# inputs = processor(raw_image, question, return_tensors="pt")
#
# out = model.generate(**inputs)
# print(processor.decode(out[0], skip_special_tokens=True).strip())

#
# model_id = "llava-hf/llava-1.5-7b-hf"
#
# prompt = "USER: <image>\nWhat are these?\nASSISTANT:"
# image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
#
# model = LlavaForConditionalGeneration.from_pretrained(
#     model_id,
#     torch_dtype=torch.float16,
#     low_cpu_mem_usage=True,
# ).to(0)
#
# processor = AutoProcessor.from_pretrained(model_id)
#
#
# raw_image = Image.open(requests.get(image_file, stream=True).raw)
# inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16)
#
# output = model.generate(**inputs, max_new_tokens=200, do_sample=False)
# print(processor.decode(output[0][2:], skip_special_tokens=True))