import streamlit as st
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import torch

# Load the pre-trained model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

def answer_question(image, question):
    inputs = processor(images=image, text=question, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(**inputs)
    answer = processor.decode(outputs[0], skip_special_tokens=True)
    return answer

# Streamlit UI
st.title("VQA App using BLIP")
st.write("Upload an image and ask a question about it.")

# Instructions Section
st.header("How to Use the App")
st.markdown(
    """
    1. **Upload an image** by clicking the file uploader below.
    2. **Wait for the image to load.**
    3. **Type a question** about the image in the input box.
    4. **Press Enter** and the AI will generate an answer.
    """
)

uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"])
if uploaded_file is not None:
    image = Image.open(uploaded_file).convert("RGB")
    st.image(image, caption="Uploaded Image", use_column_width=True)
    
    question = st.text_input("Ask a question about the image:")
    if question:
        answer = answer_question(image, question)
        st.write(f"**Answer:** {answer}")