import streamlit as st from transformers import BlipProcessor, BlipForQuestionAnswering from PIL import Image import torch # Load the pre-trained model and processor processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") def answer_question(image, question): inputs = processor(images=image, text=question, return_tensors="pt") with torch.no_grad(): outputs = model.generate(**inputs) answer = processor.decode(outputs[0], skip_special_tokens=True) return answer # Streamlit UI st.title("VQA App using BLIP") st.write("Upload an image and ask a question about it.") # Instructions Section st.header("How to Use the App") st.markdown( """ 1. **Upload an image** by clicking the file uploader below. 2. **Wait for the image to load.** 3. **Type a question** about the image in the input box. 4. **Press Enter** and the AI will generate an answer. """ ) uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"]) if uploaded_file is not None: image = Image.open(uploaded_file).convert("RGB") st.image(image, caption="Uploaded Image", use_column_width=True) question = st.text_input("Ask a question about the image:") if question: answer = answer_question(image, question) st.write(f"**Answer:** {answer}")