# !pip install torch
# import torch

import streamlit as st
from PIL import Image
# from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel,RobertaTokenizerFast, VisionEncoderDecoderModel
#from transformers import BlipProcessor, BlipForConditionalGeneration


# Load model directly
from transformers import AutoTokenizer, AutoModel

# tokenizer = AutoTokenizer.from_pretrained("sourabhbargi11/Caption_generator_model")
# model = AutoModel.from_pretrained("sourabhbargi11/Caption_generator_model")


def set_page_config():
    st.set_page_config(
        page_title='Caption an Cartoon Image', 
        page_icon=':camera:', 
        layout='wide',
    )

def initialize_model():
    device = 'cpu'
    # load a fine-tuned image captioning model and corresponding tokenizer and image processor
    model = AutoModel.from_pretrained("sourabhbargi11/Caption_generator_model").to(device) 
    tokenizer = AutoTokenizer.from_pretrained("sourabhbargi11/Caption_generator_model")
    image_processor = ViTImageProcessor.from_pretrained("sourabhbargi11/Caption_generator_model")
    return image_processor, model,tokenizer, device

def upload_image():
    return st.sidebar.file_uploader("Upload an image (we aren't storing anything)", type=["jpg", "jpeg", "png"])

def image_preprocess(image):
    image = image.resize((224,224))
    if image.mode == "L":
        image = image.convert("RGB")
    return image

def generate_caption(processor, model, device, image):
    inputs =  image_processor (image, return_tensors='pt').to(device)
    out = model.generate(**inputs, max_new_tokens=20)
    caption = processor.decode(out[0], skip_special_tokens=True)
    #caption="im here "
    return caption

def main():
    set_page_config()
    st.header("Caption an Image :camera:")

    uploaded_image = upload_image()

    if uploaded_image is not None:
        image = Image.open(uploaded_image)
        image = image_preprocess(image)
        
        st.image(image, caption='Your image')
        
        with st.sidebar:
            st.divider() 
            if st.sidebar.button('Generate Caption'):
                with st.spinner('Generating caption...'):
                    image_processor, model,tokenizer, device = initialize_model()
                    
                    caption = generate_caption(image_processor, model, device, image)
                    #caption="im here man"
                    st.header("Caption:")
                    st.markdown(f'**{caption}**')


if __name__ == '__main__':
    main()


st.markdown("""
    ---
   You are looking at partial finetuned model , please JUDGE ME!!!  """)