Spaces:

sourabhbargi11
/

caption_generate

Sleeping

App Files Files Community

caption_generate / app.py

sourabhbargi11

Update app.py

27a4a72 verified 12 months ago

raw

history blame

3.45 kB

	# !pip install torch
	# import torch
	from transformers import AutoTokenizer, AutoModel ,AutoConfig
	import torch
	from transformers import ViTImageProcessor, VisionEncoderDecoderModel,RobertaTokenizerFast
	import PIL


	# Move model to GPU , depnding on device
	device2 = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# Load the model



	import streamlit as st
	from PIL import Image
	# from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel,RobertaTokenizerFast, VisionEncoderDecoderModel
	#from transformers import BlipProcessor, BlipForConditionalGeneration


	# Load model directly
	from transformers import AutoTokenizer, AutoModel

	# tokenizer = AutoTokenizer.from_pretrained("sourabhbargi11/Caption_generator_model")
	# model = AutoModel.from_pretrained("sourabhbargi11/Caption_generator_model")



	def set_page_config():
	st.set_page_config(
	page_title='Caption an Cartoon Image',
	page_icon=':camera:',
	layout='wide',
	)

	def initialize_model():
	device = 'cpu'
	config = AutoConfig.from_pretrained("sourabhbargi11/Caption_generator_model")
	model = VisionEncoderDecoderModel.from_pretrained("sourabhbargi11/Caption_generator_model", config=config).to(device)
	tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
	image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224",device=device)
	return image_processor, model,tokenizer, device

	def upload_image():
	return st.sidebar.file_uploader("Upload an image (we aren't storing anything)", type=["jpg", "jpeg", "png"])

	def image_preprocess(image):
	image = image.resize((224,224))
	if image.mode == "L":
	image = image.convert("RGB")
	return image

	def generate_caption(processor, model, device, image):
	inputs = image_processor (image, return_tensors='pt').to(device)
	model.eval()
	# Generate caption
	with torch.no_grad():
	output = model.generate(
	pixel_values=inputs ,
	max_length=1000, # Adjust the maximum length of the generated caption as needed
	num_beams=4, # Adjust the number of beams for beam search decoding
	early_stopping=True # Enable early stopping to stop generation when all beams finished
	)

	# Decode the generated caption
	caption = tokenizer.decode(output[0], skip_special_tokens=True)
	return caption


	def main():
	set_page_config()
	st.header("Caption an Image :camera:")

	uploaded_image = upload_image()

	if uploaded_image is not None:
	image = Image.open(uploaded_image)
	image = image_preprocess(image)

	st.image(image, caption='Your image')

	with st.sidebar:
	st.divider()
	if st.sidebar.button('Generate Caption'):
	with st.spinner('Generating caption...'):
	image_processor, model,tokenizer, device = initialize_model()
	caption = generate_caption(image_processor, model, device, image)

	st.header("Caption:")
	st.markdown(f'{caption}')


	if __name__ == '__main__':
	main()


	# st.markdown("""
	# ---
	# You are looking at partial tuned model , please JUDGE ME!!! (I am Funny , Sensible , Creative )""")

	st.markdown("""
	---
	You are looking at a partially tuned model. Judge me! (I am Funny and Creative) 😄🎨""")