#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().system('pip install ftfy regex tqdm')
get_ipython().system('pip install git+https://github.com/openai/CLIP.git')
get_ipython().system('pip install sentencepiece-0.1.98-cp311-cp311-win_amd64.whl')



# In[5]:


# prompt: install transformers

get_ipython().system('pip install transformers')


# In[6]:


from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer


feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")


# ## Import the necessary libraries and load the CLIP model:

# In[7]:


from PIL import Image
import clip
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)


# ## Define a function to generate product descriptions:

# In[8]:


image = Image.open("data/download.jpeg")
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
output_ids = model.generate(pixel_values, max_length=50, num_beams=4, early_stopping=True)
captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)


# In[9]:


image = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
    image_features = clip_model.encode_image(image)

text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
with torch.no_grad():
    text_features = clip_model.encode_text(text_inputs)

similarity_scores = image_features @ text_features.T
best_caption_idx = similarity_scores.argmax().item()
product_description = captions[best_caption_idx]
print(product_description)


# # Using SigLip

# In[11]:


get_ipython().system('pip install sentencepiece')
get_ipython().system('pip install protobuf')


# In[12]:


from transformers import AutoProcessor, AutoModel, VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image


model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")


image = Image.open("data/avito4.jpeg")
inputs = processor(images=image, return_tensors="pt")


feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
output_ids = model.generate(pixel_values, max_length=100, num_beams=5, early_stopping=True)
captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

image = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
    image_features = clip_model.encode_image(image)

text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
with torch.no_grad():
    text_features = clip_model.encode_text(text_inputs)

similarity_scores = image_features @ text_features.T
best_caption_idx = similarity_scores.argmax().item()
product_description = captions[best_caption_idx]
print(product_description)

# a vase sitting on a shelf in a store => thuya
# a wooden bench sitting on top of a wooden floor => avito
## two old fashioned vases sitting next to each other => avito2
## three wooden vases sitting on top of a wooden floor => avito3
# an old fashioned clock sitting on top of a table => avito4



# In[ ]:





# # Implemeting LLaVa

# https://colab.research.google.com/drive/1veefV17NcD1S4ou4nF8ABkfm8-TgU0Dr#scrollTo=XN2vJCPZk1UY

# In[ ]: