#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().system('pip install ftfy regex tqdm') get_ipython().system('pip install git+https://github.com/openai/CLIP.git') get_ipython().system('pip install sentencepiece-0.1.98-cp311-cp311-win_amd64.whl') # In[5]: # prompt: install transformers get_ipython().system('pip install transformers') # In[6]: from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") # ## Import the necessary libraries and load the CLIP model: # In[7]: from PIL import Image import clip import torch device = "cuda" if torch.cuda.is_available() else "cpu" clip_model, preprocess = clip.load("ViT-B/32", device=device) # ## Define a function to generate product descriptions: # In[8]: image = Image.open("data/download.jpeg") pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values output_ids = model.generate(pixel_values, max_length=50, num_beams=4, early_stopping=True) captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True) # In[9]: image = preprocess(image).unsqueeze(0).to(device) with torch.no_grad(): image_features = clip_model.encode_image(image) text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device) with torch.no_grad(): text_features = clip_model.encode_text(text_inputs) similarity_scores = image_features @ text_features.T best_caption_idx = similarity_scores.argmax().item() product_description = captions[best_caption_idx] print(product_description) # # Using SigLip # In[11]: get_ipython().system('pip install sentencepiece') get_ipython().system('pip install protobuf') # In[12]: from transformers import AutoProcessor, AutoModel, VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer import torch from PIL import Image model = AutoModel.from_pretrained("google/siglip-base-patch16-224") processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224") image = Image.open("data/avito4.jpeg") inputs = processor(images=image, return_tensors="pt") feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values output_ids = model.generate(pixel_values, max_length=100, num_beams=5, early_stopping=True) captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True) image = preprocess(image).unsqueeze(0).to(device) with torch.no_grad(): image_features = clip_model.encode_image(image) text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device) with torch.no_grad(): text_features = clip_model.encode_text(text_inputs) similarity_scores = image_features @ text_features.T best_caption_idx = similarity_scores.argmax().item() product_description = captions[best_caption_idx] print(product_description) # a vase sitting on a shelf in a store => thuya # a wooden bench sitting on top of a wooden floor => avito ## two old fashioned vases sitting next to each other => avito2 ## three wooden vases sitting on top of a wooden floor => avito3 # an old fashioned clock sitting on top of a table => avito4 # In[ ]: # # Implemeting LLaVa # https://colab.research.google.com/drive/1veefV17NcD1S4ou4nF8ABkfm8-TgU0Dr#scrollTo=XN2vJCPZk1UY # In[ ]: