Spaces:
Build error
Build error
File size: 3,937 Bytes
ed002eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
#!/usr/bin/env python
# coding: utf-8
# In[1]:
get_ipython().system('pip install ftfy regex tqdm')
get_ipython().system('pip install git+https://github.com/openai/CLIP.git')
get_ipython().system('pip install sentencepiece-0.1.98-cp311-cp311-win_amd64.whl')
# In[5]:
# prompt: install transformers
get_ipython().system('pip install transformers')
# In[6]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# ## Import the necessary libraries and load the CLIP model:
# In[7]:
from PIL import Image
import clip
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)
# ## Define a function to generate product descriptions:
# In[8]:
image = Image.open("data/download.jpeg")
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
output_ids = model.generate(pixel_values, max_length=50, num_beams=4, early_stopping=True)
captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
# In[9]:
image = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
image_features = clip_model.encode_image(image)
text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
with torch.no_grad():
text_features = clip_model.encode_text(text_inputs)
similarity_scores = image_features @ text_features.T
best_caption_idx = similarity_scores.argmax().item()
product_description = captions[best_caption_idx]
print(product_description)
# # Using SigLip
# In[11]:
get_ipython().system('pip install sentencepiece')
get_ipython().system('pip install protobuf')
# In[12]:
from transformers import AutoProcessor, AutoModel, VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image
model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
image = Image.open("data/avito4.jpeg")
inputs = processor(images=image, return_tensors="pt")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
output_ids = model.generate(pixel_values, max_length=100, num_beams=5, early_stopping=True)
captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
image = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
image_features = clip_model.encode_image(image)
text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
with torch.no_grad():
text_features = clip_model.encode_text(text_inputs)
similarity_scores = image_features @ text_features.T
best_caption_idx = similarity_scores.argmax().item()
product_description = captions[best_caption_idx]
print(product_description)
# a vase sitting on a shelf in a store => thuya
# a wooden bench sitting on top of a wooden floor => avito
## two old fashioned vases sitting next to each other => avito2
## three wooden vases sitting on top of a wooden floor => avito3
# an old fashioned clock sitting on top of a table => avito4
# In[ ]:
# # Implemeting LLaVa
# https://colab.research.google.com/drive/1veefV17NcD1S4ou4nF8ABkfm8-TgU0Dr#scrollTo=XN2vJCPZk1UY
# In[ ]:
|