File size: 3,937 Bytes
ed002eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().system('pip install ftfy regex tqdm')
get_ipython().system('pip install git+https://github.com/openai/CLIP.git')
get_ipython().system('pip install sentencepiece-0.1.98-cp311-cp311-win_amd64.whl')



# In[5]:


# prompt: install transformers

get_ipython().system('pip install transformers')


# In[6]:


from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer


feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")


# ## Import the necessary libraries and load the CLIP model:

# In[7]:


from PIL import Image
import clip
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)


# ## Define a function to generate product descriptions:

# In[8]:


image = Image.open("data/download.jpeg")
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
output_ids = model.generate(pixel_values, max_length=50, num_beams=4, early_stopping=True)
captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)


# In[9]:


image = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
    image_features = clip_model.encode_image(image)

text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
with torch.no_grad():
    text_features = clip_model.encode_text(text_inputs)

similarity_scores = image_features @ text_features.T
best_caption_idx = similarity_scores.argmax().item()
product_description = captions[best_caption_idx]
print(product_description)


# # Using SigLip

# In[11]:


get_ipython().system('pip install sentencepiece')
get_ipython().system('pip install protobuf')


# In[12]:


from transformers import AutoProcessor, AutoModel, VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image


model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")


image = Image.open("data/avito4.jpeg")
inputs = processor(images=image, return_tensors="pt")


feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
output_ids = model.generate(pixel_values, max_length=100, num_beams=5, early_stopping=True)
captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

image = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
    image_features = clip_model.encode_image(image)

text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
with torch.no_grad():
    text_features = clip_model.encode_text(text_inputs)

similarity_scores = image_features @ text_features.T
best_caption_idx = similarity_scores.argmax().item()
product_description = captions[best_caption_idx]
print(product_description)

# a vase sitting on a shelf in a store => thuya
# a wooden bench sitting on top of a wooden floor => avito
## two old fashioned vases sitting next to each other => avito2
## three wooden vases sitting on top of a wooden floor => avito3
# an old fashioned clock sitting on top of a table => avito4



# In[ ]:





# # Implemeting LLaVa

# https://colab.research.google.com/drive/1veefV17NcD1S4ou4nF8ABkfm8-TgU0Dr#scrollTo=XN2vJCPZk1UY

# In[ ]: