File size: 4,701 Bytes
ed002eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python
# coding: utf-8

# # Set-up environment

# In[2]:


get_ipython().system('pip install --upgrade -q accelerate bitsandbytes')


# In[ ]:


get_ipython().system('rm -r transformers')
get_ipython().system('git clone -b llava_improvements https://github.com/NielsRogge/transformers.git')
get_ipython().system('cd transformers')
get_ipython().system('pip install -q ./transformers')


# In[ ]:


get_ipython().system('pip install git+https://github.com/huggingface/transformers.git')


# ## Load model and processor

# In[ ]:


from transformers import AutoProcessor, LlavaForConditionalGeneration
from transformers import BitsAndBytesConfig
import torch

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)


model_id = "llava-hf/llava-1.5-7b-hf"

processor = AutoProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")


# ## Prepare image and text for the model

# In[ ]:


import requests
from PIL import Image

image1 = Image.open('data/clock.jpeg')
display(image1)


# In the prompt, you can refer to images using the special \<image> token. To indicate which text comes from a human vs. the model, one uses USER and ASSISTANT respectively. The format looks as follows:
# 
# ```bash
# USER: <image>\n<prompt>\nASSISTANT:
# ```

# In other words, you always need to end your prompt with `ASSISTANT:`. Here we will perform batched generation (i.e generating on several prompts).

# In[ ]:


caption = 'an old fashioned clock sitting on top of a table'

user_input = "This is an intricately crafted old-fashioned clock created by a skilled Moroccan artisan back in 1988 from Chefchaoune.. it reminds me of my mother."

prompts = [
            f"USER: <image>\nBased on the caption '{caption}' and the following user input: '{user_input}', generate a detailed product name and description for this Moroccan artisanal item; the description should be minimal yet it gives the essence of the product and convinces people to buy or express their interest in it.\nASSISTANT:"
          # f"""
          # USER: <image>\nBased on the image caption '{caption}' and the following background information: '{user_input}', generate an attention-grabbing yet concise product name and description for this authentic Moroccan artisanal item. The description should:
          # Highlight the key features and unique selling points that make this product exceptional and desirable.
          # Convey the cultural significance, craftsmanship, and rich heritage behind the item's creation.
          # Use evocative language that resonates with potential buyers and piques their interest in owning this one-of-a-kind piece.
          # Be concise, direct, and persuasive, leaving the reader eager to learn more or acquire the product.

          # Your response should follow this format:
          # Product Name: [Compelling and relevant product name]
          # Product Description: [Concise yet captivating description addressing the points above]
          # ASSISTANT:"""

]

inputs = processor(prompts, images=[image1], padding=True, return_tensors="pt").to("cuda")
for k,v in inputs.items():
  print(k,v.shape)


# ## Autoregressively generate completion
# 
# Finally, we simply let the model predict the next tokens given the images + prompt. Of course one can adjust all the [generation parameters](https://huggingface.co/docs/transformers/v4.35.2/en/main_classes/text_generation#transformers.GenerationMixin.generate). By default, greedy decoding is used.

# In[ ]:


output = model.generate(**inputs, max_new_tokens=200)
generated_text = processor.batch_decode(output, skip_special_tokens=True)
for text in generated_text:
  print(text.split("ASSISTANT:")[-1])


# ## Pipeline API
# 
# Alternatively, you can leverage the [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines) API which abstracts all of the logic above away for the user. We also provide the quantization config to make sure we leverage 4-bit inference.

# In[ ]:


from transformers import pipeline

pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})


# In[ ]:


max_new_tokens = 200
prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place?\nASSISTANT:"

outputs = pipe(image1, prompt=prompt, generate_kwargs={"max_new_tokens": 200})


# In[ ]:


print(outputs[0]["generated_text"])


# In[ ]: