Spaces:
Build error
Build error
File size: 4,701 Bytes
ed002eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
#!/usr/bin/env python
# coding: utf-8
# # Set-up environment
# In[2]:
get_ipython().system('pip install --upgrade -q accelerate bitsandbytes')
# In[ ]:
get_ipython().system('rm -r transformers')
get_ipython().system('git clone -b llava_improvements https://github.com/NielsRogge/transformers.git')
get_ipython().system('cd transformers')
get_ipython().system('pip install -q ./transformers')
# In[ ]:
get_ipython().system('pip install git+https://github.com/huggingface/transformers.git')
# ## Load model and processor
# In[ ]:
from transformers import AutoProcessor, LlavaForConditionalGeneration
from transformers import BitsAndBytesConfig
import torch
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)
model_id = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")
# ## Prepare image and text for the model
# In[ ]:
import requests
from PIL import Image
image1 = Image.open('data/clock.jpeg')
display(image1)
# In the prompt, you can refer to images using the special \<image> token. To indicate which text comes from a human vs. the model, one uses USER and ASSISTANT respectively. The format looks as follows:
#
# ```bash
# USER: <image>\n<prompt>\nASSISTANT:
# ```
# In other words, you always need to end your prompt with `ASSISTANT:`. Here we will perform batched generation (i.e generating on several prompts).
# In[ ]:
caption = 'an old fashioned clock sitting on top of a table'
user_input = "This is an intricately crafted old-fashioned clock created by a skilled Moroccan artisan back in 1988 from Chefchaoune.. it reminds me of my mother."
prompts = [
f"USER: <image>\nBased on the caption '{caption}' and the following user input: '{user_input}', generate a detailed product name and description for this Moroccan artisanal item; the description should be minimal yet it gives the essence of the product and convinces people to buy or express their interest in it.\nASSISTANT:"
# f"""
# USER: <image>\nBased on the image caption '{caption}' and the following background information: '{user_input}', generate an attention-grabbing yet concise product name and description for this authentic Moroccan artisanal item. The description should:
# Highlight the key features and unique selling points that make this product exceptional and desirable.
# Convey the cultural significance, craftsmanship, and rich heritage behind the item's creation.
# Use evocative language that resonates with potential buyers and piques their interest in owning this one-of-a-kind piece.
# Be concise, direct, and persuasive, leaving the reader eager to learn more or acquire the product.
# Your response should follow this format:
# Product Name: [Compelling and relevant product name]
# Product Description: [Concise yet captivating description addressing the points above]
# ASSISTANT:"""
]
inputs = processor(prompts, images=[image1], padding=True, return_tensors="pt").to("cuda")
for k,v in inputs.items():
print(k,v.shape)
# ## Autoregressively generate completion
#
# Finally, we simply let the model predict the next tokens given the images + prompt. Of course one can adjust all the [generation parameters](https://huggingface.co/docs/transformers/v4.35.2/en/main_classes/text_generation#transformers.GenerationMixin.generate). By default, greedy decoding is used.
# In[ ]:
output = model.generate(**inputs, max_new_tokens=200)
generated_text = processor.batch_decode(output, skip_special_tokens=True)
for text in generated_text:
print(text.split("ASSISTANT:")[-1])
# ## Pipeline API
#
# Alternatively, you can leverage the [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines) API which abstracts all of the logic above away for the user. We also provide the quantization config to make sure we leverage 4-bit inference.
# In[ ]:
from transformers import pipeline
pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})
# In[ ]:
max_new_tokens = 200
prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place?\nASSISTANT:"
outputs = pipe(image1, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
# In[ ]:
print(outputs[0]["generated_text"])
# In[ ]:
|