File size: 4,630 Bytes
bedc493 74358f3 bedc493 a79245f bedc493 a79245f 74358f3 bedc493 74358f3 bedc493 74358f3 6f1c35a bedc493 a79245f b233fe4 a79245f 74358f3 bedc493 74358f3 bedc493 2e8bef0 bedc493 74358f3 bedc493 76b7ebe 74358f3 a79245f cd7fc31 bedc493 74358f3 a79245f 76b7ebe 74358f3 bedc493 74358f3 bedc493 a79245f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
from typing import Dict, List, Any
# import transformers
# from transformers import AutoTokenizer
# import torch
from datetime import datetime
import torch
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
import requests
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
class EndpointHandler():
def __init__(self, path=""):
self.processor = Blip2Processor.from_pretrained(path)
self.model = Blip2ForConditionalGeneration.from_pretrained(path, load_in_8bit=True, device_map="auto")
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# self.model.to(self.device)
logging.info('Model moved to device-' + self.device)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# self.model.eval()
# self.model.to(device=device, dtype=self.torch_dtype)
# self.generate_kwargs = {
# 'max_new_tokens': 512,
# 'temperature': 0.0001,
# 'top_p': 1.0,
# 'top_k': 0,
# 'use_cache': True,
# 'do_sample': True,
# 'eos_token_id': self.tokenizer.eos_token_id,
# 'pad_token_id': self.tokenizer.pad_token_id,
# "repetition_penalty": 1.1
# }
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
data args:
inputs (:obj: `str` | `PIL.Image` | `np.array`)
kwargs
Return:
A :obj:`list` | `dict`: will be serialized and returned
"""
# streamer = TextIteratorStreamer(
# self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
# )
## Model Parameters
# self.generate_kwargs['max_new_tokens'] = data['max_new_tokens'] if 'max_new_tokens' in data else self.generate_kwargs['max_new_tokens']
# self.generate_kwargs['temperature'] = data['temperature'] if 'temperature' in data else self.generate_kwargs['temperature']
# self.generate_kwargs['top_p'] = data['top_p'] if 'top_p' in data else self.generate_kwargs['top_p']
# self.generate_kwargs['top_k'] = data['top_k'] if 'top_k' in data else self.generate_kwargs['top_k']
# self.generate_kwargs['do_sample'] = data['do_sample'] if 'do_sample' in data else self.generate_kwargs['do_sample']
# self.generate_kwargs['repetition_penalty'] = data['repetition_penalty'] if 'repetition_penalty' in data else self.generate_kwargs['repetition_penalty']
## Prepare the inputs
batch_size = data.pop("batch_size",data)
# input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids
# input_ids = input_ids.to(self.model.device)
# pip install accelerate
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
now = datetime.now()
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
# question = "how many dogs are in the picture?"
# inputs = self.processor(raw_image, question, return_tensors="pt").to("cuda")
inputs = self.processor([raw_image]*batch_size, return_tensors="pt").to("cuda", torch.float16)
out = self.model.generate(**inputs)
# generated_text = self.processor.batch_decode(out, skip_special_tokens=True)[0].strip()
generated_text = self.processor.batch_decode(out, skip_special_tokens=True)
current = datetime.now()
# encoded_inp = self.tokenizer(inputs, return_tensors='pt', padding=True)
# for key, value in encoded_inp.items():
# encoded_inp[key] = value.to('cuda:0')
## Invoke the model
# with torch.no_grad():
# gen_tokens = self.model.generate(
# input_ids=encoded_inp['input_ids'],
# attention_mask=encoded_inp['attention_mask'],
# **generate_kwargs,
# )
# ## Decode using tokenizer
# decoded_gen = self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
# with torch.no_grad():
# output_ids = self.model.generate(input_ids, **self.generate_kwargs)
# # Slice the output_ids tensor to get only new tokens
# new_tokens = output_ids[0, len(input_ids[0]) :]
# output_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
return [{"gen_text":generated_text, "time_elapsed": str(current-now)}]
|