File size: 4,630 Bytes
bedc493
74358f3
 
 
 
bedc493
a79245f
bedc493
a79245f
 
74358f3
 
 
 
 
 
bedc493
 
74358f3
bedc493
 
74358f3
6f1c35a
bedc493
a79245f
 
b233fe4
a79245f
 
 
74358f3
 
 
bedc493
74358f3
 
 
 
 
 
 
 
 
 
 
bedc493
2e8bef0
bedc493
 
 
 
 
 
 
 
 
 
 
 
 
74358f3
 
 
 
 
 
 
bedc493
 
76b7ebe
74358f3
 
 
 
 
 
 
 
 
 
 
 
a79245f
 
 
cd7fc31
bedc493
74358f3
a79245f
76b7ebe
 
74358f3
 
 
bedc493
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74358f3
 
 
 
 
bedc493
a79245f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from typing import Dict, List, Any
# import transformers
# from transformers import AutoTokenizer
# import torch
from datetime import datetime

import torch

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)


import requests
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration


class EndpointHandler():
    
    def __init__(self, path=""):
        
        self.processor = Blip2Processor.from_pretrained(path)
        self.model = Blip2ForConditionalGeneration.from_pretrained(path, load_in_8bit=True, device_map="auto")

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # self.model.to(self.device)

        logging.info('Model moved to device-' + self.device)
        
        # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # self.model.eval()
        # self.model.to(device=device, dtype=self.torch_dtype)
        
        # self.generate_kwargs = {
        #     'max_new_tokens': 512,
        #     'temperature': 0.0001,
        #     'top_p': 1.0,
        #     'top_k': 0,
        #     'use_cache': True,
        #     'do_sample': True,
        #     'eos_token_id': self.tokenizer.eos_token_id,
        #     'pad_token_id': self.tokenizer.pad_token_id,
        #     "repetition_penalty": 1.1
        # }
    
    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
       data args:
            inputs (:obj: `str` | `PIL.Image` | `np.array`)
            kwargs
      Return:
            A :obj:`list` | `dict`: will be serialized and returned
        """

        # streamer = TextIteratorStreamer(
        # self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
        # )
        
        ## Model Parameters
        # self.generate_kwargs['max_new_tokens'] = data['max_new_tokens'] if 'max_new_tokens' in data else self.generate_kwargs['max_new_tokens']
        # self.generate_kwargs['temperature'] = data['temperature'] if 'temperature' in data else self.generate_kwargs['temperature']
        # self.generate_kwargs['top_p'] = data['top_p'] if 'top_p' in data else self.generate_kwargs['top_p']
        # self.generate_kwargs['top_k'] = data['top_k'] if 'top_k' in data else self.generate_kwargs['top_k']
        # self.generate_kwargs['do_sample'] = data['do_sample'] if 'do_sample' in data else self.generate_kwargs['do_sample']
        # self.generate_kwargs['repetition_penalty'] = data['repetition_penalty'] if 'repetition_penalty' in data else self.generate_kwargs['repetition_penalty']
        
        
        ## Prepare the inputs
        batch_size = data.pop("batch_size",data)
        # input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids
        # input_ids = input_ids.to(self.model.device)


        # pip install accelerate
        
        img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 

        now = datetime.now()

        raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
        
        # question = "how many dogs are in the picture?"
        # inputs = self.processor(raw_image, question, return_tensors="pt").to("cuda")

        inputs = self.processor([raw_image]*batch_size, return_tensors="pt").to("cuda", torch.float16)
        
        out = self.model.generate(**inputs)

        # generated_text = self.processor.batch_decode(out, skip_special_tokens=True)[0].strip()
        generated_text = self.processor.batch_decode(out, skip_special_tokens=True)

        current = datetime.now()

        # encoded_inp = self.tokenizer(inputs, return_tensors='pt', padding=True)
        # for key, value in encoded_inp.items():
        #     encoded_inp[key] = value.to('cuda:0')

        ## Invoke the model     
        # with torch.no_grad():
        #     gen_tokens =  self.model.generate(
        #         input_ids=encoded_inp['input_ids'],
        #         attention_mask=encoded_inp['attention_mask'],
        #         **generate_kwargs,
        #     )

        # ## Decode using tokenizer
        # decoded_gen = self.tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)        

        # with torch.no_grad():
        #     output_ids = self.model.generate(input_ids, **self.generate_kwargs)
        # # Slice the output_ids tensor to get only new tokens
        # new_tokens = output_ids[0, len(input_ids[0]) :]
        # output_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
        
        return [{"gen_text":generated_text, "time_elapsed": str(current-now)}]