File size: 3,122 Bytes
7c76e12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gc
import json

import torch
from ts.torch_handler.base_handler import BaseHandler
from transformers import AutoModelForCausalLM, AutoTokenizer

import logging

logger = logging.getLogger(__name__)


class TextGenerationHandlerForString(BaseHandler):
    def __init__(self):
        super(TextGenerationHandlerForString, self).__init__()
        self.model = None
        self.tokenizer = None
        self.device = None
        self.task_config = None
        self.initialized = False

    def load_model(self, model_dir):
        if self.device.type == "cuda":
            self.model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype="auto", low_cpu_mem_usage=True)
            if self.model.dtype == torch.float32:
                self.model = self.model.half()
        else:
            self.model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype="auto")
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        try:
            self.task_config = self.model.config.task_specific_params["text-generation"]
        except Exception:
            self.task_config = {}
        # TODO: Need to compare performance
        self.model.to(self.device, non_blocking=True)

    def initialize(self, ctx):
        self.manifest = ctx.manifest
        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        self.device = torch.device(
            "cuda:" + str(properties.get("gpu_id"))
            if torch.cuda.is_available()
            else "cpu"
        )
        self.load_model(model_dir)
        self.model.eval()
        self.initialized = True

    def preprocess(self, requests):
        input_batch = {}
        for idx, data in enumerate(requests):
            input_batch["input_text"] = data.get("body").get("text")
            input_batch["num_samples"] = data.get("body").get("num_samples")
            input_batch["length"] = data.get("body").get("length")
        del requests
        gc.collect()
        return input_batch

    def inference(self, input_batch):
        input_text = input_batch["input_text"]
        length = input_batch["length"]
        num_samples = input_batch["num_samples"]
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt").to(
            self.device
        )
        self.task_config["max_length"] = length
        self.task_config["num_return_sequences"] = num_samples
        inference_output = self.model.generate(input_ids, **self.task_config)
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        del input_batch
        gc.collect()
        return inference_output

    def postprocess(self, inference_output):
        output = self.tokenizer.batch_decode(
            inference_output.tolist(), skip_special_tokens=True
        )
        del inference_output
        gc.collect()
        return [json.dumps(output, ensure_ascii=False)]

    def handle(self, data, context):
        self.context = context
        data = self.preprocess(data)
        data = self.inference(data)
        data = self.postprocess(data)
        return data