dtorres-zAgile commited on
Commit
0be3778
·
1 Parent(s): b28323d

Upload 25 files

Browse files
__init__.py ADDED
File without changes
__script_info__.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "FileName": "__script_info__.json",
3
+ "Framework": "huggingface",
4
+ "ModelId": "none",
5
+ "Scope": "inference",
6
+ "Task": "textgeneration1",
7
+ "Version": "1.1.2"
8
+ }
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 100.0,
3
+ "eval_loss": 4.5546875,
4
+ "eval_runtime": 2.8087,
5
+ "eval_samples": 11,
6
+ "eval_samples_per_second": 3.916,
7
+ "eval_steps_per_second": 0.712,
8
+ "perplexity": 95.0770390179677,
9
+ "train_loss": 0.16138767729202907,
10
+ "train_runtime": 5518.2957,
11
+ "train_samples": 44,
12
+ "train_samples_per_second": 0.797,
13
+ "train_steps_per_second": 0.054
14
+ }
bloom_176b.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import logging
3
+ import os
4
+ import traceback
5
+ from typing import Dict
6
+ from typing import List
7
+ from typing import Optional
8
+ from typing import Union
9
+
10
+ import deepspeed
11
+ import torch
12
+ from constants import constants
13
+ from djl_python.inputs import Input
14
+ from djl_python.outputs import Output
15
+ from hp_validation import _update_num_beams
16
+ from hp_validation import _validate_payload
17
+ from input import process_input
18
+ from transformers import AutoConfig
19
+ from transformers import AutoModelForCausalLM
20
+ from transformers import AutoTokenizer
21
+ from transformers import PreTrainedModel
22
+ from transformers import PreTrainedTokenizer
23
+
24
+
25
+ TORCH_DTYPE_FROM_STR_MAPPING = {constants.INT8: torch.int8, constants.FP16: torch.float16}
26
+
27
+
28
+ class Service(object):
29
+ """Define Service class for text generation.
30
+
31
+ This class aims to have customized model loading and inference methods.
32
+ """
33
+
34
+ def __init__(self):
35
+ """Initialize model and tokenizer"""
36
+ self.model: PreTrainedModel = None
37
+ self.tokenizer: PreTrainedTokenizer = None
38
+
39
+ def load_model(self, properties: dict) -> None:
40
+ """Load Bloom or BloomZ 176B model, tokenizer from disk to memory as instance attributes.
41
+
42
+ Args:
43
+ properties (dict): a python dictionary of model parameters.
44
+ """
45
+ tensor_parallel = properties[constants.TENSOR_PARALLEL_DEGREE]
46
+ deepspeed.init_distributed(constants.NCCL)
47
+
48
+ model_location = properties[constants.MODEL_DIR]
49
+ if constants.MODEL_ID in properties:
50
+ model_location = properties[constants.MODEL_ID]
51
+ logging.info(f"Loading model from disk at '{model_location}'.")
52
+
53
+ curr_pid = os.getpid()
54
+ logging.info(f"Tensor_parallel={tensor_parallel}::curr_pid={curr_pid}::")
55
+
56
+ tokenizer = AutoTokenizer.from_pretrained(model_location)
57
+ config = AutoConfig.from_pretrained(model_location)
58
+ # Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load
59
+ with deepspeed.OnDevice(dtype=torch.float16, device=constants.META):
60
+ model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.float16)
61
+ model = model.eval()
62
+ torch.cuda.empty_cache()
63
+
64
+ # Deepspeed-Inference Loading
65
+ repo_root = model_location
66
+ # tp presharded repos come with their own checkpoints config file
67
+ checkpoints_json = os.path.join(repo_root, constants.DS_INFERENCE_CONFIG_FILE)
68
+ model = deepspeed.init_inference(
69
+ model,
70
+ tensor_parallel={
71
+ constants.TP_SIZE: tensor_parallel,
72
+ constants.ENABLED: True,
73
+ constants.MPU: None,
74
+ constants.TP_GROUP: None,
75
+ },
76
+ base_dir=repo_root,
77
+ dtype=TORCH_DTYPE_FROM_STR_MAPPING[properties.get("dtype")],
78
+ max_tokens=constants.MAX_TOKEN_DS_INIT,
79
+ checkpoint=checkpoints_json,
80
+ replace_with_kernel_inject=True,
81
+ )
82
+ torch.cuda.empty_cache()
83
+ deepspeed.runtime.utils.see_memory_usage(constants.POST_DS_INFERENCE_INIT, force=True)
84
+ model = model.module
85
+ self.model = model
86
+ self.tokenizer = tokenizer
87
+
88
+ def inference(self, inputs) -> Union[List[Dict[str, List]], List[List[Dict[str, List]]]]:
89
+ """Conduct inference based on inputs.
90
+
91
+ Args:
92
+ inputs (djl_python.inputs.Input): input containing payload and content type.
93
+ Returns:
94
+ results (Union[List[Dict[str, List]], List[List[Dict[str, List]]]]): if the length of input string being
95
+ one, the return output is a list of dictionary, where the length of dictionary corresponds to the number of
96
+ return sequences; if the length of string being more than one, (i.e., batch inference), the return output is
97
+ a list of list which inner list contains the one or multiple dictionaries.
98
+ """
99
+ try:
100
+ input_data, model_kwargs = process_input(inputs=inputs, text_input_for_bloom_model=True)
101
+ except Exception as e:
102
+ logging.exception(f"Failed to do inference: {e}; {traceback.format_exc()}")
103
+ results = Output().error((str(e)))
104
+ return results
105
+
106
+ input_tokens = self.tokenizer.batch_encode_plus(
107
+ input_data, return_tensors=constants.RETURN_TENSOR_TYPE, padding=True
108
+ )
109
+ for t in input_tokens:
110
+ if torch.is_tensor(input_tokens[t]):
111
+ input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
112
+ outputs = self.model.generate(**input_tokens, **model_kwargs)
113
+ outputs: List[str] = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
114
+
115
+ if len(outputs) > 1:
116
+ results: List[List[Dict[str, List]]] = []
117
+ num_return_seq = model_kwargs.get(constants.NUM_RETURN_SEQUENCES, 1)
118
+ for i in range(0, len(outputs), num_return_seq):
119
+ res_tmp: List[dict] = []
120
+ for j in range(i, i + num_return_seq):
121
+ res_tmp.append({constants.GENERATED_TEXT: outputs[j]})
122
+ results.append(res_tmp)
123
+ else:
124
+ results = [{constants.GENERATED_TEXT: outputs}]
125
+ return results
126
+
127
+
128
+ _service = Service()
129
+
130
+
131
+ def handle(inputs: Input) -> Optional[Output]:
132
+ """Define handler method for Bloom 176B model.
133
+
134
+ Args:
135
+ inputs (djl_python.inputs.Input): input containing payload and content type.
136
+ Returns:
137
+ outputs (djl_python.inputs.Output): model prediction output.
138
+ """
139
+
140
+ if inputs.is_empty():
141
+ # Model server makes an empty call to warmup the model on startup
142
+ _service.load_model(inputs.get_properties())
143
+ return None
144
+
145
+ results = _service.inference(inputs)
146
+ return Output().add(results)
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/tmp",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 0,
8
+ "hidden_act": "gelu",
9
+ "hidden_size": 2560,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 10240,
12
+ "layer_norm_eps": 1e-05,
13
+ "max_position_embeddings": 2048,
14
+ "model_type": "gpt_neox",
15
+ "num_attention_heads": 32,
16
+ "num_hidden_layers": 32,
17
+ "rotary_emb_base": 10000,
18
+ "rotary_pct": 1.0,
19
+ "tie_word_embeddings": false,
20
+ "torch_dtype": "float16",
21
+ "transformers_version": "4.28.1",
22
+ "use_cache": false,
23
+ "use_parallel_residual": false,
24
+ "vocab_size": 50277
25
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 100.0,
3
+ "eval_loss": 4.5546875,
4
+ "eval_runtime": 2.8087,
5
+ "eval_samples": 11,
6
+ "eval_samples_per_second": 3.916,
7
+ "eval_steps_per_second": 0.712,
8
+ "perplexity": 95.0770390179677
9
+ }
extra_requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ huggingface/textgeneration1/flash_attn/v1.0.0/flash_attn-1.0.5-cp39-cp39-linux_x86_64.whl
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "transformers_version": "4.28.1"
6
+ }
hp_validation.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Any
3
+ from typing import Dict
4
+ from typing import Union
5
+
6
+
7
+ logging.basicConfig(level=logging.INFO)
8
+
9
+
10
+ # Possible model parameters
11
+ TEXT_INPUTS = "text_inputs"
12
+ MAX_LENGTH = "max_length"
13
+ NUM_RETURN_SEQUENCES = "num_return_sequences"
14
+ NUM_BEAMS = "num_beams"
15
+ TOP_P = "top_p"
16
+ EARLY_STOPPING = "early_stopping"
17
+ DO_SAMPLE = "do_sample"
18
+ NO_REPEAT_NGRAM_SIZE = "no_repeat_ngram_size"
19
+ TOP_K = "top_k"
20
+ TEMPERATURE = "temperature"
21
+ SEED = "seed"
22
+ MIN_LENGTH = "min_length"
23
+ MIN_NEW_TOKENS = "min_new_tokens"
24
+ MAX_NEW_TOKENS = "max_new_tokens"
25
+ LENGTH_PENALTY = "length_penalty"
26
+ MAX_TIME = "max_time"
27
+ RETURN_FULL_TEXT = "return_full_text"
28
+ STOPPING_CRITERIA = "stopping_criteria"
29
+
30
+ ALL_PARAM_NAMES = [
31
+ TEXT_INPUTS,
32
+ MAX_LENGTH,
33
+ NUM_RETURN_SEQUENCES,
34
+ NUM_BEAMS,
35
+ TOP_P,
36
+ EARLY_STOPPING,
37
+ DO_SAMPLE,
38
+ NO_REPEAT_NGRAM_SIZE,
39
+ TOP_K,
40
+ TEMPERATURE,
41
+ SEED,
42
+ MIN_LENGTH,
43
+ MAX_NEW_TOKENS,
44
+ MIN_NEW_TOKENS,
45
+ LENGTH_PENALTY,
46
+ MAX_TIME,
47
+ RETURN_FULL_TEXT,
48
+ STOPPING_CRITERIA,
49
+ ]
50
+
51
+
52
+ # Model parameter ranges
53
+ LENGTH_MIN = 1
54
+ NUM_RETURN_SEQUENCE_MIN = 1
55
+ NUM_BEAMS_MIN = 1
56
+ TOP_P_MIN = 0
57
+ TOP_P_MAX = 1
58
+ NO_REPEAT_NGRAM_SIZE_MIN = 1
59
+ TOP_K_MIN = 0
60
+ TEMPERATURE_MIN = 0
61
+ NEW_TOKENS_MIN = 0
62
+
63
+
64
+ def is_list_of_strings(parameter: Any) -> bool:
65
+ """Return True if the parameter is a list of strings."""
66
+ if parameter and isinstance(parameter, list):
67
+ return all(isinstance(elem, str) for elem in parameter)
68
+ else:
69
+ return False
70
+
71
+
72
+ def _validate_payload(payload: Dict[str, Any]) -> None:
73
+ """Validate the parameters in the input loads.
74
+
75
+ Checks if max_length, num_return_sequences, num_beams, top_p and temprature are in bounds.
76
+ Checks if do_sample is boolean.
77
+ Checks max_length, num_return_sequences, num_beams and seed are integers.
78
+
79
+ Args:
80
+ payload: a decoded input payload (dictionary of input parameter and values)
81
+
82
+ Raises: ValueError is any of the check fails.
83
+ """
84
+ # For all parameters used in text2text generation task, please see
85
+ # https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
86
+ for param_name in payload:
87
+ if param_name not in ALL_PARAM_NAMES:
88
+ raise ValueError(f"Input payload contains an invalid key '{param_name}'. Valid keys are {ALL_PARAM_NAMES}.")
89
+
90
+ if TEXT_INPUTS not in payload:
91
+ raise ValueError(f"Input payload must contain {TEXT_INPUTS} key.")
92
+
93
+ for param_name in [MAX_LENGTH, NUM_RETURN_SEQUENCES, NUM_BEAMS, SEED]:
94
+ if param_name in payload:
95
+ if type(payload[param_name]) != int:
96
+ raise ValueError(f"{param_name} must be an integer, got {payload[param_name]}.")
97
+
98
+ if MAX_LENGTH in payload:
99
+ if payload[MAX_LENGTH] < LENGTH_MIN:
100
+ raise ValueError(f"{MAX_LENGTH} must be at least {LENGTH_MIN}, got {payload[MAX_LENGTH]}.")
101
+
102
+ if MIN_LENGTH in payload:
103
+ if payload[MIN_LENGTH] < LENGTH_MIN:
104
+ raise ValueError(f"{MIN_LENGTH} must be at least {LENGTH_MIN}, got {payload[MIN_LENGTH]}.")
105
+
106
+ if MAX_NEW_TOKENS in payload:
107
+ if payload[MAX_NEW_TOKENS] < NEW_TOKENS_MIN:
108
+ raise ValueError(f"{MAX_NEW_TOKENS} must be at least {NEW_TOKENS_MIN}, got {payload[MAX_NEW_TOKENS]}.")
109
+
110
+ if MIN_NEW_TOKENS in payload:
111
+ if payload[MIN_NEW_TOKENS] < NEW_TOKENS_MIN:
112
+ raise ValueError(f"{MIN_NEW_TOKENS} must be at least {NEW_TOKENS_MIN}, got {payload[MIN_NEW_TOKENS]}.")
113
+
114
+ if NUM_RETURN_SEQUENCES in payload:
115
+ if payload[NUM_RETURN_SEQUENCES] < NUM_RETURN_SEQUENCE_MIN:
116
+ raise ValueError(
117
+ f"{NUM_RETURN_SEQUENCES} must be at least {NUM_RETURN_SEQUENCE_MIN}, "
118
+ f"got {payload[NUM_RETURN_SEQUENCES]}."
119
+ )
120
+
121
+ if NUM_BEAMS in payload:
122
+ if payload[NUM_BEAMS] < NUM_BEAMS_MIN:
123
+ raise ValueError(f"{NUM_BEAMS} must be at least {NUM_BEAMS_MIN}, got {payload[NUM_BEAMS]}.")
124
+
125
+ if NUM_RETURN_SEQUENCES in payload and NUM_BEAMS in payload:
126
+ if payload[NUM_RETURN_SEQUENCES] > payload[NUM_BEAMS]:
127
+ raise ValueError(
128
+ f"{NUM_BEAMS} must be at least {NUM_RETURN_SEQUENCES}. Instead got "
129
+ f"{NUM_BEAMS}={payload[NUM_BEAMS]} and {NUM_RETURN_SEQUENCES}="
130
+ f"{payload[NUM_RETURN_SEQUENCES]}."
131
+ )
132
+
133
+ if TOP_P in payload:
134
+ if payload[TOP_P] < TOP_P_MIN or payload[TOP_P] > TOP_P_MAX:
135
+ raise ValueError(f"{TOP_K} must be in range [{TOP_P_MIN},{TOP_P_MAX}], got " f"{payload[TOP_P]}")
136
+
137
+ if TEMPERATURE in payload:
138
+ if payload[TEMPERATURE] < TEMPERATURE_MIN:
139
+ raise ValueError(
140
+ f"{TEMPERATURE} must be a float with value at least {TEMPERATURE_MIN}, got " f"{payload[TEMPERATURE]}."
141
+ )
142
+
143
+ if DO_SAMPLE in payload:
144
+ if type(payload[DO_SAMPLE]) != bool:
145
+ raise ValueError(f"{DO_SAMPLE} must be a boolean, got {payload[DO_SAMPLE]}.")
146
+
147
+ if STOPPING_CRITERIA in payload and not is_list_of_strings(payload[STOPPING_CRITERIA]):
148
+ raise ValueError(f"{payload[STOPPING_CRITERIA]} must be a list of strings, got {payload[STOPPING_CRITERIA]}")
149
+
150
+
151
+ def _update_num_beams(payload: Dict[str, Union[str, float, int]]) -> Dict[str, Union[str, float, int]]:
152
+ """Add num_beans to the payload if missing and num_return_sequences is present.
153
+
154
+ Args:
155
+ payload (Dict): dictionary of input text and parameters
156
+ Returns:
157
+ payload (Dict): payload with number of beams updated
158
+ """
159
+
160
+ if NUM_RETURN_SEQUENCES in payload and NUM_BEAMS not in payload:
161
+ payload[NUM_BEAMS] = payload[NUM_RETURN_SEQUENCES]
162
+ return payload
inference.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Any
3
+ from typing import Dict
4
+ from typing import Optional
5
+ from typing import Union
6
+
7
+ from djl_python.deepspeed import DeepSpeedService
8
+ from djl_python.inputs import Input
9
+ from djl_python.outputs import Output
10
+ from hp_validation import _update_num_beams
11
+ from hp_validation import _validate_payload
12
+ from input import process_input
13
+ from transformers import set_seed
14
+
15
+
16
+ class DeepSpeedServiceTextGeneration(DeepSpeedService):
17
+ """Define subclass DeepSpeedServiceTextGeneration.
18
+
19
+ This class aims to have customized inference function.
20
+ For the definition of DeepSpeedService, see
21
+ https://github.com/deepjavalibrary/djl-serving/blob/master/engines/python/setup/djl_python/deepspeed.py
22
+ """
23
+
24
+ def inference(self, inputs: Input) -> Output:
25
+ """Define customized inference method to have hyperparameter validation for text generation task.
26
+
27
+ Args:
28
+ inputs (djl_python.inputs.Input): input containing payload and content type.
29
+ Returns:
30
+ outputs (djl_python.inputs.Output): model prediction output.
31
+ """
32
+
33
+ try:
34
+ input_data, model_kwargs = process_input(inputs=inputs, text_input_for_bloom_model=False)
35
+ result = self.pipeline(input_data, **model_kwargs)
36
+ outputs = Output()
37
+ outputs.add(result)
38
+ except Exception as e:
39
+ logging.exception("Failed to do inference")
40
+ outputs = Output().error((str(e)))
41
+ return outputs
42
+
43
+
44
+ _service = DeepSpeedServiceTextGeneration()
45
+
46
+
47
+ def handle(inputs: Input) -> Optional[Output]:
48
+ """Define customized inference method to have hyperparameter validation for text generation task
49
+
50
+ Args:
51
+ inputs (djl_python.inputs.Input): input containing payload and content type.
52
+ Returns:
53
+ outputs (djl_python.inputs.Output): model prediction output.
54
+ """
55
+
56
+ if not _service.initialized:
57
+ _service.initialize(inputs.get_properties())
58
+
59
+ if inputs.is_empty():
60
+ return None
61
+
62
+ return _service.inference(inputs)
inference_helper.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+ from typing import List
3
+ from typing import Union
4
+
5
+ import torch
6
+ from constants import constants
7
+ from djl_python import Input
8
+ from djl_python import Output
9
+ from transformers import AutoModelForCausalLM
10
+ from transformers import AutoTokenizer
11
+
12
+
13
+ def inference_helper_model_tokenizer(
14
+ input_data: List, model: AutoModelForCausalLM, tokenizer: AutoTokenizer, content_type: str, model_kwargs: Dict
15
+ ) -> Union[List[Dict[str, List]], List[List[Dict[str, List]]]]:
16
+ """Conduct inference based on inputs.
17
+
18
+ Args:
19
+ input_data (list): a python list of string.
20
+ model (AutoModelForCausalLM): model for doing the inference
21
+ tokenizer (AutoTokenizer): tokenizer for the inference
22
+ content_type (str): request content type
23
+ model_kwargs (dict): a python dictionary of parameters.
24
+
25
+ Returns:
26
+ results (Union[List[Dict[str, List]], List[List[Dict[str, List]]]]): if the length of input list being
27
+ one, the return output is a list of dictionary, where the length of dictionary corresponds to the number of
28
+ return sequences; if the length of input list being more than one, (i.e., batch inference), the return
29
+ output is a list of list where inner list contains one or multiple dictionaries.
30
+ """
31
+
32
+ input_tokens = tokenizer.batch_encode_plus(input_data, return_tensors=constants.RETURN_TENSOR_TYPE, padding=True)
33
+ for t in input_tokens:
34
+ if torch.is_tensor(input_tokens[t]):
35
+ input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
36
+ outputs = model.generate(**input_tokens, **model_kwargs)
37
+ outputs: List[str] = tokenizer.batch_decode(outputs, skip_special_tokens=True)
38
+
39
+ if len(outputs) > 1:
40
+ results: List[List[Dict[str, List]]] = []
41
+ num_return_seq = model_kwargs.get(constants.NUM_RETURN_SEQUENCES, 1)
42
+ for i in range(0, len(outputs), num_return_seq):
43
+ res_tmp: List[dict] = []
44
+ for j in range(i, min(i + num_return_seq, len(outputs))):
45
+ res_tmp.append({constants.GENERATED_TEXT: outputs[j]})
46
+ results.append(res_tmp)
47
+ else:
48
+ if content_type == constants.APPLICATION_X_TEXT:
49
+ results = [{constants.GENERATED_TEXT: outputs}]
50
+ else:
51
+ results = [[{constants.GENERATED_TEXT: outputs}]]
52
+
53
+ return results
inference_mpt.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Any
4
+ from typing import Dict
5
+
6
+ import torch
7
+ from constants import constants
8
+ from djl_python.inputs import Input
9
+ from djl_python.outputs import Output
10
+ from sagemaker_jumpstart_huggingface_script_utilities.djl_python.dtypes import get_torch_dtype_from_str
11
+ from sagemaker_jumpstart_huggingface_script_utilities.djl_python.handle import create_handle
12
+ from sagemaker_jumpstart_huggingface_script_utilities.djl_python.inference.textgeneration import format_djl_output
13
+ from sagemaker_jumpstart_huggingface_script_utilities.djl_python.inference.textgeneration import generate_text
14
+ from sagemaker_jumpstart_huggingface_script_utilities.djl_python.inference.textgeneration import (
15
+ model_output_to_batch_output,
16
+ )
17
+ from sagemaker_jumpstart_huggingface_script_utilities.djl_python.inference.textgeneration import process_input
18
+ from sagemaker_jumpstart_huggingface_script_utilities.payload.stopping_criteria import (
19
+ add_stopping_criteria_to_model_kwargs,
20
+ )
21
+ from sagemaker_jumpstart_script_utilities.subprocess import run_with_error_handling
22
+ from transformers import AutoConfig
23
+ from transformers import AutoModelForCausalLM
24
+ from transformers import AutoTokenizer
25
+
26
+
27
+ FLASH_ATTENTION_WHEEL_FILENAME = "flash_attn-1.0.5-cp39-cp39-linux_x86_64.whl"
28
+ MAXIMUM_INPUT_SEQUENCE_LENGTH = "MAXIMUM_INPUT_SEQUENCE_LENGTH"
29
+ DTYPE = "dtype"
30
+ MAX_SEQ_LEN = "max_seq_len"
31
+ ATTN_IMPL = "attn_impl"
32
+ TRITON = "triton"
33
+ LIB = "lib"
34
+ EXTRA_DEPENDENCIES = "extra_dependencies"
35
+ EXTRA_DEPENDENCIES_PATH = Path(__file__).parent / LIB / EXTRA_DEPENDENCIES
36
+
37
+ # As of the time of this script creation, the maintainers of the flash-attn package do not distribute a bdist. To
38
+ # avoid an approximately 10 minute source build, a pre-built wheel is hosted as an extra dependency.
39
+ run_with_error_handling(["pip", "install", str(EXTRA_DEPENDENCIES_PATH / FLASH_ATTENTION_WHEEL_FILENAME)])
40
+
41
+
42
+ class MptPythonServiceTextGeneration:
43
+ """A service object for the MPT model family using the DJL Python engine.
44
+
45
+ This set of MPT models is not compatible with the T4 GPUs on the G4 instance family because some Triton kernels are
46
+ not compatible with architectures older than Ampere
47
+ ([reference](https://github.com/microsoft/DeepSpeed-MII/issues/170#issuecomment-1526277566)).
48
+
49
+ This set of MPT models uses the [FlashAttention](https://arxiv.org/pdf/2205.14135.pdf) mechanism with implementation
50
+ that requires the Python environment to include the following libraries:
51
+ - [flash-attn](https://github.com/HazyResearch/flash-attention)
52
+ - [triton](https://github.com/openai/triton)
53
+ - [einops](https://github.com/arogozhnikov/einops)
54
+ """
55
+
56
+ def __init__(self) -> None:
57
+ """Set initialization flag to False, model to be initialized upon invocation of initialize method."""
58
+ self.initialized = False
59
+
60
+ def initialize(self, properties: Dict[str, Any]) -> None:
61
+ """Initialize the MPT model and tokenizer.
62
+
63
+ This model contains custom Python scripts that are not yet included in the transformers library. Therefore, the
64
+ non-standard `trust_remote_code=True` must be set when loading config and the model. The referenced remote code
65
+ exists within the model artifact, which has been manually inspected for malicious code, downloaded with commit
66
+ revision hash checking, and prepackaged within the script tarball. Therefore, the "remote" code is hosted on S3
67
+ via the SageMaker JumpStart deployment pipeline. This script will not perform requests to the public internet.
68
+ Due to these steps and the fact that SageMaker JumpStart deploys models with network isolation enabled by
69
+ default, these scripts are deemed safe to enable to `trust_remote_code=True` flag.
70
+
71
+ Additionally, the accelerate library currently does not support the MPT model type.
72
+ """
73
+ model_dir = properties[constants.MODEL_DIR]
74
+
75
+ # View docstring for details on this `trust_remote_code=True` flag.
76
+ config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
77
+ config.attn_config[ATTN_IMPL] = TRITON
78
+ maximum_sequence_length = os.environ.get(MAXIMUM_INPUT_SEQUENCE_LENGTH)
79
+ if maximum_sequence_length is not None:
80
+ config.update({MAX_SEQ_LEN: int(maximum_sequence_length)})
81
+
82
+ torch_dtype = get_torch_dtype_from_str(properties.get(DTYPE))
83
+
84
+ self.model = AutoModelForCausalLM.from_pretrained(
85
+ model_dir, config=config, torch_dtype=torch_dtype, trust_remote_code=True
86
+ )
87
+ self.model.to(device="cuda:0")
88
+
89
+ self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
90
+ self.tokenizer.pad_token = self.tokenizer.eos_token
91
+
92
+ self.initialized = True
93
+
94
+ @format_djl_output
95
+ def inference(self, inputs: Input) -> Output:
96
+ """Perform inference for text generation task."""
97
+ input_data, model_kwargs = process_input(inputs, input_data_as_list=False)
98
+ model_kwargs = add_stopping_criteria_to_model_kwargs(model_kwargs, self.tokenizer)
99
+ model_output = generate_text(self.model, self.tokenizer, input_data, model_kwargs)
100
+ return model_output_to_batch_output(model_output, model_kwargs)
101
+
102
+
103
+ _service = MptPythonServiceTextGeneration()
104
+ handle = create_handle(_service)
input.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict
3
+ from typing import List
4
+ from typing import Union
5
+
6
+ from constants import constants
7
+ from djl_python.inputs import Input
8
+ from hp_validation import _update_num_beams
9
+ from hp_validation import _validate_payload
10
+ from transformers import set_seed
11
+
12
+
13
+ def format_input_for_task(input_values: Union[str, List[str]]) -> List[str]:
14
+ """Format input string into a list for text generation task
15
+
16
+ Args:
17
+ input_values: either a text string or list of text string.
18
+ Returns:
19
+ input_values (list): list of text string.
20
+ """
21
+ if not isinstance(input_values, list):
22
+ input_values = [input_values]
23
+ return input_values
24
+
25
+
26
+ def process_input(inputs: Input, text_input_for_bloom_model: bool = True) -> Union[List, Dict]:
27
+ """Process input based on content type.
28
+
29
+ Parse the input based on Content-Type of application/json or application/x-text
30
+
31
+ Args:
32
+ inputs (djl_python.inputs.Input): input containing payload and content type.
33
+ text_input_for_bloom_model: whether text string need special handling for bloom model
34
+ Returns:
35
+ input_data (list): a python list of string.
36
+ model_kwargs (dict): a python dictionary of parameters.
37
+ """
38
+ content_type = inputs.get_property("Content-Type")
39
+ model_kwargs = {}
40
+ if content_type == constants.APPLICATION_JSON:
41
+ try:
42
+ json_input = inputs.get_as_json()
43
+
44
+ _validate_payload(json_input)
45
+ json_input = _update_num_beams(json_input)
46
+ if constants.SEED in json_input:
47
+ set_seed(json_input[constants.SEED])
48
+ del json_input[constants.SEED]
49
+
50
+ if isinstance(json_input, dict):
51
+ input_data = format_input_for_task(json_input.pop(constants.TEXT_INPUTS))
52
+ model_kwargs = json_input
53
+ else:
54
+ input_data = json_input
55
+ except Exception:
56
+ logging.exception(
57
+ f"Failed to parse input payload. For content_type={constants.APPLICATION_JSON}, input "
58
+ f"payload must be a json encoded dictionary with keys {constants.ALL_PARAM_NAMES}."
59
+ )
60
+ raise
61
+ elif content_type == constants.APPLICATION_X_TEXT:
62
+ try:
63
+ if text_input_for_bloom_model:
64
+ input_data = format_input_for_task(inputs.get_as_string())
65
+ else:
66
+ input_data = inputs.get_as_string()
67
+ except Exception:
68
+ logging.exception(
69
+ f"Failed to parse input payload. For content_type={constants.APPLICATION_X_TEXT}, input "
70
+ f"payload must be a string encoded in utf-8 format."
71
+ )
72
+ raise
73
+ else:
74
+ raise ValueError('{{"error": "unsupported content type {}"}}'.format(content_type or "unknown"))
75
+
76
+ return input_data, model_kwargs
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63818b91e6ae42613ead2d886f433c31f8996fd5c33c0b37e158a181bab3bb23
3
+ size 5684519513
red_pajama.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import traceback
3
+ from typing import Any
4
+ from typing import Dict
5
+ from typing import List
6
+ from typing import Optional
7
+ from typing import Union
8
+
9
+ import torch
10
+ from constants import constants
11
+ from djl_python import Input
12
+ from djl_python import Output
13
+ from inference_helper import inference_helper_model_tokenizer
14
+ from input import process_input
15
+ from transformers import AutoModelForCausalLM
16
+ from transformers import AutoTokenizer
17
+ from transformers import PreTrainedModel
18
+ from transformers import PreTrainedTokenizer
19
+ from transformers import PreTrainedTokenizerBase
20
+ from transformers import StoppingCriteria
21
+ from transformers import StoppingCriteriaList
22
+
23
+
24
+ class StopWordsCriteria(StoppingCriteria):
25
+ """A text generation stopping criteria when output sequence contains any specified stop words."""
26
+
27
+ def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_words: List[str]) -> None:
28
+ """Initialize stopping criteria."""
29
+ self._tokenizer = tokenizer
30
+ self._stop_words = stop_words
31
+ self._partial_result = ""
32
+
33
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> bool:
34
+ """Return True if any stop word is present in generated sequence."""
35
+ text = self._tokenizer.decode(input_ids[0, -1])
36
+ self._partial_result += text
37
+ for stop_word in self._stop_words:
38
+ if stop_word in self._partial_result:
39
+ return True
40
+ return False
41
+
42
+
43
+ def create_stopping_criteria_list(tokenizer: PreTrainedTokenizerBase, stop_words: List[str]) -> StoppingCriteriaList:
44
+ """Create a StoppingCriteriaList to be used as generation payload input."""
45
+ stop_criteria = StopWordsCriteria(tokenizer, stop_words)
46
+ return StoppingCriteriaList([stop_criteria])
47
+
48
+
49
+ class Service(object):
50
+ """Define Service class for text generation.
51
+
52
+ This class aims to have customized model loading and inference methods.
53
+ """
54
+
55
+ def __init__(self):
56
+ """Initialize model and tokenizer"""
57
+ self.model: Optional[PreTrainedModel] = None
58
+ self.tokenizer: Optional[PreTrainedTokenizer] = None
59
+
60
+ def load_model(self, properties: dict) -> None:
61
+ """Load model, tokenizer from disk to memory as instance attributes.
62
+
63
+ Args:
64
+ properties (dict): a python dictionary of model parameters.
65
+ """
66
+
67
+ model_location = properties["model_dir"]
68
+ logging.info(f"Loading model from {model_location}")
69
+
70
+ tokenizer = AutoTokenizer.from_pretrained(model_location)
71
+ tokenizer.pad_token = tokenizer.eos_token
72
+ model = AutoModelForCausalLM.from_pretrained(model_location, torch_dtype=torch.float16, device_map="auto")
73
+ model.requires_grad_(False)
74
+ model.eval()
75
+
76
+ self.model = model
77
+ self.tokenizer = tokenizer
78
+
79
+ def modify_model_kwargs(self, model_kwargs: Dict[str, Any]) -> None:
80
+ """Provide script-based modification of model kwargs.
81
+
82
+ By default, this method injects stopping criteria into text generation. Since payload parameters are serialized
83
+ when invoking the endpoint, this method initializes any non-serializable kwargs for text generation.
84
+ """
85
+ if constants.STOPPING_CRITERIA in model_kwargs:
86
+ stopping_criteria = model_kwargs.pop(constants.STOPPING_CRITERIA)
87
+ stopping_criteria_list = create_stopping_criteria_list(self.tokenizer, stopping_criteria)
88
+ model_kwargs[constants.STOPPING_CRITERIA] = stopping_criteria_list
89
+
90
+ def inference(self, inputs: Input) -> Union[List[Dict[str, List]], List[List[Dict[str, List]]]]:
91
+ """Conduct inference based on inputs.
92
+
93
+ Args:
94
+ inputs (djl_python.inputs.Input): input containing payload and content type.
95
+ Returns:
96
+ results (Union[List[Dict[str, List]], List[List[Dict[str, List]]]]): if the length of input list being
97
+ one, the return output is a list of dictionary, where the length of dictionary corresponds to the number of
98
+ return sequences; if the length of input list being more than one, (i.e., batch inference), the return
99
+ output is a list of list where inner list contains one or multiple dictionaries.
100
+ """
101
+ try:
102
+ input_data, model_kwargs = process_input(inputs=inputs, text_input_for_bloom_model=True)
103
+ self.modify_model_kwargs(model_kwargs)
104
+ content_type = inputs.get_property("Content-Type")
105
+
106
+ return inference_helper_model_tokenizer(input_data, self.model, self.tokenizer, content_type, model_kwargs)
107
+ except Exception as e:
108
+ logging.exception(f"Failed to do inference: {e}; {traceback.format_exc()}")
109
+ results = Output().error((str(e)))
110
+ return results
111
+
112
+
113
+ _service = Service()
114
+
115
+
116
+ def handle(inputs: Input) -> Optional[Output]:
117
+ """Define handler method for model.
118
+
119
+ Args:
120
+ inputs (djl_python.inputs.Input): input containing payload and content type.
121
+ Returns:
122
+ outputs (djl_python.inputs.Output): model prediction output.
123
+ """
124
+
125
+ if inputs.is_empty():
126
+ # Model server makes an empty call to warmup the model on startup
127
+ _service.load_model(inputs.get_properties())
128
+ return None
129
+
130
+ results = _service.inference(inputs)
131
+ return Output().add(results)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ /opt/ml/model/lib/einops/einops-0.6.1-py3-none-any.whl
2
+ /opt/ml/model/lib/ninja/ninja-1.11.1-py2.py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
3
+ /opt/ml/model/lib/triton/triton-2.0.0.dev20221202-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
4
+ /opt/ml/model/lib/sagemaker_jumpstart_huggingface_script_utilities/sagemaker_jumpstart_huggingface_script_utilities-1.0.4-py2.py3-none-any.whl
5
+ /opt/ml/model/lib/sagemaker_jumpstart_script_utilities/sagemaker_jumpstart_script_utilities-1.1.7-py2.py3-none-any.whl
script_requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ sagemaker_jumpstart_huggingface_script_utilities==1.0.4
2
+ sagemaker_jumpstart_script_utilities==1.1.7
3
+ einops==0.6.1 \
4
+ --hash=sha256:99149e46cc808956b174932fe563d920db4d6e5dadb8c6ecdaa7483b7ef7cfc3
5
+ triton==2.0.0.dev20221202 \
6
+ --hash=sha256:42b3c00ffdc6311ee60318df72a5bddccc9e45951a5837c19d19ef93c72e8186
7
+ ninja==1.11.1 \
8
+ --hash=sha256:817e2aee2a4d28a708a67bcfba1817ae502c32c6d8ef80e50d63b0f23adf3a08
serving.properties ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ engine=Python
2
+ option.entryPoint=red_pajama.py
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<|endoftext|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|endoftext|>",
6
+ "model_max_length": 2048,
7
+ "tokenizer_class": "GPTNeoXTokenizer",
8
+ "unk_token": "<|endoftext|>"
9
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 100.0,
3
+ "train_loss": 0.16138767729202907,
4
+ "train_runtime": 5518.2957,
5
+ "train_samples": 44,
6
+ "train_samples_per_second": 0.797,
7
+ "train_steps_per_second": 0.054
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 100.0,
5
+ "global_step": 300,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 3.33,
12
+ "learning_rate": 4.061954955173073e-06,
13
+ "loss": 2.0922,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 6.67,
18
+ "learning_rate": 5.28472523771611e-06,
19
+ "loss": 1.5474,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 6.67,
24
+ "eval_loss": 2.08203125,
25
+ "eval_runtime": 2.8721,
26
+ "eval_samples_per_second": 3.83,
27
+ "eval_steps_per_second": 0.696,
28
+ "step": 20
29
+ },
30
+ {
31
+ "epoch": 10.0,
32
+ "learning_rate": 6e-06,
33
+ "loss": 0.8877,
34
+ "step": 30
35
+ },
36
+ {
37
+ "epoch": 13.33,
38
+ "learning_rate": 6e-06,
39
+ "loss": 0.2622,
40
+ "step": 40
41
+ },
42
+ {
43
+ "epoch": 13.33,
44
+ "eval_loss": 2.9765625,
45
+ "eval_runtime": 2.8096,
46
+ "eval_samples_per_second": 3.915,
47
+ "eval_steps_per_second": 0.712,
48
+ "step": 40
49
+ },
50
+ {
51
+ "epoch": 16.67,
52
+ "learning_rate": 6e-06,
53
+ "loss": 0.0353,
54
+ "step": 50
55
+ },
56
+ {
57
+ "epoch": 20.0,
58
+ "learning_rate": 6e-06,
59
+ "loss": 0.0066,
60
+ "step": 60
61
+ },
62
+ {
63
+ "epoch": 20.0,
64
+ "eval_loss": 3.96484375,
65
+ "eval_runtime": 2.8073,
66
+ "eval_samples_per_second": 3.918,
67
+ "eval_steps_per_second": 0.712,
68
+ "step": 60
69
+ },
70
+ {
71
+ "epoch": 23.33,
72
+ "learning_rate": 6e-06,
73
+ "loss": 0.0022,
74
+ "step": 70
75
+ },
76
+ {
77
+ "epoch": 26.67,
78
+ "learning_rate": 6e-06,
79
+ "loss": 0.001,
80
+ "step": 80
81
+ },
82
+ {
83
+ "epoch": 26.67,
84
+ "eval_loss": 4.140625,
85
+ "eval_runtime": 2.8092,
86
+ "eval_samples_per_second": 3.916,
87
+ "eval_steps_per_second": 0.712,
88
+ "step": 80
89
+ },
90
+ {
91
+ "epoch": 30.0,
92
+ "learning_rate": 6e-06,
93
+ "loss": 0.0009,
94
+ "step": 90
95
+ },
96
+ {
97
+ "epoch": 33.33,
98
+ "learning_rate": 6e-06,
99
+ "loss": 0.0006,
100
+ "step": 100
101
+ },
102
+ {
103
+ "epoch": 33.33,
104
+ "eval_loss": 4.22265625,
105
+ "eval_runtime": 2.8076,
106
+ "eval_samples_per_second": 3.918,
107
+ "eval_steps_per_second": 0.712,
108
+ "step": 100
109
+ },
110
+ {
111
+ "epoch": 36.67,
112
+ "learning_rate": 6e-06,
113
+ "loss": 0.0005,
114
+ "step": 110
115
+ },
116
+ {
117
+ "epoch": 40.0,
118
+ "learning_rate": 6e-06,
119
+ "loss": 0.0004,
120
+ "step": 120
121
+ },
122
+ {
123
+ "epoch": 40.0,
124
+ "eval_loss": 4.2890625,
125
+ "eval_runtime": 2.8093,
126
+ "eval_samples_per_second": 3.916,
127
+ "eval_steps_per_second": 0.712,
128
+ "step": 120
129
+ },
130
+ {
131
+ "epoch": 43.33,
132
+ "learning_rate": 6e-06,
133
+ "loss": 0.0003,
134
+ "step": 130
135
+ },
136
+ {
137
+ "epoch": 46.67,
138
+ "learning_rate": 6e-06,
139
+ "loss": 0.0003,
140
+ "step": 140
141
+ },
142
+ {
143
+ "epoch": 46.67,
144
+ "eval_loss": 4.3515625,
145
+ "eval_runtime": 2.8091,
146
+ "eval_samples_per_second": 3.916,
147
+ "eval_steps_per_second": 0.712,
148
+ "step": 140
149
+ },
150
+ {
151
+ "epoch": 50.0,
152
+ "learning_rate": 6e-06,
153
+ "loss": 0.0003,
154
+ "step": 150
155
+ },
156
+ {
157
+ "epoch": 53.33,
158
+ "learning_rate": 6e-06,
159
+ "loss": 0.0003,
160
+ "step": 160
161
+ },
162
+ {
163
+ "epoch": 53.33,
164
+ "eval_loss": 4.390625,
165
+ "eval_runtime": 2.8088,
166
+ "eval_samples_per_second": 3.916,
167
+ "eval_steps_per_second": 0.712,
168
+ "step": 160
169
+ },
170
+ {
171
+ "epoch": 56.67,
172
+ "learning_rate": 6e-06,
173
+ "loss": 0.0003,
174
+ "step": 170
175
+ },
176
+ {
177
+ "epoch": 60.0,
178
+ "learning_rate": 6e-06,
179
+ "loss": 0.0003,
180
+ "step": 180
181
+ },
182
+ {
183
+ "epoch": 60.0,
184
+ "eval_loss": 4.42578125,
185
+ "eval_runtime": 2.808,
186
+ "eval_samples_per_second": 3.917,
187
+ "eval_steps_per_second": 0.712,
188
+ "step": 180
189
+ },
190
+ {
191
+ "epoch": 63.33,
192
+ "learning_rate": 6e-06,
193
+ "loss": 0.0002,
194
+ "step": 190
195
+ },
196
+ {
197
+ "epoch": 66.67,
198
+ "learning_rate": 6e-06,
199
+ "loss": 0.0002,
200
+ "step": 200
201
+ },
202
+ {
203
+ "epoch": 66.67,
204
+ "eval_loss": 4.453125,
205
+ "eval_runtime": 2.8089,
206
+ "eval_samples_per_second": 3.916,
207
+ "eval_steps_per_second": 0.712,
208
+ "step": 200
209
+ },
210
+ {
211
+ "epoch": 70.0,
212
+ "learning_rate": 6e-06,
213
+ "loss": 0.0003,
214
+ "step": 210
215
+ },
216
+ {
217
+ "epoch": 73.33,
218
+ "learning_rate": 6e-06,
219
+ "loss": 0.0003,
220
+ "step": 220
221
+ },
222
+ {
223
+ "epoch": 73.33,
224
+ "eval_loss": 4.47265625,
225
+ "eval_runtime": 2.8074,
226
+ "eval_samples_per_second": 3.918,
227
+ "eval_steps_per_second": 0.712,
228
+ "step": 220
229
+ },
230
+ {
231
+ "epoch": 76.67,
232
+ "learning_rate": 6e-06,
233
+ "loss": 0.0002,
234
+ "step": 230
235
+ },
236
+ {
237
+ "epoch": 80.0,
238
+ "learning_rate": 6e-06,
239
+ "loss": 0.0002,
240
+ "step": 240
241
+ },
242
+ {
243
+ "epoch": 80.0,
244
+ "eval_loss": 4.49609375,
245
+ "eval_runtime": 2.8069,
246
+ "eval_samples_per_second": 3.919,
247
+ "eval_steps_per_second": 0.713,
248
+ "step": 240
249
+ },
250
+ {
251
+ "epoch": 83.33,
252
+ "learning_rate": 6e-06,
253
+ "loss": 0.0002,
254
+ "step": 250
255
+ },
256
+ {
257
+ "epoch": 86.67,
258
+ "learning_rate": 6e-06,
259
+ "loss": 0.0002,
260
+ "step": 260
261
+ },
262
+ {
263
+ "epoch": 86.67,
264
+ "eval_loss": 4.51953125,
265
+ "eval_runtime": 2.8067,
266
+ "eval_samples_per_second": 3.919,
267
+ "eval_steps_per_second": 0.713,
268
+ "step": 260
269
+ },
270
+ {
271
+ "epoch": 90.0,
272
+ "learning_rate": 6e-06,
273
+ "loss": 0.0002,
274
+ "step": 270
275
+ },
276
+ {
277
+ "epoch": 93.33,
278
+ "learning_rate": 6e-06,
279
+ "loss": 0.0002,
280
+ "step": 280
281
+ },
282
+ {
283
+ "epoch": 93.33,
284
+ "eval_loss": 4.5390625,
285
+ "eval_runtime": 2.8074,
286
+ "eval_samples_per_second": 3.918,
287
+ "eval_steps_per_second": 0.712,
288
+ "step": 280
289
+ },
290
+ {
291
+ "epoch": 96.67,
292
+ "learning_rate": 6e-06,
293
+ "loss": 0.0002,
294
+ "step": 290
295
+ },
296
+ {
297
+ "epoch": 100.0,
298
+ "learning_rate": 6e-06,
299
+ "loss": 0.0003,
300
+ "step": 300
301
+ },
302
+ {
303
+ "epoch": 100.0,
304
+ "eval_loss": 4.5546875,
305
+ "eval_runtime": 2.8081,
306
+ "eval_samples_per_second": 3.917,
307
+ "eval_steps_per_second": 0.712,
308
+ "step": 300
309
+ },
310
+ {
311
+ "epoch": 100.0,
312
+ "step": 300,
313
+ "total_flos": 28875518115840.0,
314
+ "train_loss": 0.16138767729202907,
315
+ "train_runtime": 5518.2957,
316
+ "train_samples_per_second": 0.797,
317
+ "train_steps_per_second": 0.054
318
+ }
319
+ ],
320
+ "max_steps": 300,
321
+ "num_train_epochs": 100,
322
+ "total_flos": 28875518115840.0,
323
+ "trial_name": null,
324
+ "trial_params": null
325
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e437e5065d5ecc3a58d100c2db2c4ab60408c0f8bf7a8837cff0a1ff7ef0598
3
+ size 4923
version ADDED
@@ -0,0 +1 @@
 
 
1
+ 1.1.2