emozilla commited on
Commit
dd36363
·
0 Parent(s):

initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You can find the Llama-2 usage policy here: https://ai.meta.com/llama/use-policy/
2
+
3
+ Llama 2 Community License Agreement
4
+
5
+ Llama 2 Version Release Date: July 18, 2023
6
+
7
+ “Agreement” means the terms and conditions for use, reproduction, distribution and modification of the Llama Materials set forth herein.
8
+
9
+ “Documentation” means the specifications, manuals and documentation accompanying Llama 2 distributed by Meta at ai.meta.com/resources/models-and-libraries/llama-downloads/.
10
+
11
+ “Licensee” or “you” means you, or your employer or any other person or entity (if you are entering into this Agreement on such person or entity’s behalf), of the age required under applicable laws, rules or regulations to provide legal consent and that has legal authority to bind your employer or such other person or entity if you are entering in this Agreement on their behalf.
12
+
13
+ “Llama 2” means the foundational large language models and software and algorithms, including machine-learning model code, trained model weights, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Meta at ai.meta.com/resources/models-and-libraries/llama-downloads/.
14
+
15
+ “Llama Materials” means, collectively, Meta’s proprietary Llama 2 and Documentation (and any portion thereof) made available under this Agreement.
16
+
17
+ “Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms, Inc. (if you are located outside of the EEA or Switzerland).
18
+
19
+ By clicking “I Accept” below or by using or distributing any portion or element of the Llama Materials, you agree to be bound by this Agreement.
20
+
21
+ License Rights and Redistribution.
22
+ a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Meta’s intellectual property or other rights owned by Meta embodied in the Llama Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Llama Materials.
23
+
24
+ b. Redistribution and Use.
25
+
26
+ i. If you distribute or make the Llama Materials, or any derivative works thereof, available to a third party, you shall provide a copy of this Agreement to such third party.
27
+
28
+ ii. If you receive Llama Materials, or any derivative works thereof, from a Licensee as part of an integrated end user product, then Section 2 of this Agreement will not apply to you.
29
+
30
+ iii. You must retain in all copies of the Llama Materials that you distribute the following attribution notice within a “Notice” text file distributed as a part of such copies: “Llama 2 is licensed under the LLAMA 2 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.”
31
+
32
+ iv. Your use of the Llama Materials must comply with applicable laws and regulations (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy for the Llama Materials (available at https://ai.meta.com/llama/use-policy), which is hereby incorporated by reference into this Agreement.
33
+
34
+ v. You will not use the Llama Materials or any output or results of the Llama Materials to improve any other large language model (excluding Llama 2 or derivative works thereof).
35
+
36
+ Additional Commercial Terms. If, on the Llama 2 version release date, the monthly active users of the products or services made available by or for Licensee, or Licensee’s affiliates, is greater than 700 million monthly active users in the preceding calendar month, you must request a license from Meta, which Meta may grant to you in its sole discretion, and you are not authorized to exercise any of the rights under this Agreement unless or until Meta otherwise expressly grants you such rights.
37
+
38
+ Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS.
39
+
40
+ Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
41
+
42
+ Intellectual Property.
43
+
44
+ a. No trademark licenses are granted under this Agreement, and in connection with the Llama Materials, neither Meta nor Licensee may use any name or mark owned by or associated with the other or any of its affiliates, except as required for reasonable and customary use in describing and redistributing the Llama Materials.
45
+
46
+ b. Subject to Meta’s ownership of Llama Materials and derivatives made by or for Meta, with respect to any derivative works and modifications of the Llama Materials that are made by you, as between you and Meta, you are and will be the owner of such derivative works and modifications.
47
+
48
+ c. If you institute litigation or other proceedings against Meta or any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Llama Materials or Llama 2 outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by you, then any licenses granted to you under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Meta from and against any claim by any third party arising out of or related to your use or distribution of the Llama Materials.
49
+
50
+ Term and Termination. The term of this Agreement will commence upon your acceptance of this Agreement or access to the Llama Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Meta may terminate this Agreement if you are in breach of any term or condition of this Agreement. Upon termination of this Agreement, you shall delete and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the termination of this Agreement.
51
+
52
+ Governing Law and Jurisdiction. This Agreement will be governed and construed under the laws of the State of California without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement. The courts of California shall have exclusive jurisdiction of any dispute arising out of this Agreement.
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<pad>": 32000
3
+ }
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "NousResearch/Llama-2-13b-hf",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "auto_map": {
7
+ "AutoModelForCausalLM": "modeling_flash_llama.LlamaForCausalLM"
8
+ },
9
+ "bos_token_id": 1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 5120,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 13824,
15
+ "max_position_embeddings": 8192,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 40,
18
+ "num_hidden_layers": 40,
19
+ "num_key_value_heads": 40,
20
+ "pad_token_id": 0,
21
+ "pretraining_tp": 2,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": {
24
+ "type": "linear",
25
+ "factor": 2.0
26
+ },
27
+ "tie_word_embeddings": false,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.32.0.dev0",
30
+ "use_cache": true,
31
+ "vocab_size": 32000
32
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "temperature": 0.9,
7
+ "top_p": 0.6,
8
+ "transformers_version": "4.32.0.dev0"
9
+ }
modeling_flash_llama.py ADDED
@@ -0,0 +1,1011 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ PyTorch LLaMA model."""
21
+ import math
22
+ from typing import List, Optional, Tuple, Union
23
+
24
+ import torch
25
+ import torch.nn.functional as F
26
+ import torch.utils.checkpoint
27
+ from torch import nn
28
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
29
+
30
+ from transformers.activations import ACT2FN
31
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
32
+ from transformers.modeling_utils import PreTrainedModel
33
+ from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
34
+ from transformers.models.llama.configuration_llama import LlamaConfig
35
+
36
+
37
+ try:
38
+ from flash_attn.flash_attn_interface import (
39
+ flash_attn_func,
40
+ flash_attn_kvpacked_func,
41
+ flash_attn_qkvpacked_func,
42
+ flash_attn_varlen_kvpacked_func,
43
+ )
44
+ from flash_attn.bert_padding import unpad_input, pad_input
45
+ flash_attn_v2_installed = True
46
+ print('>>>> Flash Attention installed')
47
+ except ImportError:
48
+ flash_attn_v2_installed = False
49
+ raise ImportError('Please install Flash Attention: `pip install flash-attn --no-build-isolation`')
50
+
51
+ try:
52
+ from flash_attn.layers.rotary import apply_rotary_emb_func
53
+ flash_rope_installed = True
54
+ print('>>>> Flash RoPE installed')
55
+ except ImportError:
56
+ flash_rope_installed = False
57
+ raise ImportError('Please install RoPE kernels: `pip install git+https://github.com/HazyResearch/flash-attention.git#subdirectory=csrc/rotary`')
58
+
59
+
60
+ logger = logging.get_logger(__name__)
61
+
62
+ _CONFIG_FOR_DOC = "LlamaConfig"
63
+
64
+
65
+ # @torch.jit.script
66
+ def rmsnorm_func(hidden_states, weight, variance_epsilon):
67
+ input_dtype = hidden_states.dtype
68
+ hidden_states = hidden_states.to(torch.float32)
69
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
70
+ hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
71
+ return (weight * hidden_states).to(input_dtype)
72
+
73
+
74
+ class LlamaRMSNorm(nn.Module):
75
+ def __init__(self, hidden_size, eps=1e-6):
76
+ """
77
+ LlamaRMSNorm is equivalent to T5LayerNorm
78
+ """
79
+ super().__init__()
80
+ self.weight = nn.Parameter(torch.ones(hidden_size))
81
+ self.register_buffer(
82
+ "variance_epsilon",
83
+ torch.tensor(eps),
84
+ persistent=False,
85
+ )
86
+
87
+ def forward(self, hidden_states):
88
+ return rmsnorm_func(hidden_states, self.weight, self.variance_epsilon)
89
+
90
+
91
+ class FlashRotaryEmbedding(torch.nn.Module):
92
+ """
93
+ The rotary position embeddings from RoFormer_ (Su et. al).
94
+ A crucial insight from the method is that the query and keys are
95
+ transformed by rotation matrices which depend on the relative positions.
96
+
97
+ Other implementations are available in the Rotary Transformer repo_ and in
98
+ GPT-NeoX_, GPT-NeoX was an inspiration
99
+
100
+ .. _RoFormer: https://arxiv.org/abs/2104.09864
101
+ .. _repo: https://github.com/ZhuiyiTechnology/roformer
102
+ .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
103
+
104
+ If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
105
+ A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
106
+ Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
107
+ """
108
+
109
+ def __init__(self, dim: int, base=10000.0, interleaved=False, scale_base=None,
110
+ scaling_factor=1.0, pos_idx_in_fp32=True, device=None):
111
+ """
112
+ interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
113
+ of 1st half and 2nd half (GPT-NeoX style).
114
+ pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
115
+ otherwise they might be in lower precision.
116
+ This option was added because previously (before 2023-07-02), when we construct
117
+ the position indices, we use the dtype of self.inv_freq. In most cases this would
118
+ be fp32, but if the model is trained in pure bf16 (not mixed precision), then
119
+ self.inv_freq would be bf16, and the position indices are also in bf16.
120
+ Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
121
+ embeddings for some positions will coincide.
122
+ To maintain compatibility with models previously trained in pure bf16,
123
+ we add this option.
124
+ scaling_factor: RotaryEmbedding extended with linear scaling.
125
+ """
126
+ super().__init__()
127
+ self.dim = dim
128
+ self.base = float(base)
129
+ self.pos_idx_in_fp32 = pos_idx_in_fp32
130
+ # Generate and save the inverse frequency buffer (non trainable)
131
+ inv_freq = self._compute_inv_freq(device)
132
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
133
+ self.interleaved = interleaved
134
+ self.scale_base = scale_base
135
+ self.scaling_factor = scaling_factor
136
+ scale = ((torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim)
137
+ / (1.4 * dim) if scale_base is not None else None)
138
+ self.register_buffer("scale", scale)
139
+
140
+ self._seq_len_cached = 0
141
+ self._cos_cached = None
142
+ self._sin_cached = None
143
+ self._cos_k_cached = None
144
+ self._sin_k_cached = None
145
+
146
+ def _compute_inv_freq(self, device=None):
147
+ return 1 / (self.base ** (torch.arange(0, self.dim, 2, device=device,
148
+ dtype=torch.float32) / self.dim))
149
+
150
+
151
+ def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
152
+ # Reset the tables if the sequence length has changed,
153
+ # if we're on a new device (possibly due to tracing for instance),
154
+ # or if we're switching from inference mode to training
155
+ if (seqlen > self._seq_len_cached or self._cos_cached.device != device
156
+ or self._cos_cached.dtype != dtype
157
+ or (self.training and self._cos_cached.is_inference())):
158
+ self._seq_len_cached = seqlen
159
+ # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
160
+ # And the output of arange can be quite large, so bf16 would lose a lot of precision.
161
+ # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
162
+ if self.pos_idx_in_fp32:
163
+ t = torch.arange(seqlen, device=device, dtype=torch.float32)
164
+ t /= self.scaling_factor
165
+ # We want fp32 here as well since inv_freq will be multiplied with t, and the output
166
+ # will be large. Having it in bf16 will lose a lot of precision and cause the
167
+ # cos & sin output to change significantly.
168
+ # We want to recompute self.inv_freq if it was not loaded in fp32
169
+ if self.inv_freq.dtype != torch.float32:
170
+ inv_freq = self.inv_freq.to(torch.float32)
171
+ else:
172
+ inv_freq = self.inv_freq
173
+ else:
174
+ t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
175
+ t /= self.scaling_factor
176
+ inv_freq = self.inv_freq
177
+ # Don't do einsum, it converts fp32 to fp16 under AMP
178
+ # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
179
+ freqs = torch.outer(t, inv_freq)
180
+ if self.scale is None:
181
+ self._cos_cached = torch.cos(freqs).to(dtype)
182
+ self._sin_cached = torch.sin(freqs).to(dtype)
183
+ else:
184
+ power = ((torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
185
+ - seqlen // 2) / self.scale_base)
186
+ scale = self.scale.to(device=power.device) ** power.unsqueeze(-1)
187
+ # We want the multiplication by scale to happen in fp32
188
+ self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
189
+ self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
190
+ self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
191
+ self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
192
+
193
+ def forward(self, q: torch.Tensor, k: torch.Tensor, seqlen_offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]:
194
+ """
195
+ q: (batch, seqlen, nheads, headdim)
196
+ k: (batch, seqlen, nheads, headdim)
197
+ seqlen_offset: can be used in generation where the qkv being passed in is only the last
198
+ token in the batch.
199
+ """
200
+ self._update_cos_sin_cache(q.shape[1] + seqlen_offset, device=q.device, dtype=q.dtype)
201
+ if self.scale is None:
202
+ return apply_rotary_emb_func(
203
+ q, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:],
204
+ self.interleaved, True # inplace=True
205
+ ), apply_rotary_emb_func(
206
+ k, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:],
207
+ self.interleaved, True # inplace=True
208
+ )
209
+ else:
210
+ assert False
211
+
212
+ class LlamaMLP(nn.Module):
213
+ def __init__(self, config):
214
+ super().__init__()
215
+ self.config = config
216
+ self.hidden_size = config.hidden_size
217
+ self.intermediate_size = config.intermediate_size
218
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
219
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
220
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
221
+ self.act_fn = ACT2FN[config.hidden_act]
222
+
223
+ def forward(self, x):
224
+ if self.config.pretraining_tp > 1:
225
+ slice = self.intermediate_size // self.config.pretraining_tp
226
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
227
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
228
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
229
+
230
+ gate_proj = torch.cat(
231
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
232
+ )
233
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
234
+
235
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
236
+ down_proj = [
237
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
238
+ ]
239
+ down_proj = sum(down_proj)
240
+ else:
241
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
242
+
243
+ return down_proj
244
+
245
+ @torch.jit.script
246
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
247
+ """
248
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
249
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
250
+ """
251
+ batch, slen, _, num_key_value_heads, head_dim = hidden_states.shape
252
+ if n_rep == 1:
253
+ return hidden_states
254
+ hidden_states = hidden_states[:, :, :, :, None, :].expand(batch, slen, 2, num_key_value_heads, n_rep, head_dim)
255
+ return hidden_states.reshape(batch, slen, 2, num_key_value_heads * n_rep, head_dim)
256
+
257
+
258
+ class LlamaAttention(nn.Module):
259
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
260
+
261
+ def __init__(self, config: LlamaConfig):
262
+ super().__init__()
263
+ self.config = config
264
+ self.hidden_size = config.hidden_size
265
+ self.num_heads = config.num_attention_heads
266
+ self.head_dim = self.hidden_size // self.num_heads
267
+ self.num_key_value_heads = config.num_key_value_heads
268
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
269
+ self.max_position_embeddings = config.max_position_embeddings
270
+
271
+ if (self.head_dim * self.num_heads) != self.hidden_size:
272
+ raise ValueError(
273
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
274
+ f" and `num_heads`: {self.num_heads})."
275
+ )
276
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
277
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
278
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
279
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
280
+
281
+ self.register_buffer(
282
+ "norm_factor",
283
+ torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype()),
284
+ persistent=False,
285
+ )
286
+
287
+ if self.config.rope_scaling is None:
288
+ scaling_factor = 1
289
+ else:
290
+ scaling_type = self.config.rope_scaling["type"]
291
+ scaling_factor = self.config.rope_scaling["factor"]
292
+ assert scaling_type == 'linear'
293
+
294
+ self.rotary_emb = FlashRotaryEmbedding(
295
+ self.head_dim, base=10000, interleaved=False, scaling_factor=scaling_factor,
296
+ )
297
+
298
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
299
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
300
+
301
+ def forward(
302
+ self,
303
+ hidden_states: torch.Tensor,
304
+ attention_mask: Optional[torch.Tensor] = None,
305
+ position_ids: Optional[torch.LongTensor] = None,
306
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
307
+ output_attentions: bool = False,
308
+ use_cache: bool = False,
309
+ is_padded_inputs: Optional[bool] = False,
310
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
311
+ bsz, q_len, h_size = hidden_states.size()
312
+
313
+ has_layer_past = past_key_value is not None
314
+
315
+ if has_layer_past:
316
+ past_kv = past_key_value[0]
317
+ past_len = past_key_value[1]
318
+ else:
319
+ past_len = 0
320
+
321
+ if self.config.pretraining_tp > 1:
322
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
323
+ query_slices = self.q_proj.weight.split(
324
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
325
+ )
326
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
327
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
328
+
329
+ q = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
330
+ q = torch.cat(q, dim=-1)
331
+
332
+ k = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
333
+ k = torch.cat(k, dim=-1)
334
+
335
+ v = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
336
+ v = torch.cat(v, dim=-1)
337
+
338
+ else:
339
+ q = self.q_proj(hidden_states)
340
+ k = self.k_proj(hidden_states)
341
+ v = self.v_proj(hidden_states)
342
+
343
+ q = q.view(bsz, q_len, self.num_heads, self.head_dim)
344
+ k = k.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
345
+ v = v.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
346
+
347
+ q, k = self.rotary_emb(q, k, past_len)
348
+
349
+ kv = torch.stack([k, v], 2)
350
+ kv = repeat_kv(kv, self.num_key_value_groups)
351
+
352
+ # Cache QKV values
353
+ if has_layer_past:
354
+ new_len = past_len+q.size(1)
355
+ if new_len > past_kv.size(1):
356
+ past_kv = torch.cat([past_kv, torch.empty(bsz, 256, 2, kv.size(3), kv.size(4), dtype=kv.dtype, device=kv.device)], 1)
357
+ past_kv[:, past_len:new_len] = kv
358
+ kv = past_kv[:, :new_len]
359
+ else:
360
+ past_kv = kv
361
+
362
+ past_key_value = (past_kv, past_len+q.size(1)) if use_cache else None
363
+
364
+ if is_padded_inputs:
365
+
366
+ # varlen, ignore padding tokens, efficient for large batch with many paddings
367
+
368
+ assert attention_mask is not None
369
+
370
+ unpadded_kv, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(kv, attention_mask)
371
+ unpadded_q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, attention_mask[:, -q.size(1):])
372
+ attn_outputs = flash_attn_varlen_kvpacked_func(
373
+ unpadded_q, unpadded_kv, cu_seqlens_q, cu_seqlens_k,
374
+ max_seqlen_q, max_seqlen_k,
375
+ dropout_p=0.0, softmax_scale=1.0/self.norm_factor,
376
+ causal=(not has_layer_past), return_attn_probs=output_attentions
377
+ )
378
+
379
+ attn_output = attn_outputs[0] if output_attentions else attn_outputs
380
+ attn_output = pad_input(
381
+ attn_output, indices_q, bsz, q_len
382
+ ).reshape(bsz, q_len, h_size)
383
+ attn_weights = attn_outputs[2] if output_attentions else None
384
+
385
+ else:
386
+
387
+ # no padding tokens, more efficient
388
+
389
+ attn_outputs = flash_attn_kvpacked_func(
390
+ q, kv, dropout_p=0.0, softmax_scale=1.0/self.norm_factor, causal=(not has_layer_past), return_attn_probs=output_attentions)
391
+
392
+ attn_output = attn_outputs[0] if output_attentions else attn_outputs
393
+ attn_output = attn_output.reshape(bsz, q_len, h_size)
394
+ attn_weights = attn_outputs[2] if output_attentions else None
395
+
396
+ if self.config.pretraining_tp > 1:
397
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
398
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
399
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
400
+ else:
401
+ attn_output = self.o_proj(attn_output)
402
+
403
+ if not output_attentions:
404
+ attn_weights = None
405
+
406
+ return attn_output, attn_weights, past_key_value
407
+
408
+
409
+ class LlamaDecoderLayer(nn.Module):
410
+ def __init__(self, config: LlamaConfig):
411
+ super().__init__()
412
+ self.hidden_size = config.hidden_size
413
+ self.self_attn = LlamaAttention(config=config)
414
+ self.mlp = LlamaMLP(config)
415
+ self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
416
+ self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
417
+
418
+ def forward(
419
+ self,
420
+ hidden_states: torch.Tensor,
421
+ attention_mask: Optional[torch.Tensor] = None,
422
+ position_ids: Optional[torch.LongTensor] = None,
423
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
424
+ is_padded_inputs: Optional[bool] = False,
425
+ output_attentions: Optional[bool] = False,
426
+ use_cache: Optional[bool] = False,
427
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
428
+ """
429
+ Args:
430
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
431
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
432
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
433
+ output_attentions (`bool`, *optional*):
434
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
435
+ returned tensors for more detail.
436
+ use_cache (`bool`, *optional*):
437
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
438
+ (see `past_key_values`).
439
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
440
+ """
441
+
442
+ residual = hidden_states
443
+
444
+ hidden_states = self.input_layernorm(hidden_states)
445
+
446
+ # Self Attention
447
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
448
+ hidden_states=hidden_states,
449
+ attention_mask=attention_mask,
450
+ position_ids=position_ids,
451
+ past_key_value=past_key_value,
452
+ output_attentions=output_attentions,
453
+ use_cache=use_cache,
454
+ is_padded_inputs=is_padded_inputs,
455
+ )
456
+ hidden_states = residual + hidden_states
457
+
458
+ # Fully Connected
459
+ residual = hidden_states
460
+ hidden_states = self.post_attention_layernorm(hidden_states)
461
+ hidden_states = self.mlp(hidden_states)
462
+ hidden_states = residual + hidden_states
463
+
464
+ outputs = (hidden_states,)
465
+
466
+ if output_attentions:
467
+ outputs += (self_attn_weights,)
468
+
469
+ if use_cache:
470
+ outputs += (present_key_value,)
471
+
472
+ return outputs
473
+
474
+
475
+ LLAMA_START_DOCSTRING = r"""
476
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
477
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
478
+ etc.)
479
+
480
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
481
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
482
+ and behavior.
483
+
484
+ Parameters:
485
+ config ([`LlamaConfig`]):
486
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
487
+ load the weights associated with the model, only the configuration. Check out the
488
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
489
+ """
490
+
491
+
492
+ @add_start_docstrings(
493
+ "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
494
+ LLAMA_START_DOCSTRING,
495
+ )
496
+ class LlamaPreTrainedModel(PreTrainedModel):
497
+ config_class = LlamaConfig
498
+ base_model_prefix = "model"
499
+ supports_gradient_checkpointing = True
500
+ _no_split_modules = ["LlamaDecoderLayer"]
501
+ _skip_keys_device_placement = "past_key_values"
502
+
503
+ def _init_weights(self, module):
504
+ std = self.config.initializer_range
505
+ if isinstance(module, nn.Linear):
506
+ module.weight.data.normal_(mean=0.0, std=std)
507
+ if module.bias is not None:
508
+ module.bias.data.zero_()
509
+ elif isinstance(module, nn.Embedding):
510
+ module.weight.data.normal_(mean=0.0, std=std)
511
+ if module.padding_idx is not None:
512
+ module.weight.data[module.padding_idx].zero_()
513
+
514
+ def _set_gradient_checkpointing(self, module, value=False):
515
+ if isinstance(module, LlamaModel):
516
+ module.gradient_checkpointing = value
517
+
518
+
519
+ LLAMA_INPUTS_DOCSTRING = r"""
520
+ Args:
521
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
522
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
523
+ it.
524
+
525
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
526
+ [`PreTrainedTokenizer.__call__`] for details.
527
+
528
+ [What are input IDs?](../glossary#input-ids)
529
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
530
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
531
+
532
+ - 1 for tokens that are **not masked**,
533
+ - 0 for tokens that are **masked**.
534
+
535
+ [What are attention masks?](../glossary#attention-mask)
536
+
537
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
538
+ [`PreTrainedTokenizer.__call__`] for details.
539
+
540
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
541
+ `past_key_values`).
542
+
543
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
544
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
545
+ information on the default strategy.
546
+
547
+ - 1 indicates the head is **not masked**,
548
+ - 0 indicates the head is **masked**.
549
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
550
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
551
+ config.n_positions - 1]`.
552
+
553
+ [What are position IDs?](../glossary#position-ids)
554
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
555
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
556
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
557
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
558
+
559
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
560
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
561
+
562
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
563
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
564
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
565
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
566
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
567
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
568
+ model's internal embedding lookup matrix.
569
+ use_cache (`bool`, *optional*):
570
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
571
+ `past_key_values`).
572
+ output_attentions (`bool`, *optional*):
573
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
574
+ tensors for more detail.
575
+ output_hidden_states (`bool`, *optional*):
576
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
577
+ more detail.
578
+ return_dict (`bool`, *optional*):
579
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
580
+ """
581
+
582
+
583
+ @add_start_docstrings(
584
+ "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
585
+ LLAMA_START_DOCSTRING,
586
+ )
587
+ class LlamaModel(LlamaPreTrainedModel):
588
+ """
589
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
590
+
591
+ Args:
592
+ config: LlamaConfig
593
+ """
594
+
595
+ def __init__(self, config: LlamaConfig):
596
+ super().__init__(config)
597
+ self.padding_idx = config.pad_token_id
598
+ self.vocab_size = config.vocab_size
599
+
600
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
601
+ self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
602
+ self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
603
+
604
+ self.gradient_checkpointing = False
605
+ # Initialize weights and apply final processing
606
+ self.post_init()
607
+
608
+ def get_input_embeddings(self):
609
+ return self.embed_tokens
610
+
611
+ def set_input_embeddings(self, value):
612
+ self.embed_tokens = value
613
+
614
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
615
+ def forward(
616
+ self,
617
+ input_ids: torch.LongTensor = None,
618
+ attention_mask: Optional[torch.Tensor] = None,
619
+ position_ids: Optional[torch.LongTensor] = None,
620
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
621
+ inputs_embeds: Optional[torch.FloatTensor] = None,
622
+ use_cache: Optional[bool] = None,
623
+ output_attentions: Optional[bool] = None,
624
+ output_hidden_states: Optional[bool] = None,
625
+ return_dict: Optional[bool] = None,
626
+ is_padded_inputs: Optional[bool] = False,
627
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
628
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
629
+ output_hidden_states = (
630
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
631
+ )
632
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
633
+
634
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
635
+
636
+ # retrieve input_ids and inputs_embeds
637
+ if input_ids is not None and inputs_embeds is not None:
638
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
639
+ elif input_ids is not None:
640
+ batch_size, seq_length = input_ids.shape
641
+ elif inputs_embeds is not None:
642
+ batch_size, seq_length, _ = inputs_embeds.shape
643
+ else:
644
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
645
+
646
+ seq_length_with_past = seq_length
647
+ past_key_values_length = 0
648
+
649
+ if past_key_values is not None:
650
+ past_key_values_length = past_key_values[0][0].shape[2]
651
+ seq_length_with_past = seq_length_with_past + past_key_values_length
652
+
653
+ position_ids = None
654
+
655
+ if inputs_embeds is None:
656
+ inputs_embeds = self.embed_tokens(input_ids)
657
+
658
+ hidden_states = inputs_embeds
659
+
660
+ if self.gradient_checkpointing and self.training:
661
+ if use_cache:
662
+ logger.warning_once(
663
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
664
+ )
665
+ use_cache = False
666
+
667
+ # decoder layers
668
+ all_hidden_states = () if output_hidden_states else None
669
+ all_self_attns = () if output_attentions else None
670
+ next_decoder_cache = () if use_cache else None
671
+
672
+ for idx, decoder_layer in enumerate(self.layers):
673
+ if output_hidden_states:
674
+ all_hidden_states += (hidden_states,)
675
+
676
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
677
+
678
+ if self.gradient_checkpointing and self.training:
679
+
680
+ def create_custom_forward(module):
681
+ def custom_forward(*inputs):
682
+ # None for past_key_value
683
+ return module(*inputs, output_attentions, None)
684
+
685
+ return custom_forward
686
+
687
+ layer_outputs = torch.utils.checkpoint.checkpoint(
688
+ create_custom_forward(decoder_layer),
689
+ hidden_states,
690
+ attention_mask,
691
+ position_ids,
692
+ None,
693
+ is_padded_inputs
694
+ )
695
+ else:
696
+ layer_outputs = decoder_layer(
697
+ hidden_states,
698
+ attention_mask=attention_mask,
699
+ position_ids=position_ids,
700
+ past_key_value=past_key_value,
701
+ output_attentions=output_attentions,
702
+ use_cache=use_cache,
703
+ is_padded_inputs=is_padded_inputs,
704
+ )
705
+
706
+ hidden_states = layer_outputs[0]
707
+
708
+ if use_cache:
709
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
710
+
711
+ if output_attentions:
712
+ all_self_attns += (layer_outputs[1],)
713
+
714
+ hidden_states = self.norm(hidden_states)
715
+
716
+ # add hidden states from the last decoder layer
717
+ if output_hidden_states:
718
+ all_hidden_states += (hidden_states,)
719
+
720
+ next_cache = next_decoder_cache if use_cache else None
721
+ if not return_dict:
722
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
723
+ return BaseModelOutputWithPast(
724
+ last_hidden_state=hidden_states,
725
+ past_key_values=next_cache,
726
+ hidden_states=all_hidden_states,
727
+ attentions=all_self_attns,
728
+ )
729
+
730
+
731
+ class LlamaForCausalLM(LlamaPreTrainedModel):
732
+ _tied_weights_keys = ["lm_head.weight"]
733
+
734
+ def __init__(self, config):
735
+ super().__init__(config)
736
+ self.model = LlamaModel(config)
737
+ self.vocab_size = config.vocab_size
738
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
739
+
740
+ # Initialize weights and apply final processing
741
+ self.post_init()
742
+
743
+ def get_input_embeddings(self):
744
+ return self.model.embed_tokens
745
+
746
+ def set_input_embeddings(self, value):
747
+ self.model.embed_tokens = value
748
+
749
+ def get_output_embeddings(self):
750
+ return self.lm_head
751
+
752
+ def set_output_embeddings(self, new_embeddings):
753
+ self.lm_head = new_embeddings
754
+
755
+ def set_decoder(self, decoder):
756
+ self.model = decoder
757
+
758
+ def get_decoder(self):
759
+ return self.model
760
+
761
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
762
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
763
+ def forward(
764
+ self,
765
+ input_ids: torch.LongTensor = None,
766
+ attention_mask: Optional[torch.Tensor] = None,
767
+ position_ids: Optional[torch.LongTensor] = None,
768
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
769
+ inputs_embeds: Optional[torch.FloatTensor] = None,
770
+ labels: Optional[torch.LongTensor] = None,
771
+ use_cache: Optional[bool] = None,
772
+ output_attentions: Optional[bool] = None,
773
+ output_hidden_states: Optional[bool] = None,
774
+ return_dict: Optional[bool] = None,
775
+ is_padded_inputs: Optional[bool] = None,
776
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
777
+ r"""
778
+ Args:
779
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
780
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
781
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
782
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
783
+
784
+ Returns:
785
+
786
+ Example:
787
+
788
+ ```python
789
+ >>> from transformers import AutoTokenizer, LlamaForCausalLM
790
+
791
+ >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
792
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
793
+
794
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
795
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
796
+
797
+ >>> # Generate
798
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
799
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
800
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
801
+ ```"""
802
+
803
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
804
+ output_hidden_states = (
805
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
806
+ )
807
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
808
+
809
+ is_padded_inputs = ((attention_mask is not None) and (not attention_mask.all().item()))
810
+
811
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
812
+ outputs = self.model(
813
+ input_ids=input_ids,
814
+ attention_mask=attention_mask,
815
+ position_ids=position_ids,
816
+ past_key_values=past_key_values,
817
+ inputs_embeds=inputs_embeds,
818
+ use_cache=use_cache,
819
+ output_attentions=output_attentions,
820
+ output_hidden_states=output_hidden_states,
821
+ return_dict=return_dict,
822
+ is_padded_inputs=is_padded_inputs,
823
+ )
824
+
825
+ hidden_states = outputs[0]
826
+ if self.config.pretraining_tp > 1:
827
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
828
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
829
+ logits = torch.cat(logits, dim=-1)
830
+ else:
831
+ logits = self.lm_head(hidden_states)
832
+ logits = logits.float()
833
+
834
+ loss = None
835
+ if labels is not None:
836
+ # Shift so that tokens < n predict n
837
+ shift_logits = logits[..., :-1, :].contiguous()
838
+ shift_labels = labels[..., 1:].contiguous()
839
+ # Flatten the tokens
840
+ loss_fct = CrossEntropyLoss()
841
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
842
+ shift_labels = shift_labels.view(-1)
843
+ # Enable model parallelism
844
+ shift_labels = shift_labels.to(shift_logits.device)
845
+ loss = loss_fct(shift_logits, shift_labels)
846
+
847
+ if not return_dict:
848
+ output = (logits,) + outputs[1:]
849
+ return (loss,) + output if loss is not None else output
850
+
851
+ return CausalLMOutputWithPast(
852
+ loss=loss,
853
+ logits=logits,
854
+ past_key_values=outputs.past_key_values,
855
+ hidden_states=outputs.hidden_states,
856
+ attentions=outputs.attentions,
857
+ )
858
+
859
+ def prepare_inputs_for_generation(
860
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
861
+ ):
862
+ if past_key_values:
863
+ input_ids = input_ids[:, -1:]
864
+
865
+ position_ids = kwargs.get("position_ids", None)
866
+
867
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
868
+ if inputs_embeds is not None and past_key_values is None:
869
+ model_inputs = {"inputs_embeds": inputs_embeds}
870
+ else:
871
+ model_inputs = {"input_ids": input_ids}
872
+
873
+ model_inputs.update(
874
+ {
875
+ "position_ids": position_ids,
876
+ "past_key_values": past_key_values,
877
+ "use_cache": kwargs.get("use_cache"),
878
+ "attention_mask": attention_mask,
879
+ "is_padded_inputs": ((attention_mask is not None) and (not attention_mask.all().item()))
880
+ }
881
+ )
882
+ return model_inputs
883
+
884
+ @staticmethod
885
+ def _reorder_cache(past_key_values, beam_idx):
886
+ reordered_past = ()
887
+ for layer_past in past_key_values:
888
+ reordered_past += (
889
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
890
+ )
891
+ return reordered_past
892
+
893
+
894
+ @add_start_docstrings(
895
+ """
896
+ The LLaMa Model transformer with a sequence classification head on top (linear layer).
897
+
898
+ [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
899
+ (e.g. GPT-2) do.
900
+
901
+ Since it does classification on the last token, it requires to know the position of the last token. If a
902
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
903
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
904
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
905
+ each row of the batch).
906
+ """,
907
+ LLAMA_START_DOCSTRING,
908
+ )
909
+ class LlamaForSequenceClassification(LlamaPreTrainedModel):
910
+ def __init__(self, config):
911
+ super().__init__(config)
912
+ self.num_labels = config.num_labels
913
+ self.model = LlamaModel(config)
914
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
915
+
916
+ # Initialize weights and apply final processing
917
+ self.post_init()
918
+
919
+ def get_input_embeddings(self):
920
+ return self.model.embed_tokens
921
+
922
+ def set_input_embeddings(self, value):
923
+ self.model.embed_tokens = value
924
+
925
+ @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
926
+ def forward(
927
+ self,
928
+ input_ids: torch.LongTensor = None,
929
+ attention_mask: Optional[torch.Tensor] = None,
930
+ position_ids: Optional[torch.LongTensor] = None,
931
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
932
+ inputs_embeds: Optional[torch.FloatTensor] = None,
933
+ labels: Optional[torch.LongTensor] = None,
934
+ use_cache: Optional[bool] = None,
935
+ output_attentions: Optional[bool] = None,
936
+ output_hidden_states: Optional[bool] = None,
937
+ return_dict: Optional[bool] = None,
938
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
939
+ r"""
940
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
941
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
942
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
943
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
944
+ """
945
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
946
+
947
+ transformer_outputs = self.model(
948
+ input_ids,
949
+ attention_mask=attention_mask,
950
+ position_ids=position_ids,
951
+ past_key_values=past_key_values,
952
+ inputs_embeds=inputs_embeds,
953
+ use_cache=use_cache,
954
+ output_attentions=output_attentions,
955
+ output_hidden_states=output_hidden_states,
956
+ return_dict=return_dict,
957
+ )
958
+ hidden_states = transformer_outputs[0]
959
+ logits = self.score(hidden_states)
960
+
961
+ if input_ids is not None:
962
+ batch_size = input_ids.shape[0]
963
+ else:
964
+ batch_size = inputs_embeds.shape[0]
965
+
966
+ if self.config.pad_token_id is None and batch_size != 1:
967
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
968
+ if self.config.pad_token_id is None:
969
+ sequence_lengths = -1
970
+ else:
971
+ if input_ids is not None:
972
+ sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
973
+ else:
974
+ sequence_lengths = -1
975
+
976
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
977
+
978
+ loss = None
979
+ if labels is not None:
980
+ labels = labels.to(logits.device)
981
+ if self.config.problem_type is None:
982
+ if self.num_labels == 1:
983
+ self.config.problem_type = "regression"
984
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
985
+ self.config.problem_type = "single_label_classification"
986
+ else:
987
+ self.config.problem_type = "multi_label_classification"
988
+
989
+ if self.config.problem_type == "regression":
990
+ loss_fct = MSELoss()
991
+ if self.num_labels == 1:
992
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
993
+ else:
994
+ loss = loss_fct(pooled_logits, labels)
995
+ elif self.config.problem_type == "single_label_classification":
996
+ loss_fct = CrossEntropyLoss()
997
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
998
+ elif self.config.problem_type == "multi_label_classification":
999
+ loss_fct = BCEWithLogitsLoss()
1000
+ loss = loss_fct(pooled_logits, labels)
1001
+ if not return_dict:
1002
+ output = (pooled_logits,) + transformer_outputs[1:]
1003
+ return ((loss,) + output) if loss is not None else output
1004
+
1005
+ return SequenceClassifierOutputWithPast(
1006
+ loss=loss,
1007
+ logits=pooled_logits,
1008
+ past_key_values=transformer_outputs.past_key_values,
1009
+ hidden_states=transformer_outputs.hidden_states,
1010
+ attentions=transformer_outputs.attentions,
1011
+ )
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:364f5798ebae82d53615fc02bf2b30b3ca3910a8cbd801a166b0effda5225a41
3
+ size 26031797876
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "legacy": false,
22
+ "model_max_length": 8192,
23
+ "pad_token": null,
24
+ "sp_model_kwargs": {},
25
+ "tokenizer_class": "LlamaTokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }