kaiokendev
commited on
Commit
·
897d35b
1
Parent(s):
3f87968
7b model
Browse files- README.md +39 -0
- adapter_config.json +19 -0
- adapter_model.bin +3 -0
- llama_rope_scaled_monkey_patch.py +64 -0
README.md
CHANGED
@@ -1,3 +1,42 @@
|
|
1 |
---
|
2 |
license: mit
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: mit
|
3 |
---
|
4 |
+
|
5 |
+
### SuperHOT Prototype 2 w/ 8K Context
|
6 |
+
|
7 |
+
This is a second prototype of SuperHOT, this time 7B with 8K context and no RLHF, using the same technique described in [the github blog](https://kaiokendev.github.io/til#extending-context-to-8k).
|
8 |
+
|
9 |
+
#### Looking for Merged & Quantized Models?
|
10 |
+
Make some please :)
|
11 |
+
|
12 |
+
#### Using the monkey-patch?
|
13 |
+
You will **NEED** to **apply the monkeypatch** or, if you are already using the monkeypatch, **change the scaling factor to 0.25 and the maximum sequence length to 8192**
|
14 |
+
|
15 |
+
The monkeypatch is only necessary if you are using a front-end/back-end that does not already support scaling and said front-end/back-end is Python-based (i.e. Huggingface Transformers). To apply the patch, you will need to copy the `llama_rope_scaled_monkey_patch.py` into your working directory and call the exported function `replace_llama_rope_with_scaled_rope` at the very start of your Python program. It will modify the Transformers library's implementation of RoPE to properly apply the scaling factor.
|
16 |
+
|
17 |
+
#### Using Oobabooga with Exllama?
|
18 |
+
Switch your loader to `exllama` or `exllama_hf` Add the arguments `max_seq_len 8192` and `compress_pos_emb 4`. **While the model may work well with `compress_pos_emb 2`, it was trained on 4, so that is what I advocate for you to use**
|
19 |
+
|
20 |
+
Example in the command-line:
|
21 |
+
- `python server.py --max_seq_len 8192 --compress_pos_emb 4 --loader exllama_hf`
|
22 |
+
|
23 |
+
In the UI, you will see the loader option in the `Models` tab. Once you select either `exllama` or `exllama_hf`, the `max_seq_len` and `compress_pos_emb` settings will appear.
|
24 |
+
|
25 |
+
#### Training Details
|
26 |
+
I trained the LoRA with the following configuration:
|
27 |
+
- 1200 samples (~400 samples over 2048 sequence length)
|
28 |
+
- learning rate of 3e-4
|
29 |
+
- 3 epochs
|
30 |
+
- The exported modules are:
|
31 |
+
- q_proj
|
32 |
+
- k_proj
|
33 |
+
- v_proj
|
34 |
+
- o_proj
|
35 |
+
- no bias
|
36 |
+
- Rank = 4
|
37 |
+
- Alpha = 8
|
38 |
+
- no dropout
|
39 |
+
- weight decay of 0.1
|
40 |
+
- AdamW beta1 of 0.9 and beta2 0.99, epsilon of 1e-5
|
41 |
+
- Trained on 4-bit base model
|
42 |
+
- Cutoff length: 4096
|
adapter_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"base_model_name_or_path": "",
|
3 |
+
"bias": "none",
|
4 |
+
"fan_in_fan_out": false,
|
5 |
+
"inference_mode": true,
|
6 |
+
"init_lora_weights": true,
|
7 |
+
"lora_alpha": 8,
|
8 |
+
"lora_dropout": 0,
|
9 |
+
"modules_to_save": null,
|
10 |
+
"peft_type": "LORA",
|
11 |
+
"r": 4,
|
12 |
+
"target_modules": [
|
13 |
+
"q_proj",
|
14 |
+
"k_proj",
|
15 |
+
"v_proj",
|
16 |
+
"o_proj"
|
17 |
+
],
|
18 |
+
"task_type": "CAUSAL_LM"
|
19 |
+
}
|
adapter_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:34e672a6931364b04ce9bea789616f54511dd341af44d071204f7576d003315e
|
3 |
+
size 16869642
|
llama_rope_scaled_monkey_patch.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import transformers
|
3 |
+
import transformers.models.llama.modeling_llama
|
4 |
+
from einops import rearrange
|
5 |
+
import random
|
6 |
+
|
7 |
+
|
8 |
+
class ScaledRotaryEmbedding(torch.nn.Module):
|
9 |
+
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
10 |
+
super().__init__()
|
11 |
+
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
|
12 |
+
self.register_buffer("inv_freq", inv_freq)
|
13 |
+
|
14 |
+
max_position_embeddings = 8192
|
15 |
+
|
16 |
+
# Build here to make `torch.jit.trace` work.
|
17 |
+
self.max_seq_len_cached = max_position_embeddings
|
18 |
+
t = torch.arange(
|
19 |
+
self.max_seq_len_cached,
|
20 |
+
device=self.inv_freq.device,
|
21 |
+
dtype=self.inv_freq.dtype,
|
22 |
+
)
|
23 |
+
|
24 |
+
self.scale = 1 / 4
|
25 |
+
t *= self.scale
|
26 |
+
|
27 |
+
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
|
28 |
+
# Different from paper, but it uses a different permutation in order to obtain the same calculation
|
29 |
+
emb = torch.cat((freqs, freqs), dim=-1)
|
30 |
+
self.register_buffer(
|
31 |
+
"cos_cached", emb.cos()[None, None, :, :], persistent=False
|
32 |
+
)
|
33 |
+
self.register_buffer(
|
34 |
+
"sin_cached", emb.sin()[None, None, :, :], persistent=False
|
35 |
+
)
|
36 |
+
|
37 |
+
def forward(self, x, seq_len=None):
|
38 |
+
# x: [bs, num_attention_heads, seq_len, head_size]
|
39 |
+
# This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
|
40 |
+
if seq_len > self.max_seq_len_cached:
|
41 |
+
self.max_seq_len_cached = seq_len
|
42 |
+
t = torch.arange(
|
43 |
+
self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype
|
44 |
+
)
|
45 |
+
t *= self.scale
|
46 |
+
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
|
47 |
+
# Different from paper, but it uses a different permutation in order to obtain the same calculation
|
48 |
+
emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
|
49 |
+
self.register_buffer(
|
50 |
+
"cos_cached", emb.cos()[None, None, :, :], persistent=False
|
51 |
+
)
|
52 |
+
self.register_buffer(
|
53 |
+
"sin_cached", emb.sin()[None, None, :, :], persistent=False
|
54 |
+
)
|
55 |
+
return (
|
56 |
+
self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
|
57 |
+
self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
|
58 |
+
)
|
59 |
+
|
60 |
+
|
61 |
+
def replace_llama_rope_with_scaled_rope():
|
62 |
+
transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = (
|
63 |
+
ScaledRotaryEmbedding
|
64 |
+
)
|