File size: 4,436 Bytes
42c3bab
 
0f9e3cc
42c3bab
0f9e3cc
42c3bab
 
0f9e3cc
42c3bab
0f9e3cc
42c3bab
0f9e3cc
2dbb94f
0f9e3cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f2ceb7
0f9e3cc
1f2ceb7
0f9e3cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47875c3
0f9e3cc
47875c3
0f9e3cc
 
 
 
c26b20b
 
 
 
0f9e3cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e5df7d
1cdd0e3
2dbb94f
0f9e3cc
 
 
 
aa4a704
0f9e3cc
 
 
 
 
 
 
 
f970cca
0f9e3cc
 
 
 
 
 
 
315ce9c
0f9e3cc
 
 
 
 
 
 
f6687d9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import gc

import numpy as np
import requests as rq
import torch

from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerFast, TrainingArguments
from datasets import load_dataset

from tokenizers import ByteLevelBPETokenizer
import trl

dataset = load_dataset("nroggendorff/openhermes", split="train").select(range(int(1e+5)))

def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

training_corpus = get_training_corpus()

tokenizer = ByteLevelBPETokenizer()

tokenizer.train_from_iterator(
    training_corpus,
    vocab_size=3200,
    min_frequency=2,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<|user|>", "<|bot|>", "<|end|>"]
)

tokenizer.save("/tmp/custom_tokenizer.json")

tokenizer = PreTrainedTokenizerFast(tokenizer_file="/tmp/custom_tokenizer.json")

tokenizer.bos_token = "<s>"
tokenizer.eos_token = "</s>"
tokenizer.unk_token = "<unk>"
tokenizer.pad_token = "<pad>"
tokenizer.mask_token = "<mask>"

tokenizer.additional_special_tokens = ["<|user|>", "<|bot|>", "<|end|>"]

tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")

chat_template = "{{bos_token}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{{ eos_token }}"

tokenizer.chat_template = chat_template

tokenizer.add_special_tokens({
    "additional_special_tokens": ["<|user|>", "<|bot|>", "<|end|>"]
})

tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")

tokenizer.save_pretrained("/tmp/llama-tokenizer")

tokenizer = AutoTokenizer.from_pretrained("/tmp/llama-tokenizer")
print(tokenizer.apply_chat_template([{"role": "user", "content": "Why is the sky blue?"}, {"role": "assistant", "content": "Due to rayleigh scattering."}, {"role": "user", "content": "That's cool."}, {"role": "assistant", "content": "Yeah, I agree."}], tokenize=False))

config = LlamaConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=int(512 / 1),
    intermediate_size=int(1024 / 1),
    num_hidden_layers=int(8 / 1),
    num_attention_heads=int(8 / 1),
    max_position_embeddings=int(512 / 1),
    rms_norm_eps=1e-6,
    initializer_range=0.02,
    use_cache=True,
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    tie_word_embeddings=False,
)

model = LlamaForCausalLM(config)

def format_prompts(examples):
    texts = []
    for text in examples['text']:
        conversation = []
        parts = text.split('<|end|>')
        for i in range(0, len(parts) - 1, 2):
            prompt = parts[i].replace("<|user|>", "")
            response = parts[i + 1].replace("<|bot|>", "")
            conversation.append({"role": "user", "content": prompt})
            conversation.append({"role": "assistant", "content": response})
        formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
        texts.append(formatted_conversation)
    output = {}
    output['text'] = texts
    return output

dataset = dataset.map(format_prompts, batched=True)

print(dataset['text'][2])

args = TrainingArguments(
    output_dir="mayo",
    num_train_epochs=1,
    per_device_train_batch_size=64,
    #gradient_accumulation_steps=8,
    learning_rate=1e-5,
    save_steps=100000,
    fp16=True,
    optim="sgd",
    optim_target_modules=["attn", "mlp"]
)

trainer = trl.SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=dataset,
    dataset_text_field='text',
    max_seq_length=512
)

torch.cuda.set_device(0)

gc.collect()
torch.cuda.empty_cache()

trainer.train()
    
#trainer.push_to_hub()
trained_model = trainer.model
trained_tokenizer = trainer.tokenizer

repo_id = "makeshift-mayo"
trained_model.push_to_hub(repo_id)
trained_tokenizer.push_to_hub(repo_id)

raise RuntimeError("The script was finished.")