Spaces:
Runtime error
Runtime error
File size: 4,435 Bytes
42c3bab 0f9e3cc 42c3bab 0f9e3cc 42c3bab 0f9e3cc 42c3bab 0f9e3cc 42c3bab 0f9e3cc a40e2b5 0f9e3cc 1f2ceb7 0f9e3cc 1f2ceb7 0f9e3cc 47875c3 0f9e3cc 47875c3 0f9e3cc c26b20b 0f9e3cc 1e5df7d 1cdd0e3 a72e320 0f9e3cc aa4a704 0f9e3cc f970cca 0f9e3cc 315ce9c 0f9e3cc f6687d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import gc
import numpy as np
import requests as rq
import torch
from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerFast, TrainingArguments
from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer
import trl
dataset = load_dataset("nroggendorff/openhermes", split="train").select(range(int(1e+4)))
def get_training_corpus():
for i in range(0, len(dataset), 1000):
yield dataset[i : i + 1000]["text"]
training_corpus = get_training_corpus()
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(
training_corpus,
vocab_size=3200,
min_frequency=2,
special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<|user|>", "<|bot|>", "<|end|>"]
)
tokenizer.save("/tmp/custom_tokenizer.json")
tokenizer = PreTrainedTokenizerFast(tokenizer_file="/tmp/custom_tokenizer.json")
tokenizer.bos_token = "<s>"
tokenizer.eos_token = "</s>"
tokenizer.unk_token = "<unk>"
tokenizer.pad_token = "<pad>"
tokenizer.mask_token = "<mask>"
tokenizer.additional_special_tokens = ["<|user|>", "<|bot|>", "<|end|>"]
tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
chat_template = "{{bos_token}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{{ eos_token }}"
tokenizer.chat_template = chat_template
tokenizer.add_special_tokens({
"additional_special_tokens": ["<|user|>", "<|bot|>", "<|end|>"]
})
tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
tokenizer.save_pretrained("/tmp/llama-tokenizer")
tokenizer = AutoTokenizer.from_pretrained("/tmp/llama-tokenizer")
print(tokenizer.apply_chat_template([{"role": "user", "content": "Why is the sky blue?"}, {"role": "assistant", "content": "Due to rayleigh scattering."}, {"role": "user", "content": "That's cool."}, {"role": "assistant", "content": "Yeah, I agree."}], tokenize=False))
config = LlamaConfig(
vocab_size=tokenizer.vocab_size,
hidden_size=int(512 / 1),
intermediate_size=int(1024 / 1),
num_hidden_layers=int(8 / 1),
num_attention_heads=int(8 / 1),
max_position_embeddings=int(512 / 1),
rms_norm_eps=1e-6,
initializer_range=0.02,
use_cache=True,
pad_token_id=tokenizer.pad_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
tie_word_embeddings=False,
)
model = LlamaForCausalLM(config)
def format_prompts(examples):
texts = []
for text in examples['text']:
conversation = []
parts = text.split('<|end|>')
for i in range(0, len(parts) - 1, 2):
prompt = parts[i].replace("<|user|>", "")
response = parts[i + 1].replace("<|bot|>", "")
conversation.append({"role": "user", "content": prompt})
conversation.append({"role": "assistant", "content": response})
formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
texts.append(formatted_conversation)
output = {}
output['text'] = texts
return output
dataset = dataset.map(format_prompts, batched=True)
print(dataset['text'][2])
args = TrainingArguments(
output_dir="mayo",
num_train_epochs=1,
per_device_train_batch_size=64,
gradient_accumulation_steps=4,
learning_rate=1e-5,
save_steps=100000,
fp16=True,
optim="sgd",
optim_target_modules=["attn", "mlp"]
)
trainer = trl.SFTTrainer(
model=model,
tokenizer=tokenizer,
args=args,
train_dataset=dataset,
dataset_text_field='text',
max_seq_length=512
)
torch.cuda.set_device(0)
gc.collect()
torch.cuda.empty_cache()
trainer.train()
#trainer.push_to_hub()
trained_model = trainer.model
trained_tokenizer = trainer.tokenizer
repo_id = "makeshift-mayo"
trained_model.push_to_hub(repo_id)
trained_tokenizer.push_to_hub(repo_id)
raise RuntimeError("The script was finished.") |