from datasets import load_dataset |
Falcon = load_dataset('csv', data_files={"train": 'FalconData_train.csv', "validation": 'FalconData_validation.csv'}) |
print('Dataset Loaded!') |
"""Then take a look at an example:""" |
Falcon['train'][0] |
Falcon['validation'][0] |
"""The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:""" |
from transformers import AutoTokenizer, GPT2TokenizerFast |
tokenizer = AutoTokenizer.from_pretrained("distilgpt2") |
tokenizer.pad_token = tokenizer.eos_token |
Falcon = Falcon.flatten() |
Falcon["train"][0] |
def preprocess_function(examples): |
return tokenizer([" ".join(x) for x in examples["Text"]]) |
tokenized_Falcon = Falcon.map( |
preprocess_function, |
batched=True, |
num_proc=4, |
remove_columns=Falcon["train"].column_names, |
) |
block_size = tokenizer.model_max_length |
def group_texts(examples): |
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} |
total_length = len(concatenated_examples[list(examples.keys())[0]]) |
if total_length >= block_size: |
total_length = (total_length // block_size) * block_size |
result = { |
k: [t[i : i + block_size] for i in range(0, total_length, block_size)] |
for k, t in concatenated_examples.items() |
} |
result["labels"] = result["input_ids"].copy() |
return result |
"""Apply the `group_texts` function over the entire dataset:""" |
lm_dataset = tokenized_Falcon.map(group_texts, batched=True, num_proc=4) |
from transformers import DataCollatorForLanguageModeling |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer |
import torch |
model = AutoModelForCausalLM.from_pretrained("rwh/tinytoo", torch_dtype=torch.bfloat16) |
print('Model Loaded!') |
model.to('cuda') |
OutputDir = "ReadyModel3" |
training_args = TrainingArguments( |
output_dir=OutputDir, |
overwrite_output_dir=True, |
bf16=True, |
evaluation_strategy="steps", |
learning_rate=1e-5, |
weight_decay=0.001, |
num_train_epochs=5, |
per_device_train_batch_size=8, |
per_device_eval_batch_size=8, |
lr_scheduler_type = 'linear', |
push_to_hub=False, |
save_total_limit = 2, |
save_strategy = "steps", |
load_best_model_at_end=True, |
save_safetensors=True, |
) |
trainer = Trainer( |
model=model, |
args=training_args, |
train_dataset=lm_dataset["train"], |
eval_dataset=lm_dataset["validation"], |
data_collator=data_collator, |
) |
print('Started Training!') |
trainer.train() |
trainer.save_model(OutputDir) |
print('Saved Model Path:', OutputDir) |
import math |
eval_results = trainer.evaluate() |
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") |