Spaces:
Sleeping
Sleeping
import re | |
import os | |
import transformers | |
import torch | |
from transformers import TextDataset, DataCollatorForLanguageModeling | |
from transformers import GPT2Tokenizer, GPT2LMHeadModel | |
from transformers import Trainer, TrainingArguments | |
print(torch.cuda.is_available()) | |
def load_dataset(file_path, tokenizer, block_size=128): | |
dataset = TextDataset( | |
tokenizer=tokenizer, | |
file_path=file_path, | |
block_size=block_size, | |
) | |
return dataset | |
def load_data_collator(tokenizer, mlm=False): | |
data_collator = DataCollatorForLanguageModeling( | |
tokenizer=tokenizer, | |
mlm=mlm, | |
) | |
return data_collator | |
def train(train_file_path, model_name, output_dir, overwrite_output_dir, | |
per_device_train_batch_size, num_train_epochs, save_steps, resume_from_checkpoint): | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
tokenizer = AutoTokenizer.from_pretrained("malteos/gpt2-uk") | |
train_dataset = load_dataset(train_file_path, tokenizer) | |
data_collator = load_data_collator(tokenizer) | |
tokenizer.save_pretrained(output_dir) | |
model = AutoModelForCausalLM.from_pretrained("malteos/gpt2-uk") | |
model.save_pretrained(output_dir) | |
training_args = TrainingArguments( | |
output_dir=output_dir, | |
overwrite_output_dir=overwrite_output_dir, | |
per_device_train_batch_size=per_device_train_batch_size, | |
num_train_epochs=num_train_epochs, | |
) | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
data_collator=data_collator, | |
train_dataset=train_dataset, | |
) | |
trainer.train(resume_from_checkpoint=resume_from_checkpoint) | |
trainer.save_model() | |
train_directory = 'H:/Finetunning/q_and_a' | |
train_file_path = 'H:/Finetunning/journal.txt' | |
model_name = train_directory | |
output_dir = 'H:/Finetunning/custom_full_text' | |
overwrite_output_dir = False | |
per_device_train_batch_size = 8 | |
num_train_epochs = 51 | |
save_steps = 50000 | |
print("Починаємо навчання...") | |
train( | |
train_file_path=train_file_path, | |
model_name=model_name, | |
output_dir=output_dir, | |
overwrite_output_dir=overwrite_output_dir, | |
per_device_train_batch_size=per_device_train_batch_size, | |
num_train_epochs=num_train_epochs, | |
save_steps=save_steps, | |
resume_from_checkpoint=True # False для першого разу, True - з якоїсь точки остановки | |
) | |