# Text Generation using GPT (Using Huggingface)

## Project Setup

In [1]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import shutil
from torch.utils.data import Dataset, random_split
from transformers import Trainer, TrainingArguments, GPTNeoForCausalLM, GPT2Tokenizer


from google.colab import drive


## Data Preparation

In [3]:
# Load data into colab
!wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt

--2023-04-30 17:26:12--  https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94275 (92K) [text/plain]
Saving to: ‘shakespeare.txt’


2023-04-30 17:26:12 (4.90 MB/s) - ‘shakespeare.txt’ saved [94275/94275]



In [4]:
# Connects colab to google drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
shutil.copy("/content/shakespeare.txt","/content/drive/MyDrive/Colab Notebooks")


'/content/drive/MyDrive/Colab Notebooks/shakespeare.txt'

In [6]:
# Read the text file and returns list of lines in text
def read_file(file_path):
    with open(file_path) as f:
        lines = [line for line in f]
        # lines.remove("")
    return lines


In [7]:
file_path = "/content/drive/MyDrive/Colab Notebooks/shakespeare.txt"

texts = read_file(file_path)
sonnets = []
sonnet = []
for text in texts:
  if len(text)>1:
    sonnet.append(text)
  else:
    sonnets.append(''.join(sonnet))
    sonnet = []

# Remove unnecessary texts


In [8]:
# Prepare sonnets
datas = sonnets[2:-1]
print(len(datas))
for data in datas:
  if len(data)<1:
    datas.remove(data)
print(len(datas))

289
212


In [9]:
# Custome dataset class to load dataset
class ShakespeareDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            # Encode the descriptions using the GPT-Neo tokenizer
            encodings_dict = tokenizer('<|startoftext|>' 
                                        + txt +    
                                        '<|endoftext|>',
                                        truncation=True,
                                        max_length=max_length, 
                                            padding="max_length")
            input_ids = torch.tensor(encodings_dict['input_ids'])    
            self.input_ids.append(input_ids)
            mask = torch.tensor(encodings_dict['attention_mask'])
            self.attn_masks.append(mask)
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

## Initialize tokenizer, model

In [10]:
# Set the random seed to a fixed value to get reproducible results 
torch.manual_seed(42)

# Download the pre-trained GPT-Neo model's tokenizer
# Add the custom tokens denoting the beginning and the end 
# of the sequence and a special token for padding
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M",    
                            bos_token='<|startoftext|>',
                            eos_token='<|endoftext|>',
                            pad_token='<|pad|>')

# Download the pre-trained GPT-Neo model and transfer it to the GPU
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M").cuda()

# Resize the token embeddings because we've just added 3 new tokens 
model.resize_token_embeddings(len(tokenizer))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/526M [00:00<?, ?B/s]

Embedding(50259, 768)

## Train/Test Split data

In [11]:
max_length = max([len(tokenizer.encode(sonnet)) for sonnet in datas])

# Load dataset
dataset = ShakespeareDataset(sonnets, tokenizer, max_length)

# Split data into train/val
train_size = int(0.9 * len(dataset))

train_data, val_data = random_split(dataset, [train_size, len(dataset) - train_size])

max_length

351

In [None]:
tokenizer.batch_decode(val_data[0])

['<|startoftext|> <|endoftext|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> 

## Train Model

In [12]:
# Here I will pass the output directory where 
# the model predictions and checkpoints will be stored, 
# batch sizes for the training and validation steps, 
# and warmup_steps to gradually increase the learning rate
learning_rates = [5e-5, 3e-5, 1e-5]

for learning_rate in learning_rates:

    training_args = TrainingArguments(output_dir=f'./results_{learning_rate}',
                                      num_train_epochs=5,
                                      logging_steps=1000,
                                      save_steps=1000,
                                      evaluation_strategy='steps',
                                      eval_steps=1000,                               
                                      per_device_train_batch_size=2,
                                      per_device_eval_batch_size=2,
                                      warmup_steps=100,
                                      learning_rate=learning_rate,
                                      weight_decay=0.01,  
                                      logging_dir=f'./logs_{learning_rate}')

    trainer = Trainer(model=model, args=training_args,  
                      train_dataset=train_data,
                      eval_dataset=val_data, 
                      # This custom collate function is necessary 
                      # to built batches of data
                      data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),       
                  'attention_mask': torch.stack([f[1] for f in data]),
                  'labels': torch.stack([f[0] for f in data])})

    # Start training process!
    print(f"Training result for learning rate: {learning_rate}")
    trainer.train()
    print("\n\n")

Training result for learning rate: 5e-05




Step,Training Loss,Validation Loss





Training result for learning rate: 3e-05


Step,Training Loss,Validation Loss





Training result for learning rate: 1e-05


Step,Training Loss,Validation Loss







BAsed on the results above, it looks like model trained with learning rate = 5e-5 is more promising than others.

In [13]:
training_args = TrainingArguments(output_dir=f'./results',
                                      num_train_epochs=5,
                                      logging_steps=1000,
                                      save_steps=5000,
                                      evaluation_strategy='steps',
                                      eval_steps=1000,                               
                                      per_device_train_batch_size=2,
                                      per_device_eval_batch_size=2,
                                      warmup_steps=100,
                                      learning_rate=5e-5,
                                      weight_decay=0.01,  
                                      logging_dir=f'./logs')

trainer = Trainer(model=model, args=training_args,  
                  train_dataset=train_data,
                  eval_dataset=val_data, 
                  # This custom collate function is necessary 
                  # to built batches of data
                  data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),       
              'attention_mask': torch.stack([f[1] for f in data]),
              'labels': torch.stack([f[0] for f in data])})

# Start training process!
trainer.train()


Step,Training Loss,Validation Loss


TrainOutput(global_step=655, training_loss=0.12344741238892533, metrics={'train_runtime': 161.029, 'train_samples_per_second': 8.135, 'train_steps_per_second': 4.068, 'total_flos': 234581319198720.0, 'train_loss': 0.12344741238892533, 'epoch': 5.0})

In [14]:
# Save model in the specified file path
trainer.save_model("/content/drive/MyDrive/Colab Notebooks/")

In [25]:
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/")

('/content/drive/MyDrive/Colab Notebooks/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/added_tokens.json')

## Checking Model Output

In [16]:
generated = tokenizer("<|startoftext|>", return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=50,
                                # bos_token='<|startoftext|>',
                                # eos_token='<|endoftext|>', pad_token='<|pad|>',
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: 
1: 
2: Like as that which unsadiq oracles are made of that whichis, we do
Dose form our own composition, and nature lends
 substance to errors, to renew calls, to renew
Profound Errors, to renew our injunctions: so
dear lovewFallacies, may we not beueless; but
wretched are our health; do not exceed this,
 nor our stores other than well grounded be:
Do not loseдThumbnailImage](........:<?>]-------------------------------- '
3: In the second case where we found new gems,
we looked deep sorrow in death's purple,
 suffered such penance in all their woes,
 filled the water deep with glad surprised,
 and together we had made such a marriage of love
 and welcome that we are said to have eternity;
and we are yet condemned where are our old highouncesrest
 and our new highron�the ocean hermed: we are comfort elsewhere,
 because we are not alone '. (!

4:     Supports the vassuing of slavery,
Not theticket-agresseth out alchemy,  
Nor slandered his beauty within,
Made the proud object that

In [19]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [20]:
! transformers-cli env

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
2023-04-30 18:06:25.329066: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.

Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.

- `transformers` version: 4.28.1
- Platform: Linux-5.10.147+-x86_64-with-glibc2.31
- Python version: 3.10.11
- Huggingface_hub version: 0.14.1
- Safetensors version: not installed
- PyTorch version (GPU?): 2.0.0+cu118 (True)
- Tensorflow version (GPU?): 2.12.0 (True)
- Flax version (CPU?/GPU?/TPU?): 0.6.9 (gpu)
- Jax version: 0.4.8
- JaxLib version: 0.4.7
- Using GPU in script?: <fill in>
- Using distributed or parallel set-up in script?: <fill in>



## Upload model to huggingface

In [21]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
from huggingface_hub import HfApi

api = HfApi()

In [23]:
# Create your repo first to upload the model
api.create_repo(repo_id="gpt2-sonnet-generators")

RepoUrl('https://huggingface.co/sgul12/gpt2-sonnet-generators', endpoint='https://huggingface.co', repo_type='model', repo_id='sgul12/gpt2-sonnet-generators')

In [None]:
# Upload your model to huggingface. You can clone the repo anytime to use the model.
import os

model_pth = "/content/drive/MyDrive/Colab Notebooks"

files = os.listdir(model_pth)
for fi in files:
    print(os.path.join(model_pth, fi))

    api.upload_file(
        path_or_fileobj=os.path.join(model_pth, fi),
        path_in_repo=fi,
        repo_id="sgul12/gpt2-sonnet-generators",
        repo_type="model",
    )

/content/drive/MyDrive/Colab Notebooks/Current AI_Camp_Updated_CV_Notebook.ipynb
