|
import pandas as pd |
|
import torch |
|
|
|
|
|
from transformers import GPT2Tokenizer |
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', add_bos_token=True) |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_df = pd.read_csv('./test_data.csv') |
|
test_df = test_df.reset_index(drop=True) |
|
|
|
|
|
class TextDataset(torch.utils.data.Dataset): |
|
def __init__(self, X, y): |
|
self.X = X |
|
self.y = y |
|
|
|
def __len__(self): |
|
return len(self.X) |
|
|
|
|
|
|
|
|
|
def __getitem__(self, idx): |
|
return self.X[idx], self.y[idx] |
|
|
|
def collate_fn(batch): |
|
X = [i[0] for i in batch] |
|
y = [i[1] for i in batch] |
|
|
|
lenX = [] |
|
maxlen = max([len(tokenizer.tokenize(i)) for i in X]) |
|
maylen = max([len(tokenizer.tokenize(i)) for i in y]) |
|
|
|
|
|
|
|
inputs = [tokenizer(i, max_length=maxlen, padding='max_length', truncation=True, return_tensors='pt', return_attention_mask=True) for i in X] |
|
targets = [tokenizer(i, max_length=maylen, padding='max_length', truncation=True, return_tensors='pt', return_attention_mask=True) for i in y] |
|
|
|
input_ids, input_mask = [], [] |
|
for i in inputs: |
|
input_ids.append(i['input_ids']) |
|
input_mask.append(i['attention_mask']) |
|
target_ids, target_mask = [], [] |
|
for i in targets: |
|
target_ids.append(i['input_ids']) |
|
target_mask.append(i['attention_mask']) |
|
|
|
return (torch.vstack(input_ids), torch.vstack(input_mask), torch.vstack(target_ids), torch.vstack(target_mask)) |
|
|
|
val_ds = TextDataset(test_df['X'].values, test_df['y'].values) |
|
valloader = torch.utils.data.DataLoader(val_ds, batch_size=5000, shuffle=False, collate_fn=collate_fn) |
|
|
|
|
|
|
|
def get_sample(i, device='cpu'): |
|
|
|
|
|
|
|
|
|
|
|
val_batch = next(iter(valloader)) |
|
return val_batch[0][i].unsqueeze(dim=0).to(device), val_batch[1][i].unsqueeze(dim=0).type(torch.float32).to(device), val_batch[2][i].to(device), val_batch[3][i].to(device) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|