File size: 2,902 Bytes
e343fdd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd
import torch
# import numpy as np
# import torch
from transformers import GPT2Tokenizer

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# train_size = 50000 
# test_size = 2500
# val_size = 2500

tokenizer = GPT2Tokenizer.from_pretrained('gpt2', add_bos_token=True)
tokenizer.pad_token = tokenizer.eos_token

# df = pd.read_csv('./prepro_data.csv')

# train_df = df[:train_size]
# test_df = df[train_size:train_size + test_size + val_size]

# test_df.to_csv('test_data.csv')
# print('Test df saved...')

test_df = pd.read_csv('./test_data.csv')
test_df = test_df.reset_index(drop=True)
# print(test_df.index)

class TextDataset(torch.utils.data.Dataset):
        def __init__(self, X, y):
            self.X = X
            self.y = y
    
        def __len__(self):
            return len(self.X)
    
        # def __getitem__(self, idx):
        #     return self.X[idx]['input_ids'], self.X[idx]['attention_mask'] , self.y[idx]['input_ids'], self.y[idx]['attention_mask']
    
        def __getitem__(self, idx):
            return self.X[idx], self.y[idx]
    
def collate_fn(batch):
    X = [i[0] for i in batch] 
    y = [i[1] for i in batch] 

    lenX = []
    maxlen = max([len(tokenizer.tokenize(i)) for i in X])
    maylen = max([len(tokenizer.tokenize(i)) for i in y])

    # print(f'maxlen: {maxlen} | maylen: {maylen}')

    inputs = [tokenizer(i, max_length=maxlen, padding='max_length', truncation=True, return_tensors='pt', return_attention_mask=True) for i in X]
    targets = [tokenizer(i, max_length=maylen, padding='max_length', truncation=True, return_tensors='pt', return_attention_mask=True) for i in y]

    input_ids, input_mask = [], []
    for i in inputs:
        input_ids.append(i['input_ids'])
        input_mask.append(i['attention_mask'])
    target_ids, target_mask = [], []
    for i in targets:
        target_ids.append(i['input_ids'])
        target_mask.append(i['attention_mask'])
    
    return (torch.vstack(input_ids), torch.vstack(input_mask), torch.vstack(target_ids), torch.vstack(target_mask))

val_ds = TextDataset(test_df['X'].values, test_df['y'].values)
valloader = torch.utils.data.DataLoader(val_ds, batch_size=5000, shuffle=False, collate_fn=collate_fn)

# print(test_df.head())

def get_sample(i, device='cpu'):
    # X,y = test_df['X'][idx], test_df['y'][idx]
    # tok_X = tokenizer(X, return_tensors='pt', return_attention_mask=True)
    # tok_y = tokenizer(y, return_tensors='pt', return_attention_mask=True)
    # return X,y, tok_X, tok_y
    # return X,y
    val_batch = next(iter(valloader))
    return val_batch[0][i].unsqueeze(dim=0).to(device), val_batch[1][i].unsqueeze(dim=0).type(torch.float32).to(device), val_batch[2][i].to(device), val_batch[3][i].to(device)


# X, y, tok_X, tok_y = get_sample(1)
# print(f'X: {X} \n y: {y}')
# print(type(tok_X))
# print(tok_X)
# print(tok_X.shape, tok_y.shape)