File size: 6,506 Bytes
9d65ca7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# Importing the correct libraries.

!pip install datasets huggingface_hub
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score

# Importing the datasets and defining a function for preprocessing.

sst2 = load_dataset("stanfordnlp/sst2")
imdb = load_dataset("stanfordnlp/imdb")

def data_preprocessor_sst2(data, token):
  return data.map(lambda x: token(x['sentence'], truncation = True, padding = 'max_length', max_length = 128), batched = True)
def data_preprocessor_imdb(data, token):
  return data.map(lambda x: token(x['text'], truncation = True, padding = 'max_length', max_length = 128), batched = True)
  
# Loading Bert and Distilbert tokenizers to tokenize the datasets.

token_B = AutoTokenizer.from_pretrained("bert-base-uncased")
token_DB = AutoTokenizer.from_pretrained("distilbert-base-uncased")

train_sst2 = data_preprocessor_sst2(sst2['train'], token_B)
test_sst2 = data_preprocessor_sst2(sst2['validation'], token_B)
test_imdb = data_preprocessor_imdb(imdb['test'], token_B)

# Formatting the datasets to fit the model and transformers.

train_sst2.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])
test_sst2.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])
test_imdb.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])

# Creating a data loader for each of the splits.

def data_loader(data, batch = 32):
  return DataLoader(data, batch_size = batch, shuffle = True)

train_DL = data_loader(train_sst2)
test_DL = data_loader(test_sst2)
test_DL_imdb = data_loader(test_imdb)

# Loading the Bert and Distilbert models.

model_B = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)
model_DB = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 2)

# Building a function to train and optimize the model.

def model_trainer(model, data, epochs = 1):
  gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(gpu)
  optimizer = AdamW(model.parameters(), lr = 0.00005)
  model.train()
  for x in range(epochs):
    loss_total = 0
    for y in data:
      optimizer.zero_grad()
      input_ids, attention_mask, labels = (y['input_ids'].to(gpu), y['attention_mask'].to(gpu), y['label'].to(gpu))
      forward_info = model(input_ids, attention_mask = attention_mask, labels = labels)
      forward_loss = forward_info.loss
      loss_total += forward_loss.item()
      forward_loss.backward()
      optimizer.step()
    print(f"Epoch - Loss = {loss_total / len(data)}")
  return model
  
# Building a function to evaluate the model.

def model_evaluator(model, data):
  model.eval()
  gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(gpu)
  preds, trues = [], []
  with torch.no_grad():
    for y in data:
      input_ids, attention_mask, labels = (y['input_ids'].to(gpu), y['attention_mask'].to(gpu), y['label'].to(gpu))
      forward_info = model(input_ids, attention_mask = attention_mask)
      forward_loss = forward_info.logits
      preds.extend(torch.argmax(forward_loss, dim = 1).cpu().numpy())
      trues.extend(labels.cpu().numpy())
  score_acc = accuracy_score(trues, preds)
  score_f1 = f1_score(trues, preds)
  return score_acc, score_f1
  
# Training the Bert and Distilbert models.

model_B = model_trainer(model_B, train_DL)
model_DB = model_trainer(model_DB, train_DL)

# Evaluating the scores of the models on the SST2 dataset.

bert_acc_sst2, bert_f1_sst2 = model_evaluator(model_B, test_DL)
dist_acc_sst2, dist_f1_sst2 = model_evaluator(model_DB, test_DL)

print(f"Bert Accuracy: {bert_acc_sst2}, Bert F1: {bert_f1_sst2}")
print(f"Distilbert Accuracy: {dist_acc_sst2}, Distilbert F1: {dist_f1_sst2}")

# Evaluating the scores of the models on the IMDB dataset.

bert_acc_imdb, bert_f1_imdb = model_evaluator(model_B, test_DL_imdb)
dist_acc_imdb, dist_f1_imdb = model_evaluator(model_DB, test_DL_imdb)

print(f"Bert Accuracy: {bert_acc_imdb}, Bert F1: {bert_f1_imdb}")
print(f"Distilbert Accuracy: {dist_acc_imdb}, Distilbert F1: {dist_f1_imdb}")

# Defining a function to classify the sentences

def classify(sentence: str, model, token):
    gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(gpu)
    model.eval()
    inputs = token(sentence, return_tensors = "pt", truncation = True, padding = True, max_length = 128).to(gpu)
    with torch.no_grad():
        forward_info = model(**inputs)
    pred = forward_info.logits.argmax(dim = 1).item()
    return pred
    
# Testing sentence 1.

sentence1 = "for all its highfalutin title and corkscrew narrative , the movie turns out to be not much more than a shaggy human tale ."
print(f"BERT Prediction: {classify(sentence1, model_B, token_B)}")
print(f"DistilBERT Prediction: {classify(sentence1, model_DB, token_DB)}")

# Testing sentence 2.

sentence2 = "its underlying mythology is a hodgepodge of inconsistencies that pose the question : since when did dumb entertainment have to be this dumb ?"
print(f"BERT Prediction: {classify(sentence2, model_B, token_B)}")
print(f"DistilBERT Prediction: {classify(sentence2, model_DB, token_DB)}")

# Testing sentence 3.

sentence3 = '''

the actors do n’t inhabit their roles– they ’re trapped by them ,

forced to change behavior in bizarre unjustified fashion and spout dialog that consists mostly of platitudes .

'''
print(f"BERT Prediction: {classify(sentence3, model_B, token_B)}")
print(f"DistilBERT Prediction: {classify(sentence3, model_DB, token_DB)}")

# Testing sentence 4.

sentence4 = "an absorbing trip into the minds and motivations of people under stress as well as a keen , unsentimental look at variations on the theme of motherhood ."
print(f"BERT Prediction: {classify(sentence4, model_B, token_B)}")
print(f"DistilBERT Prediction: {classify(sentence4, model_DB, token_DB)}")

# Testing sentence 5.

sentence5 = "one of those rare , exhilarating cinematic delights that gets even better in hindsight , as you mull over its every nuance in your mind ."
print(f"BERT Prediction: {classify(sentence5, model_B, token_B)}")
print(f"DistilBERT Prediction: {classify(sentence5, model_DB, token_DB)}")