File size: 6,506 Bytes
9d65ca7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# Importing the correct libraries.
!pip install datasets huggingface_hub
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score
# Importing the datasets and defining a function for preprocessing.
sst2 = load_dataset("stanfordnlp/sst2")
imdb = load_dataset("stanfordnlp/imdb")
def data_preprocessor_sst2(data, token):
return data.map(lambda x: token(x['sentence'], truncation = True, padding = 'max_length', max_length = 128), batched = True)
def data_preprocessor_imdb(data, token):
return data.map(lambda x: token(x['text'], truncation = True, padding = 'max_length', max_length = 128), batched = True)
# Loading Bert and Distilbert tokenizers to tokenize the datasets.
token_B = AutoTokenizer.from_pretrained("bert-base-uncased")
token_DB = AutoTokenizer.from_pretrained("distilbert-base-uncased")
train_sst2 = data_preprocessor_sst2(sst2['train'], token_B)
test_sst2 = data_preprocessor_sst2(sst2['validation'], token_B)
test_imdb = data_preprocessor_imdb(imdb['test'], token_B)
# Formatting the datasets to fit the model and transformers.
train_sst2.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])
test_sst2.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])
test_imdb.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])
# Creating a data loader for each of the splits.
def data_loader(data, batch = 32):
return DataLoader(data, batch_size = batch, shuffle = True)
train_DL = data_loader(train_sst2)
test_DL = data_loader(test_sst2)
test_DL_imdb = data_loader(test_imdb)
# Loading the Bert and Distilbert models.
model_B = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)
model_DB = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 2)
# Building a function to train and optimize the model.
def model_trainer(model, data, epochs = 1):
gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(gpu)
optimizer = AdamW(model.parameters(), lr = 0.00005)
model.train()
for x in range(epochs):
loss_total = 0
for y in data:
optimizer.zero_grad()
input_ids, attention_mask, labels = (y['input_ids'].to(gpu), y['attention_mask'].to(gpu), y['label'].to(gpu))
forward_info = model(input_ids, attention_mask = attention_mask, labels = labels)
forward_loss = forward_info.loss
loss_total += forward_loss.item()
forward_loss.backward()
optimizer.step()
print(f"Epoch - Loss = {loss_total / len(data)}")
return model
# Building a function to evaluate the model.
def model_evaluator(model, data):
model.eval()
gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(gpu)
preds, trues = [], []
with torch.no_grad():
for y in data:
input_ids, attention_mask, labels = (y['input_ids'].to(gpu), y['attention_mask'].to(gpu), y['label'].to(gpu))
forward_info = model(input_ids, attention_mask = attention_mask)
forward_loss = forward_info.logits
preds.extend(torch.argmax(forward_loss, dim = 1).cpu().numpy())
trues.extend(labels.cpu().numpy())
score_acc = accuracy_score(trues, preds)
score_f1 = f1_score(trues, preds)
return score_acc, score_f1
# Training the Bert and Distilbert models.
model_B = model_trainer(model_B, train_DL)
model_DB = model_trainer(model_DB, train_DL)
# Evaluating the scores of the models on the SST2 dataset.
bert_acc_sst2, bert_f1_sst2 = model_evaluator(model_B, test_DL)
dist_acc_sst2, dist_f1_sst2 = model_evaluator(model_DB, test_DL)
print(f"Bert Accuracy: {bert_acc_sst2}, Bert F1: {bert_f1_sst2}")
print(f"Distilbert Accuracy: {dist_acc_sst2}, Distilbert F1: {dist_f1_sst2}")
# Evaluating the scores of the models on the IMDB dataset.
bert_acc_imdb, bert_f1_imdb = model_evaluator(model_B, test_DL_imdb)
dist_acc_imdb, dist_f1_imdb = model_evaluator(model_DB, test_DL_imdb)
print(f"Bert Accuracy: {bert_acc_imdb}, Bert F1: {bert_f1_imdb}")
print(f"Distilbert Accuracy: {dist_acc_imdb}, Distilbert F1: {dist_f1_imdb}")
# Defining a function to classify the sentences
def classify(sentence: str, model, token):
gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(gpu)
model.eval()
inputs = token(sentence, return_tensors = "pt", truncation = True, padding = True, max_length = 128).to(gpu)
with torch.no_grad():
forward_info = model(**inputs)
pred = forward_info.logits.argmax(dim = 1).item()
return pred
# Testing sentence 1.
sentence1 = "for all its highfalutin title and corkscrew narrative , the movie turns out to be not much more than a shaggy human tale ."
print(f"BERT Prediction: {classify(sentence1, model_B, token_B)}")
print(f"DistilBERT Prediction: {classify(sentence1, model_DB, token_DB)}")
# Testing sentence 2.
sentence2 = "its underlying mythology is a hodgepodge of inconsistencies that pose the question : since when did dumb entertainment have to be this dumb ?"
print(f"BERT Prediction: {classify(sentence2, model_B, token_B)}")
print(f"DistilBERT Prediction: {classify(sentence2, model_DB, token_DB)}")
# Testing sentence 3.
sentence3 = '''
the actors do n’t inhabit their roles– they ’re trapped by them ,
forced to change behavior in bizarre unjustified fashion and spout dialog that consists mostly of platitudes .
'''
print(f"BERT Prediction: {classify(sentence3, model_B, token_B)}")
print(f"DistilBERT Prediction: {classify(sentence3, model_DB, token_DB)}")
# Testing sentence 4.
sentence4 = "an absorbing trip into the minds and motivations of people under stress as well as a keen , unsentimental look at variations on the theme of motherhood ."
print(f"BERT Prediction: {classify(sentence4, model_B, token_B)}")
print(f"DistilBERT Prediction: {classify(sentence4, model_DB, token_DB)}")
# Testing sentence 5.
sentence5 = "one of those rare , exhilarating cinematic delights that gets even better in hindsight , as you mull over its every nuance in your mind ."
print(f"BERT Prediction: {classify(sentence5, model_B, token_B)}")
print(f"DistilBERT Prediction: {classify(sentence5, model_DB, token_DB)}") |