# Question Answering
The following notebook contains different question answering models. We will start by introducing a representation for the dataset and corresponding DataLoader and then evaluate different models.

In [50]:
from transformers import DistilBertModel, DistilBertForMaskedLM, DistilBertConfig, \
 DistilBertTokenizerFast, AutoTokenizer, BertModel, BertForMaskedLM, BertTokenizerFast, BertConfig
from torch import nn
from pathlib import Path
import torch
import pandas as pd
from typing import Optional 
from tqdm.auto import tqdm
from util import eval_test_set, count_parameters
from torch.optim import AdamW, RMSprop


from qa_model import QuestionDistilBERT, SimpleQuestionDistilBERT, ReuseQuestionDistilBERT, Dataset, test_model

## Data
Processing the data correctly is partly based on the Huggingface Tutorial (https://huggingface.co/course/chapter7/7?fw=pt)

In [51]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [52]:
 
# create datasets and loaders for training and test set
squad_paths = [str(x) for x in Path('data/training_squad/').glob('**/*.txt')]
nat_paths = [str(x) for x in Path('data/natural_questions_train/').glob('**/*.txt')]
hotpotqa_paths = [str(x) for x in Path('data/hotpotqa_training/').glob('**/*.txt')]

## POC Model
* Works very well:
 * Dropout 0.1 is too small (overfitting after first epoch) - changed to 0.15
 * Difference between AdamW and RMSprop minimal
 
### Results:
Dropout = 0.15
* Mean EM: 0.5374
* Mean F-1: 0.6826317532406944

Dropout = 0.2 (overfitting realtively similar to first, but seems to be too high)
* Mean EM: 0.5044
* Mean F-1: 0.6437359169276439

In [54]:
dataset = Dataset(squad_paths = squad_paths, natural_question_paths=None, hotpotqa_paths=hotpotqa_paths, tokenizer=tokenizer)
loader = torch.utils.data.DataLoader(dataset, batch_size=8)

test_dataset = Dataset(squad_paths = [str(x) for x in Path('data/test_squad/').glob('**/*.txt')], 
 natural_question_paths=None, 
 hotpotqa_paths = None, tokenizer=tokenizer)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4)

In [55]:
model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")
config = DistilBertConfig.from_pretrained("distilbert-base-uncased")
mod = model.distilbert

In [56]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = SimpleQuestionDistilBERT(mod)
model.to(device)

SimpleQuestionDistilBERT(
 (distilbert): DistilBertModel(
 (embeddings): Embeddings(
 (word_embeddings): Embedding(30522, 768, padding_idx=0)
 (position_embeddings): Embedding(512, 768)
 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 (dropout): Dropout(p=0.1, inplace=False)
 )
 (transformer): Transformer(
 (layer): ModuleList(
 (0): TransformerBlock(
 (attention): MultiHeadSelfAttention(
 (dropout): Dropout(p=0.1, inplace=False)
 (q_lin): Linear(in_features=768, out_features=768, bias=True)
 (k_lin): Linear(in_features=768, out_features=768, bias=True)
 (v_lin): Linear(in_features=768, out_features=768, bias=True)
 (out_lin): Linear(in_features=768, out_features=768, bias=True)
 )
 (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 (ffn): FFN(
 (dropout): Dropout(p=0.1, inplace=False)
 (lin1): Linear(in_features=768, out_features=3072, bias=True)
 (lin2): Linear(in_features=3072, out_features=768, bias=True)
 (activation): GELUActivation()
 )
 

In [57]:
count_parameters(model)

+---------------------------------------------------------+------------+
| Modules | Parameters |
+---------------------------------------------------------+------------+
| distilbert.embeddings.word_embeddings.weight | 23440896 |
| distilbert.embeddings.position_embeddings.weight | 393216 |
| distilbert.embeddings.LayerNorm.weight | 768 |
| distilbert.embeddings.LayerNorm.bias | 768 |
| distilbert.transformer.layer.0.attention.q_lin.weight | 589824 |
| distilbert.transformer.layer.0.attention.q_lin.bias | 768 |
| distilbert.transformer.layer.0.attention.k_lin.weight | 589824 |
| distilbert.transformer.layer.0.attention.k_lin.bias | 768 |
| distilbert.transformer.layer.0.attention.v_lin.weight | 589824 |
| distilbert.transformer.layer.0.attention.v_lin.bias | 768 |
| distilbert.transformer.layer.0.attention.out_lin.weight | 589824 |
| distilbert.transformer.layer.0.attention.out_lin.bias | 768 |
| distilbert.transformer.layer.0.sa_layer_norm.weight | 768 |
| distilbert.transformer.laye

66364418

### Testing the model

In [58]:
# get smaller dataset
batch_size = 8
test_ds = Dataset(squad_paths = squad_paths[:2], natural_question_paths=None, hotpotqa_paths=None, tokenizer=tokenizer)
test_ds_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
optim = RMSprop(model.parameters(), lr=1e-4)

In [59]:
test_model(model, optim, test_ds_loader, device)

Passed


### Model Training

In [60]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = SimpleQuestionDistilBERT(mod)
model.to(device)

SimpleQuestionDistilBERT(
 (distilbert): DistilBertModel(
 (embeddings): Embeddings(
 (word_embeddings): Embedding(30522, 768, padding_idx=0)
 (position_embeddings): Embedding(512, 768)
 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 (dropout): Dropout(p=0.1, inplace=False)
 )
 (transformer): Transformer(
 (layer): ModuleList(
 (0): TransformerBlock(
 (attention): MultiHeadSelfAttention(
 (dropout): Dropout(p=0.1, inplace=False)
 (q_lin): Linear(in_features=768, out_features=768, bias=True)
 (k_lin): Linear(in_features=768, out_features=768, bias=True)
 (v_lin): Linear(in_features=768, out_features=768, bias=True)
 (out_lin): Linear(in_features=768, out_features=768, bias=True)
 )
 (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 (ffn): FFN(
 (dropout): Dropout(p=0.1, inplace=False)
 (lin1): Linear(in_features=768, out_features=3072, bias=True)
 (lin2): Linear(in_features=3072, out_features=768, bias=True)
 (activation): GELUActivation()
 )
 

In [61]:
model.train()
optim = RMSprop(model.parameters(), lr=1e-4)

In [None]:
epochs = 5

for epoch in range(epochs):
 loop = tqdm(loader, leave=True)
 model.train()
 mean_training_error = []
 for batch in loop:
 optim.zero_grad()
 
 input_ids = batch['input_ids'].to(device)
 attention_mask = batch['attention_mask'].to(device)
 start = batch['start_positions'].to(device)
 end = batch['end_positions'].to(device)
 
 outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)
 # print(torch.argmax(outputs['start_logits'],axis=1), torch.argmax(outputs['end_logits'], axis=1), start, end)
 loss = outputs['loss']
 loss.backward()
 # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
 optim.step()
 mean_training_error.append(loss.item())
 loop.set_description(f'Epoch {epoch}')
 loop.set_postfix(loss=loss.item())
 print("Mean Training Error", np.mean(mean_training_error))
 
 
 loop = tqdm(test_loader, leave=True)
 model.eval()
 mean_test_error = []
 for batch in loop:
 
 input_ids = batch['input_ids'].to(device)
 attention_mask = batch['attention_mask'].to(device)
 start = batch['start_positions'].to(device)
 end = batch['end_positions'].to(device)
 
 outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)
 # print(torch.argmax(outputs['start_logits'],axis=1), torch.argmax(outputs['end_logits'], axis=1), start, end)
 loss = outputs['loss']
 
 mean_test_error.append(loss.item())
 loop.set_description(f'Epoch {epoch} Testset')
 loop.set_postfix(loss=loss.item())
 print("Mean Test Error", np.mean(mean_test_error))

In [19]:
torch.save(model.state_dict(), "simple_distilbert_qa.model")

In [20]:
model = SimpleQuestionDistilBERT(mod)
model.load_state_dict(torch.load("simple_distilbert_qa.model"))



In [18]:
eval_test_set(model, tokenizer, test_loader, device)

100%|██████████| 2500/2500 [02:09<00:00, 19.37it/s]

Mean EM: 0.5374
Mean F-1: 0.6826317532406944





## Freeze baseline and train new head
This was my initial idea, to freeze the layers and add a completely new head, which we train from scratch. I tried a lot of different configurations, but nothing really worked, I usually stayed at a CrossEntropyLoss of about 3 the whole time. Below, you can see the different heads I have tried.

Furthermore, I experimented with different data, because I though it might not be enough data all in all. I would conclude that this didn't work because (1) Transformers are very data-hungry and I probably still used too little data (one epoch took about 1h though, so it wasn't possible to use even more). (2) We train the layers completely new, which means they contain absolutely no structure about the problem and task beforehand. I do not think that this way of training leads to better results / less energy used all in all, because it would be too resource intense.

The following setup is partly based on the HuggingFace implementation of the question answering model (https://github.com/huggingface/transformers/blob/v4.23.1/src/transformers/models/distilbert/modeling_distilbert.py#L805)

In [62]:
model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")

In [63]:
config = DistilBertConfig.from_pretrained("distilbert-base-uncased")

In [64]:
# only take base model, we do not need the classification head
mod = model.distilbert

In [65]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = QuestionDistilBERT(mod)
model.to(device)

QuestionDistilBERT(
 (distilbert): DistilBertModel(
 (embeddings): Embeddings(
 (word_embeddings): Embedding(30522, 768, padding_idx=0)
 (position_embeddings): Embedding(512, 768)
 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 (dropout): Dropout(p=0.1, inplace=False)
 )
 (transformer): Transformer(
 (layer): ModuleList(
 (0): TransformerBlock(
 (attention): MultiHeadSelfAttention(
 (dropout): Dropout(p=0.1, inplace=False)
 (q_lin): Linear(in_features=768, out_features=768, bias=True)
 (k_lin): Linear(in_features=768, out_features=768, bias=True)
 (v_lin): Linear(in_features=768, out_features=768, bias=True)
 (out_lin): Linear(in_features=768, out_features=768, bias=True)
 )
 (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 (ffn): FFN(
 (dropout): Dropout(p=0.1, inplace=False)
 (lin1): Linear(in_features=768, out_features=3072, bias=True)
 (lin2): Linear(in_features=3072, out_features=768, bias=True)
 (activation): GELUActivation()
 )
 (outpu

In [66]:
count_parameters(model)

+---------------------------------------+------------+
| Modules | Parameters |
+---------------------------------------+------------+
| te.layers.0.self_attn.in_proj_weight | 1769472 |
| te.layers.0.self_attn.in_proj_bias | 2304 |
| te.layers.0.self_attn.out_proj.weight | 589824 |
| te.layers.0.self_attn.out_proj.bias | 768 |
| te.layers.0.linear1.weight | 1572864 |
| te.layers.0.linear1.bias | 2048 |
| te.layers.0.linear2.weight | 1572864 |
| te.layers.0.linear2.bias | 768 |
| te.layers.0.norm1.weight | 768 |
| te.layers.0.norm1.bias | 768 |
| te.layers.0.norm2.weight | 768 |
| te.layers.0.norm2.bias | 768 |
| te.layers.1.self_attn.in_proj_weight | 1769472 |
| te.layers.1.self_attn.in_proj_bias | 2304 |
| te.layers.1.self_attn.out_proj.weight | 589824 |
| te.layers.1.self_attn.out_proj.bias | 768 |
| te.layers.1.linear1.weight | 1572864 |
| te.layers.1.linear1.bias | 2048 |
| te.layers.1.linear2.weight | 1572864 |
| te.layers.1.linear2.bias | 768 |
| te.layers.1.norm1.weight | 768 |


17108290

### Testing the model
This is the same procedure as in `distilbert.ipynb`. 

In [67]:
# get smaller dataset
batch_size = 8
test_ds = Dataset(squad_paths = squad_paths[:2], natural_question_paths=None, hotpotqa_paths=None, tokenizer=tokenizer)
test_ds_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
optim=torch.optim.Adam(model.parameters())

In [68]:
test_model(model, optim, test_ds_loader, device)

Passed


### Training the model
* Parameter Tuning:
 * Learning Rate: I experimented with several values, 1e-4 seemed to work best for me. 1e-3 was very unstable and 1e-5 was too small.
 * Gradient Clipping: I experimented with this, but the difference was only minimal

Data:
* I first used only the SQuAD dataset, but generalisation is a problem
 * The dataset is realtively small and we often have entries with the same context but different questions
 * I believe, the diversity is not big enough to train a fully functional model
* Hence, I included the Natural Questions dataset too
 * It is however a lot more messy - I elaborated a bit more on this in `load_data.ipynb`
* Also the hotpotqa data was used

Tested with: 
* 3 Linear Layers
 * Training Error high - needed more layers
 * Already expected - this was mostly a Proof of Concept
* 1 TransformerEncoder with 4 attention heads + 1 Linear Layer:
 * Training Error was high, still too simple
* 1 TransformerEncoder with 8 heads + 1 Linear Layer:
 * Training Error gets lower, however stagnates at some point
 * Probably still too simple, it doesn't generalise either
* 2 TransformerEncoder with 8 and 4 heads + 1 Linear Layer:
 * Loss gets down but doesn't go further after some time


In [None]:
dataset = Dataset(squad_paths = squad_paths, natural_question_paths=nat_paths, hotpotqa_paths=hotpotqa_paths, tokenizer=tokenizer)
loader = torch.utils.data.DataLoader(dataset, batch_size=8)

test_dataset = Dataset(squad_paths = [str(x) for x in Path('data/test_squad/').glob('**/*.txt')], 
 natural_question_paths=None, 
 hotpotqa_paths = None, tokenizer=tokenizer)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4)

In [26]:
model = QuestionDistilBERT(mod)

In [41]:
from torch.optim import AdamW, RMSprop

model.train()
optim = RMSprop(model.parameters(), lr=1e-4)

In [42]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [None]:
epochs = 20

for epoch in range(epochs):
 loop = tqdm(loader, leave=True)
 model.train()
 mean_training_error = []
 for batch in loop:
 optim.zero_grad()
 
 input_ids = batch['input_ids'].to(device)
 attention_mask = batch['attention_mask'].to(device)
 start = batch['start_positions'].to(device)
 end = batch['end_positions'].to(device)
 
 outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)
 
 loss = outputs['loss']
 loss.backward()
 
 optim.step()
 mean_training_error.append(loss.item())
 loop.set_description(f'Epoch {epoch}')
 loop.set_postfix(loss=loss.item())
 print("Mean Training Error", np.mean(mean_training_error))
 writer.add_scalar("Loss/train", np.mean(mean_training_error), epoch)
 
 loop = tqdm(test_loader, leave=True)
 model.eval()
 mean_test_error = []
 for batch in loop:
 
 input_ids = batch['input_ids'].to(device)
 attention_mask = batch['attention_mask'].to(device)
 start = batch['start_positions'].to(device)
 end = batch['end_positions'].to(device)
 
 outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)
 # print(torch.argmax(outputs['start_logits'],axis=1), torch.argmax(outputs['end_logits'], axis=1), start, end)
 loss = outputs['loss']
 
 mean_test_error.append(loss.item())
 loop.set_description(f'Epoch {epoch} Testset')
 loop.set_postfix(loss=loss.item())
 print("Mean Test Error", np.mean(mean_test_error))
 writer.add_scalar("Loss/test", np.mean(mean_test_error), epoch)

In [238]:
writer.close()

In [33]:
torch.save(model.state_dict(), "distilbert_qa.model")

In [34]:
model = QuestionDistilBERT(mod)
model.load_state_dict(torch.load("distilbert_qa.model"))



In [35]:
eval_test_set(model, tokenizer, test_loader, device)

100%|██████████| 2500/2500 [02:57<00:00, 14.09it/s]

Mean EM: 0.0479
Mean F-1: 0.08989175857485086





## Reuse Layer
This was inspired by how well the original model with just one classification head worked. I felt like the main problem with the previous model was the lack of structure which was already in the layers, combined with the massive amount of resources needed for a Transformer.

Hence, I tried cloning the last (and then last two) layers of the DistilBERT model, putting a classifier on top and using this as the head. The base DistilBERT model is completely frozen. This worked extremely well, while we only fine-tune about 21% of the parameters (14 Mio as opposed to 66 Mio!) we did before. Below you can see the results.

### Last DistilBERT layer

Dropout 0.1 and RMSprop 1e-4:
* Mean EM: 0.3888
* Mean F-1: 0.5122932744694068

Dropout 0.25: very early stagnating
* Mean EM: 0.3552
* Mean F-1: 0.4711235721312687

Dropout 0.15: seems to work well - training and test error stagnate around 1.7 and 1.8 but good generalisation (need to add more layers)
* Mean EM: 0.4119
* Mean F-1: 0.5296387232893214

### Last DitilBERT layer + more Dense layers
Dropout 0.15 + 4 dense layers((786-512)-(512-256)-(256-128)-(128-2)) & ReLU: doesn't work too well - stagnates at around 2.4

### Last two DistilBERT layers
Dropout 0.1 but last 2 DistilBERT layers: works very well, but early overfitting - maybe use more data
* Mean EM: 0.458
* Mean F-1: 0.6003368353673634

Dropout 0.1 - last 2 distilbert layers: all data
* Mean EM: 0.484
* Mean F-1: 0.6344960035215299

Dropout 0.15 - **BEST**
* Mean EM: 0.5178
* Mean F-1: 0.6671140689626448

Dropout 0.2 - doesn't work too well
* Mean EM: 0.4353
* Mean F-1: 0.5776847879304647


In [69]:
dataset = Dataset(squad_paths = squad_paths, natural_question_paths=None, hotpotqa_paths=hotpotqa_paths, tokenizer=tokenizer)
loader = torch.utils.data.DataLoader(dataset, batch_size=8)

test_dataset = Dataset(squad_paths = [str(x) for x in Path('data/test_squad/').glob('**/*.txt')], 
 natural_question_paths=None, 
 hotpotqa_paths = None, tokenizer=tokenizer)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4)

In [70]:
model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")
config = DistilBertConfig.from_pretrained("distilbert-base-uncased")
mod = model.distilbert

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = ReuseQuestionDistilBERT(mod)
model.to(device)

ReuseQuestionDistilBERT(
 (te): ModuleList(
 (0): TransformerBlock(
 (attention): MultiHeadSelfAttention(
 (dropout): Dropout(p=0.1, inplace=False)
 (q_lin): Linear(in_features=768, out_features=768, bias=True)
 (k_lin): Linear(in_features=768, out_features=768, bias=True)
 (v_lin): Linear(in_features=768, out_features=768, bias=True)
 (out_lin): Linear(in_features=768, out_features=768, bias=True)
 )
 (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 (ffn): FFN(
 (dropout): Dropout(p=0.1, inplace=False)
 (lin1): Linear(in_features=768, out_features=3072, bias=True)
 (lin2): Linear(in_features=3072, out_features=768, bias=True)
 (activation): GELUActivation()
 )
 (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 )
 (1): TransformerBlock(
 (attention): MultiHeadSelfAttention(
 (dropout): Dropout(p=0.1, inplace=False)
 (q_lin): Linear(in_features=768, out_features=768, bias=True)
 (k_lin): Linear(in_features=768, out_features=768, bias=True

In [71]:
count_parameters(model)

+-------------------------------+------------+
| Modules | Parameters |
+-------------------------------+------------+
| te.0.attention.q_lin.weight | 589824 |
| te.0.attention.q_lin.bias | 768 |
| te.0.attention.k_lin.weight | 589824 |
| te.0.attention.k_lin.bias | 768 |
| te.0.attention.v_lin.weight | 589824 |
| te.0.attention.v_lin.bias | 768 |
| te.0.attention.out_lin.weight | 589824 |
| te.0.attention.out_lin.bias | 768 |
| te.0.sa_layer_norm.weight | 768 |
| te.0.sa_layer_norm.bias | 768 |
| te.0.ffn.lin1.weight | 2359296 |
| te.0.ffn.lin1.bias | 3072 |
| te.0.ffn.lin2.weight | 2359296 |
| te.0.ffn.lin2.bias | 768 |
| te.0.output_layer_norm.weight | 768 |
| te.0.output_layer_norm.bias | 768 |
| te.1.attention.q_lin.weight | 589824 |
| te.1.attention.q_lin.bias | 768 |
| te.1.attention.k_lin.weight | 589824 |
| te.1.attention.k_lin.bias | 768 |
| te.1.attention.v_lin.weight | 589824 |
| te.1.attention.v_lin.bias | 768 |
| te.1.attention.out_lin.weight | 589824 |
| te.1.attention.o

14177282

### Testing the Model

In [72]:
# get smaller dataset
batch_size = 8
test_ds = Dataset(squad_paths = squad_paths[:2], natural_question_paths=None, hotpotqa_paths=None, tokenizer=tokenizer)
test_ds_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
optim=torch.optim.Adam(model.parameters())

In [73]:
test_model(model, optim, test_ds_loader, device)

Passed


### Model Training

In [24]:
from torch.optim import AdamW, RMSprop

model.train()
optim = AdamW(model.parameters(), lr=1e-4)

In [None]:
epochs = 16

for epoch in range(epochs):
 loop = tqdm(loader, leave=True)
 model.train()
 mean_training_error = []
 for batch in loop:
 optim.zero_grad()
 
 input_ids = batch['input_ids'].to(device)
 attention_mask = batch['attention_mask'].to(device)
 start = batch['start_positions'].to(device)
 end = batch['end_positions'].to(device)
 
 outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)
 # print(torch.argmax(outputs['start_logits'],axis=1), torch.argmax(outputs['end_logits'], axis=1), start, end)
 loss = outputs['loss']
 loss.backward()
 # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
 optim.step()
 mean_training_error.append(loss.item())
 loop.set_description(f'Epoch {epoch}')
 loop.set_postfix(loss=loss.item())
 print("Mean Training Error", np.mean(mean_training_error))
 
 loop = tqdm(test_loader, leave=True)
 model.eval()
 mean_test_error = []
 for batch in loop:
 
 input_ids = batch['input_ids'].to(device)
 attention_mask = batch['attention_mask'].to(device)
 start = batch['start_positions'].to(device)
 end = batch['end_positions'].to(device)
 
 outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)
 # print(torch.argmax(outputs['start_logits'],axis=1), torch.argmax(outputs['end_logits'], axis=1), start, end)
 loss = outputs['loss']
 
 mean_test_error.append(loss.item())
 loop.set_description(f'Epoch {epoch} Testset')
 loop.set_postfix(loss=loss.item())
 print("Mean Test Error", np.mean(mean_test_error))
 torch.save(model.state_dict(), "distilbert_reuse_{}".format(epoch))

In [48]:
torch.save(model.state_dict(), "distilbert_reuse.model")

In [49]:
m = ReuseQuestionDistilBERT(mod)
m.load_state_dict(torch.load("distilbert_reuse.model"))
model = m

In [47]:
eval_test_set(model, tokenizer, test_loader, device)

100%|██████████| 2500/2500 [02:51<00:00, 14.59it/s]

Mean EM: 0.5178
Mean F-1: 0.6671140689626448



