Spaces:
Sleeping
Sleeping
removed head
Browse files- app.py +15 -22
- src/attention.py +0 -24
- src/bert.py +0 -20
- src/classifier_model.py +1 -25
- src/dataset.py +0 -229
- src/pretrainer.py +10 -427
- src/seq_model.py +1 -37
- src/transformer.py +0 -9
- src/vocab.py +0 -10
app.py
CHANGED
|
@@ -101,24 +101,22 @@ import shutil
|
|
| 101 |
import matplotlib.pyplot as plt
|
| 102 |
from sklearn.metrics import roc_curve, auc
|
| 103 |
# Define the function to process the input file and model selection
|
| 104 |
-
|
| 105 |
def process_file(file,label,info, model_name):
|
| 106 |
-
|
| 107 |
-
def process_file(file,label, model_name):
|
| 108 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 109 |
with open(file.name, 'r') as f:
|
| 110 |
content = f.read()
|
| 111 |
saved_test_dataset = "train.txt"
|
| 112 |
saved_test_label = "train_label.txt"
|
| 113 |
-
|
| 114 |
saved_train_info="train_info.txt"
|
| 115 |
-
|
| 116 |
-
|
| 117 |
|
| 118 |
# Save the uploaded file content to a specified location
|
| 119 |
shutil.copyfile(file.name, saved_test_dataset)
|
| 120 |
shutil.copyfile(label.name, saved_test_label)
|
| 121 |
-
|
| 122 |
shutil.copyfile(info.name, saved_train_info)
|
| 123 |
# For demonstration purposes, we'll just return the content with the selected model name
|
| 124 |
# if(model_name=="highGRschool10"):
|
|
@@ -142,7 +140,7 @@ def process_file(file,label, model_name):
|
|
| 142 |
"-e",str(1),
|
| 143 |
"-b",str(5)
|
| 144 |
], shell=True)
|
| 145 |
-
|
| 146 |
# For demonstration purposes, we'll just return the content with the selected model name
|
| 147 |
if(model_name=="FS"):
|
| 148 |
checkpoint="ratio_proportion_change3/output/FS/bert_fine_tuned.model.ep32"
|
|
@@ -159,7 +157,7 @@ def process_file(file,label, model_name):
|
|
| 159 |
subprocess.run(["python", "src/test_saved_model.py",
|
| 160 |
"--finetuned_bert_checkpoint",checkpoint
|
| 161 |
])
|
| 162 |
-
|
| 163 |
result = {}
|
| 164 |
with open("result.txt", 'r') as file:
|
| 165 |
for line in file:
|
|
@@ -194,11 +192,9 @@ def process_file(file,label, model_name):
|
|
| 194 |
return text_output,plot_path
|
| 195 |
|
| 196 |
# List of models for the dropdown menu
|
| 197 |
-
|
| 198 |
models = ["highGRschool10", "lowGRschoolAll", "fullTest"]
|
| 199 |
-
|
| 200 |
-
models = ["FS", "IS", "CORRECTNESS","EFFECTIVENESS"]
|
| 201 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 202 |
|
| 203 |
# Create the Gradio interface
|
| 204 |
with gr.Blocks(css="""
|
|
@@ -388,25 +384,22 @@ tbody.svelte-18wv37q>tr.svelte-18wv37q:nth-child(odd) {
|
|
| 388 |
with gr.Row():
|
| 389 |
file_input = gr.File(label="Upload a test file", file_types=['.txt'], elem_classes="file-box")
|
| 390 |
label_input = gr.File(label="Upload test labels", file_types=['.txt'], elem_classes="file-box")
|
| 391 |
-
|
| 392 |
info_input = gr.File(label="Upload test info", file_types=['.txt'], elem_classes="file-box")
|
| 393 |
|
| 394 |
model_dropdown = gr.Dropdown(choices=models, label="Select Finetune Task", elem_classes="dropdown-menu")
|
| 395 |
-
|
| 396 |
|
| 397 |
-
|
| 398 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 399 |
|
| 400 |
with gr.Row():
|
| 401 |
output_text = gr.Textbox(label="Output Text")
|
| 402 |
output_image = gr.Image(label="Output Plot")
|
| 403 |
|
| 404 |
btn = gr.Button("Submit")
|
| 405 |
-
|
| 406 |
btn.click(fn=process_file, inputs=[file_input,label_input,info_input, model_dropdown], outputs=[output_text,output_image])
|
| 407 |
-
|
| 408 |
-
btn.click(fn=process_file, inputs=[file_input,label_input, model_dropdown], outputs=[output_text,output_image])
|
| 409 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 410 |
|
| 411 |
# Launch the app
|
| 412 |
demo.launch()
|
|
|
|
| 101 |
import matplotlib.pyplot as plt
|
| 102 |
from sklearn.metrics import roc_curve, auc
|
| 103 |
# Define the function to process the input file and model selection
|
| 104 |
+
|
| 105 |
def process_file(file,label,info, model_name):
|
| 106 |
+
|
|
|
|
|
|
|
| 107 |
with open(file.name, 'r') as f:
|
| 108 |
content = f.read()
|
| 109 |
saved_test_dataset = "train.txt"
|
| 110 |
saved_test_label = "train_label.txt"
|
| 111 |
+
|
| 112 |
saved_train_info="train_info.txt"
|
| 113 |
+
|
| 114 |
+
|
| 115 |
|
| 116 |
# Save the uploaded file content to a specified location
|
| 117 |
shutil.copyfile(file.name, saved_test_dataset)
|
| 118 |
shutil.copyfile(label.name, saved_test_label)
|
| 119 |
+
|
| 120 |
shutil.copyfile(info.name, saved_train_info)
|
| 121 |
# For demonstration purposes, we'll just return the content with the selected model name
|
| 122 |
# if(model_name=="highGRschool10"):
|
|
|
|
| 140 |
"-e",str(1),
|
| 141 |
"-b",str(5)
|
| 142 |
], shell=True)
|
| 143 |
+
|
| 144 |
# For demonstration purposes, we'll just return the content with the selected model name
|
| 145 |
if(model_name=="FS"):
|
| 146 |
checkpoint="ratio_proportion_change3/output/FS/bert_fine_tuned.model.ep32"
|
|
|
|
| 157 |
subprocess.run(["python", "src/test_saved_model.py",
|
| 158 |
"--finetuned_bert_checkpoint",checkpoint
|
| 159 |
])
|
| 160 |
+
|
| 161 |
result = {}
|
| 162 |
with open("result.txt", 'r') as file:
|
| 163 |
for line in file:
|
|
|
|
| 192 |
return text_output,plot_path
|
| 193 |
|
| 194 |
# List of models for the dropdown menu
|
| 195 |
+
|
| 196 |
models = ["highGRschool10", "lowGRschoolAll", "fullTest"]
|
| 197 |
+
|
|
|
|
|
|
|
| 198 |
|
| 199 |
# Create the Gradio interface
|
| 200 |
with gr.Blocks(css="""
|
|
|
|
| 384 |
with gr.Row():
|
| 385 |
file_input = gr.File(label="Upload a test file", file_types=['.txt'], elem_classes="file-box")
|
| 386 |
label_input = gr.File(label="Upload test labels", file_types=['.txt'], elem_classes="file-box")
|
| 387 |
+
|
| 388 |
info_input = gr.File(label="Upload test info", file_types=['.txt'], elem_classes="file-box")
|
| 389 |
|
| 390 |
model_dropdown = gr.Dropdown(choices=models, label="Select Finetune Task", elem_classes="dropdown-menu")
|
| 391 |
+
|
| 392 |
|
| 393 |
+
|
|
|
|
| 394 |
|
| 395 |
with gr.Row():
|
| 396 |
output_text = gr.Textbox(label="Output Text")
|
| 397 |
output_image = gr.Image(label="Output Plot")
|
| 398 |
|
| 399 |
btn = gr.Button("Submit")
|
| 400 |
+
|
| 401 |
btn.click(fn=process_file, inputs=[file_input,label_input,info_input, model_dropdown], outputs=[output_text,output_image])
|
| 402 |
+
|
|
|
|
|
|
|
| 403 |
|
| 404 |
# Launch the app
|
| 405 |
demo.launch()
|
src/attention.py
CHANGED
|
@@ -3,19 +3,11 @@ import torch.nn.functional as F
|
|
| 3 |
import torch
|
| 4 |
|
| 5 |
import math
|
| 6 |
-
<<<<<<< HEAD
|
| 7 |
import pickle
|
| 8 |
|
| 9 |
class Attention(nn.Module):
|
| 10 |
"""
|
| 11 |
Compute Scaled Dot Product Attention
|
| 12 |
-
=======
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
class Attention(nn.Module):
|
| 16 |
-
"""
|
| 17 |
-
Compute 'Scaled Dot Product Attention
|
| 18 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 19 |
"""
|
| 20 |
|
| 21 |
def __init__(self):
|
|
@@ -53,10 +45,6 @@ class MultiHeadedAttention(nn.Module):
|
|
| 53 |
self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
|
| 54 |
self.output_linear = nn.Linear(d_model, d_model)
|
| 55 |
self.attention = Attention()
|
| 56 |
-
<<<<<<< HEAD
|
| 57 |
-
=======
|
| 58 |
-
|
| 59 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 60 |
self.dropout = nn.Dropout(p=dropout)
|
| 61 |
|
| 62 |
def forward(self, query, key, value, mask=None):
|
|
@@ -70,21 +58,9 @@ class MultiHeadedAttention(nn.Module):
|
|
| 70 |
query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
|
| 71 |
for l, x in zip(self.linear_layers, (query, key, value))]
|
| 72 |
# 2) Apply attention on all the projected vectors in batch.
|
| 73 |
-
<<<<<<< HEAD
|
| 74 |
x, p_attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
|
| 75 |
|
| 76 |
# 3) "Concat" using a view and apply a final linear.
|
| 77 |
x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
|
| 78 |
|
| 79 |
return self.output_linear(x), p_attn
|
| 80 |
-
=======
|
| 81 |
-
x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
|
| 82 |
-
# torch.Size([64, 8, 100, 100])
|
| 83 |
-
# print("Attention", attn.shape)
|
| 84 |
-
|
| 85 |
-
# 3) "Concat" using a view and apply a final linear.
|
| 86 |
-
x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
|
| 87 |
-
|
| 88 |
-
return self.output_linear(x)
|
| 89 |
-
|
| 90 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
|
|
|
| 3 |
import torch
|
| 4 |
|
| 5 |
import math
|
|
|
|
| 6 |
import pickle
|
| 7 |
|
| 8 |
class Attention(nn.Module):
|
| 9 |
"""
|
| 10 |
Compute Scaled Dot Product Attention
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
def __init__(self):
|
|
|
|
| 45 |
self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])
|
| 46 |
self.output_linear = nn.Linear(d_model, d_model)
|
| 47 |
self.attention = Attention()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
self.dropout = nn.Dropout(p=dropout)
|
| 49 |
|
| 50 |
def forward(self, query, key, value, mask=None):
|
|
|
|
| 58 |
query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
|
| 59 |
for l, x in zip(self.linear_layers, (query, key, value))]
|
| 60 |
# 2) Apply attention on all the projected vectors in batch.
|
|
|
|
| 61 |
x, p_attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)
|
| 62 |
|
| 63 |
# 3) "Concat" using a view and apply a final linear.
|
| 64 |
x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
|
| 65 |
|
| 66 |
return self.output_linear(x), p_attn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/bert.py
CHANGED
|
@@ -1,14 +1,8 @@
|
|
| 1 |
import torch.nn as nn
|
| 2 |
-
<<<<<<< HEAD
|
| 3 |
import torch
|
| 4 |
|
| 5 |
from .transformer import TransformerBlock
|
| 6 |
from .embedding import BERTEmbedding
|
| 7 |
-
=======
|
| 8 |
-
|
| 9 |
-
from transformer import TransformerBlock
|
| 10 |
-
from embedding import BERTEmbedding
|
| 11 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 12 |
|
| 13 |
class BERT(nn.Module):
|
| 14 |
"""
|
|
@@ -38,15 +32,11 @@ class BERT(nn.Module):
|
|
| 38 |
# multi-layers transformer blocks, deep network
|
| 39 |
self.transformer_blocks = nn.ModuleList(
|
| 40 |
[TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])
|
| 41 |
-
<<<<<<< HEAD
|
| 42 |
# self.attention_values = []
|
| 43 |
-
=======
|
| 44 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 45 |
|
| 46 |
def forward(self, x, segment_info):
|
| 47 |
# attention masking for padded token
|
| 48 |
# torch.ByteTensor([batch_size, 1, seq_len, seq_len)
|
| 49 |
-
<<<<<<< HEAD
|
| 50 |
|
| 51 |
device = x.device
|
| 52 |
|
|
@@ -68,15 +58,5 @@ class BERT(nn.Module):
|
|
| 68 |
for transformer in self.transformer_blocks:
|
| 69 |
x = transformer.forward(x, mask)
|
| 70 |
# self.attention_values.append(transformer.p_attn)
|
| 71 |
-
=======
|
| 72 |
-
mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)
|
| 73 |
-
# print("bert mask: ", mask)
|
| 74 |
-
# embedding the indexed sequence to sequence of vectors
|
| 75 |
-
x = self.embedding(x, segment_info)
|
| 76 |
-
|
| 77 |
-
# running over multiple transformer blocks
|
| 78 |
-
for transformer in self.transformer_blocks:
|
| 79 |
-
x = transformer.forward(x, mask)
|
| 80 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 81 |
|
| 82 |
return x
|
|
|
|
| 1 |
import torch.nn as nn
|
|
|
|
| 2 |
import torch
|
| 3 |
|
| 4 |
from .transformer import TransformerBlock
|
| 5 |
from .embedding import BERTEmbedding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
class BERT(nn.Module):
|
| 8 |
"""
|
|
|
|
| 32 |
# multi-layers transformer blocks, deep network
|
| 33 |
self.transformer_blocks = nn.ModuleList(
|
| 34 |
[TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])
|
|
|
|
| 35 |
# self.attention_values = []
|
|
|
|
|
|
|
| 36 |
|
| 37 |
def forward(self, x, segment_info):
|
| 38 |
# attention masking for padded token
|
| 39 |
# torch.ByteTensor([batch_size, 1, seq_len, seq_len)
|
|
|
|
| 40 |
|
| 41 |
device = x.device
|
| 42 |
|
|
|
|
| 58 |
for transformer in self.transformer_blocks:
|
| 59 |
x = transformer.forward(x, mask)
|
| 60 |
# self.attention_values.append(transformer.p_attn)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
return x
|
src/classifier_model.py
CHANGED
|
@@ -1,28 +1,17 @@
|
|
| 1 |
-
<<<<<<< HEAD
|
| 2 |
import torch
|
| 3 |
import torch.nn as nn
|
| 4 |
|
| 5 |
from .bert import BERT
|
| 6 |
-
=======
|
| 7 |
-
import torch.nn as nn
|
| 8 |
-
|
| 9 |
-
from bert import BERT
|
| 10 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 11 |
|
| 12 |
|
| 13 |
class BERTForClassification(nn.Module):
|
| 14 |
"""
|
| 15 |
-
<<<<<<< HEAD
|
| 16 |
Fine-tune Task Classifier Model
|
| 17 |
-
=======
|
| 18 |
-
Progress Classifier Model
|
| 19 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 20 |
"""
|
| 21 |
|
| 22 |
def __init__(self, bert: BERT, vocab_size, n_labels):
|
| 23 |
"""
|
| 24 |
:param bert: BERT model which should be trained
|
| 25 |
-
<<<<<<< HEAD
|
| 26 |
:param vocab_size: total vocab size
|
| 27 |
:param n_labels: number of labels for the task
|
| 28 |
"""
|
|
@@ -59,17 +48,4 @@ class BERTForClassificationWithFeats(nn.Module):
|
|
| 59 |
# x = self.linear1(x)
|
| 60 |
# x = self.RELU(x)
|
| 61 |
# return self.linear2(x)
|
| 62 |
-
return self.linear(x)
|
| 63 |
-
=======
|
| 64 |
-
:param vocab_size: total vocab size for masked_lm
|
| 65 |
-
"""
|
| 66 |
-
|
| 67 |
-
super().__init__()
|
| 68 |
-
self.bert = bert
|
| 69 |
-
self.linear = nn.Linear(self.bert.hidden, n_labels)
|
| 70 |
-
# self.softmax = nn.LogSoftmax(dim=-1)
|
| 71 |
-
|
| 72 |
-
def forward(self, x, segment_label):
|
| 73 |
-
x = self.bert(x, segment_label)
|
| 74 |
-
return x, self.linear(x[:, 0])
|
| 75 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
|
|
|
|
|
|
| 1 |
import torch
|
| 2 |
import torch.nn as nn
|
| 3 |
|
| 4 |
from .bert import BERT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
class BERTForClassification(nn.Module):
|
| 8 |
"""
|
|
|
|
| 9 |
Fine-tune Task Classifier Model
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
|
| 12 |
def __init__(self, bert: BERT, vocab_size, n_labels):
|
| 13 |
"""
|
| 14 |
:param bert: BERT model which should be trained
|
|
|
|
| 15 |
:param vocab_size: total vocab size
|
| 16 |
:param n_labels: number of labels for the task
|
| 17 |
"""
|
|
|
|
| 48 |
# x = self.linear1(x)
|
| 49 |
# x = self.RELU(x)
|
| 50 |
# return self.linear2(x)
|
| 51 |
+
return self.linear(x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/dataset.py
CHANGED
|
@@ -4,28 +4,17 @@ import pandas as pd
|
|
| 4 |
import numpy as np
|
| 5 |
import tqdm
|
| 6 |
import random
|
| 7 |
-
<<<<<<< HEAD
|
| 8 |
from .vocab import Vocab
|
| 9 |
import pickle
|
| 10 |
import copy
|
| 11 |
# from sklearn.preprocessing import OneHotEncoder
|
| 12 |
-
=======
|
| 13 |
-
from vocab import Vocab
|
| 14 |
-
import pickle
|
| 15 |
-
import copy
|
| 16 |
-
from sklearn.preprocessing import OneHotEncoder
|
| 17 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 18 |
|
| 19 |
class PretrainerDataset(Dataset):
|
| 20 |
"""
|
| 21 |
Class name: PretrainDataset
|
| 22 |
|
| 23 |
"""
|
| 24 |
-
<<<<<<< HEAD
|
| 25 |
def __init__(self, dataset_path, vocab, seq_len=30, max_mask=0.15):
|
| 26 |
-
=======
|
| 27 |
-
def __init__(self, dataset_path, vocab, seq_len=30, select_next_seq= False):
|
| 28 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 29 |
self.dataset_path = dataset_path
|
| 30 |
self.vocab = vocab # Vocab object
|
| 31 |
|
|
@@ -46,7 +35,6 @@ class PretrainerDataset(Dataset):
|
|
| 46 |
self.index_documents[i] = []
|
| 47 |
else:
|
| 48 |
self.index_documents[i].append(index)
|
| 49 |
-
<<<<<<< HEAD
|
| 50 |
self.lines.append(line.split("\t"))
|
| 51 |
len_line = len(line.split("\t"))
|
| 52 |
seq_len_list.append(len_line)
|
|
@@ -61,22 +49,6 @@ class PretrainerDataset(Dataset):
|
|
| 61 |
print("Sequence length set at: ", self.seq_len)
|
| 62 |
self.max_mask = max_mask
|
| 63 |
print("% of input tokens selected for masking : ",self.max_mask)
|
| 64 |
-
=======
|
| 65 |
-
self.lines.append(line.split())
|
| 66 |
-
len_line = len(line.split())
|
| 67 |
-
seq_len_list.append(len_line)
|
| 68 |
-
index+=1
|
| 69 |
-
reader.close()
|
| 70 |
-
print("Sequence Stats: ", len(seq_len_list), min(seq_len_list), max(seq_len_list), sum(seq_len_list)/len(seq_len_list))
|
| 71 |
-
print("Unique Sequences: ", len({tuple(ll) for ll in self.lines}))
|
| 72 |
-
self.index_documents = {k:v for k,v in self.index_documents.items() if v}
|
| 73 |
-
self.seq_len = seq_len
|
| 74 |
-
self.max_mask_per_seq = 0.15
|
| 75 |
-
self.select_next_seq = select_next_seq
|
| 76 |
-
print("Sequence length set at ", self.seq_len)
|
| 77 |
-
print("select_next_seq: ", self.select_next_seq)
|
| 78 |
-
print(len(self.index_documents))
|
| 79 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 80 |
|
| 81 |
|
| 82 |
def __len__(self):
|
|
@@ -84,7 +56,6 @@ class PretrainerDataset(Dataset):
|
|
| 84 |
|
| 85 |
def __getitem__(self, item):
|
| 86 |
token_a = self.lines[item]
|
| 87 |
-
<<<<<<< HEAD
|
| 88 |
# sa_masked = None
|
| 89 |
# sa_masked_label = None
|
| 90 |
# token_b = None
|
|
@@ -130,44 +101,6 @@ class PretrainerDataset(Dataset):
|
|
| 130 |
|
| 131 |
# print(item, len(s1), len(s1_label), len(segment_label))
|
| 132 |
# print(f"{item}.")
|
| 133 |
-
=======
|
| 134 |
-
token_b = None
|
| 135 |
-
is_same_student = None
|
| 136 |
-
sa_masked = None
|
| 137 |
-
sa_masked_label = None
|
| 138 |
-
sb_masked = None
|
| 139 |
-
sb_masked_label = None
|
| 140 |
-
|
| 141 |
-
if self.select_next_seq:
|
| 142 |
-
is_same_student, token_b = self.get_token_b(item)
|
| 143 |
-
is_same_student = 1 if is_same_student else 0
|
| 144 |
-
token_a1, token_b1 = self.truncate_to_max_seq(token_a, token_b)
|
| 145 |
-
sa_masked, sa_masked_label = self.random_mask_seq(token_a1)
|
| 146 |
-
sb_masked, sb_masked_label = self.random_mask_seq(token_b1)
|
| 147 |
-
else:
|
| 148 |
-
token_a = token_a[:self.seq_len-2]
|
| 149 |
-
sa_masked, sa_masked_label = self.random_mask_seq(token_a)
|
| 150 |
-
|
| 151 |
-
s1 = ([self.vocab.vocab['[CLS]']] + sa_masked + [self.vocab.vocab['[SEP]']])
|
| 152 |
-
s1_label = ([self.vocab.vocab['[PAD]']] + sa_masked_label + [self.vocab.vocab['[PAD]']])
|
| 153 |
-
segment_label = [1 for _ in range(len(s1))]
|
| 154 |
-
|
| 155 |
-
if self.select_next_seq:
|
| 156 |
-
s1 = s1 + sb_masked + [self.vocab.vocab['[SEP]']]
|
| 157 |
-
s1_label = s1_label + sb_masked_label + [self.vocab.vocab['[PAD]']]
|
| 158 |
-
segment_label = segment_label + [2 for _ in range(len(sb_masked)+1)]
|
| 159 |
-
|
| 160 |
-
padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
|
| 161 |
-
s1.extend(padding), s1_label.extend(padding), segment_label.extend(padding)
|
| 162 |
-
|
| 163 |
-
output = {'bert_input': s1,
|
| 164 |
-
'bert_label': s1_label,
|
| 165 |
-
'segment_label': segment_label}
|
| 166 |
-
|
| 167 |
-
if self.select_next_seq:
|
| 168 |
-
output['is_same_student'] = is_same_student
|
| 169 |
-
# print(item, len(s1), len(s1_label), len(segment_label))
|
| 170 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 171 |
return {key: torch.tensor(value) for key, value in output.items()}
|
| 172 |
|
| 173 |
def random_mask_seq(self, tokens):
|
|
@@ -176,7 +109,6 @@ class PretrainerDataset(Dataset):
|
|
| 176 |
Output: masked token seq, output label
|
| 177 |
"""
|
| 178 |
|
| 179 |
-
<<<<<<< HEAD
|
| 180 |
masked_pos = []
|
| 181 |
output_labels = []
|
| 182 |
output_tokens = copy.deepcopy(tokens)
|
|
@@ -197,22 +129,11 @@ class PretrainerDataset(Dataset):
|
|
| 197 |
# else:
|
| 198 |
prob = random.random()
|
| 199 |
if prob < self.max_mask:
|
| 200 |
-
=======
|
| 201 |
-
# masked_pos_label = {}
|
| 202 |
-
output_labels = []
|
| 203 |
-
output_tokens = copy.deepcopy(tokens)
|
| 204 |
-
|
| 205 |
-
# while(len(label_tokens) < self.max_mask_per_seq*len(tokens)):
|
| 206 |
-
for i, token in enumerate(tokens):
|
| 207 |
-
prob = random.random()
|
| 208 |
-
if prob < 0.15:
|
| 209 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 210 |
# chooses 15% of token positions at random
|
| 211 |
# prob /= 0.15
|
| 212 |
prob = random.random()
|
| 213 |
if prob < 0.8: #[MASK] token 80% of the time
|
| 214 |
output_tokens[i] = self.vocab.vocab['[MASK]']
|
| 215 |
-
<<<<<<< HEAD
|
| 216 |
masked_pos.append(1)
|
| 217 |
elif prob < 0.9: # a random token 10% of the time
|
| 218 |
# print(".......0.8-0.9......")
|
|
@@ -226,14 +147,6 @@ class PretrainerDataset(Dataset):
|
|
| 226 |
# print(".......unchanged......")
|
| 227 |
output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
|
| 228 |
masked_pos.append(0)
|
| 229 |
-
=======
|
| 230 |
-
elif prob < 0.9: # a random token 10% of the time
|
| 231 |
-
# print(".......0.8-0.9......")
|
| 232 |
-
output_tokens[i] = random.randint(1, len(self.vocab.vocab)-1)
|
| 233 |
-
else: # the unchanged i-th token 10% of the time
|
| 234 |
-
# print(".......unchanged......")
|
| 235 |
-
output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
|
| 236 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 237 |
# True Label
|
| 238 |
output_labels.append(self.vocab.vocab.get(token, self.vocab.vocab['[UNK]']))
|
| 239 |
# masked_pos_label[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
|
|
@@ -242,16 +155,12 @@ class PretrainerDataset(Dataset):
|
|
| 242 |
output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
|
| 243 |
# Padded label
|
| 244 |
output_labels.append(self.vocab.vocab['[PAD]'])
|
| 245 |
-
<<<<<<< HEAD
|
| 246 |
masked_pos.append(0)
|
| 247 |
-
=======
|
| 248 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 249 |
# label_position = []
|
| 250 |
# label_tokens = []
|
| 251 |
# for k, v in masked_pos_label.items():
|
| 252 |
# label_position.append(k)
|
| 253 |
# label_tokens.append(v)
|
| 254 |
-
<<<<<<< HEAD
|
| 255 |
return output_tokens, output_labels, masked_pos
|
| 256 |
|
| 257 |
# def get_token_b(self, item):
|
|
@@ -288,43 +197,6 @@ class PretrainerDataset(Dataset):
|
|
| 288 |
# sb.pop()
|
| 289 |
# return sa, sb
|
| 290 |
|
| 291 |
-
=======
|
| 292 |
-
return output_tokens, output_labels
|
| 293 |
-
|
| 294 |
-
def get_token_b(self, item):
|
| 295 |
-
document_id = [k for k,v in self.index_documents.items() if item in v][0]
|
| 296 |
-
random_document_id = document_id
|
| 297 |
-
|
| 298 |
-
if random.random() < 0.5:
|
| 299 |
-
document_ids = [k for k in self.index_documents.keys() if k != document_id]
|
| 300 |
-
random_document_id = random.choice(document_ids)
|
| 301 |
-
|
| 302 |
-
same_student = (random_document_id == document_id)
|
| 303 |
-
|
| 304 |
-
nex_seq_list = self.index_documents.get(random_document_id)
|
| 305 |
-
|
| 306 |
-
if same_student:
|
| 307 |
-
if len(nex_seq_list) != 1:
|
| 308 |
-
nex_seq_list = [v for v in nex_seq_list if v !=item]
|
| 309 |
-
|
| 310 |
-
next_seq = random.choice(nex_seq_list)
|
| 311 |
-
tokens = self.lines[next_seq]
|
| 312 |
-
# print(f"item = {item}, tokens: {tokens}")
|
| 313 |
-
# print(f"item={item}, next={next_seq}, same_student = {same_student}, {document_id} == {random_document_id}, b. {tokens}")
|
| 314 |
-
return same_student, tokens
|
| 315 |
-
|
| 316 |
-
def truncate_to_max_seq(self, s1, s2):
|
| 317 |
-
sa = copy.deepcopy(s1)
|
| 318 |
-
sb = copy.deepcopy(s1)
|
| 319 |
-
total_allowed_seq = self.seq_len - 3
|
| 320 |
-
|
| 321 |
-
while((len(sa)+len(sb)) > total_allowed_seq):
|
| 322 |
-
if random.random() < 0.5:
|
| 323 |
-
sa.pop()
|
| 324 |
-
else:
|
| 325 |
-
sb.pop()
|
| 326 |
-
return sa, sb
|
| 327 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 328 |
|
| 329 |
class TokenizerDataset(Dataset):
|
| 330 |
"""
|
|
@@ -332,24 +204,15 @@ class TokenizerDataset(Dataset):
|
|
| 332 |
Tokenize the data in the dataset
|
| 333 |
|
| 334 |
"""
|
| 335 |
-
<<<<<<< HEAD
|
| 336 |
def __init__(self, dataset_path, label_path, vocab, seq_len=30):
|
| 337 |
self.dataset_path = dataset_path
|
| 338 |
self.label_path = label_path
|
| 339 |
self.vocab = vocab # Vocab object
|
| 340 |
# self.encoder = OneHotEncoder(sparse=False)
|
| 341 |
-
=======
|
| 342 |
-
def __init__(self, dataset_path, label_path, vocab, seq_len=30, train=True):
|
| 343 |
-
self.dataset_path = dataset_path
|
| 344 |
-
self.label_path = label_path
|
| 345 |
-
self.vocab = vocab # Vocab object
|
| 346 |
-
self.encoder = OneHotEncoder(sparse_output=False)
|
| 347 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 348 |
|
| 349 |
# Related to input dataset file
|
| 350 |
self.lines = []
|
| 351 |
self.labels = []
|
| 352 |
-
<<<<<<< HEAD
|
| 353 |
self.feats = []
|
| 354 |
if self.label_path:
|
| 355 |
self.label_file = open(self.label_path, "r")
|
|
@@ -414,97 +277,21 @@ class TokenizerDataset(Dataset):
|
|
| 414 |
# self.labels = self.encoder.transform(np.array(self.labels).reshape(-1,1))
|
| 415 |
|
| 416 |
self.file = open(self.dataset_path, "r")
|
| 417 |
-
=======
|
| 418 |
-
self.labels = []
|
| 419 |
-
|
| 420 |
-
self.label_file = open(self.label_path, "r")
|
| 421 |
-
for line in self.label_file:
|
| 422 |
-
if line:
|
| 423 |
-
line = line.strip()
|
| 424 |
-
if not line:
|
| 425 |
-
continue
|
| 426 |
-
self.labels.append(float(line))
|
| 427 |
-
self.label_file.close()
|
| 428 |
-
labeler = np.unique(self.labels)
|
| 429 |
-
self.encoder.fit(labeler.reshape(-1,1))
|
| 430 |
-
self.labels = self.encoder.transform(np.array(self.labels).reshape(-1,1))
|
| 431 |
-
# print(f"labels: {self.labels}")
|
| 432 |
-
|
| 433 |
-
# info_file_name = self.dataset_path.split('.')
|
| 434 |
-
# info_file_name = info_file_name[0]+"_info."+info_file_name[1]
|
| 435 |
-
# progress = []
|
| 436 |
-
# with open(info_file_name, "r") as f:
|
| 437 |
-
# for line in f:
|
| 438 |
-
# if line:
|
| 439 |
-
# line = line.strip()
|
| 440 |
-
# if not line:
|
| 441 |
-
# continue
|
| 442 |
-
# line = line.split(",")[0]
|
| 443 |
-
# pstat = 1 if line == "GRADUATED" else 0
|
| 444 |
-
# progress.append(pstat)
|
| 445 |
-
# f.close()
|
| 446 |
-
|
| 447 |
-
# indices_of_grad = np.where(np.array(progress) == 1)[0]
|
| 448 |
-
# indices_of_prom = np.where(np.array(progress) == 0)[0]
|
| 449 |
-
|
| 450 |
-
# indices_of_zeros = np.where(np.array(labels) == 0)[0]
|
| 451 |
-
# indices_of_ones = np.where(np.array(labels) == 1)[0]
|
| 452 |
-
|
| 453 |
-
# number_of_items = min(len(indices_of_zeros), len(indices_of_ones))
|
| 454 |
-
# # number_of_items = min(len(indices_of_grad), len(indices_of_prom))
|
| 455 |
-
# print(number_of_items)
|
| 456 |
-
|
| 457 |
-
# indices_of_zeros = indices_of_zeros[:number_of_items]
|
| 458 |
-
# indices_of_ones = indices_of_ones[:number_of_items]
|
| 459 |
-
# print(indices_of_zeros)
|
| 460 |
-
# print(indices_of_ones)
|
| 461 |
-
|
| 462 |
-
# indices_of_grad = indices_of_grad[:number_of_items]
|
| 463 |
-
# indices_of_prom = indices_of_prom[:number_of_items]
|
| 464 |
-
# print(indices_of_grad)
|
| 465 |
-
# print(indices_of_prom)
|
| 466 |
-
|
| 467 |
-
self.file = open(self.dataset_path, "r")
|
| 468 |
-
# index = 0
|
| 469 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 470 |
for line in self.file:
|
| 471 |
if line:
|
| 472 |
line = line.strip()
|
| 473 |
if line:
|
| 474 |
self.lines.append(line)
|
| 475 |
-
<<<<<<< HEAD
|
| 476 |
-
=======
|
| 477 |
-
# if train:
|
| 478 |
-
# if index in indices_of_zeros:
|
| 479 |
-
# # if index in indices_of_prom:
|
| 480 |
-
# self.lines.append(line)
|
| 481 |
-
# self.labels.append(0)
|
| 482 |
-
# if index in indices_of_ones:
|
| 483 |
-
# # if index in indices_of_grad:
|
| 484 |
-
# self.lines.append(line)
|
| 485 |
-
# self.labels.append(1)
|
| 486 |
-
# else:
|
| 487 |
-
# self.lines.append(line)
|
| 488 |
-
# self.labels.append(labels[index])
|
| 489 |
-
# self.labels.append(progress[index])
|
| 490 |
-
# index += 1
|
| 491 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 492 |
self.file.close()
|
| 493 |
|
| 494 |
self.len = len(self.lines)
|
| 495 |
self.seq_len = seq_len
|
| 496 |
-
<<<<<<< HEAD
|
| 497 |
print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0)
|
| 498 |
-
=======
|
| 499 |
-
|
| 500 |
-
print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels))
|
| 501 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 502 |
|
| 503 |
def __len__(self):
|
| 504 |
return self.len
|
| 505 |
|
| 506 |
def __getitem__(self, item):
|
| 507 |
-
<<<<<<< HEAD
|
| 508 |
org_line = self.lines[item].split("\t")
|
| 509 |
dup_line = []
|
| 510 |
opt = False
|
|
@@ -527,23 +314,10 @@ class TokenizerDataset(Dataset):
|
|
| 527 |
output = {'input': s1,
|
| 528 |
'label': s1_label,
|
| 529 |
'feat': s1_feat,
|
| 530 |
-
=======
|
| 531 |
-
|
| 532 |
-
s1 = self.vocab.to_seq(self.lines[item], self.seq_len) # This is like tokenizer and adds [CLS] and [SEP].
|
| 533 |
-
s1_label = self.labels[item]
|
| 534 |
-
segment_label = [1 for _ in range(len(s1))]
|
| 535 |
-
|
| 536 |
-
padding = [self.vocab.vocab['[PAD]'] for _ in range(self.seq_len - len(s1))]
|
| 537 |
-
s1.extend(padding), segment_label.extend(padding)
|
| 538 |
-
|
| 539 |
-
output = {'bert_input': s1,
|
| 540 |
-
'progress_status': s1_label,
|
| 541 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 542 |
'segment_label': segment_label}
|
| 543 |
return {key: torch.tensor(value) for key, value in output.items()}
|
| 544 |
|
| 545 |
|
| 546 |
-
<<<<<<< HEAD
|
| 547 |
class TokenizerDatasetForCalibration(Dataset):
|
| 548 |
"""
|
| 549 |
Class name: TokenizerDataset
|
|
@@ -661,9 +435,6 @@ class TokenizerDatasetForCalibration(Dataset):
|
|
| 661 |
|
| 662 |
|
| 663 |
# if __name__ == "__main__":
|
| 664 |
-
=======
|
| 665 |
-
# if __name__ == "__main__":
|
| 666 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 667 |
# # import pickle
|
| 668 |
# # k = pickle.load(open("dataset/CL4999_1920/unique_steps_list.pkl","rb"))
|
| 669 |
# # print(k)
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
import tqdm
|
| 6 |
import random
|
|
|
|
| 7 |
from .vocab import Vocab
|
| 8 |
import pickle
|
| 9 |
import copy
|
| 10 |
# from sklearn.preprocessing import OneHotEncoder
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
class PretrainerDataset(Dataset):
|
| 13 |
"""
|
| 14 |
Class name: PretrainDataset
|
| 15 |
|
| 16 |
"""
|
|
|
|
| 17 |
def __init__(self, dataset_path, vocab, seq_len=30, max_mask=0.15):
|
|
|
|
|
|
|
|
|
|
| 18 |
self.dataset_path = dataset_path
|
| 19 |
self.vocab = vocab # Vocab object
|
| 20 |
|
|
|
|
| 35 |
self.index_documents[i] = []
|
| 36 |
else:
|
| 37 |
self.index_documents[i].append(index)
|
|
|
|
| 38 |
self.lines.append(line.split("\t"))
|
| 39 |
len_line = len(line.split("\t"))
|
| 40 |
seq_len_list.append(len_line)
|
|
|
|
| 49 |
print("Sequence length set at: ", self.seq_len)
|
| 50 |
self.max_mask = max_mask
|
| 51 |
print("% of input tokens selected for masking : ",self.max_mask)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
def __len__(self):
|
|
|
|
| 56 |
|
| 57 |
def __getitem__(self, item):
|
| 58 |
token_a = self.lines[item]
|
|
|
|
| 59 |
# sa_masked = None
|
| 60 |
# sa_masked_label = None
|
| 61 |
# token_b = None
|
|
|
|
| 101 |
|
| 102 |
# print(item, len(s1), len(s1_label), len(segment_label))
|
| 103 |
# print(f"{item}.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
return {key: torch.tensor(value) for key, value in output.items()}
|
| 105 |
|
| 106 |
def random_mask_seq(self, tokens):
|
|
|
|
| 109 |
Output: masked token seq, output label
|
| 110 |
"""
|
| 111 |
|
|
|
|
| 112 |
masked_pos = []
|
| 113 |
output_labels = []
|
| 114 |
output_tokens = copy.deepcopy(tokens)
|
|
|
|
| 129 |
# else:
|
| 130 |
prob = random.random()
|
| 131 |
if prob < self.max_mask:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
# chooses 15% of token positions at random
|
| 133 |
# prob /= 0.15
|
| 134 |
prob = random.random()
|
| 135 |
if prob < 0.8: #[MASK] token 80% of the time
|
| 136 |
output_tokens[i] = self.vocab.vocab['[MASK]']
|
|
|
|
| 137 |
masked_pos.append(1)
|
| 138 |
elif prob < 0.9: # a random token 10% of the time
|
| 139 |
# print(".......0.8-0.9......")
|
|
|
|
| 147 |
# print(".......unchanged......")
|
| 148 |
output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
|
| 149 |
masked_pos.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
# True Label
|
| 151 |
output_labels.append(self.vocab.vocab.get(token, self.vocab.vocab['[UNK]']))
|
| 152 |
# masked_pos_label[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
|
|
|
|
| 155 |
output_tokens[i] = self.vocab.vocab.get(token, self.vocab.vocab['[UNK]'])
|
| 156 |
# Padded label
|
| 157 |
output_labels.append(self.vocab.vocab['[PAD]'])
|
|
|
|
| 158 |
masked_pos.append(0)
|
|
|
|
|
|
|
| 159 |
# label_position = []
|
| 160 |
# label_tokens = []
|
| 161 |
# for k, v in masked_pos_label.items():
|
| 162 |
# label_position.append(k)
|
| 163 |
# label_tokens.append(v)
|
|
|
|
| 164 |
return output_tokens, output_labels, masked_pos
|
| 165 |
|
| 166 |
# def get_token_b(self, item):
|
|
|
|
| 197 |
# sb.pop()
|
| 198 |
# return sa, sb
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
class TokenizerDataset(Dataset):
|
| 202 |
"""
|
|
|
|
| 204 |
Tokenize the data in the dataset
|
| 205 |
|
| 206 |
"""
|
|
|
|
| 207 |
def __init__(self, dataset_path, label_path, vocab, seq_len=30):
|
| 208 |
self.dataset_path = dataset_path
|
| 209 |
self.label_path = label_path
|
| 210 |
self.vocab = vocab # Vocab object
|
| 211 |
# self.encoder = OneHotEncoder(sparse=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
# Related to input dataset file
|
| 214 |
self.lines = []
|
| 215 |
self.labels = []
|
|
|
|
| 216 |
self.feats = []
|
| 217 |
if self.label_path:
|
| 218 |
self.label_file = open(self.label_path, "r")
|
|
|
|
| 277 |
# self.labels = self.encoder.transform(np.array(self.labels).reshape(-1,1))
|
| 278 |
|
| 279 |
self.file = open(self.dataset_path, "r")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
for line in self.file:
|
| 281 |
if line:
|
| 282 |
line = line.strip()
|
| 283 |
if line:
|
| 284 |
self.lines.append(line)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
self.file.close()
|
| 286 |
|
| 287 |
self.len = len(self.lines)
|
| 288 |
self.seq_len = seq_len
|
|
|
|
| 289 |
print("Sequence length set at ", self.seq_len, len(self.lines), len(self.labels) if self.label_path else 0)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
def __len__(self):
|
| 292 |
return self.len
|
| 293 |
|
| 294 |
def __getitem__(self, item):
|
|
|
|
| 295 |
org_line = self.lines[item].split("\t")
|
| 296 |
dup_line = []
|
| 297 |
opt = False
|
|
|
|
| 314 |
output = {'input': s1,
|
| 315 |
'label': s1_label,
|
| 316 |
'feat': s1_feat,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
'segment_label': segment_label}
|
| 318 |
return {key: torch.tensor(value) for key, value in output.items()}
|
| 319 |
|
| 320 |
|
|
|
|
| 321 |
class TokenizerDatasetForCalibration(Dataset):
|
| 322 |
"""
|
| 323 |
Class name: TokenizerDataset
|
|
|
|
| 435 |
|
| 436 |
|
| 437 |
# if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
| 438 |
# # import pickle
|
| 439 |
# # k = pickle.load(open("dataset/CL4999_1920/unique_steps_list.pkl","rb"))
|
| 440 |
# # print(k)
|
src/pretrainer.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import torch
|
| 2 |
import torch.nn as nn
|
| 3 |
-
<<<<<<< HEAD
|
| 4 |
# from torch.nn import functional as F
|
| 5 |
from torch.optim import Adam
|
| 6 |
from torch.utils.data import DataLoader
|
|
@@ -36,75 +35,6 @@ class BERTTrainer:
|
|
| 36 |
train_dataloader: DataLoader, val_dataloader: DataLoader = None, test_dataloader: DataLoader = None,
|
| 37 |
lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=5000,
|
| 38 |
with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, log_folder_path: str = None):
|
| 39 |
-
=======
|
| 40 |
-
from torch.nn import functional as F
|
| 41 |
-
from torch.optim import Adam, SGD
|
| 42 |
-
from torch.utils.data import DataLoader
|
| 43 |
-
import pickle
|
| 44 |
-
|
| 45 |
-
from bert import BERT
|
| 46 |
-
from seq_model import BERTSM
|
| 47 |
-
from classifier_model import BERTForClassification
|
| 48 |
-
from optim_schedule import ScheduledOptim
|
| 49 |
-
|
| 50 |
-
import tqdm
|
| 51 |
-
import sys
|
| 52 |
-
|
| 53 |
-
import numpy as np
|
| 54 |
-
import visualization
|
| 55 |
-
|
| 56 |
-
from sklearn.metrics import precision_score, recall_score, f1_score
|
| 57 |
-
|
| 58 |
-
class ECE(nn.Module):
|
| 59 |
-
|
| 60 |
-
def __init__(self, n_bins=15):
|
| 61 |
-
"""
|
| 62 |
-
n_bins (int): number of confidence interval bins
|
| 63 |
-
"""
|
| 64 |
-
super(ECE, self).__init__()
|
| 65 |
-
bin_boundaries = torch.linspace(0, 1, n_bins + 1)
|
| 66 |
-
self.bin_lowers = bin_boundaries[:-1]
|
| 67 |
-
self.bin_uppers = bin_boundaries[1:]
|
| 68 |
-
|
| 69 |
-
def forward(self, logits, labels):
|
| 70 |
-
softmaxes = F.softmax(logits, dim=1)
|
| 71 |
-
confidences, predictions = torch.max(softmaxes, 1)
|
| 72 |
-
labels = torch.argmax(labels,1)
|
| 73 |
-
accuracies = predictions.eq(labels)
|
| 74 |
-
|
| 75 |
-
ece = torch.zeros(1, device=logits.device)
|
| 76 |
-
for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
|
| 77 |
-
# Calculated |confidence - accuracy| in each bin
|
| 78 |
-
in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
|
| 79 |
-
prop_in_bin = in_bin.float().mean()
|
| 80 |
-
if prop_in_bin.item() > 0:
|
| 81 |
-
accuracy_in_bin = accuracies[in_bin].float().mean()
|
| 82 |
-
avg_confidence_in_bin = confidences[in_bin].mean()
|
| 83 |
-
ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
|
| 84 |
-
|
| 85 |
-
return ece
|
| 86 |
-
|
| 87 |
-
def accurate_nb(preds, labels):
|
| 88 |
-
pred_flat = np.argmax(preds, axis=1).flatten()
|
| 89 |
-
labels_flat = np.argmax(labels, axis=1).flatten()
|
| 90 |
-
labels_flat = labels.flatten()
|
| 91 |
-
return np.sum(pred_flat == labels_flat)
|
| 92 |
-
|
| 93 |
-
class BERTTrainer:
|
| 94 |
-
"""
|
| 95 |
-
# Sequence..
|
| 96 |
-
|
| 97 |
-
BERTTrainer make the pretrained BERT model with two LM training method.
|
| 98 |
-
|
| 99 |
-
1. Masked Language Model : 3.3.1 Task #1: Masked LM
|
| 100 |
-
"""
|
| 101 |
-
|
| 102 |
-
def __init__(self, bert: BERT, vocab_size: int,
|
| 103 |
-
train_dataloader: DataLoader, test_dataloader: DataLoader = None,
|
| 104 |
-
lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
|
| 105 |
-
with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, same_student_prediction = False,
|
| 106 |
-
workspace_name=None):
|
| 107 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 108 |
"""
|
| 109 |
:param bert: BERT model which you want to train
|
| 110 |
:param vocab_size: total word vocab size
|
|
@@ -117,7 +47,6 @@ class BERTTrainer:
|
|
| 117 |
:param log_freq: logging frequency of the batch iteration
|
| 118 |
"""
|
| 119 |
|
| 120 |
-
<<<<<<< HEAD
|
| 121 |
cuda_condition = torch.cuda.is_available() and with_cuda
|
| 122 |
self.device = torch.device("cuda:0" if cuda_condition else "cpu")
|
| 123 |
print(cuda_condition, " Device used = ", self.device)
|
|
@@ -127,33 +56,16 @@ class BERTTrainer:
|
|
| 127 |
# This BERT model will be saved
|
| 128 |
self.bert = bert.to(self.device)
|
| 129 |
# Initialize the BERT Sequence Model, with BERT model
|
| 130 |
-
=======
|
| 131 |
-
# Setup cuda device for BERT training, argument -c, --cuda should be true
|
| 132 |
-
cuda_condition = torch.cuda.is_available() and with_cuda
|
| 133 |
-
self.device = torch.device("cuda:0" if cuda_condition else "cpu")
|
| 134 |
-
print("Device used = ", self.device)
|
| 135 |
-
|
| 136 |
-
# This BERT model will be saved every epoch
|
| 137 |
-
self.bert = bert
|
| 138 |
-
# Initialize the BERT Language Model, with BERT model
|
| 139 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 140 |
self.model = BERTSM(bert, vocab_size).to(self.device)
|
| 141 |
|
| 142 |
# Distributed GPU training if CUDA can detect more than 1 GPU
|
| 143 |
if with_cuda and torch.cuda.device_count() > 1:
|
| 144 |
print("Using %d GPUS for BERT" % torch.cuda.device_count())
|
| 145 |
-
<<<<<<< HEAD
|
| 146 |
self.model = nn.DataParallel(self.model, device_ids=available_gpus)
|
| 147 |
|
| 148 |
# Setting the train, validation and test data loader
|
| 149 |
self.train_data = train_dataloader
|
| 150 |
self.val_data = val_dataloader
|
| 151 |
-
=======
|
| 152 |
-
self.model = nn.DataParallel(self.model, device_ids=cuda_devices)
|
| 153 |
-
|
| 154 |
-
# Setting the train and test data loader
|
| 155 |
-
self.train_data = train_dataloader
|
| 156 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 157 |
self.test_data = test_dataloader
|
| 158 |
|
| 159 |
# Setting the Adam optimizer with hyper-param
|
|
@@ -164,7 +76,6 @@ class BERTTrainer:
|
|
| 164 |
self.criterion = nn.NLLLoss(ignore_index=0)
|
| 165 |
|
| 166 |
self.log_freq = log_freq
|
| 167 |
-
<<<<<<< HEAD
|
| 168 |
self.log_folder_path = log_folder_path
|
| 169 |
# self.workspace_name = workspace_name
|
| 170 |
self.save_model = False
|
|
@@ -175,18 +86,11 @@ class BERTTrainer:
|
|
| 175 |
f.close()
|
| 176 |
self.start_time = time.time()
|
| 177 |
|
| 178 |
-
=======
|
| 179 |
-
self.same_student_prediction = same_student_prediction
|
| 180 |
-
self.workspace_name = workspace_name
|
| 181 |
-
self.save_model = False
|
| 182 |
-
self.avg_loss = 10000
|
| 183 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 184 |
print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
|
| 185 |
|
| 186 |
def train(self, epoch):
|
| 187 |
self.iteration(epoch, self.train_data)
|
| 188 |
|
| 189 |
-
<<<<<<< HEAD
|
| 190 |
def val(self, epoch):
|
| 191 |
if epoch == 0:
|
| 192 |
self.avg_loss = 10000
|
|
@@ -196,12 +100,6 @@ class BERTTrainer:
|
|
| 196 |
self.iteration(epoch, self.test_data, phase="test")
|
| 197 |
|
| 198 |
def iteration(self, epoch, data_loader, phase="train"):
|
| 199 |
-
=======
|
| 200 |
-
def test(self, epoch):
|
| 201 |
-
self.iteration(epoch, self.test_data, train=False)
|
| 202 |
-
|
| 203 |
-
def iteration(self, epoch, data_loader, train=True):
|
| 204 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 205 |
"""
|
| 206 |
loop over the data_loader for training or testing
|
| 207 |
if on train status, backward operation is activated
|
|
@@ -212,7 +110,6 @@ class BERTTrainer:
|
|
| 212 |
:param train: boolean value of is train or test
|
| 213 |
:return: None
|
| 214 |
"""
|
| 215 |
-
<<<<<<< HEAD
|
| 216 |
|
| 217 |
# self.log_file = f"{self.workspace_name}/logs/{self.code}/log_{phase}_pretrained.txt"
|
| 218 |
# bert_hidden_representations = [] can be used
|
|
@@ -235,39 +132,10 @@ class BERTTrainer:
|
|
| 235 |
else:
|
| 236 |
self.model.eval()
|
| 237 |
with open(self.log_folder_path+f"/log_{phase}_pretrained.txt", 'a') as f:
|
| 238 |
-
=======
|
| 239 |
-
str_code = "train" if train else "test"
|
| 240 |
-
code = "masked_prediction" if self.same_student_prediction else "masked"
|
| 241 |
-
|
| 242 |
-
self.log_file = f"{self.workspace_name}/logs/{code}/log_{str_code}_pretrained.txt"
|
| 243 |
-
bert_hidden_representations = []
|
| 244 |
-
if epoch == 0:
|
| 245 |
-
f = open(self.log_file, 'w')
|
| 246 |
-
f.close()
|
| 247 |
-
if not train:
|
| 248 |
-
self.avg_loss = 10000
|
| 249 |
-
# Setting the tqdm progress bar
|
| 250 |
-
data_iter = tqdm.tqdm(enumerate(data_loader),
|
| 251 |
-
desc="EP_%s:%d" % (str_code, epoch),
|
| 252 |
-
total=len(data_loader),
|
| 253 |
-
bar_format="{l_bar}{r_bar}")
|
| 254 |
-
|
| 255 |
-
avg_loss_mask = 0.0
|
| 256 |
-
total_correct_mask = 0
|
| 257 |
-
total_element_mask = 0
|
| 258 |
-
|
| 259 |
-
avg_loss_pred = 0.0
|
| 260 |
-
total_correct_pred = 0
|
| 261 |
-
total_element_pred = 0
|
| 262 |
-
|
| 263 |
-
avg_loss = 0.0
|
| 264 |
-
with open(self.log_file, 'a') as f:
|
| 265 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 266 |
sys.stdout = f
|
| 267 |
for i, data in data_iter:
|
| 268 |
# 0. batch_data will be sent into the device(GPU or cpu)
|
| 269 |
data = {key: value.to(self.device) for key, value in data.items()}
|
| 270 |
-
<<<<<<< HEAD
|
| 271 |
|
| 272 |
# 1. forward masked_sm model
|
| 273 |
# mask_sm_output is log-probabilities output
|
|
@@ -280,38 +148,10 @@ class BERTTrainer:
|
|
| 280 |
|
| 281 |
# 3. backward and optimization only in train
|
| 282 |
if phase == "train":
|
| 283 |
-
=======
|
| 284 |
-
|
| 285 |
-
# 1. forward the next_sentence_prediction and masked_lm model
|
| 286 |
-
# next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])
|
| 287 |
-
if self.same_student_prediction:
|
| 288 |
-
bert_hidden_rep, mask_lm_output, same_student_output = self.model.forward(data["bert_input"], data["segment_label"], self.same_student_prediction)
|
| 289 |
-
else:
|
| 290 |
-
bert_hidden_rep, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"], self.same_student_prediction)
|
| 291 |
-
|
| 292 |
-
embeddings = [h for h in bert_hidden_rep.cpu().detach().numpy()]
|
| 293 |
-
bert_hidden_representations.extend(embeddings)
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
# 2-2. NLLLoss of predicting masked token word
|
| 297 |
-
mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])
|
| 298 |
-
|
| 299 |
-
# 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
|
| 300 |
-
if self.same_student_prediction:
|
| 301 |
-
# 2-1. NLL(negative log likelihood) loss of is_next classification result
|
| 302 |
-
same_student_loss = self.criterion(same_student_output, data["is_same_student"])
|
| 303 |
-
loss = same_student_loss + mask_loss
|
| 304 |
-
else:
|
| 305 |
-
loss = mask_loss
|
| 306 |
-
|
| 307 |
-
# 3. backward and optimization only in train
|
| 308 |
-
if train:
|
| 309 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 310 |
self.optim_schedule.zero_grad()
|
| 311 |
loss.backward()
|
| 312 |
self.optim_schedule.step_and_update_lr()
|
| 313 |
|
| 314 |
-
<<<<<<< HEAD
|
| 315 |
# tokens with highest log-probabilities creates a predicted sequence
|
| 316 |
pred_tokens = torch.argmax(mask_sm_output, dim=-1)
|
| 317 |
mask_correct = (data["bert_label"] == pred_tokens) & data["masked_pos"]
|
|
@@ -348,69 +188,6 @@ class BERTTrainer:
|
|
| 348 |
if self.avg_loss > (avg_loss / len(data_iter)):
|
| 349 |
self.save_model = True
|
| 350 |
self.avg_loss = (avg_loss / len(data_iter))
|
| 351 |
-
=======
|
| 352 |
-
|
| 353 |
-
non_zero_mask = (data["bert_label"] != 0).float()
|
| 354 |
-
predictions = torch.argmax(mask_lm_output, dim=-1)
|
| 355 |
-
predicted_masked = predictions*non_zero_mask
|
| 356 |
-
mask_correct = ((data["bert_label"] == predicted_masked)*non_zero_mask).sum().item()
|
| 357 |
-
|
| 358 |
-
avg_loss_mask += loss.item()
|
| 359 |
-
total_correct_mask += mask_correct
|
| 360 |
-
total_element_mask += non_zero_mask.sum().item()
|
| 361 |
-
|
| 362 |
-
post_fix = {
|
| 363 |
-
"epoch": epoch,
|
| 364 |
-
"iter": i,
|
| 365 |
-
"avg_loss": avg_loss_mask / (i + 1),
|
| 366 |
-
"avg_acc_mask": total_correct_mask / total_element_mask * 100,
|
| 367 |
-
"loss": loss.item()
|
| 368 |
-
}
|
| 369 |
-
|
| 370 |
-
# next sentence prediction accuracy
|
| 371 |
-
if self.same_student_prediction:
|
| 372 |
-
correct = same_student_output.argmax(dim=-1).eq(data["is_same_student"]).sum().item()
|
| 373 |
-
avg_loss_pred += loss.item()
|
| 374 |
-
total_correct_pred += correct
|
| 375 |
-
total_element_pred += data["is_same_student"].nelement()
|
| 376 |
-
# correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
|
| 377 |
-
post_fix["avg_loss"] = avg_loss_pred / (i + 1)
|
| 378 |
-
post_fix["avg_acc_pred"] = total_correct_pred / total_element_pred * 100
|
| 379 |
-
post_fix["loss"] = loss.item()
|
| 380 |
-
|
| 381 |
-
avg_loss +=loss.item()
|
| 382 |
-
|
| 383 |
-
if i % self.log_freq == 0:
|
| 384 |
-
data_iter.write(str(post_fix))
|
| 385 |
-
# if not train and epoch > 20 :
|
| 386 |
-
# pickle.dump(mask_lm_output.cpu().detach().numpy(), open(f"logs/mask/mask_out_e{epoch}_{i}.pkl","wb"))
|
| 387 |
-
# pickle.dump(data["bert_label"].cpu().detach().numpy(), open(f"logs/mask/label_e{epoch}_{i}.pkl","wb"))
|
| 388 |
-
|
| 389 |
-
final_msg = {
|
| 390 |
-
"epoch": f"EP{epoch}_{str_code}",
|
| 391 |
-
"avg_loss": avg_loss / len(data_iter),
|
| 392 |
-
"total_masked_acc": total_correct_mask * 100.0 / total_element_mask
|
| 393 |
-
}
|
| 394 |
-
if self.same_student_prediction:
|
| 395 |
-
final_msg["total_prediction_acc"] = total_correct_pred * 100.0 / total_element_pred
|
| 396 |
-
|
| 397 |
-
print(final_msg)
|
| 398 |
-
# print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_masked_acc=", total_correct_mask * 100.0 / total_element_mask, "total_prediction_acc=", total_correct_pred * 100.0 / total_element_pred)
|
| 399 |
-
# else:
|
| 400 |
-
# print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_masked_acc=", total_correct_mask * 100.0 / total_element_mask)
|
| 401 |
-
# print("EP%d_%s, " % (epoch, str_code))
|
| 402 |
-
|
| 403 |
-
f.close()
|
| 404 |
-
sys.stdout = sys.__stdout__
|
| 405 |
-
self.save_model = False
|
| 406 |
-
if self.avg_loss > (avg_loss / len(data_iter)):
|
| 407 |
-
self.save_model = True
|
| 408 |
-
self.avg_loss = (avg_loss / len(data_iter))
|
| 409 |
-
|
| 410 |
-
# pickle.dump(bert_hidden_representations, open(f"embeddings/{code}/{str_code}_embeddings_{epoch}.pkl","wb"))
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 414 |
|
| 415 |
def save(self, epoch, file_path="output/bert_trained.model"):
|
| 416 |
"""
|
|
@@ -432,12 +209,8 @@ class BERTFineTuneTrainer:
|
|
| 432 |
def __init__(self, bert: BERT, vocab_size: int,
|
| 433 |
train_dataloader: DataLoader, test_dataloader: DataLoader = None,
|
| 434 |
lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
|
| 435 |
-
<<<<<<< HEAD
|
| 436 |
with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, workspace_name=None,
|
| 437 |
num_labels=2, log_folder_path: str = None):
|
| 438 |
-
=======
|
| 439 |
-
with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, workspace_name=None, num_labels=2):
|
| 440 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 441 |
"""
|
| 442 |
:param bert: BERT model which you want to train
|
| 443 |
:param vocab_size: total word vocab size
|
|
@@ -453,7 +226,6 @@ class BERTFineTuneTrainer:
|
|
| 453 |
# Setup cuda device for BERT training, argument -c, --cuda should be true
|
| 454 |
cuda_condition = torch.cuda.is_available() and with_cuda
|
| 455 |
self.device = torch.device("cuda:0" if cuda_condition else "cpu")
|
| 456 |
-
<<<<<<< HEAD
|
| 457 |
print(cuda_condition, " Device used = ", self.device)
|
| 458 |
|
| 459 |
available_gpus = list(range(torch.cuda.device_count()))
|
|
@@ -462,6 +234,16 @@ class BERTFineTuneTrainer:
|
|
| 462 |
self.bert = bert
|
| 463 |
for param in self.bert.parameters():
|
| 464 |
param.requires_grad = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
# Initialize the BERT Language Model, with BERT model
|
| 466 |
# self.model = BERTForClassification(self.bert, vocab_size, num_labels).to(self.device)
|
| 467 |
# self.model = BERTForClassificationWithFeats(self.bert, num_labels, 8).to(self.device)
|
|
@@ -748,48 +530,11 @@ class BERTFineTuneTrainer1:
|
|
| 748 |
for fi in ['train', 'test']: #'val',
|
| 749 |
f = open(self.log_folder_path+f"/log_{fi}_finetuned.txt", 'w')
|
| 750 |
f.close()
|
| 751 |
-
=======
|
| 752 |
-
print("Device used = ", self.device)
|
| 753 |
-
|
| 754 |
-
# This BERT model will be saved every epoch
|
| 755 |
-
self.bert = bert
|
| 756 |
-
# for param in self.bert.parameters():
|
| 757 |
-
# param.requires_grad = False
|
| 758 |
-
# Initialize the BERT Language Model, with BERT model
|
| 759 |
-
self.model = BERTForClassification(self.bert, vocab_size, num_labels).to(self.device)
|
| 760 |
-
|
| 761 |
-
# Distributed GPU training if CUDA can detect more than 1 GPU
|
| 762 |
-
if with_cuda and torch.cuda.device_count() > 1:
|
| 763 |
-
print("Using %d GPUS for BERT" % torch.cuda.device_count())
|
| 764 |
-
self.model = nn.DataParallel(self.model, device_ids=cuda_devices)
|
| 765 |
-
|
| 766 |
-
# Setting the train and test data loader
|
| 767 |
-
self.train_data = train_dataloader
|
| 768 |
-
self.test_data = test_dataloader
|
| 769 |
-
|
| 770 |
-
self.optim = Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay, eps=1e-9)
|
| 771 |
-
# self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1)
|
| 772 |
-
|
| 773 |
-
if num_labels == 1:
|
| 774 |
-
self.criterion = nn.MSELoss()
|
| 775 |
-
elif num_labels == 2:
|
| 776 |
-
self.criterion = nn.CrossEntropyLoss()
|
| 777 |
-
elif num_labels > 2:
|
| 778 |
-
self.criterion = nn.BCEWithLogitsLoss()
|
| 779 |
-
|
| 780 |
-
self.ece_criterion = ECE().to(self.device)
|
| 781 |
-
|
| 782 |
-
self.log_freq = log_freq
|
| 783 |
-
self.workspace_name = workspace_name
|
| 784 |
-
self.save_model = False
|
| 785 |
-
self.avg_loss = 10000
|
| 786 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 787 |
print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
|
| 788 |
|
| 789 |
def train(self, epoch):
|
| 790 |
self.iteration(epoch, self.train_data)
|
| 791 |
|
| 792 |
-
<<<<<<< HEAD
|
| 793 |
# def val(self, epoch):
|
| 794 |
# self.iteration(epoch, self.val_data, phase="val")
|
| 795 |
|
|
@@ -799,12 +544,6 @@ class BERTFineTuneTrainer1:
|
|
| 799 |
self.iteration(epoch, self.test_data, phase="test")
|
| 800 |
|
| 801 |
def iteration(self, epoch, data_loader, phase="train"):
|
| 802 |
-
=======
|
| 803 |
-
def test(self, epoch):
|
| 804 |
-
self.iteration(epoch, self.test_data, train=False)
|
| 805 |
-
|
| 806 |
-
def iteration(self, epoch, data_loader, train=True):
|
| 807 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 808 |
"""
|
| 809 |
loop over the data_loader for training or testing
|
| 810 |
if on train status, backward operation is activated
|
|
@@ -815,26 +554,10 @@ class BERTFineTuneTrainer1:
|
|
| 815 |
:param train: boolean value of is train or test
|
| 816 |
:return: None
|
| 817 |
"""
|
| 818 |
-
<<<<<<< HEAD
|
| 819 |
|
| 820 |
# Setting the tqdm progress bar
|
| 821 |
data_iter = tqdm.tqdm(enumerate(data_loader),
|
| 822 |
desc="EP_%s:%d" % (phase, epoch),
|
| 823 |
-
=======
|
| 824 |
-
str_code = "train" if train else "test"
|
| 825 |
-
|
| 826 |
-
self.log_file = f"{self.workspace_name}/logs/masked/log_{str_code}_FS_finetuned.txt"
|
| 827 |
-
|
| 828 |
-
if epoch == 0:
|
| 829 |
-
f = open(self.log_file, 'w')
|
| 830 |
-
f.close()
|
| 831 |
-
if not train:
|
| 832 |
-
self.avg_loss = 10000
|
| 833 |
-
|
| 834 |
-
# Setting the tqdm progress bar
|
| 835 |
-
data_iter = tqdm.tqdm(enumerate(data_loader),
|
| 836 |
-
desc="EP_%s:%d" % (str_code, epoch),
|
| 837 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 838 |
total=len(data_loader),
|
| 839 |
bar_format="{l_bar}{r_bar}")
|
| 840 |
|
|
@@ -843,7 +566,6 @@ class BERTFineTuneTrainer1:
|
|
| 843 |
total_element = 0
|
| 844 |
plabels = []
|
| 845 |
tlabels = []
|
| 846 |
-
<<<<<<< HEAD
|
| 847 |
probabs = []
|
| 848 |
|
| 849 |
if phase == "train":
|
|
@@ -864,43 +586,10 @@ class BERTFineTuneTrainer1:
|
|
| 864 |
logits = self.model.forward(data["input"], data["segment_label"])#, data["feat"])
|
| 865 |
|
| 866 |
loss = self.criterion(logits, data["label"])
|
| 867 |
-
=======
|
| 868 |
-
eval_accurate_nb = 0
|
| 869 |
-
nb_eval_examples = 0
|
| 870 |
-
logits_list = []
|
| 871 |
-
labels_list = []
|
| 872 |
-
|
| 873 |
-
if train:
|
| 874 |
-
self.model.train()
|
| 875 |
-
else:
|
| 876 |
-
self.model.eval()
|
| 877 |
-
|
| 878 |
-
with open(self.log_file, 'a') as f:
|
| 879 |
-
sys.stdout = f
|
| 880 |
-
|
| 881 |
-
for i, data in data_iter:
|
| 882 |
-
# 0. batch_data will be sent into the device(GPU or cpu)
|
| 883 |
-
data = {key: value.to(self.device) for key, value in data.items()}
|
| 884 |
-
if train:
|
| 885 |
-
h_rep, logits = self.model.forward(data["bert_input"], data["segment_label"])
|
| 886 |
-
else:
|
| 887 |
-
with torch.no_grad():
|
| 888 |
-
h_rep, logits = self.model.forward(data["bert_input"], data["segment_label"])
|
| 889 |
-
# print(logits, logits.shape)
|
| 890 |
-
logits_list.append(logits.cpu())
|
| 891 |
-
labels_list.append(data["progress_status"].cpu())
|
| 892 |
-
# print(">>>>>>>>>>>>", progress_output)
|
| 893 |
-
# print(f"{epoch}---nelement--- {data['progress_status'].nelement()}")
|
| 894 |
-
# print(data["progress_status"].shape, logits.shape)
|
| 895 |
-
progress_loss = self.criterion(logits, data["progress_status"])
|
| 896 |
-
loss = progress_loss
|
| 897 |
-
|
| 898 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 899 |
if torch.cuda.device_count() > 1:
|
| 900 |
loss = loss.mean()
|
| 901 |
|
| 902 |
# 3. backward and optimization only in train
|
| 903 |
-
<<<<<<< HEAD
|
| 904 |
if phase == "train":
|
| 905 |
self.optim_schedule.zero_grad()
|
| 906 |
loss.backward()
|
|
@@ -969,108 +658,10 @@ class BERTFineTuneTrainer1:
|
|
| 969 |
sys.stdout = sys.__stdout__
|
| 970 |
|
| 971 |
if phase == "test":
|
| 972 |
-
=======
|
| 973 |
-
if train:
|
| 974 |
-
self.optim.zero_grad()
|
| 975 |
-
loss.backward()
|
| 976 |
-
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
|
| 977 |
-
self.optim.step()
|
| 978 |
-
|
| 979 |
-
# progress prediction accuracy
|
| 980 |
-
# correct = progress_output.argmax(dim=-1).eq(data["progress_status"]).sum().item()
|
| 981 |
-
probs = nn.LogSoftmax(dim=-1)(logits)
|
| 982 |
-
predicted_labels = torch.argmax(probs, dim=-1)
|
| 983 |
-
true_labels = torch.argmax(data["progress_status"], dim=-1)
|
| 984 |
-
plabels.extend(predicted_labels.cpu().numpy())
|
| 985 |
-
tlabels.extend(true_labels.cpu().numpy())
|
| 986 |
-
|
| 987 |
-
# print(">>>>>>>>>>>>>>", predicted_labels, true_labels)
|
| 988 |
-
# Compare predicted labels to true labels and calculate accuracy
|
| 989 |
-
correct = (predicted_labels == true_labels).sum().item()
|
| 990 |
-
avg_loss += loss.item()
|
| 991 |
-
total_correct += correct
|
| 992 |
-
total_element += true_labels.nelement()
|
| 993 |
-
|
| 994 |
-
if train:
|
| 995 |
-
post_fix = {
|
| 996 |
-
"epoch": epoch,
|
| 997 |
-
"iter": i,
|
| 998 |
-
"avg_loss": avg_loss / (i + 1),
|
| 999 |
-
"avg_acc": total_correct / total_element * 100,
|
| 1000 |
-
"loss": loss.item()
|
| 1001 |
-
}
|
| 1002 |
-
else:
|
| 1003 |
-
logits = logits.detach().cpu().numpy()
|
| 1004 |
-
label_ids = data["progress_status"].to('cpu').numpy()
|
| 1005 |
-
tmp_eval_nb = accurate_nb(logits, label_ids)
|
| 1006 |
-
|
| 1007 |
-
eval_accurate_nb += tmp_eval_nb
|
| 1008 |
-
nb_eval_examples += label_ids.shape[0]
|
| 1009 |
-
|
| 1010 |
-
total_element += data["progress_status"].nelement()
|
| 1011 |
-
# avg_loss += loss.item()
|
| 1012 |
-
|
| 1013 |
-
post_fix = {
|
| 1014 |
-
"epoch": epoch,
|
| 1015 |
-
"iter": i,
|
| 1016 |
-
"avg_loss": avg_loss / (i + 1),
|
| 1017 |
-
"avg_acc": tmp_eval_nb / total_element * 100,
|
| 1018 |
-
"loss": loss.item()
|
| 1019 |
-
}
|
| 1020 |
-
|
| 1021 |
-
|
| 1022 |
-
if i % self.log_freq == 0:
|
| 1023 |
-
data_iter.write(str(post_fix))
|
| 1024 |
-
|
| 1025 |
-
# precisions = precision_score(plabels, tlabels, average="weighted")
|
| 1026 |
-
# recalls = recall_score(plabels, tlabels, average="weighted")
|
| 1027 |
-
f1_scores = f1_score(plabels, tlabels, average="weighted")
|
| 1028 |
-
if train:
|
| 1029 |
-
final_msg = {
|
| 1030 |
-
"epoch": f"EP{epoch}_{str_code}",
|
| 1031 |
-
"avg_loss": avg_loss / len(data_iter),
|
| 1032 |
-
"total_acc": total_correct * 100.0 / total_element,
|
| 1033 |
-
# "precisions": precisions,
|
| 1034 |
-
# "recalls": recalls,
|
| 1035 |
-
"f1_scores": f1_scores
|
| 1036 |
-
}
|
| 1037 |
-
else:
|
| 1038 |
-
eval_accuracy = eval_accurate_nb/nb_eval_examples
|
| 1039 |
-
|
| 1040 |
-
logits_ece = torch.cat(logits_list)
|
| 1041 |
-
labels_ece = torch.cat(labels_list)
|
| 1042 |
-
ece = self.ece_criterion(logits_ece, labels_ece).item()
|
| 1043 |
-
final_msg = {
|
| 1044 |
-
"epoch": f"EP{epoch}_{str_code}",
|
| 1045 |
-
"eval_accuracy": eval_accuracy,
|
| 1046 |
-
"ece": ece,
|
| 1047 |
-
"avg_loss": avg_loss / len(data_iter),
|
| 1048 |
-
# "precisions": precisions,
|
| 1049 |
-
# "recalls": recalls,
|
| 1050 |
-
"f1_scores": f1_scores
|
| 1051 |
-
}
|
| 1052 |
-
if self.save_model:
|
| 1053 |
-
conf_hist = visualization.ConfidenceHistogram()
|
| 1054 |
-
plt_test = conf_hist.plot(np.array(logits_ece), np.array(labels_ece), title= f"Confidence Histogram {epoch}")
|
| 1055 |
-
plt_test.savefig(f"{self.workspace_name}/plots/confidence_histogram/FS/conf_histogram_test_{epoch}.png",bbox_inches='tight')
|
| 1056 |
-
plt_test.close()
|
| 1057 |
-
|
| 1058 |
-
rel_diagram = visualization.ReliabilityDiagram()
|
| 1059 |
-
plt_test_2 = rel_diagram.plot(np.array(logits_ece), np.array(labels_ece),title=f"Reliability Diagram {epoch}")
|
| 1060 |
-
plt_test_2.savefig(f"{self.workspace_name}/plots/confidence_histogram/FS/rel_diagram_test_{epoch}.png",bbox_inches='tight')
|
| 1061 |
-
plt_test_2.close()
|
| 1062 |
-
print(final_msg)
|
| 1063 |
-
|
| 1064 |
-
# print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=", total_correct * 100.0 / total_element)
|
| 1065 |
-
f.close()
|
| 1066 |
-
sys.stdout = sys.__stdout__
|
| 1067 |
-
if train:
|
| 1068 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 1069 |
self.save_model = False
|
| 1070 |
if self.avg_loss > (avg_loss / len(data_iter)):
|
| 1071 |
self.save_model = True
|
| 1072 |
self.avg_loss = (avg_loss / len(data_iter))
|
| 1073 |
-
<<<<<<< HEAD
|
| 1074 |
|
| 1075 |
def iteration_1(self, epoch_idx, data):
|
| 1076 |
try:
|
|
@@ -1094,11 +685,6 @@ class BERTFineTuneTrainer1:
|
|
| 1094 |
print(f"Error during iteration: {e}")
|
| 1095 |
raise
|
| 1096 |
|
| 1097 |
-
=======
|
| 1098 |
-
|
| 1099 |
-
# plt_test.show()
|
| 1100 |
-
# print("EP%d_%s, " % (epoch, str_code))
|
| 1101 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 1102 |
|
| 1103 |
def save(self, epoch, file_path="output/bert_fine_tuned_trained.model"):
|
| 1104 |
"""
|
|
@@ -1113,7 +699,6 @@ class BERTFineTuneTrainer1:
|
|
| 1113 |
self.model.to(self.device)
|
| 1114 |
print("EP:%d Model Saved on:" % epoch, output_path)
|
| 1115 |
return output_path
|
| 1116 |
-
<<<<<<< HEAD
|
| 1117 |
|
| 1118 |
|
| 1119 |
class BERTAttention:
|
|
@@ -1221,5 +806,3 @@ class BERTAttention:
|
|
| 1221 |
|
| 1222 |
|
| 1223 |
|
| 1224 |
-
=======
|
| 1225 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
|
|
|
| 1 |
import torch
|
| 2 |
import torch.nn as nn
|
|
|
|
| 3 |
# from torch.nn import functional as F
|
| 4 |
from torch.optim import Adam
|
| 5 |
from torch.utils.data import DataLoader
|
|
|
|
| 35 |
train_dataloader: DataLoader, val_dataloader: DataLoader = None, test_dataloader: DataLoader = None,
|
| 36 |
lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=5000,
|
| 37 |
with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, log_folder_path: str = None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
:param bert: BERT model which you want to train
|
| 40 |
:param vocab_size: total word vocab size
|
|
|
|
| 47 |
:param log_freq: logging frequency of the batch iteration
|
| 48 |
"""
|
| 49 |
|
|
|
|
| 50 |
cuda_condition = torch.cuda.is_available() and with_cuda
|
| 51 |
self.device = torch.device("cuda:0" if cuda_condition else "cpu")
|
| 52 |
print(cuda_condition, " Device used = ", self.device)
|
|
|
|
| 56 |
# This BERT model will be saved
|
| 57 |
self.bert = bert.to(self.device)
|
| 58 |
# Initialize the BERT Sequence Model, with BERT model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
self.model = BERTSM(bert, vocab_size).to(self.device)
|
| 60 |
|
| 61 |
# Distributed GPU training if CUDA can detect more than 1 GPU
|
| 62 |
if with_cuda and torch.cuda.device_count() > 1:
|
| 63 |
print("Using %d GPUS for BERT" % torch.cuda.device_count())
|
|
|
|
| 64 |
self.model = nn.DataParallel(self.model, device_ids=available_gpus)
|
| 65 |
|
| 66 |
# Setting the train, validation and test data loader
|
| 67 |
self.train_data = train_dataloader
|
| 68 |
self.val_data = val_dataloader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
self.test_data = test_dataloader
|
| 70 |
|
| 71 |
# Setting the Adam optimizer with hyper-param
|
|
|
|
| 76 |
self.criterion = nn.NLLLoss(ignore_index=0)
|
| 77 |
|
| 78 |
self.log_freq = log_freq
|
|
|
|
| 79 |
self.log_folder_path = log_folder_path
|
| 80 |
# self.workspace_name = workspace_name
|
| 81 |
self.save_model = False
|
|
|
|
| 86 |
f.close()
|
| 87 |
self.start_time = time.time()
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
|
| 90 |
|
| 91 |
def train(self, epoch):
|
| 92 |
self.iteration(epoch, self.train_data)
|
| 93 |
|
|
|
|
| 94 |
def val(self, epoch):
|
| 95 |
if epoch == 0:
|
| 96 |
self.avg_loss = 10000
|
|
|
|
| 100 |
self.iteration(epoch, self.test_data, phase="test")
|
| 101 |
|
| 102 |
def iteration(self, epoch, data_loader, phase="train"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
"""
|
| 104 |
loop over the data_loader for training or testing
|
| 105 |
if on train status, backward operation is activated
|
|
|
|
| 110 |
:param train: boolean value of is train or test
|
| 111 |
:return: None
|
| 112 |
"""
|
|
|
|
| 113 |
|
| 114 |
# self.log_file = f"{self.workspace_name}/logs/{self.code}/log_{phase}_pretrained.txt"
|
| 115 |
# bert_hidden_representations = [] can be used
|
|
|
|
| 132 |
else:
|
| 133 |
self.model.eval()
|
| 134 |
with open(self.log_folder_path+f"/log_{phase}_pretrained.txt", 'a') as f:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
sys.stdout = f
|
| 136 |
for i, data in data_iter:
|
| 137 |
# 0. batch_data will be sent into the device(GPU or cpu)
|
| 138 |
data = {key: value.to(self.device) for key, value in data.items()}
|
|
|
|
| 139 |
|
| 140 |
# 1. forward masked_sm model
|
| 141 |
# mask_sm_output is log-probabilities output
|
|
|
|
| 148 |
|
| 149 |
# 3. backward and optimization only in train
|
| 150 |
if phase == "train":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
self.optim_schedule.zero_grad()
|
| 152 |
loss.backward()
|
| 153 |
self.optim_schedule.step_and_update_lr()
|
| 154 |
|
|
|
|
| 155 |
# tokens with highest log-probabilities creates a predicted sequence
|
| 156 |
pred_tokens = torch.argmax(mask_sm_output, dim=-1)
|
| 157 |
mask_correct = (data["bert_label"] == pred_tokens) & data["masked_pos"]
|
|
|
|
| 188 |
if self.avg_loss > (avg_loss / len(data_iter)):
|
| 189 |
self.save_model = True
|
| 190 |
self.avg_loss = (avg_loss / len(data_iter))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
def save(self, epoch, file_path="output/bert_trained.model"):
|
| 193 |
"""
|
|
|
|
| 209 |
def __init__(self, bert: BERT, vocab_size: int,
|
| 210 |
train_dataloader: DataLoader, test_dataloader: DataLoader = None,
|
| 211 |
lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
|
|
|
|
| 212 |
with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, workspace_name=None,
|
| 213 |
num_labels=2, log_folder_path: str = None):
|
|
|
|
|
|
|
|
|
|
| 214 |
"""
|
| 215 |
:param bert: BERT model which you want to train
|
| 216 |
:param vocab_size: total word vocab size
|
|
|
|
| 226 |
# Setup cuda device for BERT training, argument -c, --cuda should be true
|
| 227 |
cuda_condition = torch.cuda.is_available() and with_cuda
|
| 228 |
self.device = torch.device("cuda:0" if cuda_condition else "cpu")
|
|
|
|
| 229 |
print(cuda_condition, " Device used = ", self.device)
|
| 230 |
|
| 231 |
available_gpus = list(range(torch.cuda.device_count()))
|
|
|
|
| 234 |
self.bert = bert
|
| 235 |
for param in self.bert.parameters():
|
| 236 |
param.requires_grad = False
|
| 237 |
+
|
| 238 |
+
# for name, param in self.bert.named_parameters():
|
| 239 |
+
# if '.attention.linear_layers.0' in name or \
|
| 240 |
+
# '.attention.linear_layers.1' in name or \
|
| 241 |
+
# '.attention.linear_layers.2' in name:
|
| 242 |
+
# # if 'transformer_blocks.' in name:# or \
|
| 243 |
+
# # 'transformer_blocks.3.' in name:
|
| 244 |
+
# # if '2.attention.linear_layers.' in name or \
|
| 245 |
+
# # '3.attention.linear_layers.' in name:
|
| 246 |
+
# param.requires_grad = True
|
| 247 |
# Initialize the BERT Language Model, with BERT model
|
| 248 |
# self.model = BERTForClassification(self.bert, vocab_size, num_labels).to(self.device)
|
| 249 |
# self.model = BERTForClassificationWithFeats(self.bert, num_labels, 8).to(self.device)
|
|
|
|
| 530 |
for fi in ['train', 'test']: #'val',
|
| 531 |
f = open(self.log_folder_path+f"/log_{fi}_finetuned.txt", 'w')
|
| 532 |
f.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
|
| 534 |
|
| 535 |
def train(self, epoch):
|
| 536 |
self.iteration(epoch, self.train_data)
|
| 537 |
|
|
|
|
| 538 |
# def val(self, epoch):
|
| 539 |
# self.iteration(epoch, self.val_data, phase="val")
|
| 540 |
|
|
|
|
| 544 |
self.iteration(epoch, self.test_data, phase="test")
|
| 545 |
|
| 546 |
def iteration(self, epoch, data_loader, phase="train"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 547 |
"""
|
| 548 |
loop over the data_loader for training or testing
|
| 549 |
if on train status, backward operation is activated
|
|
|
|
| 554 |
:param train: boolean value of is train or test
|
| 555 |
:return: None
|
| 556 |
"""
|
|
|
|
| 557 |
|
| 558 |
# Setting the tqdm progress bar
|
| 559 |
data_iter = tqdm.tqdm(enumerate(data_loader),
|
| 560 |
desc="EP_%s:%d" % (phase, epoch),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
total=len(data_loader),
|
| 562 |
bar_format="{l_bar}{r_bar}")
|
| 563 |
|
|
|
|
| 566 |
total_element = 0
|
| 567 |
plabels = []
|
| 568 |
tlabels = []
|
|
|
|
| 569 |
probabs = []
|
| 570 |
|
| 571 |
if phase == "train":
|
|
|
|
| 586 |
logits = self.model.forward(data["input"], data["segment_label"])#, data["feat"])
|
| 587 |
|
| 588 |
loss = self.criterion(logits, data["label"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
if torch.cuda.device_count() > 1:
|
| 590 |
loss = loss.mean()
|
| 591 |
|
| 592 |
# 3. backward and optimization only in train
|
|
|
|
| 593 |
if phase == "train":
|
| 594 |
self.optim_schedule.zero_grad()
|
| 595 |
loss.backward()
|
|
|
|
| 658 |
sys.stdout = sys.__stdout__
|
| 659 |
|
| 660 |
if phase == "test":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
self.save_model = False
|
| 662 |
if self.avg_loss > (avg_loss / len(data_iter)):
|
| 663 |
self.save_model = True
|
| 664 |
self.avg_loss = (avg_loss / len(data_iter))
|
|
|
|
| 665 |
|
| 666 |
def iteration_1(self, epoch_idx, data):
|
| 667 |
try:
|
|
|
|
| 685 |
print(f"Error during iteration: {e}")
|
| 686 |
raise
|
| 687 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 688 |
|
| 689 |
def save(self, epoch, file_path="output/bert_fine_tuned_trained.model"):
|
| 690 |
"""
|
|
|
|
| 699 |
self.model.to(self.device)
|
| 700 |
print("EP:%d Model Saved on:" % epoch, output_path)
|
| 701 |
return output_path
|
|
|
|
| 702 |
|
| 703 |
|
| 704 |
class BERTAttention:
|
|
|
|
| 806 |
|
| 807 |
|
| 808 |
|
|
|
|
|
|
src/seq_model.py
CHANGED
|
@@ -1,10 +1,6 @@
|
|
| 1 |
import torch.nn as nn
|
| 2 |
|
| 3 |
-
<<<<<<< HEAD
|
| 4 |
from .bert import BERT
|
| 5 |
-
=======
|
| 6 |
-
from bert import BERT
|
| 7 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 8 |
|
| 9 |
|
| 10 |
class BERTSM(nn.Module):
|
|
@@ -22,23 +18,10 @@ class BERTSM(nn.Module):
|
|
| 22 |
super().__init__()
|
| 23 |
self.bert = bert
|
| 24 |
self.mask_lm = MaskedSequenceModel(self.bert.hidden, vocab_size)
|
| 25 |
-
<<<<<<< HEAD
|
| 26 |
|
| 27 |
def forward(self, x, segment_label):
|
| 28 |
x = self.bert(x, segment_label)
|
| 29 |
return self.mask_lm(x), x[:, 0]
|
| 30 |
-
=======
|
| 31 |
-
self.same_student = SameStudentPrediction(self.bert.hidden)
|
| 32 |
-
|
| 33 |
-
def forward(self, x, segment_label, pred=False):
|
| 34 |
-
x = self.bert(x, segment_label)
|
| 35 |
-
# torch.Size([32, 200, 512])
|
| 36 |
-
# print("???????????? ",x.shape)
|
| 37 |
-
if pred:
|
| 38 |
-
return x[:, 0], self.mask_lm(x), self.same_student(x)
|
| 39 |
-
else:
|
| 40 |
-
return x[:, 0], self.mask_lm(x)
|
| 41 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 42 |
|
| 43 |
|
| 44 |
class MaskedSequenceModel(nn.Module):
|
|
@@ -57,23 +40,4 @@ class MaskedSequenceModel(nn.Module):
|
|
| 57 |
self.softmax = nn.LogSoftmax(dim=-1)
|
| 58 |
|
| 59 |
def forward(self, x):
|
| 60 |
-
|
| 61 |
-
return self.softmax(self.linear(x))
|
| 62 |
-
=======
|
| 63 |
-
return self.softmax(self.linear(x))
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
class SameStudentPrediction(nn.Module):
|
| 67 |
-
|
| 68 |
-
def __init__(self, hidden):
|
| 69 |
-
"""
|
| 70 |
-
:param hidden: BERT model output size
|
| 71 |
-
"""
|
| 72 |
-
super().__init__()
|
| 73 |
-
self.linear = nn.Linear(hidden, 2)
|
| 74 |
-
self.softmax = nn.LogSoftmax(dim=-1)
|
| 75 |
-
|
| 76 |
-
def forward(self, x):
|
| 77 |
-
return self.softmax(self.linear(x[:, 0]))
|
| 78 |
-
|
| 79 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
|
|
|
| 1 |
import torch.nn as nn
|
| 2 |
|
|
|
|
| 3 |
from .bert import BERT
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
class BERTSM(nn.Module):
|
|
|
|
| 18 |
super().__init__()
|
| 19 |
self.bert = bert
|
| 20 |
self.mask_lm = MaskedSequenceModel(self.bert.hidden, vocab_size)
|
|
|
|
| 21 |
|
| 22 |
def forward(self, x, segment_label):
|
| 23 |
x = self.bert(x, segment_label)
|
| 24 |
return self.mask_lm(x), x[:, 0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
class MaskedSequenceModel(nn.Module):
|
|
|
|
| 40 |
self.softmax = nn.LogSoftmax(dim=-1)
|
| 41 |
|
| 42 |
def forward(self, x):
|
| 43 |
+
return self.softmax(self.linear(x))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/transformer.py
CHANGED
|
@@ -1,12 +1,7 @@
|
|
| 1 |
import torch.nn as nn
|
| 2 |
|
| 3 |
-
<<<<<<< HEAD
|
| 4 |
from .attention import MultiHeadedAttention
|
| 5 |
from .transformer_component import SublayerConnection, PositionwiseFeedForward
|
| 6 |
-
=======
|
| 7 |
-
from attention import MultiHeadedAttention
|
| 8 |
-
from transformer_component import SublayerConnection, PositionwiseFeedForward
|
| 9 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 10 |
|
| 11 |
class TransformerBlock(nn.Module):
|
| 12 |
"""
|
|
@@ -30,12 +25,8 @@ class TransformerBlock(nn.Module):
|
|
| 30 |
self.dropout = nn.Dropout(p=dropout)
|
| 31 |
|
| 32 |
def forward(self, x, mask):
|
| 33 |
-
<<<<<<< HEAD
|
| 34 |
attn_output, p_attn = self.attention.forward(x, x, x, mask=mask)
|
| 35 |
self.p_attn = p_attn.cpu().detach().numpy()
|
| 36 |
x = self.input_sublayer(x, lambda _x: attn_output)
|
| 37 |
-
=======
|
| 38 |
-
x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
|
| 39 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 40 |
x = self.output_sublayer(x, self.feed_forward)
|
| 41 |
return self.dropout(x)
|
|
|
|
| 1 |
import torch.nn as nn
|
| 2 |
|
|
|
|
| 3 |
from .attention import MultiHeadedAttention
|
| 4 |
from .transformer_component import SublayerConnection, PositionwiseFeedForward
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
class TransformerBlock(nn.Module):
|
| 7 |
"""
|
|
|
|
| 25 |
self.dropout = nn.Dropout(p=dropout)
|
| 26 |
|
| 27 |
def forward(self, x, mask):
|
|
|
|
| 28 |
attn_output, p_attn = self.attention.forward(x, x, x, mask=mask)
|
| 29 |
self.p_attn = p_attn.cpu().detach().numpy()
|
| 30 |
x = self.input_sublayer(x, lambda _x: attn_output)
|
|
|
|
|
|
|
|
|
|
| 31 |
x = self.output_sublayer(x, self.feed_forward)
|
| 32 |
return self.dropout(x)
|
src/vocab.py
CHANGED
|
@@ -1,22 +1,16 @@
|
|
| 1 |
import collections
|
| 2 |
import tqdm
|
| 3 |
-
<<<<<<< HEAD
|
| 4 |
import os
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
head_directory = Path(__file__).resolve().parent.parent
|
| 8 |
# print(head_directory)
|
| 9 |
os.chdir(head_directory)
|
| 10 |
-
=======
|
| 11 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 12 |
|
| 13 |
class Vocab(object):
|
| 14 |
"""
|
| 15 |
Special tokens predefined in the vocab file are:
|
| 16 |
-
<<<<<<< HEAD
|
| 17 |
-[PAD]
|
| 18 |
-
=======
|
| 19 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 20 |
-[UNK]
|
| 21 |
-[MASK]
|
| 22 |
-[CLS]
|
|
@@ -48,11 +42,7 @@ class Vocab(object):
|
|
| 48 |
words = [self.invocab[index] if index < len(self.invocab)
|
| 49 |
else "[%d]" % index for index in seq ]
|
| 50 |
|
| 51 |
-
<<<<<<< HEAD
|
| 52 |
return words #" ".join(words)
|
| 53 |
-
=======
|
| 54 |
-
return " ".join(words)
|
| 55 |
-
>>>>>>> bffd3381ccb717f802fe651d4111ec0a268e3896
|
| 56 |
|
| 57 |
|
| 58 |
# if __init__ == "__main__":
|
|
|
|
| 1 |
import collections
|
| 2 |
import tqdm
|
|
|
|
| 3 |
import os
|
| 4 |
from pathlib import Path
|
| 5 |
|
| 6 |
head_directory = Path(__file__).resolve().parent.parent
|
| 7 |
# print(head_directory)
|
| 8 |
os.chdir(head_directory)
|
|
|
|
|
|
|
| 9 |
|
| 10 |
class Vocab(object):
|
| 11 |
"""
|
| 12 |
Special tokens predefined in the vocab file are:
|
|
|
|
| 13 |
-[PAD]
|
|
|
|
|
|
|
| 14 |
-[UNK]
|
| 15 |
-[MASK]
|
| 16 |
-[CLS]
|
|
|
|
| 42 |
words = [self.invocab[index] if index < len(self.invocab)
|
| 43 |
else "[%d]" % index for index in seq ]
|
| 44 |
|
|
|
|
| 45 |
return words #" ".join(words)
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
# if __init__ == "__main__":
|