File size: 6,152 Bytes
a197a13 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import spacy
from spacy.util import minibatch, compounding
from spacy.scorer import Scorer
from src.model_utils import *
import random
from tqdm import tqdm
def train_transformer(config: dict, train_data: list, components: list, iter: int,
batch_size: int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data: list=None)-> spacy:
"""
Finetune a transformer model or resume training from a fine-tuned model.
Parameters:
config: dict, configuration parameters
train_data: list, contain training data
components: list, list of components to be trained
iter: int, number of iterations to train
batch_size: int, batch size to be used for training
entities: list of entities to be trained on for NER
eval_data: list, containing evaluation data
Returns:
nlp : spacy transformer
losses: list of the losses at every iteration
"""
if config['dir'] is not None:
nlp = spacy.load(config['dir'])
optimizer = nlp.resume_training()
else:
nlp = spacy.blank("en") # empty English pipeline
nlp.add_pipe("transformer", config=config['config'])
for component in components:
nlp.add_pipe(component)
task=nlp.get_pipe(component)
if ('ner' in components) and (entities is not None):
for label in entities:
task.add_label(label)
nlp.initialize() # XXX don't forget this step!
optimizer = nlp.create_optimizer()
# convert data into training doc
train_data_doc = make_training_doc(nlp, train_data)
all_losses = []
for itn in tqdm(range(1,iter+1)):
print("Starting iteration " + str(itn))
random.shuffle(train_data)
losses = {}
# compounding(4.0, 32.0, 1.001)
batches = minibatch(train_data_doc, size=batch_size)
for batch in batches:
nlp.update(batch, sgd=optimizer,drop=0.2, losses=losses)
scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data)
print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \
format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f']))
all_losses.append([losses[component] for component in components])
return nlp, all_losses
def train_spacy(model: spacy, train_data: list, components: list, iter: int,
batch_size:int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data:list=None)-> spacy:
"""
Finetune a spacy model or resume training from a fine-tuned model.
Parameters:
model: str, name of spacy model
train_data: list, contain training data
components: list, list of components to be trained
iter: int, number of iterations to train
batch_size: int, batch size to be used for training
entities: list of entities to be trained on for NER
eval_data: list, containing evaluation data
Returns:
nlp : spacy model
losses: list of the losses at every iteration
"""
# get model and optimizer
if model is not None:
nlp, optimizer = load_model(model) # load existing spaCy model/ blank models
# convert data into training doc
train_data_doc = make_training_doc(nlp, train_data)
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
for component in components:
if component not in nlp.pipe_names:
ner = nlp.create_pipe(component)
nlp.add_pipe(component, last=True)
else:
ner = nlp.get_pipe(component)
# add labels if component is NER
if (component == 'ner') and (entities is not None):
for ent in entities:
ner.add_label(ent)
print(f'Entities in the model are: {nlp.get_pipe("ner").labels}')
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in components]
all_losses = []
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in tqdm(range(1,iter+1)):
print("Starting iteration " + str(itn))
random.shuffle(train_data)
losses = {}
batches = minibatch(train_data_doc, size=batch_size)
for batch in batches:
nlp.update(list(batch),
losses=losses,
drop=0.1,
sgd=optimizer)
scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data)
print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \
format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f']))
all_losses.append([losses[component] for component in components])
return nlp, all_losses
def eval_spacy(model: spacy, data):
"""
Function to perform evaluation and scoring
Parameters:
model: either a spacy model or spacy transformer
data: evaluation data so that scoring can be done
Returns:
score: dict with scores of the model
"""
scorer = Scorer()
examples = []
try:
# accept spacy format json data
for input_, annot in data:
doc = model.make_doc(input_)
example = Example.from_dict(doc, annot)
example.predicted = model(str(example.text))
examples.append(example)
scores = scorer.score(examples)
return scores
except TypeError:
# accept alternative format json data
for row in data:
input_, annot = row.values()
doc = model.make_doc(input_)
example = Example.from_dict(doc, {'entities':annot})
example.predicted = model(str(example.text))
examples.append(example)
scores = scorer.score(examples)
return scores
except Exception as e: print(e)
|