ner_pg / src /trainers.py
Kaelan
Add application file
a197a13
import spacy
from spacy.util import minibatch, compounding
from spacy.scorer import Scorer
from src.model_utils import *
import random
from tqdm import tqdm
def train_transformer(config: dict, train_data: list, components: list, iter: int,
batch_size: int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data: list=None)-> spacy:
"""
Finetune a transformer model or resume training from a fine-tuned model.
Parameters:
config: dict, configuration parameters
train_data: list, contain training data
components: list, list of components to be trained
iter: int, number of iterations to train
batch_size: int, batch size to be used for training
entities: list of entities to be trained on for NER
eval_data: list, containing evaluation data
Returns:
nlp : spacy transformer
losses: list of the losses at every iteration
"""
if config['dir'] is not None:
nlp = spacy.load(config['dir'])
optimizer = nlp.resume_training()
else:
nlp = spacy.blank("en") # empty English pipeline
nlp.add_pipe("transformer", config=config['config'])
for component in components:
nlp.add_pipe(component)
task=nlp.get_pipe(component)
if ('ner' in components) and (entities is not None):
for label in entities:
task.add_label(label)
nlp.initialize() # XXX don't forget this step!
optimizer = nlp.create_optimizer()
# convert data into training doc
train_data_doc = make_training_doc(nlp, train_data)
all_losses = []
for itn in tqdm(range(1,iter+1)):
print("Starting iteration " + str(itn))
random.shuffle(train_data)
losses = {}
# compounding(4.0, 32.0, 1.001)
batches = minibatch(train_data_doc, size=batch_size)
for batch in batches:
nlp.update(batch, sgd=optimizer,drop=0.2, losses=losses)
scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data)
print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \
format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f']))
all_losses.append([losses[component] for component in components])
return nlp, all_losses
def train_spacy(model: spacy, train_data: list, components: list, iter: int,
batch_size:int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data:list=None)-> spacy:
"""
Finetune a spacy model or resume training from a fine-tuned model.
Parameters:
model: str, name of spacy model
train_data: list, contain training data
components: list, list of components to be trained
iter: int, number of iterations to train
batch_size: int, batch size to be used for training
entities: list of entities to be trained on for NER
eval_data: list, containing evaluation data
Returns:
nlp : spacy model
losses: list of the losses at every iteration
"""
# get model and optimizer
if model is not None:
nlp, optimizer = load_model(model) # load existing spaCy model/ blank models
# convert data into training doc
train_data_doc = make_training_doc(nlp, train_data)
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
for component in components:
if component not in nlp.pipe_names:
ner = nlp.create_pipe(component)
nlp.add_pipe(component, last=True)
else:
ner = nlp.get_pipe(component)
# add labels if component is NER
if (component == 'ner') and (entities is not None):
for ent in entities:
ner.add_label(ent)
print(f'Entities in the model are: {nlp.get_pipe("ner").labels}')
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in components]
all_losses = []
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in tqdm(range(1,iter+1)):
print("Starting iteration " + str(itn))
random.shuffle(train_data)
losses = {}
batches = minibatch(train_data_doc, size=batch_size)
for batch in batches:
nlp.update(list(batch),
losses=losses,
drop=0.1,
sgd=optimizer)
scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data)
print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \
format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f']))
all_losses.append([losses[component] for component in components])
return nlp, all_losses
def eval_spacy(model: spacy, data):
"""
Function to perform evaluation and scoring
Parameters:
model: either a spacy model or spacy transformer
data: evaluation data so that scoring can be done
Returns:
score: dict with scores of the model
"""
scorer = Scorer()
examples = []
try:
# accept spacy format json data
for input_, annot in data:
doc = model.make_doc(input_)
example = Example.from_dict(doc, annot)
example.predicted = model(str(example.text))
examples.append(example)
scores = scorer.score(examples)
return scores
except TypeError:
# accept alternative format json data
for row in data:
input_, annot = row.values()
doc = model.make_doc(input_)
example = Example.from_dict(doc, {'entities':annot})
example.predicted = model(str(example.text))
examples.append(example)
scores = scorer.score(examples)
return scores
except Exception as e: print(e)