Spaces:

Kaelan
/

ner_pg

Sleeping

ner_pg / src /trainers.py

Kaelan

Add application file

a197a13 over 1 year ago

6.15 kB

	import spacy
	from spacy.util import minibatch, compounding
	from spacy.scorer import Scorer
	from src.model_utils import *

	import random
	from tqdm import tqdm

	def train_transformer(config: dict, train_data: list, components: list, iter: int,
	batch_size: int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data: list=None)-> spacy:
	"""
	Finetune a transformer model or resume training from a fine-tuned model.

	Parameters:
	config: dict, configuration parameters
	train_data: list, contain training data
	components: list, list of components to be trained
	iter: int, number of iterations to train
	batch_size: int, batch size to be used for training
	entities: list of entities to be trained on for NER
	eval_data: list, containing evaluation data

	Returns:
	nlp : spacy transformer
	losses: list of the losses at every iteration


	"""
	if config['dir'] is not None:
	nlp = spacy.load(config['dir'])
	optimizer = nlp.resume_training()
	else:
	nlp = spacy.blank("en") # empty English pipeline
	nlp.add_pipe("transformer", config=config['config'])
	for component in components:
	nlp.add_pipe(component)

	task=nlp.get_pipe(component)
	if ('ner' in components) and (entities is not None):
	for label in entities:
	task.add_label(label)

	nlp.initialize() # XXX don't forget this step!
	optimizer = nlp.create_optimizer()

	# convert data into training doc
	train_data_doc = make_training_doc(nlp, train_data)

	all_losses = []
	for itn in tqdm(range(1,iter+1)):
	print("Starting iteration " + str(itn))
	random.shuffle(train_data)
	losses = {}
	# compounding(4.0, 32.0, 1.001)
	batches = minibatch(train_data_doc, size=batch_size)
	for batch in batches:
	nlp.update(batch, sgd=optimizer,drop=0.2, losses=losses)

	scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data)
	print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \
	format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f']))

	all_losses.append([losses[component] for component in components])

	return nlp, all_losses



	def train_spacy(model: spacy, train_data: list, components: list, iter: int,
	batch_size:int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data:list=None)-> spacy:
	"""
	Finetune a spacy model or resume training from a fine-tuned model.

	Parameters:
	model: str, name of spacy model
	train_data: list, contain training data
	components: list, list of components to be trained
	iter: int, number of iterations to train
	batch_size: int, batch size to be used for training
	entities: list of entities to be trained on for NER
	eval_data: list, containing evaluation data

	Returns:
	nlp : spacy model
	losses: list of the losses at every iteration

	"""

	# get model and optimizer
	if model is not None:
	nlp, optimizer = load_model(model) # load existing spaCy model/ blank models

	# convert data into training doc
	train_data_doc = make_training_doc(nlp, train_data)

	# create the built-in pipeline components and add them to the pipeline
	# nlp.create_pipe works for built-ins that are registered with spaCy
	for component in components:
	if component not in nlp.pipe_names:
	ner = nlp.create_pipe(component)
	nlp.add_pipe(component, last=True)
	else:
	ner = nlp.get_pipe(component)

	# add labels if component is NER
	if (component == 'ner') and (entities is not None):
	for ent in entities:
	ner.add_label(ent)

	print(f'Entities in the model are: {nlp.get_pipe("ner").labels}')

	# get names of other pipes to disable them during training
	other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in components]
	all_losses = []
	with nlp.disable_pipes(*other_pipes): # only train NER
	for itn in tqdm(range(1,iter+1)):
	print("Starting iteration " + str(itn))
	random.shuffle(train_data)
	losses = {}
	batches = minibatch(train_data_doc, size=batch_size)
	for batch in batches:
	nlp.update(list(batch),
	losses=losses,
	drop=0.1,
	sgd=optimizer)
	scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data)
	print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \
	format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f']))

	all_losses.append([losses[component] for component in components])

	return nlp, all_losses

	def eval_spacy(model: spacy, data):
	"""
	Function to perform evaluation and scoring

	Parameters:
	model: either a spacy model or spacy transformer
	data: evaluation data so that scoring can be done

	Returns:
	score: dict with scores of the model
	"""
	scorer = Scorer()
	examples = []
	try:
	# accept spacy format json data
	for input_, annot in data:
	doc = model.make_doc(input_)
	example = Example.from_dict(doc, annot)
	example.predicted = model(str(example.text))
	examples.append(example)
	scores = scorer.score(examples)
	return scores
	except TypeError:
	# accept alternative format json data
	for row in data:
	input_, annot = row.values()
	doc = model.make_doc(input_)
	example = Example.from_dict(doc, {'entities':annot})
	example.predicted = model(str(example.text))
	examples.append(example)
	scores = scorer.score(examples)
	return scores
	except Exception as e: print(e)