|
import spacy |
|
from spacy.util import minibatch, compounding |
|
from spacy.scorer import Scorer |
|
from src.model_utils import * |
|
|
|
import random |
|
from tqdm import tqdm |
|
|
|
def train_transformer(config: dict, train_data: list, components: list, iter: int, |
|
batch_size: int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data: list=None)-> spacy: |
|
""" |
|
Finetune a transformer model or resume training from a fine-tuned model. |
|
|
|
Parameters: |
|
config: dict, configuration parameters |
|
train_data: list, contain training data |
|
components: list, list of components to be trained |
|
iter: int, number of iterations to train |
|
batch_size: int, batch size to be used for training |
|
entities: list of entities to be trained on for NER |
|
eval_data: list, containing evaluation data |
|
|
|
Returns: |
|
nlp : spacy transformer |
|
losses: list of the losses at every iteration |
|
|
|
|
|
""" |
|
if config['dir'] is not None: |
|
nlp = spacy.load(config['dir']) |
|
optimizer = nlp.resume_training() |
|
else: |
|
nlp = spacy.blank("en") |
|
nlp.add_pipe("transformer", config=config['config']) |
|
for component in components: |
|
nlp.add_pipe(component) |
|
|
|
task=nlp.get_pipe(component) |
|
if ('ner' in components) and (entities is not None): |
|
for label in entities: |
|
task.add_label(label) |
|
|
|
nlp.initialize() |
|
optimizer = nlp.create_optimizer() |
|
|
|
|
|
train_data_doc = make_training_doc(nlp, train_data) |
|
|
|
all_losses = [] |
|
for itn in tqdm(range(1,iter+1)): |
|
print("Starting iteration " + str(itn)) |
|
random.shuffle(train_data) |
|
losses = {} |
|
|
|
batches = minibatch(train_data_doc, size=batch_size) |
|
for batch in batches: |
|
nlp.update(batch, sgd=optimizer,drop=0.2, losses=losses) |
|
|
|
scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data) |
|
print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \ |
|
format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f'])) |
|
|
|
all_losses.append([losses[component] for component in components]) |
|
|
|
return nlp, all_losses |
|
|
|
|
|
|
|
def train_spacy(model: spacy, train_data: list, components: list, iter: int, |
|
batch_size:int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data:list=None)-> spacy: |
|
""" |
|
Finetune a spacy model or resume training from a fine-tuned model. |
|
|
|
Parameters: |
|
model: str, name of spacy model |
|
train_data: list, contain training data |
|
components: list, list of components to be trained |
|
iter: int, number of iterations to train |
|
batch_size: int, batch size to be used for training |
|
entities: list of entities to be trained on for NER |
|
eval_data: list, containing evaluation data |
|
|
|
Returns: |
|
nlp : spacy model |
|
losses: list of the losses at every iteration |
|
|
|
""" |
|
|
|
|
|
if model is not None: |
|
nlp, optimizer = load_model(model) |
|
|
|
|
|
train_data_doc = make_training_doc(nlp, train_data) |
|
|
|
|
|
|
|
for component in components: |
|
if component not in nlp.pipe_names: |
|
ner = nlp.create_pipe(component) |
|
nlp.add_pipe(component, last=True) |
|
else: |
|
ner = nlp.get_pipe(component) |
|
|
|
|
|
if (component == 'ner') and (entities is not None): |
|
for ent in entities: |
|
ner.add_label(ent) |
|
|
|
print(f'Entities in the model are: {nlp.get_pipe("ner").labels}') |
|
|
|
|
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in components] |
|
all_losses = [] |
|
with nlp.disable_pipes(*other_pipes): |
|
for itn in tqdm(range(1,iter+1)): |
|
print("Starting iteration " + str(itn)) |
|
random.shuffle(train_data) |
|
losses = {} |
|
batches = minibatch(train_data_doc, size=batch_size) |
|
for batch in batches: |
|
nlp.update(list(batch), |
|
losses=losses, |
|
drop=0.1, |
|
sgd=optimizer) |
|
scores = eval_spacy(nlp, eval_data) if eval_data else eval_spacy(nlp, train_data) |
|
print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \ |
|
format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f'])) |
|
|
|
all_losses.append([losses[component] for component in components]) |
|
|
|
return nlp, all_losses |
|
|
|
def eval_spacy(model: spacy, data): |
|
""" |
|
Function to perform evaluation and scoring |
|
|
|
Parameters: |
|
model: either a spacy model or spacy transformer |
|
data: evaluation data so that scoring can be done |
|
|
|
Returns: |
|
score: dict with scores of the model |
|
""" |
|
scorer = Scorer() |
|
examples = [] |
|
try: |
|
|
|
for input_, annot in data: |
|
doc = model.make_doc(input_) |
|
example = Example.from_dict(doc, annot) |
|
example.predicted = model(str(example.text)) |
|
examples.append(example) |
|
scores = scorer.score(examples) |
|
return scores |
|
except TypeError: |
|
|
|
for row in data: |
|
input_, annot = row.values() |
|
doc = model.make_doc(input_) |
|
example = Example.from_dict(doc, {'entities':annot}) |
|
example.predicted = model(str(example.text)) |
|
examples.append(example) |
|
scores = scorer.score(examples) |
|
return scores |
|
except Exception as e: print(e) |
|
|
|
|
|
|