|
import json |
|
from pathlib import Path |
|
|
|
import spacy |
|
from spacy.training import Example |
|
|
|
def make_training_doc(nlp: spacy, data: list): |
|
""" |
|
To convert data into spacy doc type that can be use for training |
|
|
|
parameters: |
|
nlp: model |
|
data: training data |
|
|
|
returns: |
|
trainiing_data: list of spacy doc |
|
""" |
|
training_data = [] |
|
for text, annotations in data: |
|
doc = nlp.make_doc(text) |
|
example = Example.from_dict(doc, annotations) |
|
training_data.append(example) |
|
|
|
return training_data |
|
|
|
|
|
def load_model(model: str=None): |
|
""" |
|
Load the model indicated by model |
|
|
|
parameters: |
|
model: str , name of the model to load |
|
|
|
returns: |
|
nlp: spacy model object |
|
optimizer : the optimizer to be use in training |
|
""" |
|
if model is not None: |
|
nlp = spacy.load(model) |
|
print("Loaded model '%s'" % model) |
|
optimizer = nlp.resume_training() |
|
else: |
|
nlp = spacy.blank('en') |
|
print("Created blank 'en' model") |
|
optimizer = nlp.begin_training() |
|
|
|
return nlp, optimizer |
|
|
|
|
|
def save_model(model: spacy, output_dir: str): |
|
""" |
|
Save the model to the output_dir |
|
|
|
parameters: |
|
model: spacy model |
|
output_dir: path |
|
""" |
|
if output_dir is not None: |
|
output_dir = Path(output_dir) |
|
if not output_dir.exists(): |
|
output_dir.mkdir() |
|
model.to_disk(output_dir) |
|
print("Saved model to", output_dir) |
|
|
|
return None |
|
|
|
|
|
def load_data(args): |
|
""" |
|
Load training data, evaluation data as well as entities dictionary |
|
|
|
parameters: |
|
args: dict, configuration from the config file |
|
|
|
returns: |
|
train_dict, entities_dict, eval_dict |
|
|
|
""" |
|
|
|
assert args['train_dir'] != None, 'indicate path for training directory' |
|
|
|
|
|
with open(args['train_dir']) as f: |
|
train_dict = json.load(f) |
|
print('Loaded Training Data') |
|
|
|
try: |
|
entities_dict=train_dict[args['ent_key']] |
|
print('Loaded Entities from Training Data') |
|
except KeyError: |
|
entities_dict=None |
|
print('No classes for entities found in data loaded. Proceed to check in ent_dir') |
|
|
|
|
|
if args['ent_dir'] is not None and entities_dict is None: |
|
with open(args['ent_dir']) as f: |
|
entities_dict = json.load(f) |
|
entities_dict = entities_dict[args['ent_key']] |
|
print('Loaded Entities from ent_dir') |
|
elif args['ent_dir'] is None and entities_dict is None: |
|
assert entities_dict != None, 'No entities found from training_dir & ent_dir' |
|
|
|
|
|
if args['eval_dir'] is not None: |
|
with open(args['eval_dir']) as f: |
|
eval_dict = json.load(f) |
|
print('Loaded Evaluating Data') |
|
else: |
|
return train_dict, entities_dict, None |
|
|
|
return train_dict, entities_dict, eval_dict |