File size: 6,152 Bytes
a197a13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import spacy
from spacy.util import minibatch, compounding
from spacy.scorer import Scorer
from src.model_utils import *

import random
from tqdm import tqdm

def train_transformer(config: dict, train_data: list, components: list, iter: int, 
                      batch_size: int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data: list=None)-> spacy:
    """
    Finetune a transformer model or resume training from a fine-tuned model.

    Parameters:
        config: dict, configuration parameters
        train_data: list, contain training data  
        components: list, list of components to be trained
        iter: int, number of iterations to train
        batch_size: int, batch size to be used for training
        entities: list of entities to be trained on for NER
        eval_data: list, containing evaluation data
    
    Returns:
        nlp : spacy transformer
        losses: list  of  the losses at every iteration


    """
    if config['dir'] is not None:
        nlp = spacy.load(config['dir'])
        optimizer = nlp.resume_training()
    else:   
        nlp = spacy.blank("en") # empty English pipeline
        nlp.add_pipe("transformer", config=config['config'])
        for component in components:
            nlp.add_pipe(component)

            task=nlp.get_pipe(component)
            if ('ner' in components) and (entities is not None):
                for label in entities:
                    task.add_label(label)
            
        nlp.initialize() # XXX don't forget this step!
        optimizer = nlp.create_optimizer()

    # convert data into training doc
    train_data_doc = make_training_doc(nlp, train_data)
    
    all_losses = []
    for itn in tqdm(range(1,iter+1)):
        print("Starting iteration " + str(itn))
        random.shuffle(train_data)
        losses = {}
        # compounding(4.0, 32.0, 1.001)
        batches = minibatch(train_data_doc, size=batch_size)
        for batch in batches:
            nlp.update(batch, sgd=optimizer,drop=0.2, losses=losses)

        scores = eval_spacy(nlp, eval_data) if eval_data else  eval_spacy(nlp, train_data)
        print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \
            format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f']))

        all_losses.append([losses[component] for component in components])
    
    return nlp, all_losses



def train_spacy(model: spacy, train_data: list, components: list, iter: int,
                batch_size:int=compounding(4.0, 32.0, 1.001), entities: list=None, eval_data:list=None)-> spacy:
    """
    Finetune a spacy model or resume training from a fine-tuned model.

    Parameters:
        model: str, name of spacy model
        train_data: list, contain training data  
        components: list, list of components to be trained
        iter: int, number of iterations to train
        batch_size: int, batch size to be used for training
        entities: list of entities to be trained on for NER
        eval_data: list, containing evaluation data
    
    Returns:
        nlp : spacy model
        losses: list  of  the losses at every iteration

    """

    # get model and optimizer
    if model is not None:
        nlp, optimizer = load_model(model)  # load existing spaCy model/ blank models

    # convert data into training doc
    train_data_doc = make_training_doc(nlp, train_data)

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    for component in components:
        if component not in nlp.pipe_names:
            ner = nlp.create_pipe(component)
            nlp.add_pipe(component, last=True)
        else:
            ner = nlp.get_pipe(component)

            # add labels if component is NER
        if (component == 'ner') and (entities is not None):
            for ent in entities:
                    ner.add_label(ent)

    print(f'Entities in the model are: {nlp.get_pipe("ner").labels}')
    
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in components]
    all_losses = []
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in tqdm(range(1,iter+1)):
            print("Starting iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data_doc, size=batch_size)
            for batch in batches:
                nlp.update(list(batch), 
                          losses=losses, 
                          drop=0.1, 
                          sgd=optimizer)
            scores = eval_spacy(nlp, eval_data) if eval_data else  eval_spacy(nlp, train_data)
            print("epoch: {} Losses: {} Recall: {} Precision: {} F1: {}". \
                format(itn, str(losses),scores['ents_r'],scores['ents_p'],scores['ents_f']))

            all_losses.append([losses[component] for component in components])

    return nlp, all_losses

def eval_spacy(model: spacy, data):
    """
    Function to perform evaluation and scoring 

    Parameters:
        model: either a spacy model or spacy transformer 
        data: evaluation data so that scoring can be done

    Returns:
        score: dict with scores of the model
    """
    scorer = Scorer()
    examples = []
    try:
        # accept spacy format json data
        for input_, annot in data:
            doc = model.make_doc(input_)
            example = Example.from_dict(doc, annot)
            example.predicted = model(str(example.text))
            examples.append(example)
        scores = scorer.score(examples)
        return scores
    except TypeError:
        # accept alternative format json data
        for row in data:
            input_, annot = row.values()
            doc = model.make_doc(input_)
            example = Example.from_dict(doc, {'entities':annot})
            example.predicted = model(str(example.text))
            examples.append(example)
        scores = scorer.score(examples)
        return scores      
    except Exception as e: print(e)