Spaces:

Kaelan
/

ner_pg

Sleeping

File size: 2,649 Bytes

a197a13

import spacy
from negspacy.negation import Negex
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

def negation(model: spacy, entities: list):
    """
    Take in the current model pipeline and add in Negation model.
    Add in entities to the negation model
    Parameters:
        model: spacy model
        entities: list of entities

    Returns:
        model: spacy model with Negation added to the pipeline
    """
    if 'parser' in model.pipe_names:
        model.remove_pipe('parser')
    #nlp.add_pipe(nlp.create_pipe('sentencizer'))
    if 'sentencizer' not in model.pipe_names:
        model.add_pipe('sentencizer')
    #negex = Negex(nlp)
    if 'negex' not in model.pipe_names:
        model.add_pipe('negex',config=entities)

    return model

def infer_negation(neg_model: spacy, model: spacy, text: str ,pred_doc: spacy):
    """
    To match results from the negation model with the results from the model.
    Replace the entity type of the spans or tokens in the predictions doc 
    that should be negated with entity type "NEG".

    Parameters:
        neg_model: spacy negation model
        model: spacy model
        text: text sample
        pred_doc: prediction of the text sample from model
    
    Returns:
        pred_doc: spacy doc with all entities that should be negated replaced with the "NEG" entity type

    """
    doc = neg_model(text)
    results = {'ent':[],'start':[], 'end':[]}
    for e in doc.ents:
        rs = str(e._.negex)
        if rs == "True": 
            results['ent'].append(e.text)
            results['start'].append(e.start)
            results['end'].append(e.end)    
    print('Negation: ', results)

    patterns = [model.make_doc(text) for text in results['ent']]
    matcher = PhraseMatcher(model.vocab)
    matcher.add('NEG', None, *patterns)
    
    # match all the tokens or span of text detected to be negated with the prediction doc.
    matches = matcher(pred_doc)
    seen_tokens = set()
    new_entities = []
    entities = pred_doc.ents

    # to get exact matches: not only the span or word matches but also location
    for match in results['start']:
        count = 0
        for match_id, start, end in matches:
            if match == start:
                new_entities.append(Span(pred_doc, start, end, label=match_id))
                entities = [
                    e for e in entities if not (e.start < end and e.end > start)
                ]
                seen_tokens.update(range(start, end))
                matches.pop(count)
            count += 1
    pred_doc.ents = tuple(entities) + tuple(new_entities)
   
    return pred_doc