File size: 5,038 Bytes
91eaff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
These classes represent graph elements.

Consider this "flavor" of graph representation to be a superset of
`openCypher` _labeled property graphs_ (LPG) with additional support
for probabilistic graphs.

Imposing a discipline of IRIs for node names and edge relations
helps guarantee that a view of the graph can be exported to RDF
for data quality checks, transitive closure, semantic inference,
and so on.

see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md
"""

from dataclasses import dataclass, field
import typing

import spacy  # pylint: disable=E0401

from .util import EnumBase


######################################################################
## class definitions

@dataclass(order=False, frozen=False)
class KGSearchHit:  # pylint: disable=R0902
    """
A data class representing a hit from a _knowledge graph_ search.
    """
    iri: str
    label: str
    descrip: str
    aliases: typing.List[ str ]
    prob: float


@dataclass(order=False, frozen=False)
class LinkedEntity:  # pylint: disable=R0902
    """
A data class representing one linked entity.
    """
    span: typing.Optional[ spacy.tokens.span.Span ]
    iri: str
    length: int
    rel: str
    prob: float
    token_id: int
    kg_ent: typing.Optional[ KGSearchHit ]
    count: int = 1


@dataclass(order=False, frozen=False)
class NounChunk:  # pylint: disable=R0902
    """
A data class representing one noun chunk, i.e., a candidate as an extracted phrase.
    """
    span: spacy.tokens.span.Span
    text: str
    length: int
    lemma_key: str
    unseen: bool
    sent_id: int
    start: int = 0


class NodeEnum (EnumBase):
    """
Enumeration for the kinds of node categories
    """
    DEP = 0  # `spaCy` parse dependency
    LEM = 1  # lemmatized token
    ENT = 2  # named entity
    CHU = 3  # noun chunk
    IRI = 4  # IRI for linked entity

    @property
    def decoder (
        self
        ) -> typing.List[ str ]:
        """
Decoder values
        """
        return [
            "dep",
            "lem",
            "ent",
            "chu",
            "iri",
        ]


@dataclass(order=False, frozen=False)
class Node:  # pylint: disable=R0902
    """
A data class representing one node, i.e., an extracted phrase.
    """
    node_id: int
    key: str
    text: str
    pos: str
    kind: NodeEnum
    span: typing.Optional[ typing.Union[ spacy.tokens.span.Span, spacy.tokens.token.Token ]] = None
    loc: typing.List[ typing.List[ int ] ] = field(default_factory = lambda: [])
    label: typing.Optional[ str ] = None
    length: int = 1
    sub_obj: bool = False
    count: int = 0
    neighbors: int = 0
    weight: float = 0.0
    entity: typing.List[ LinkedEntity ] = field(default_factory = lambda: [])
    annotated: bool = False


    def get_linked_label (
        self
        ) -> typing.Optional[ str ]:
        """
When this node has a linked entity, return that IRI.
Otherwise return its `label` value.

    returns:
a label for the linked entity
        """
        if len(self.entity) > 0:
            return self.entity[0].iri

        return self.label


    def get_name (
        self
        ) -> str:
        """
Return a brief name for the graphical depiction of this Node.

    returns:
brief label to be used in a graph
        """
        if self.kind == NodeEnum.IRI:
            return self.label  # type: ignore
        if self.kind == NodeEnum.LEM:
            return self.key

        return self.text


    def get_stacked_count (
        self
        ) -> int:
        """
Return a modified count, to redact verbs and linked entities from
the stack-rank partitions.

    returns:
count, used for re-ranking extracted entities
        """
        if self.pos == "VERB" or self.kind == NodeEnum.IRI:
            return 0

        return self.count


    def get_pos (
        self
        ) -> typing.Tuple[ int, int ]:
        """
Generate a position span for `OpenNRE`.

    returns:
a position span needed for `OpenNRE` relation extraction
        """
        position: typing.Tuple[ int, int ] = ( self.span.idx, self.span.idx + len(self.text) - 1, )  # type: ignore  # pylint: disable=C0301
        return position


class RelEnum (EnumBase):
    """
Enumeration for the kinds of edge relations
    """
    DEP = 0  # `spaCy` parse dependency
    CHU = 1  # `spaCy` noun chunk
    INF = 2  # `REBEL` or `OpenNRE` inferred relation
    SYN = 3  # `sense2vec` inferred synonym
    IRI = 4  # `DBPedia` or `Wikidata` linked entity

    @property
    def decoder (
        self
        ) -> typing.List[ str ]:
        """
Decoder values
        """
        return [
            "dep",
            "chu",
            "inf",
            "syn",
            "iri",
        ]


@dataclass(order=False, frozen=False)
class Edge:
    """
A data class representing an edge between two nodes.
    """
    src_node: int
    dst_node: int
    kind: RelEnum
    rel: str
    prob: float
    count: int = 1