| from typing import Any, Dict, List, Optional | |
| from .operator import StreamInstanceOperator | |
| class IobExtractor(StreamInstanceOperator): | |
| """A class designed to extract entities from sequences of text using the Inside-Outside-Beginning (IOB) tagging convention. It identifies entities based on IOB tags and categorizes them into predefined labels such as Person, Organization, and Location. | |
| Attributes: | |
| labels (List[str]): A list of entity type labels, e.g., ["Person", "Organization", "Location"]. | |
| begin_labels (List[str]): A list of labels indicating the beginning of an entity, e.g., ["B-PER", "B-ORG", "B-LOC"]. | |
| inside_labels (List[str]): A list of labels indicating the continuation of an entity, e.g., ["I-PER", "I-ORG", "I-LOC"]. | |
| outside_label (str): The label indicating tokens outside of any entity, typically "O". | |
| The extraction process identifies spans of text corresponding to entities and labels them according to their entity type. Each span is annotated with a start and end character offset, the entity text, and the corresponding label. | |
| Example of instantiation and usage: | |
| ```python | |
| operator = IobExtractor( | |
| labels=["Person", "Organization", "Location"], | |
| begin_labels=["B-PER", "B-ORG", "B-LOC"], | |
| inside_labels=["I-PER", "I-ORG", "I-LOC"], | |
| outside_label="O", | |
| ) | |
| instance = { | |
| "labels": ["B-PER", "I-PER", "O", "B-ORG", "I-ORG"], | |
| "tokens": ["John", "Doe", "works", "at", "OpenAI"] | |
| } | |
| processed_instance = operator.process(instance) | |
| print(processed_instance["spans"]) | |
| # Output: [{'start': 0, 'end': 8, 'text': 'John Doe', 'label': 'Person'}, ...] | |
| ``` | |
| For more details on the IOB tagging convention, see: https://en.wikipedia.org/wiki/Inside-outside-beginning_(tagging) | |
| """ | |
| labels: List[str] | |
| begin_labels: List[str] | |
| inside_labels: List[str] | |
| outside_label: int | |
| def process( | |
| self, instance: Dict[str, Any], stream_name: Optional[str] = None | |
| ) -> Dict[str, Any]: | |
| labels = instance["labels"] | |
| tokens = instance["tokens"] | |
| text = instance["text"] | |
| spans = [] | |
| current_pos = 0 | |
| end_pos = 0 | |
| for label, token in zip(labels, tokens): | |
| token_pos = text.find(token, current_pos) | |
| if token_pos == -1: | |
| raise ValueError( | |
| f"Token '{token}' not found in text '{text}' starting from position {current_pos}" | |
| ) | |
| end_pos = token_pos + len(token) | |
| if label in self.begin_labels: | |
| span = { | |
| "start": token_pos, | |
| "label": self.labels[self.begin_labels.index(label)], | |
| "end": end_pos, | |
| } | |
| spans.append(span) | |
| elif label in self.inside_labels and spans: | |
| spans[-1]["end"] = end_pos | |
| current_pos = end_pos | |
| for span in spans: | |
| span["text"] = text[span["start"] : span["end"]] | |
| instance["spans"] = spans | |
| return instance | |