File size: 2,437 Bytes
35b22df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""JSON Reader."""

import json
import re
from typing import Dict, Generator, List, Optional

from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document


def _depth_first_yield(
    json_data: Dict, levels_back: int, path: List[str]
) -> Generator[str, None, None]:
    """Do depth first yield of all of the leaf nodes of a JSON.

    Combines keys in the JSON tree using spaces.

    If levels_back is set to 0, prints all levels.

    """
    if isinstance(json_data, dict):
        for key, value in json_data.items():
            new_path = path[:]
            new_path.append(key)
            yield from _depth_first_yield(value, levels_back, new_path)
    elif isinstance(json_data, list):
        for _, value in enumerate(json_data):
            yield from _depth_first_yield(value, levels_back, path)
    else:
        new_path = path[-levels_back:]
        new_path.append(str(json_data))
        yield " ".join(new_path)


class JSONReader(BaseReader):
    """JSON reader.

    Reads JSON documents with options to help suss out relationships between nodes.

    Args:
        levels_back (int): the number of levels to go back in the JSON tree, 0
        if you want all levels. If levels_back is None, then we just format the
        JSON and make each line an embedding

    """

    def __init__(self, levels_back: Optional[int] = None) -> None:
        """Initialize with arguments."""
        super().__init__()
        self.levels_back = levels_back

    def load_data(self, input_file: str) -> List[Document]:
        """Load data from the input file."""
        with open(input_file, "r") as f:
            data = json.load(f)
            if self.levels_back is None:
                # If levels_back isn't set, we just format and make each
                # line an embedding
                json_output = json.dumps(data, indent=0)
                lines = json_output.split("\n")
                useful_lines = [
                    line for line in lines if not re.match(r"^[{}\[\],]*$", line)
                ]
                return [Document("\n".join(useful_lines))]
            elif self.levels_back is not None:
                # If levels_back is set, we make the embeddings contain the labels
                # from further up the JSON tree
                lines = [*_depth_first_yield(data, self.levels_back, [])]
                return [Document("\n".join(lines))]