Spaces:
Runtime error
Runtime error
"""JSON Reader.""" | |
import json | |
import re | |
from typing import Dict, Generator, List, Optional | |
from gpt_index.readers.base import BaseReader | |
from gpt_index.readers.schema.base import Document | |
def _depth_first_yield( | |
json_data: Dict, levels_back: int, path: List[str] | |
) -> Generator[str, None, None]: | |
"""Do depth first yield of all of the leaf nodes of a JSON. | |
Combines keys in the JSON tree using spaces. | |
If levels_back is set to 0, prints all levels. | |
""" | |
if isinstance(json_data, dict): | |
for key, value in json_data.items(): | |
new_path = path[:] | |
new_path.append(key) | |
yield from _depth_first_yield(value, levels_back, new_path) | |
elif isinstance(json_data, list): | |
for _, value in enumerate(json_data): | |
yield from _depth_first_yield(value, levels_back, path) | |
else: | |
new_path = path[-levels_back:] | |
new_path.append(str(json_data)) | |
yield " ".join(new_path) | |
class JSONReader(BaseReader): | |
"""JSON reader. | |
Reads JSON documents with options to help suss out relationships between nodes. | |
Args: | |
levels_back (int): the number of levels to go back in the JSON tree, 0 | |
if you want all levels. If levels_back is None, then we just format the | |
JSON and make each line an embedding | |
""" | |
def __init__(self, levels_back: Optional[int] = None) -> None: | |
"""Initialize with arguments.""" | |
super().__init__() | |
self.levels_back = levels_back | |
def load_data(self, input_file: str) -> List[Document]: | |
"""Load data from the input file.""" | |
with open(input_file, "r") as f: | |
data = json.load(f) | |
if self.levels_back is None: | |
# If levels_back isn't set, we just format and make each | |
# line an embedding | |
json_output = json.dumps(data, indent=0) | |
lines = json_output.split("\n") | |
useful_lines = [ | |
line for line in lines if not re.match(r"^[{}\[\],]*$", line) | |
] | |
return [Document("\n".join(useful_lines))] | |
elif self.levels_back is not None: | |
# If levels_back is set, we make the embeddings contain the labels | |
# from further up the JSON tree | |
lines = [*_depth_first_yield(data, self.levels_back, [])] | |
return [Document("\n".join(lines))] | |