Spaces:
Runtime error
Runtime error
"""JSON Reader.""" | |
import json | |
import re | |
from typing import Any, Generator, List, Optional | |
from gpt_index.readers.base import BaseReader | |
from gpt_index.readers.schema.base import Document | |
def _depth_first_yield( | |
json_data: Any, | |
levels_back: int, | |
collapse_length: Optional[int], | |
path: List[str], | |
) -> Generator[str, None, None]: | |
"""Do depth first yield of all of the leaf nodes of a JSON. | |
Combines keys in the JSON tree using spaces. | |
If levels_back is set to 0, prints all levels. | |
If collapse_length is not None and the json_data is <= that number | |
of characters, then we collapse it into one line. | |
""" | |
if isinstance(json_data, dict) or isinstance(json_data, list): | |
# only try to collapse if we're not at a leaf node | |
json_str = json.dumps(json_data) | |
if collapse_length is not None and len(json_str) <= collapse_length: | |
new_path = path[-levels_back:] | |
new_path.append(json_str) | |
yield " ".join(new_path) | |
return | |
elif isinstance(json_data, dict): | |
for key, value in json_data.items(): | |
new_path = path[:] | |
new_path.append(key) | |
yield from _depth_first_yield( | |
value, levels_back, collapse_length, new_path | |
) | |
elif isinstance(json_data, list): | |
for _, value in enumerate(json_data): | |
yield from _depth_first_yield(value, levels_back, collapse_length, path) | |
else: | |
new_path = path[-levels_back:] | |
new_path.append(str(json_data)) | |
yield " ".join(new_path) | |
class JSONReader(BaseReader): | |
"""JSON reader. | |
Reads JSON documents with options to help suss out relationships between nodes. | |
Args: | |
levels_back (int): the number of levels to go back in the JSON tree, 0 | |
if you want all levels. If levels_back is None, then we just format the | |
JSON and make each line an embedding | |
collapse_length (int): the maximum number of characters a JSON fragment | |
would be collapsed in the output (levels_back needs to be not None) | |
ex: if collapse_length = 10, and | |
input is {a: [1, 2, 3], b: {"hello": "world", "foo": "bar"}} | |
then a would be collapsed into one line, while b would not. | |
Recommend starting around 100 and then adjusting from there. | |
""" | |
def __init__( | |
self, levels_back: Optional[int] = None, collapse_length: Optional[int] = None | |
) -> None: | |
"""Initialize with arguments.""" | |
super().__init__() | |
self.levels_back = levels_back | |
self.collapse_length = collapse_length | |
def load_data(self, input_file: str) -> List[Document]: | |
"""Load data from the input file.""" | |
with open(input_file, "r") as f: | |
data = json.load(f) | |
if self.levels_back is None: | |
# If levels_back isn't set, we just format and make each | |
# line an embedding | |
json_output = json.dumps(data, indent=0) | |
lines = json_output.split("\n") | |
useful_lines = [ | |
line for line in lines if not re.match(r"^[{}\[\],]*$", line) | |
] | |
return [Document("\n".join(useful_lines))] | |
elif self.levels_back is not None: | |
# If levels_back is set, we make the embeddings contain the labels | |
# from further up the JSON tree | |
lines = [ | |
*_depth_first_yield( | |
data, self.levels_back, self.collapse_length, [] | |
) | |
] | |
return [Document("\n".join(lines))] | |