Spaces:
Runtime error
Runtime error
File size: 3,704 Bytes
b699122 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
"""JSON Reader."""
import json
import re
from typing import Any, Generator, List, Optional
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
def _depth_first_yield(
json_data: Any,
levels_back: int,
collapse_length: Optional[int],
path: List[str],
) -> Generator[str, None, None]:
"""Do depth first yield of all of the leaf nodes of a JSON.
Combines keys in the JSON tree using spaces.
If levels_back is set to 0, prints all levels.
If collapse_length is not None and the json_data is <= that number
of characters, then we collapse it into one line.
"""
if isinstance(json_data, dict) or isinstance(json_data, list):
# only try to collapse if we're not at a leaf node
json_str = json.dumps(json_data)
if collapse_length is not None and len(json_str) <= collapse_length:
new_path = path[-levels_back:]
new_path.append(json_str)
yield " ".join(new_path)
return
elif isinstance(json_data, dict):
for key, value in json_data.items():
new_path = path[:]
new_path.append(key)
yield from _depth_first_yield(
value, levels_back, collapse_length, new_path
)
elif isinstance(json_data, list):
for _, value in enumerate(json_data):
yield from _depth_first_yield(value, levels_back, collapse_length, path)
else:
new_path = path[-levels_back:]
new_path.append(str(json_data))
yield " ".join(new_path)
class JSONReader(BaseReader):
"""JSON reader.
Reads JSON documents with options to help suss out relationships between nodes.
Args:
levels_back (int): the number of levels to go back in the JSON tree, 0
if you want all levels. If levels_back is None, then we just format the
JSON and make each line an embedding
collapse_length (int): the maximum number of characters a JSON fragment
would be collapsed in the output (levels_back needs to be not None)
ex: if collapse_length = 10, and
input is {a: [1, 2, 3], b: {"hello": "world", "foo": "bar"}}
then a would be collapsed into one line, while b would not.
Recommend starting around 100 and then adjusting from there.
"""
def __init__(
self, levels_back: Optional[int] = None, collapse_length: Optional[int] = None
) -> None:
"""Initialize with arguments."""
super().__init__()
self.levels_back = levels_back
self.collapse_length = collapse_length
def load_data(self, input_file: str) -> List[Document]:
"""Load data from the input file."""
with open(input_file, "r") as f:
data = json.load(f)
if self.levels_back is None:
# If levels_back isn't set, we just format and make each
# line an embedding
json_output = json.dumps(data, indent=0)
lines = json_output.split("\n")
useful_lines = [
line for line in lines if not re.match(r"^[{}\[\],]*$", line)
]
return [Document("\n".join(useful_lines))]
elif self.levels_back is not None:
# If levels_back is set, we make the embeddings contain the labels
# from further up the JSON tree
lines = [
*_depth_first_yield(
data, self.levels_back, self.collapse_length, []
)
]
return [Document("\n".join(lines))]
|