Spaces:
Runtime error
Runtime error
"""Tool for migrating Index built with V1 data structs to V2.""" | |
import dataclasses | |
import json | |
from typing import Dict, List, Optional, Tuple, Type | |
from gpt_index.constants import DOCSTORE_KEY, INDEX_STRUCT_KEY | |
from gpt_index.data_structs.table import SQLStructTable | |
try: | |
import fire | |
except ImportError: | |
print("Please run `pip install fire`") | |
from gpt_index.data_structs.data_structs import ( | |
KG, | |
ChromaIndexDict, | |
EmptyIndex, | |
FaissIndexDict, | |
IndexDict, | |
IndexGraph, | |
IndexList, | |
IndexStruct, | |
KeywordTable, | |
Node, | |
OpensearchIndexDict, | |
PineconeIndexDict, | |
QdrantIndexDict, | |
SimpleIndexDict, | |
WeaviateIndexDict, | |
) | |
from gpt_index.data_structs.data_structs_v2 import KG as V2KG | |
from gpt_index.data_structs.data_structs_v2 import ChromaIndexDict as V2ChromaIndexDict | |
from gpt_index.data_structs.data_structs_v2 import FaissIndexDict as V2FaissIndexDict | |
from gpt_index.data_structs.data_structs_v2 import IndexDict as V2IndexDict | |
from gpt_index.data_structs.data_structs_v2 import IndexGraph as V2IndexGraph | |
from gpt_index.data_structs.data_structs_v2 import IndexList as V2IndexList | |
from gpt_index.data_structs.data_structs_v2 import KeywordTable as V2KeywordTable | |
from gpt_index.data_structs.data_structs_v2 import ( | |
OpensearchIndexDict as V2OpensearchIndexDict, | |
) | |
from gpt_index.data_structs.data_structs_v2 import ( | |
PineconeIndexDict as V2PineconeIndexDict, | |
) | |
from gpt_index.data_structs.data_structs_v2 import QdrantIndexDict as V2QdrantIndexDict | |
from gpt_index.data_structs.data_structs_v2 import SimpleIndexDict as V2SimpleIndexDict | |
from gpt_index.data_structs.data_structs_v2 import V2IndexStruct | |
from gpt_index.data_structs.data_structs_v2 import ( | |
WeaviateIndexDict as V2WeaviateIndexDict, | |
) | |
from gpt_index.data_structs.node_v2 import DocumentRelationship | |
from gpt_index.data_structs.node_v2 import ImageNode as V2ImageNode | |
from gpt_index.data_structs.node_v2 import Node as V2Node | |
from gpt_index.data_structs.struct_type import IndexStructType | |
from gpt_index.old_docstore import V1DocumentStore | |
from gpt_index.docstore import DocumentStore as V2DocumentStore | |
from gpt_index.tools.file_utils import add_prefix_suffix_to_file_path | |
INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS: Dict[IndexStructType, Type[IndexStruct]] = { | |
IndexStructType.TREE: IndexGraph, | |
IndexStructType.LIST: IndexList, | |
IndexStructType.KEYWORD_TABLE: KeywordTable, | |
IndexStructType.SIMPLE_DICT: SimpleIndexDict, | |
IndexStructType.DICT: FaissIndexDict, | |
IndexStructType.WEAVIATE: WeaviateIndexDict, | |
IndexStructType.PINECONE: PineconeIndexDict, | |
IndexStructType.QDRANT: QdrantIndexDict, | |
IndexStructType.CHROMA: ChromaIndexDict, | |
IndexStructType.VECTOR_STORE: IndexDict, | |
IndexStructType.SQL: SQLStructTable, | |
IndexStructType.KG: KG, | |
IndexStructType.EMPTY: EmptyIndex, | |
IndexStructType.NODE: Node, | |
} | |
V1_INDEX_STRUCT_KEY = "index_struct" | |
V1_INDEX_STRUCT_ID_KEY = "index_struct_id" | |
V1_DOC_STORE_KEY = "docstore" | |
def node_to_v2(node: Node) -> V2Node: | |
if node.ref_doc_id is not None: | |
relationships = { | |
DocumentRelationship.SOURCE: node.ref_doc_id, | |
} | |
else: | |
relationships = {} | |
if node.image is None: | |
return V2Node( | |
text=node.text, | |
doc_id=node.doc_id, | |
embedding=node.embedding, | |
doc_hash=node.doc_hash, | |
extra_info=node.extra_info, | |
node_info=node.node_info, | |
relationships=relationships, | |
) | |
else: | |
return V2ImageNode( | |
text=node.text, | |
doc_id=node.doc_id, | |
embedding=node.embedding, | |
doc_hash=node.doc_hash, | |
extra_info=node.extra_info, | |
node_info=node.node_info, | |
image=node.image, | |
relationships=relationships, | |
) | |
def index_graph_to_v2(struct: IndexGraph) -> Tuple[V2IndexGraph, List[V2Node]]: | |
all_nodes_v2 = { | |
index: node.get_doc_id() for index, node in struct.all_nodes.items() | |
} | |
root_nodes_v2 = { | |
index: node.get_doc_id() for index, node in struct.all_nodes.items() | |
} | |
node_id_to_children_ids_v2 = {} | |
for node in struct.all_nodes.values(): | |
node_id = node.get_doc_id() | |
children_ids = [] | |
for child_index in node.child_indices: | |
child_id = struct.all_nodes[child_index].get_doc_id() | |
children_ids.append(child_id) | |
node_id_to_children_ids_v2[node_id] = children_ids | |
struct_v2 = V2IndexGraph( | |
all_nodes=all_nodes_v2, | |
root_nodes=root_nodes_v2, | |
node_id_to_children_ids=node_id_to_children_ids_v2, | |
) | |
nodes_v2 = [node_to_v2(node) for node in struct.all_nodes.values()] | |
return struct_v2, nodes_v2 | |
def index_list_to_v2(struct: IndexList) -> Tuple[V2IndexList, List[V2Node]]: | |
struct_v2 = V2IndexList(nodes=[node.get_doc_id() for node in struct.nodes]) | |
nodes_v2 = [node_to_v2(node) for node in struct.nodes] | |
return struct_v2, nodes_v2 | |
def keyword_table_to_v2(struct: KeywordTable) -> Tuple[V2KeywordTable, List[V2Node]]: | |
table_v2 = { | |
keyword: set(struct.text_chunks[index].get_doc_id() for index in indices) | |
for keyword, indices in struct.table.items() | |
} | |
struct_v2 = V2KeywordTable(table=table_v2) | |
nodes_v2 = [node_to_v2(node) for node in struct.text_chunks.values()] | |
return struct_v2, nodes_v2 | |
def index_dict_to_v2(struct: IndexDict) -> Tuple[V2IndexDict, List[V2Node]]: | |
nodes_dict_v2 = { | |
vector_id: struct.nodes_dict[int_id].get_doc_id() | |
for vector_id, int_id in struct.id_map.items() | |
} | |
node_id_to_vector_id = { | |
node_id: vector_id for vector_id, node_id in nodes_dict_v2.items() | |
} | |
doc_id_dict_v2: Dict[str, List[str]] = {} | |
for node in struct.nodes_dict.values(): | |
node_id = node.get_doc_id() | |
vector_id = node_id_to_vector_id[node_id] | |
if node.ref_doc_id is not None: | |
if node.ref_doc_id not in doc_id_dict_v2: | |
doc_id_dict_v2[node.ref_doc_id] = [] | |
doc_id_dict_v2[node.ref_doc_id].append(vector_id) | |
struct_v2 = V2IndexDict( | |
nodes_dict=nodes_dict_v2, | |
doc_id_dict=doc_id_dict_v2, | |
embeddings_dict=struct.embeddings_dict, | |
) | |
nodes_v2 = [node_to_v2(node) for node in struct.nodes_dict.values()] | |
if isinstance(struct, SimpleIndexDict): | |
struct_v2 = V2SimpleIndexDict(**dataclasses.asdict(struct_v2)) | |
if isinstance(struct, FaissIndexDict): | |
struct_v2 = V2FaissIndexDict(**dataclasses.asdict(struct_v2)) | |
if isinstance(struct, PineconeIndexDict): | |
struct_v2 = V2PineconeIndexDict(**dataclasses.asdict(struct_v2)) | |
if isinstance(struct, WeaviateIndexDict): | |
struct_v2 = V2WeaviateIndexDict(**dataclasses.asdict(struct_v2)) | |
if isinstance(struct, QdrantIndexDict): | |
struct_v2 = V2QdrantIndexDict(**dataclasses.asdict(struct_v2)) | |
if isinstance(struct, ChromaIndexDict): | |
struct_v2 = V2ChromaIndexDict(**dataclasses.asdict(struct_v2)) | |
if isinstance(struct, OpensearchIndexDict): | |
struct_v2 = V2OpensearchIndexDict(**dataclasses.asdict(struct_v2)) | |
return struct_v2, nodes_v2 | |
def kg_to_v2(struct: KG) -> Tuple[V2KG, List[V2Node]]: | |
struct_v2 = V2KG( | |
table=struct.table, | |
rel_map=struct.rel_map, | |
embedding_dict=struct.embedding_dict, | |
) | |
nodes_v2 = [node_to_v2(node) for node in struct.text_chunks.values()] | |
return struct_v2, nodes_v2 | |
def convert_to_v2_index_struct_and_docstore( | |
index_struct: IndexStruct, docstore: V1DocumentStore | |
) -> Tuple[V2IndexStruct, V2DocumentStore]: | |
struct_v2: V2IndexStruct | |
if isinstance(index_struct, IndexGraph): | |
struct_v2, nodes_v2 = index_graph_to_v2(index_struct) | |
elif isinstance(index_struct, IndexList): | |
struct_v2, nodes_v2 = index_list_to_v2(index_struct) | |
elif isinstance(index_struct, IndexDict): | |
struct_v2, nodes_v2 = index_dict_to_v2(index_struct) | |
elif isinstance(index_struct, KG): | |
struct_v2, nodes_v2 = kg_to_v2(index_struct) | |
elif isinstance(index_struct, KeywordTable): | |
struct_v2, nodes_v2 = keyword_table_to_v2(index_struct) | |
else: | |
raise NotImplementedError(f"Cannot migrate {type(index_struct)} yet.") | |
docstore_v2 = V2DocumentStore() | |
docstore_v2.add_documents(nodes_v2, allow_update=False) | |
return struct_v2, docstore_v2 | |
def load_v1_index_struct_in_docstore( | |
file_dict: dict, | |
) -> Tuple[IndexStruct, V1DocumentStore]: | |
index_struct_id = file_dict[V1_INDEX_STRUCT_ID_KEY] | |
docstore = V1DocumentStore.load_from_dict( | |
file_dict[V1_DOC_STORE_KEY], | |
type_to_struct=INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS, # type: ignore | |
) | |
index_struct = docstore.get_document(index_struct_id) | |
assert isinstance(index_struct, IndexStruct) | |
return index_struct, docstore | |
def load_v1_index_struct_separate( | |
file_dict: dict, index_struct_type: IndexStructType | |
) -> Tuple[IndexStruct, V1DocumentStore]: | |
index_struct_cls = INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS[index_struct_type] | |
index_struct = index_struct_cls.from_dict(file_dict[V1_INDEX_STRUCT_KEY]) | |
docstore = V1DocumentStore.load_from_dict( | |
file_dict[V1_DOC_STORE_KEY], | |
type_to_struct=INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS, # type: ignore | |
) | |
return index_struct, docstore | |
def load_v1( | |
file_dict: dict, index_struct_type: Optional[IndexStructType] = None | |
) -> Tuple[IndexStruct, V1DocumentStore]: | |
if V1_INDEX_STRUCT_KEY in file_dict: | |
assert index_struct_type is not None, "Must specify index_struct_type to load." | |
index_struct, docstore = load_v1_index_struct_separate( | |
file_dict, index_struct_type | |
) | |
elif V1_INDEX_STRUCT_ID_KEY in file_dict: | |
index_struct, docstore = load_v1_index_struct_in_docstore(file_dict) | |
else: | |
raise ValueError("index_struct or index_struct_id must be provided.") | |
return index_struct, docstore | |
def save_v2(index_struct: V2IndexStruct, docstore: V2DocumentStore) -> dict: | |
return { | |
INDEX_STRUCT_KEY: index_struct.to_dict(), | |
DOCSTORE_KEY: docstore.serialize_to_dict(), | |
} | |
def convert_to_v2_dict( | |
v1_dict: dict, index_struct_type: Optional[IndexStructType] = None | |
) -> dict: | |
index_struct, docstore = load_v1(v1_dict, index_struct_type) | |
index_struct_v2, docstore_v2 = convert_to_v2_index_struct_and_docstore( | |
index_struct, docstore | |
) | |
v2_dict = save_v2(index_struct_v2, docstore_v2) | |
return v2_dict | |
def convert_to_v2_file( | |
v1_path: str, | |
index_struct_type: Optional[IndexStructType] = None, | |
v2_path: Optional[str] = None, | |
encoding: str = "ascii", | |
) -> None: | |
with open(v1_path, "r") as f: | |
file_str = f.read() | |
v1_dict = json.loads(file_str) | |
print(f"Successfully loaded V1 JSON file from: {v1_path}") | |
v2_dict = convert_to_v2_dict(v1_dict, index_struct_type) | |
v2_str = json.dumps(v2_dict) | |
v2_path = v2_path or add_prefix_suffix_to_file_path(v1_path, suffix="_v2") | |
with open(v2_path, "wt", encoding=encoding) as f: | |
f.write(v2_str) | |
print(f"Successfully created V2 JSON file at: {v2_path}") | |
if __name__ == "__main__": | |
fire.Fire(convert_to_v2_file) | |