"""Tool for migrating Index built with V1 data structs to V2.""" import dataclasses import json from typing import Dict, List, Optional, Tuple, Type from gpt_index.constants import DOCSTORE_KEY, INDEX_STRUCT_KEY from gpt_index.data_structs.table import SQLStructTable try: import fire except ImportError: print("Please run `pip install fire`") from gpt_index.data_structs.data_structs import ( KG, ChromaIndexDict, EmptyIndex, FaissIndexDict, IndexDict, IndexGraph, IndexList, IndexStruct, KeywordTable, Node, OpensearchIndexDict, PineconeIndexDict, QdrantIndexDict, SimpleIndexDict, WeaviateIndexDict, ) from gpt_index.data_structs.data_structs_v2 import KG as V2KG from gpt_index.data_structs.data_structs_v2 import ChromaIndexDict as V2ChromaIndexDict from gpt_index.data_structs.data_structs_v2 import FaissIndexDict as V2FaissIndexDict from gpt_index.data_structs.data_structs_v2 import IndexDict as V2IndexDict from gpt_index.data_structs.data_structs_v2 import IndexGraph as V2IndexGraph from gpt_index.data_structs.data_structs_v2 import IndexList as V2IndexList from gpt_index.data_structs.data_structs_v2 import KeywordTable as V2KeywordTable from gpt_index.data_structs.data_structs_v2 import ( OpensearchIndexDict as V2OpensearchIndexDict, ) from gpt_index.data_structs.data_structs_v2 import ( PineconeIndexDict as V2PineconeIndexDict, ) from gpt_index.data_structs.data_structs_v2 import QdrantIndexDict as V2QdrantIndexDict from gpt_index.data_structs.data_structs_v2 import SimpleIndexDict as V2SimpleIndexDict from gpt_index.data_structs.data_structs_v2 import V2IndexStruct from gpt_index.data_structs.data_structs_v2 import ( WeaviateIndexDict as V2WeaviateIndexDict, ) from gpt_index.data_structs.node_v2 import DocumentRelationship from gpt_index.data_structs.node_v2 import ImageNode as V2ImageNode from gpt_index.data_structs.node_v2 import Node as V2Node from gpt_index.data_structs.struct_type import IndexStructType from gpt_index.old_docstore import V1DocumentStore from gpt_index.docstore import DocumentStore as V2DocumentStore from gpt_index.tools.file_utils import add_prefix_suffix_to_file_path INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS: Dict[IndexStructType, Type[IndexStruct]] = { IndexStructType.TREE: IndexGraph, IndexStructType.LIST: IndexList, IndexStructType.KEYWORD_TABLE: KeywordTable, IndexStructType.SIMPLE_DICT: SimpleIndexDict, IndexStructType.DICT: FaissIndexDict, IndexStructType.WEAVIATE: WeaviateIndexDict, IndexStructType.PINECONE: PineconeIndexDict, IndexStructType.QDRANT: QdrantIndexDict, IndexStructType.CHROMA: ChromaIndexDict, IndexStructType.VECTOR_STORE: IndexDict, IndexStructType.SQL: SQLStructTable, IndexStructType.KG: KG, IndexStructType.EMPTY: EmptyIndex, IndexStructType.NODE: Node, } V1_INDEX_STRUCT_KEY = "index_struct" V1_INDEX_STRUCT_ID_KEY = "index_struct_id" V1_DOC_STORE_KEY = "docstore" def node_to_v2(node: Node) -> V2Node: if node.ref_doc_id is not None: relationships = { DocumentRelationship.SOURCE: node.ref_doc_id, } else: relationships = {} if node.image is None: return V2Node( text=node.text, doc_id=node.doc_id, embedding=node.embedding, doc_hash=node.doc_hash, extra_info=node.extra_info, node_info=node.node_info, relationships=relationships, ) else: return V2ImageNode( text=node.text, doc_id=node.doc_id, embedding=node.embedding, doc_hash=node.doc_hash, extra_info=node.extra_info, node_info=node.node_info, image=node.image, relationships=relationships, ) def index_graph_to_v2(struct: IndexGraph) -> Tuple[V2IndexGraph, List[V2Node]]: all_nodes_v2 = { index: node.get_doc_id() for index, node in struct.all_nodes.items() } root_nodes_v2 = { index: node.get_doc_id() for index, node in struct.all_nodes.items() } node_id_to_children_ids_v2 = {} for node in struct.all_nodes.values(): node_id = node.get_doc_id() children_ids = [] for child_index in node.child_indices: child_id = struct.all_nodes[child_index].get_doc_id() children_ids.append(child_id) node_id_to_children_ids_v2[node_id] = children_ids struct_v2 = V2IndexGraph( all_nodes=all_nodes_v2, root_nodes=root_nodes_v2, node_id_to_children_ids=node_id_to_children_ids_v2, ) nodes_v2 = [node_to_v2(node) for node in struct.all_nodes.values()] return struct_v2, nodes_v2 def index_list_to_v2(struct: IndexList) -> Tuple[V2IndexList, List[V2Node]]: struct_v2 = V2IndexList(nodes=[node.get_doc_id() for node in struct.nodes]) nodes_v2 = [node_to_v2(node) for node in struct.nodes] return struct_v2, nodes_v2 def keyword_table_to_v2(struct: KeywordTable) -> Tuple[V2KeywordTable, List[V2Node]]: table_v2 = { keyword: set(struct.text_chunks[index].get_doc_id() for index in indices) for keyword, indices in struct.table.items() } struct_v2 = V2KeywordTable(table=table_v2) nodes_v2 = [node_to_v2(node) for node in struct.text_chunks.values()] return struct_v2, nodes_v2 def index_dict_to_v2(struct: IndexDict) -> Tuple[V2IndexDict, List[V2Node]]: nodes_dict_v2 = { vector_id: struct.nodes_dict[int_id].get_doc_id() for vector_id, int_id in struct.id_map.items() } node_id_to_vector_id = { node_id: vector_id for vector_id, node_id in nodes_dict_v2.items() } doc_id_dict_v2: Dict[str, List[str]] = {} for node in struct.nodes_dict.values(): node_id = node.get_doc_id() vector_id = node_id_to_vector_id[node_id] if node.ref_doc_id is not None: if node.ref_doc_id not in doc_id_dict_v2: doc_id_dict_v2[node.ref_doc_id] = [] doc_id_dict_v2[node.ref_doc_id].append(vector_id) struct_v2 = V2IndexDict( nodes_dict=nodes_dict_v2, doc_id_dict=doc_id_dict_v2, embeddings_dict=struct.embeddings_dict, ) nodes_v2 = [node_to_v2(node) for node in struct.nodes_dict.values()] if isinstance(struct, SimpleIndexDict): struct_v2 = V2SimpleIndexDict(**dataclasses.asdict(struct_v2)) if isinstance(struct, FaissIndexDict): struct_v2 = V2FaissIndexDict(**dataclasses.asdict(struct_v2)) if isinstance(struct, PineconeIndexDict): struct_v2 = V2PineconeIndexDict(**dataclasses.asdict(struct_v2)) if isinstance(struct, WeaviateIndexDict): struct_v2 = V2WeaviateIndexDict(**dataclasses.asdict(struct_v2)) if isinstance(struct, QdrantIndexDict): struct_v2 = V2QdrantIndexDict(**dataclasses.asdict(struct_v2)) if isinstance(struct, ChromaIndexDict): struct_v2 = V2ChromaIndexDict(**dataclasses.asdict(struct_v2)) if isinstance(struct, OpensearchIndexDict): struct_v2 = V2OpensearchIndexDict(**dataclasses.asdict(struct_v2)) return struct_v2, nodes_v2 def kg_to_v2(struct: KG) -> Tuple[V2KG, List[V2Node]]: struct_v2 = V2KG( table=struct.table, rel_map=struct.rel_map, embedding_dict=struct.embedding_dict, ) nodes_v2 = [node_to_v2(node) for node in struct.text_chunks.values()] return struct_v2, nodes_v2 def convert_to_v2_index_struct_and_docstore( index_struct: IndexStruct, docstore: V1DocumentStore ) -> Tuple[V2IndexStruct, V2DocumentStore]: struct_v2: V2IndexStruct if isinstance(index_struct, IndexGraph): struct_v2, nodes_v2 = index_graph_to_v2(index_struct) elif isinstance(index_struct, IndexList): struct_v2, nodes_v2 = index_list_to_v2(index_struct) elif isinstance(index_struct, IndexDict): struct_v2, nodes_v2 = index_dict_to_v2(index_struct) elif isinstance(index_struct, KG): struct_v2, nodes_v2 = kg_to_v2(index_struct) elif isinstance(index_struct, KeywordTable): struct_v2, nodes_v2 = keyword_table_to_v2(index_struct) else: raise NotImplementedError(f"Cannot migrate {type(index_struct)} yet.") docstore_v2 = V2DocumentStore() docstore_v2.add_documents(nodes_v2, allow_update=False) return struct_v2, docstore_v2 def load_v1_index_struct_in_docstore( file_dict: dict, ) -> Tuple[IndexStruct, V1DocumentStore]: index_struct_id = file_dict[V1_INDEX_STRUCT_ID_KEY] docstore = V1DocumentStore.load_from_dict( file_dict[V1_DOC_STORE_KEY], type_to_struct=INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS, # type: ignore ) index_struct = docstore.get_document(index_struct_id) assert isinstance(index_struct, IndexStruct) return index_struct, docstore def load_v1_index_struct_separate( file_dict: dict, index_struct_type: IndexStructType ) -> Tuple[IndexStruct, V1DocumentStore]: index_struct_cls = INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS[index_struct_type] index_struct = index_struct_cls.from_dict(file_dict[V1_INDEX_STRUCT_KEY]) docstore = V1DocumentStore.load_from_dict( file_dict[V1_DOC_STORE_KEY], type_to_struct=INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS, # type: ignore ) return index_struct, docstore def load_v1( file_dict: dict, index_struct_type: Optional[IndexStructType] = None ) -> Tuple[IndexStruct, V1DocumentStore]: if V1_INDEX_STRUCT_KEY in file_dict: assert index_struct_type is not None, "Must specify index_struct_type to load." index_struct, docstore = load_v1_index_struct_separate( file_dict, index_struct_type ) elif V1_INDEX_STRUCT_ID_KEY in file_dict: index_struct, docstore = load_v1_index_struct_in_docstore(file_dict) else: raise ValueError("index_struct or index_struct_id must be provided.") return index_struct, docstore def save_v2(index_struct: V2IndexStruct, docstore: V2DocumentStore) -> dict: return { INDEX_STRUCT_KEY: index_struct.to_dict(), DOCSTORE_KEY: docstore.serialize_to_dict(), } def convert_to_v2_dict( v1_dict: dict, index_struct_type: Optional[IndexStructType] = None ) -> dict: index_struct, docstore = load_v1(v1_dict, index_struct_type) index_struct_v2, docstore_v2 = convert_to_v2_index_struct_and_docstore( index_struct, docstore ) v2_dict = save_v2(index_struct_v2, docstore_v2) return v2_dict def convert_to_v2_file( v1_path: str, index_struct_type: Optional[IndexStructType] = None, v2_path: Optional[str] = None, encoding: str = "ascii", ) -> None: with open(v1_path, "r") as f: file_str = f.read() v1_dict = json.loads(file_str) print(f"Successfully loaded V1 JSON file from: {v1_path}") v2_dict = convert_to_v2_dict(v1_dict, index_struct_type) v2_str = json.dumps(v2_dict) v2_path = v2_path or add_prefix_suffix_to_file_path(v1_path, suffix="_v2") with open(v2_path, "wt", encoding=encoding) as f: f.write(v2_str) print(f"Successfully created V2 JSON file at: {v2_path}") if __name__ == "__main__": fire.Fire(convert_to_v2_file)