nobita-gpt / gpt_index /tools /migrate_v1_to_v2.py
binhnase04854's picture
first deploy
b699122
raw
history blame
11.3 kB
"""Tool for migrating Index built with V1 data structs to V2."""
import dataclasses
import json
from typing import Dict, List, Optional, Tuple, Type
from gpt_index.constants import DOCSTORE_KEY, INDEX_STRUCT_KEY
from gpt_index.data_structs.table import SQLStructTable
try:
import fire
except ImportError:
print("Please run `pip install fire`")
from gpt_index.data_structs.data_structs import (
KG,
ChromaIndexDict,
EmptyIndex,
FaissIndexDict,
IndexDict,
IndexGraph,
IndexList,
IndexStruct,
KeywordTable,
Node,
OpensearchIndexDict,
PineconeIndexDict,
QdrantIndexDict,
SimpleIndexDict,
WeaviateIndexDict,
)
from gpt_index.data_structs.data_structs_v2 import KG as V2KG
from gpt_index.data_structs.data_structs_v2 import ChromaIndexDict as V2ChromaIndexDict
from gpt_index.data_structs.data_structs_v2 import FaissIndexDict as V2FaissIndexDict
from gpt_index.data_structs.data_structs_v2 import IndexDict as V2IndexDict
from gpt_index.data_structs.data_structs_v2 import IndexGraph as V2IndexGraph
from gpt_index.data_structs.data_structs_v2 import IndexList as V2IndexList
from gpt_index.data_structs.data_structs_v2 import KeywordTable as V2KeywordTable
from gpt_index.data_structs.data_structs_v2 import (
OpensearchIndexDict as V2OpensearchIndexDict,
)
from gpt_index.data_structs.data_structs_v2 import (
PineconeIndexDict as V2PineconeIndexDict,
)
from gpt_index.data_structs.data_structs_v2 import QdrantIndexDict as V2QdrantIndexDict
from gpt_index.data_structs.data_structs_v2 import SimpleIndexDict as V2SimpleIndexDict
from gpt_index.data_structs.data_structs_v2 import V2IndexStruct
from gpt_index.data_structs.data_structs_v2 import (
WeaviateIndexDict as V2WeaviateIndexDict,
)
from gpt_index.data_structs.node_v2 import DocumentRelationship
from gpt_index.data_structs.node_v2 import ImageNode as V2ImageNode
from gpt_index.data_structs.node_v2 import Node as V2Node
from gpt_index.data_structs.struct_type import IndexStructType
from gpt_index.old_docstore import V1DocumentStore
from gpt_index.docstore import DocumentStore as V2DocumentStore
from gpt_index.tools.file_utils import add_prefix_suffix_to_file_path
INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS: Dict[IndexStructType, Type[IndexStruct]] = {
IndexStructType.TREE: IndexGraph,
IndexStructType.LIST: IndexList,
IndexStructType.KEYWORD_TABLE: KeywordTable,
IndexStructType.SIMPLE_DICT: SimpleIndexDict,
IndexStructType.DICT: FaissIndexDict,
IndexStructType.WEAVIATE: WeaviateIndexDict,
IndexStructType.PINECONE: PineconeIndexDict,
IndexStructType.QDRANT: QdrantIndexDict,
IndexStructType.CHROMA: ChromaIndexDict,
IndexStructType.VECTOR_STORE: IndexDict,
IndexStructType.SQL: SQLStructTable,
IndexStructType.KG: KG,
IndexStructType.EMPTY: EmptyIndex,
IndexStructType.NODE: Node,
}
V1_INDEX_STRUCT_KEY = "index_struct"
V1_INDEX_STRUCT_ID_KEY = "index_struct_id"
V1_DOC_STORE_KEY = "docstore"
def node_to_v2(node: Node) -> V2Node:
if node.ref_doc_id is not None:
relationships = {
DocumentRelationship.SOURCE: node.ref_doc_id,
}
else:
relationships = {}
if node.image is None:
return V2Node(
text=node.text,
doc_id=node.doc_id,
embedding=node.embedding,
doc_hash=node.doc_hash,
extra_info=node.extra_info,
node_info=node.node_info,
relationships=relationships,
)
else:
return V2ImageNode(
text=node.text,
doc_id=node.doc_id,
embedding=node.embedding,
doc_hash=node.doc_hash,
extra_info=node.extra_info,
node_info=node.node_info,
image=node.image,
relationships=relationships,
)
def index_graph_to_v2(struct: IndexGraph) -> Tuple[V2IndexGraph, List[V2Node]]:
all_nodes_v2 = {
index: node.get_doc_id() for index, node in struct.all_nodes.items()
}
root_nodes_v2 = {
index: node.get_doc_id() for index, node in struct.all_nodes.items()
}
node_id_to_children_ids_v2 = {}
for node in struct.all_nodes.values():
node_id = node.get_doc_id()
children_ids = []
for child_index in node.child_indices:
child_id = struct.all_nodes[child_index].get_doc_id()
children_ids.append(child_id)
node_id_to_children_ids_v2[node_id] = children_ids
struct_v2 = V2IndexGraph(
all_nodes=all_nodes_v2,
root_nodes=root_nodes_v2,
node_id_to_children_ids=node_id_to_children_ids_v2,
)
nodes_v2 = [node_to_v2(node) for node in struct.all_nodes.values()]
return struct_v2, nodes_v2
def index_list_to_v2(struct: IndexList) -> Tuple[V2IndexList, List[V2Node]]:
struct_v2 = V2IndexList(nodes=[node.get_doc_id() for node in struct.nodes])
nodes_v2 = [node_to_v2(node) for node in struct.nodes]
return struct_v2, nodes_v2
def keyword_table_to_v2(struct: KeywordTable) -> Tuple[V2KeywordTable, List[V2Node]]:
table_v2 = {
keyword: set(struct.text_chunks[index].get_doc_id() for index in indices)
for keyword, indices in struct.table.items()
}
struct_v2 = V2KeywordTable(table=table_v2)
nodes_v2 = [node_to_v2(node) for node in struct.text_chunks.values()]
return struct_v2, nodes_v2
def index_dict_to_v2(struct: IndexDict) -> Tuple[V2IndexDict, List[V2Node]]:
nodes_dict_v2 = {
vector_id: struct.nodes_dict[int_id].get_doc_id()
for vector_id, int_id in struct.id_map.items()
}
node_id_to_vector_id = {
node_id: vector_id for vector_id, node_id in nodes_dict_v2.items()
}
doc_id_dict_v2: Dict[str, List[str]] = {}
for node in struct.nodes_dict.values():
node_id = node.get_doc_id()
vector_id = node_id_to_vector_id[node_id]
if node.ref_doc_id is not None:
if node.ref_doc_id not in doc_id_dict_v2:
doc_id_dict_v2[node.ref_doc_id] = []
doc_id_dict_v2[node.ref_doc_id].append(vector_id)
struct_v2 = V2IndexDict(
nodes_dict=nodes_dict_v2,
doc_id_dict=doc_id_dict_v2,
embeddings_dict=struct.embeddings_dict,
)
nodes_v2 = [node_to_v2(node) for node in struct.nodes_dict.values()]
if isinstance(struct, SimpleIndexDict):
struct_v2 = V2SimpleIndexDict(**dataclasses.asdict(struct_v2))
if isinstance(struct, FaissIndexDict):
struct_v2 = V2FaissIndexDict(**dataclasses.asdict(struct_v2))
if isinstance(struct, PineconeIndexDict):
struct_v2 = V2PineconeIndexDict(**dataclasses.asdict(struct_v2))
if isinstance(struct, WeaviateIndexDict):
struct_v2 = V2WeaviateIndexDict(**dataclasses.asdict(struct_v2))
if isinstance(struct, QdrantIndexDict):
struct_v2 = V2QdrantIndexDict(**dataclasses.asdict(struct_v2))
if isinstance(struct, ChromaIndexDict):
struct_v2 = V2ChromaIndexDict(**dataclasses.asdict(struct_v2))
if isinstance(struct, OpensearchIndexDict):
struct_v2 = V2OpensearchIndexDict(**dataclasses.asdict(struct_v2))
return struct_v2, nodes_v2
def kg_to_v2(struct: KG) -> Tuple[V2KG, List[V2Node]]:
struct_v2 = V2KG(
table=struct.table,
rel_map=struct.rel_map,
embedding_dict=struct.embedding_dict,
)
nodes_v2 = [node_to_v2(node) for node in struct.text_chunks.values()]
return struct_v2, nodes_v2
def convert_to_v2_index_struct_and_docstore(
index_struct: IndexStruct, docstore: V1DocumentStore
) -> Tuple[V2IndexStruct, V2DocumentStore]:
struct_v2: V2IndexStruct
if isinstance(index_struct, IndexGraph):
struct_v2, nodes_v2 = index_graph_to_v2(index_struct)
elif isinstance(index_struct, IndexList):
struct_v2, nodes_v2 = index_list_to_v2(index_struct)
elif isinstance(index_struct, IndexDict):
struct_v2, nodes_v2 = index_dict_to_v2(index_struct)
elif isinstance(index_struct, KG):
struct_v2, nodes_v2 = kg_to_v2(index_struct)
elif isinstance(index_struct, KeywordTable):
struct_v2, nodes_v2 = keyword_table_to_v2(index_struct)
else:
raise NotImplementedError(f"Cannot migrate {type(index_struct)} yet.")
docstore_v2 = V2DocumentStore()
docstore_v2.add_documents(nodes_v2, allow_update=False)
return struct_v2, docstore_v2
def load_v1_index_struct_in_docstore(
file_dict: dict,
) -> Tuple[IndexStruct, V1DocumentStore]:
index_struct_id = file_dict[V1_INDEX_STRUCT_ID_KEY]
docstore = V1DocumentStore.load_from_dict(
file_dict[V1_DOC_STORE_KEY],
type_to_struct=INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS, # type: ignore
)
index_struct = docstore.get_document(index_struct_id)
assert isinstance(index_struct, IndexStruct)
return index_struct, docstore
def load_v1_index_struct_separate(
file_dict: dict, index_struct_type: IndexStructType
) -> Tuple[IndexStruct, V1DocumentStore]:
index_struct_cls = INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS[index_struct_type]
index_struct = index_struct_cls.from_dict(file_dict[V1_INDEX_STRUCT_KEY])
docstore = V1DocumentStore.load_from_dict(
file_dict[V1_DOC_STORE_KEY],
type_to_struct=INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS, # type: ignore
)
return index_struct, docstore
def load_v1(
file_dict: dict, index_struct_type: Optional[IndexStructType] = None
) -> Tuple[IndexStruct, V1DocumentStore]:
if V1_INDEX_STRUCT_KEY in file_dict:
assert index_struct_type is not None, "Must specify index_struct_type to load."
index_struct, docstore = load_v1_index_struct_separate(
file_dict, index_struct_type
)
elif V1_INDEX_STRUCT_ID_KEY in file_dict:
index_struct, docstore = load_v1_index_struct_in_docstore(file_dict)
else:
raise ValueError("index_struct or index_struct_id must be provided.")
return index_struct, docstore
def save_v2(index_struct: V2IndexStruct, docstore: V2DocumentStore) -> dict:
return {
INDEX_STRUCT_KEY: index_struct.to_dict(),
DOCSTORE_KEY: docstore.serialize_to_dict(),
}
def convert_to_v2_dict(
v1_dict: dict, index_struct_type: Optional[IndexStructType] = None
) -> dict:
index_struct, docstore = load_v1(v1_dict, index_struct_type)
index_struct_v2, docstore_v2 = convert_to_v2_index_struct_and_docstore(
index_struct, docstore
)
v2_dict = save_v2(index_struct_v2, docstore_v2)
return v2_dict
def convert_to_v2_file(
v1_path: str,
index_struct_type: Optional[IndexStructType] = None,
v2_path: Optional[str] = None,
encoding: str = "ascii",
) -> None:
with open(v1_path, "r") as f:
file_str = f.read()
v1_dict = json.loads(file_str)
print(f"Successfully loaded V1 JSON file from: {v1_path}")
v2_dict = convert_to_v2_dict(v1_dict, index_struct_type)
v2_str = json.dumps(v2_dict)
v2_path = v2_path or add_prefix_suffix_to_file_path(v1_path, suffix="_v2")
with open(v2_path, "wt", encoding=encoding) as f:
f.write(v2_str)
print(f"Successfully created V2 JSON file at: {v2_path}")
if __name__ == "__main__":
fire.Fire(convert_to_v2_file)