Spaces:

binhnase04854
/

nobita-gpt

Runtime error

App Files Files Community

nobita-gpt / gpt_index /tools /migrate_v1_to_v2.py

binhnase04854

first deploy

b699122 almost 2 years ago

raw

history blame

11.3 kB

	"""Tool for migrating Index built with V1 data structs to V2."""
	import dataclasses
	import json
	from typing import Dict, List, Optional, Tuple, Type

	from gpt_index.constants import DOCSTORE_KEY, INDEX_STRUCT_KEY
	from gpt_index.data_structs.table import SQLStructTable

	try:
	import fire
	except ImportError:
	print("Please run `pip install fire`")

	from gpt_index.data_structs.data_structs import (
	KG,
	ChromaIndexDict,
	EmptyIndex,
	FaissIndexDict,
	IndexDict,
	IndexGraph,
	IndexList,
	IndexStruct,
	KeywordTable,
	Node,
	OpensearchIndexDict,
	PineconeIndexDict,
	QdrantIndexDict,
	SimpleIndexDict,
	WeaviateIndexDict,
	)
	from gpt_index.data_structs.data_structs_v2 import KG as V2KG
	from gpt_index.data_structs.data_structs_v2 import ChromaIndexDict as V2ChromaIndexDict
	from gpt_index.data_structs.data_structs_v2 import FaissIndexDict as V2FaissIndexDict
	from gpt_index.data_structs.data_structs_v2 import IndexDict as V2IndexDict
	from gpt_index.data_structs.data_structs_v2 import IndexGraph as V2IndexGraph
	from gpt_index.data_structs.data_structs_v2 import IndexList as V2IndexList
	from gpt_index.data_structs.data_structs_v2 import KeywordTable as V2KeywordTable
	from gpt_index.data_structs.data_structs_v2 import (
	OpensearchIndexDict as V2OpensearchIndexDict,
	)
	from gpt_index.data_structs.data_structs_v2 import (
	PineconeIndexDict as V2PineconeIndexDict,
	)
	from gpt_index.data_structs.data_structs_v2 import QdrantIndexDict as V2QdrantIndexDict
	from gpt_index.data_structs.data_structs_v2 import SimpleIndexDict as V2SimpleIndexDict
	from gpt_index.data_structs.data_structs_v2 import V2IndexStruct
	from gpt_index.data_structs.data_structs_v2 import (
	WeaviateIndexDict as V2WeaviateIndexDict,
	)
	from gpt_index.data_structs.node_v2 import DocumentRelationship
	from gpt_index.data_structs.node_v2 import ImageNode as V2ImageNode
	from gpt_index.data_structs.node_v2 import Node as V2Node
	from gpt_index.data_structs.struct_type import IndexStructType
	from gpt_index.old_docstore import V1DocumentStore
	from gpt_index.docstore import DocumentStore as V2DocumentStore
	from gpt_index.tools.file_utils import add_prefix_suffix_to_file_path

	INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS: Dict[IndexStructType, Type[IndexStruct]] = {
	IndexStructType.TREE: IndexGraph,
	IndexStructType.LIST: IndexList,
	IndexStructType.KEYWORD_TABLE: KeywordTable,
	IndexStructType.SIMPLE_DICT: SimpleIndexDict,
	IndexStructType.DICT: FaissIndexDict,
	IndexStructType.WEAVIATE: WeaviateIndexDict,
	IndexStructType.PINECONE: PineconeIndexDict,
	IndexStructType.QDRANT: QdrantIndexDict,
	IndexStructType.CHROMA: ChromaIndexDict,
	IndexStructType.VECTOR_STORE: IndexDict,
	IndexStructType.SQL: SQLStructTable,
	IndexStructType.KG: KG,
	IndexStructType.EMPTY: EmptyIndex,
	IndexStructType.NODE: Node,
	}

	V1_INDEX_STRUCT_KEY = "index_struct"
	V1_INDEX_STRUCT_ID_KEY = "index_struct_id"
	V1_DOC_STORE_KEY = "docstore"


	def node_to_v2(node: Node) -> V2Node:
	if node.ref_doc_id is not None:
	relationships = {
	DocumentRelationship.SOURCE: node.ref_doc_id,
	}
	else:
	relationships = {}

	if node.image is None:
	return V2Node(
	text=node.text,
	doc_id=node.doc_id,
	embedding=node.embedding,
	doc_hash=node.doc_hash,
	extra_info=node.extra_info,
	node_info=node.node_info,
	relationships=relationships,
	)
	else:
	return V2ImageNode(
	text=node.text,
	doc_id=node.doc_id,
	embedding=node.embedding,
	doc_hash=node.doc_hash,
	extra_info=node.extra_info,
	node_info=node.node_info,
	image=node.image,
	relationships=relationships,
	)


	def index_graph_to_v2(struct: IndexGraph) -> Tuple[V2IndexGraph, List[V2Node]]:
	all_nodes_v2 = {
	index: node.get_doc_id() for index, node in struct.all_nodes.items()
	}
	root_nodes_v2 = {
	index: node.get_doc_id() for index, node in struct.all_nodes.items()
	}
	node_id_to_children_ids_v2 = {}
	for node in struct.all_nodes.values():
	node_id = node.get_doc_id()
	children_ids = []
	for child_index in node.child_indices:
	child_id = struct.all_nodes[child_index].get_doc_id()
	children_ids.append(child_id)
	node_id_to_children_ids_v2[node_id] = children_ids
	struct_v2 = V2IndexGraph(
	all_nodes=all_nodes_v2,
	root_nodes=root_nodes_v2,
	node_id_to_children_ids=node_id_to_children_ids_v2,
	)

	nodes_v2 = [node_to_v2(node) for node in struct.all_nodes.values()]
	return struct_v2, nodes_v2


	def index_list_to_v2(struct: IndexList) -> Tuple[V2IndexList, List[V2Node]]:
	struct_v2 = V2IndexList(nodes=[node.get_doc_id() for node in struct.nodes])
	nodes_v2 = [node_to_v2(node) for node in struct.nodes]
	return struct_v2, nodes_v2


	def keyword_table_to_v2(struct: KeywordTable) -> Tuple[V2KeywordTable, List[V2Node]]:
	table_v2 = {
	keyword: set(struct.text_chunks[index].get_doc_id() for index in indices)
	for keyword, indices in struct.table.items()
	}
	struct_v2 = V2KeywordTable(table=table_v2)
	nodes_v2 = [node_to_v2(node) for node in struct.text_chunks.values()]
	return struct_v2, nodes_v2


	def index_dict_to_v2(struct: IndexDict) -> Tuple[V2IndexDict, List[V2Node]]:
	nodes_dict_v2 = {
	vector_id: struct.nodes_dict[int_id].get_doc_id()
	for vector_id, int_id in struct.id_map.items()
	}

	node_id_to_vector_id = {
	node_id: vector_id for vector_id, node_id in nodes_dict_v2.items()
	}
	doc_id_dict_v2: Dict[str, List[str]] = {}
	for node in struct.nodes_dict.values():
	node_id = node.get_doc_id()
	vector_id = node_id_to_vector_id[node_id]
	if node.ref_doc_id is not None:
	if node.ref_doc_id not in doc_id_dict_v2:
	doc_id_dict_v2[node.ref_doc_id] = []
	doc_id_dict_v2[node.ref_doc_id].append(vector_id)

	struct_v2 = V2IndexDict(
	nodes_dict=nodes_dict_v2,
	doc_id_dict=doc_id_dict_v2,
	embeddings_dict=struct.embeddings_dict,
	)
	nodes_v2 = [node_to_v2(node) for node in struct.nodes_dict.values()]

	if isinstance(struct, SimpleIndexDict):
	struct_v2 = V2SimpleIndexDict(**dataclasses.asdict(struct_v2))
	if isinstance(struct, FaissIndexDict):
	struct_v2 = V2FaissIndexDict(**dataclasses.asdict(struct_v2))
	if isinstance(struct, PineconeIndexDict):
	struct_v2 = V2PineconeIndexDict(**dataclasses.asdict(struct_v2))
	if isinstance(struct, WeaviateIndexDict):
	struct_v2 = V2WeaviateIndexDict(**dataclasses.asdict(struct_v2))
	if isinstance(struct, QdrantIndexDict):
	struct_v2 = V2QdrantIndexDict(**dataclasses.asdict(struct_v2))
	if isinstance(struct, ChromaIndexDict):
	struct_v2 = V2ChromaIndexDict(**dataclasses.asdict(struct_v2))
	if isinstance(struct, OpensearchIndexDict):
	struct_v2 = V2OpensearchIndexDict(**dataclasses.asdict(struct_v2))
	return struct_v2, nodes_v2


	def kg_to_v2(struct: KG) -> Tuple[V2KG, List[V2Node]]:
	struct_v2 = V2KG(
	table=struct.table,
	rel_map=struct.rel_map,
	embedding_dict=struct.embedding_dict,
	)
	nodes_v2 = [node_to_v2(node) for node in struct.text_chunks.values()]
	return struct_v2, nodes_v2


	def convert_to_v2_index_struct_and_docstore(
	index_struct: IndexStruct, docstore: V1DocumentStore
	) -> Tuple[V2IndexStruct, V2DocumentStore]:
	struct_v2: V2IndexStruct
	if isinstance(index_struct, IndexGraph):
	struct_v2, nodes_v2 = index_graph_to_v2(index_struct)
	elif isinstance(index_struct, IndexList):
	struct_v2, nodes_v2 = index_list_to_v2(index_struct)
	elif isinstance(index_struct, IndexDict):
	struct_v2, nodes_v2 = index_dict_to_v2(index_struct)
	elif isinstance(index_struct, KG):
	struct_v2, nodes_v2 = kg_to_v2(index_struct)
	elif isinstance(index_struct, KeywordTable):
	struct_v2, nodes_v2 = keyword_table_to_v2(index_struct)
	else:
	raise NotImplementedError(f"Cannot migrate {type(index_struct)} yet.")

	docstore_v2 = V2DocumentStore()
	docstore_v2.add_documents(nodes_v2, allow_update=False)
	return struct_v2, docstore_v2


	def load_v1_index_struct_in_docstore(
	file_dict: dict,
	) -> Tuple[IndexStruct, V1DocumentStore]:
	index_struct_id = file_dict[V1_INDEX_STRUCT_ID_KEY]
	docstore = V1DocumentStore.load_from_dict(
	file_dict[V1_DOC_STORE_KEY],
	type_to_struct=INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS, # type: ignore
	)
	index_struct = docstore.get_document(index_struct_id)
	assert isinstance(index_struct, IndexStruct)
	return index_struct, docstore


	def load_v1_index_struct_separate(
	file_dict: dict, index_struct_type: IndexStructType
	) -> Tuple[IndexStruct, V1DocumentStore]:
	index_struct_cls = INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS[index_struct_type]
	index_struct = index_struct_cls.from_dict(file_dict[V1_INDEX_STRUCT_KEY])
	docstore = V1DocumentStore.load_from_dict(
	file_dict[V1_DOC_STORE_KEY],
	type_to_struct=INDEX_STRUCT_TYPE_TO_V1_INDEX_STRUCT_CLASS, # type: ignore
	)
	return index_struct, docstore


	def load_v1(
	file_dict: dict, index_struct_type: Optional[IndexStructType] = None
	) -> Tuple[IndexStruct, V1DocumentStore]:
	if V1_INDEX_STRUCT_KEY in file_dict:
	assert index_struct_type is not None, "Must specify index_struct_type to load."
	index_struct, docstore = load_v1_index_struct_separate(
	file_dict, index_struct_type
	)
	elif V1_INDEX_STRUCT_ID_KEY in file_dict:
	index_struct, docstore = load_v1_index_struct_in_docstore(file_dict)
	else:
	raise ValueError("index_struct or index_struct_id must be provided.")
	return index_struct, docstore


	def save_v2(index_struct: V2IndexStruct, docstore: V2DocumentStore) -> dict:
	return {
	INDEX_STRUCT_KEY: index_struct.to_dict(),
	DOCSTORE_KEY: docstore.serialize_to_dict(),
	}


	def convert_to_v2_dict(
	v1_dict: dict, index_struct_type: Optional[IndexStructType] = None
	) -> dict:
	index_struct, docstore = load_v1(v1_dict, index_struct_type)
	index_struct_v2, docstore_v2 = convert_to_v2_index_struct_and_docstore(
	index_struct, docstore
	)
	v2_dict = save_v2(index_struct_v2, docstore_v2)
	return v2_dict


	def convert_to_v2_file(
	v1_path: str,
	index_struct_type: Optional[IndexStructType] = None,
	v2_path: Optional[str] = None,
	encoding: str = "ascii",
	) -> None:
	with open(v1_path, "r") as f:
	file_str = f.read()
	v1_dict = json.loads(file_str)
	print(f"Successfully loaded V1 JSON file from: {v1_path}")

	v2_dict = convert_to_v2_dict(v1_dict, index_struct_type)

	v2_str = json.dumps(v2_dict)
	v2_path = v2_path or add_prefix_suffix_to_file_path(v1_path, suffix="_v2")
	with open(v2_path, "wt", encoding=encoding) as f:
	f.write(v2_str)
	print(f"Successfully created V2 JSON file at: {v2_path}")


	if __name__ == "__main__":
	fire.Fire(convert_to_v2_file)