Spaces:

binhnase04854
/

nobita-gpt

Runtime error

App Files Files Community

nobita-gpt / gpt_index /data_structs /data_structs_v2.py

binhnase04854

first deploy

b699122 almost 2 years ago

raw

history blame contribute delete

12.5 kB

	"""Data structures v2.

	Nodes are decoupled from the indices.

	"""

	import uuid
	from abc import abstractmethod
	from dataclasses import dataclass, field
	from typing import Dict, List, Optional, Sequence, Set, Tuple

	from dataclasses_json import DataClassJsonMixin
	from pydantic import Json

	from gpt_index.constants import DATA_KEY, TYPE_KEY
	from gpt_index.data_structs.node_v2 import Node
	from gpt_index.data_structs.struct_type import IndexStructType


	@dataclass
	class V2IndexStruct(DataClassJsonMixin):
	"""A base data struct for a LlamaIndex."""

	index_id: str = field(default_factory=lambda: str(uuid.uuid4()))
	summary: Optional[str] = None

	def get_summary(self) -> str:
	"""Get text summary."""
	if self.summary is None:
	raise ValueError("summary field of the index_struct not set.")
	return self.summary

	@classmethod
	@abstractmethod
	def get_type(cls) -> IndexStructType:
	"""Get index struct type."""

	def to_dict(self, encode_json: bool = False) -> Dict[str, Json]:
	out_dict = {
	TYPE_KEY: self.get_type(),
	DATA_KEY: super().to_dict(encode_json),
	}
	return out_dict


	@dataclass
	class IndexGraph(V2IndexStruct):
	"""A graph representing the tree-structured index."""

	# mapping from index in tree to Node doc id.
	all_nodes: Dict[int, str] = field(default_factory=dict)
	root_nodes: Dict[int, str] = field(default_factory=dict)
	node_id_to_children_ids: Dict[str, List[str]] = field(default_factory=dict)

	@property
	def node_id_to_index(self) -> Dict[str, int]:
	"""Map from node id to index."""
	return {node_id: index for index, node_id in self.all_nodes.items()}

	@property
	def size(self) -> int:
	"""Get the size of the graph."""
	return len(self.all_nodes)

	def get_index(self, node: Node) -> int:
	"""Get index of node."""
	return self.node_id_to_index[node.get_doc_id()]

	def insert(
	self,
	node: Node,
	index: Optional[int] = None,
	children_nodes: Optional[Sequence[Node]] = None,
	) -> None:
	"""Insert node."""
	index = index or self.size
	node_id = node.get_doc_id()

	self.all_nodes[index] = node_id

	if children_nodes is None:
	children_nodes = []
	children_ids = [n.get_doc_id() for n in children_nodes]
	self.node_id_to_children_ids[node_id] = children_ids

	def get_children(self, parent_node: Optional[Node]) -> Dict[int, str]:
	"""Get children nodes."""
	if parent_node is None:
	return self.root_nodes
	else:
	parent_id = parent_node.get_doc_id()
	children_ids = self.node_id_to_children_ids[parent_id]
	return {
	self.node_id_to_index[child_id]: child_id for child_id in children_ids
	}

	def insert_under_parent(
	self, node: Node, parent_node: Optional[Node], new_index: Optional[int] = None
	) -> None:
	"""Insert under parent node."""
	new_index = new_index or self.size
	if parent_node is None:
	self.root_nodes[new_index] = node.get_doc_id()
	else:
	if parent_node.doc_id not in self.node_id_to_children_ids:
	self.node_id_to_children_ids[parent_node.get_doc_id()] = []
	self.node_id_to_children_ids[parent_node.get_doc_id()].append(
	node.get_doc_id()
	)

	self.all_nodes[new_index] = node.get_doc_id()

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.TREE


	@dataclass
	class KeywordTable(V2IndexStruct):
	"""A table of keywords mapping keywords to text chunks."""

	table: Dict[str, Set[str]] = field(default_factory=dict)

	def add_node(self, keywords: List[str], node: Node) -> None:
	"""Add text to table."""
	for keyword in keywords:
	if keyword not in self.table:
	self.table[keyword] = set()
	self.table[keyword].add(node.get_doc_id())

	@property
	def node_ids(self) -> Set[str]:
	"""Get all node ids."""
	return set.union(*self.table.values())

	@property
	def keywords(self) -> Set[str]:
	"""Get all keywords in the table."""
	return set(self.table.keys())

	@property
	def size(self) -> int:
	"""Get the size of the table."""
	return len(self.table)

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.KEYWORD_TABLE


	@dataclass
	class IndexList(V2IndexStruct):
	"""A list of documents."""

	nodes: List[str] = field(default_factory=list)

	def add_node(self, node: Node) -> None:
	"""Add text to table, return current position in list."""
	# don't worry about child indices for now, nodes are all in order
	self.nodes.append(node.get_doc_id())

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.LIST


	@dataclass
	class IndexDict(V2IndexStruct):
	"""A simple dictionary of documents."""

	# mapping from vector store id to node id
	nodes_dict: Dict[str, str] = field(default_factory=dict)
	# mapping from doc_id to vector store id
	doc_id_dict: Dict[str, List[str]] = field(default_factory=dict)

	# TODO: temporary hack to store embeddings for simple vector index
	# this should be empty for all other indices
	embeddings_dict: Dict[str, List[float]] = field(default_factory=dict)

	def add_node(
	self,
	node: Node,
	text_id: Optional[str] = None,
	) -> str:
	"""Add text to table, return current position in list."""
	# # don't worry about child indices for now, nodes are all in order
	# self.nodes_dict[int_id] = node
	vector_id = text_id if text_id is not None else node.get_doc_id()
	self.nodes_dict[vector_id] = node.get_doc_id()
	if node.ref_doc_id is not None:
	if node.ref_doc_id not in self.doc_id_dict:
	self.doc_id_dict[node.ref_doc_id] = []
	self.doc_id_dict[node.ref_doc_id].append(vector_id)

	return vector_id

	def delete(self, doc_id: str) -> None:
	"""Delete a Node."""
	if doc_id not in self.doc_id_dict:
	return
	for vector_id in self.doc_id_dict[doc_id]:
	del self.nodes_dict[vector_id]
	del self.doc_id_dict[doc_id]

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.VECTOR_STORE


	@dataclass
	class KG(V2IndexStruct):
	"""A table of keywords mapping keywords to text chunks."""

	# Unidirectional

	table: Dict[str, Set[str]] = field(default_factory=dict)
	# text_chunks: Dict[str, Node] = field(default_factory=dict)
	rel_map: Dict[str, List[Tuple[str, str]]] = field(default_factory=dict)
	embedding_dict: Dict[str, List[float]] = field(default_factory=dict)

	@property
	def node_ids(self) -> Set[str]:
	"""Get all node ids."""
	return set.union(*self.table.values())

	def add_to_embedding_dict(self, triplet_str: str, embedding: List[float]) -> None:
	"""Add embedding to dict."""
	self.embedding_dict[triplet_str] = embedding

	def upsert_triplet(self, triplet: Tuple[str, str, str]) -> None:
	"""Upsert a knowledge triplet to the graph."""
	subj, relationship, obj = triplet
	if subj not in self.rel_map:
	self.rel_map[subj] = []
	self.rel_map[subj].append((obj, relationship))

	def add_node(self, keywords: List[str], node: Node) -> None:
	"""Add text to table."""
	node_id = node.get_doc_id()
	for keyword in keywords:
	if keyword not in self.table:
	self.table[keyword] = set()
	self.table[keyword].add(node_id)
	# self.text_chunks[node_id] = node

	def get_rel_map_texts(self, keyword: str) -> List[str]:
	"""Get the corresponding knowledge for a given keyword."""
	# NOTE: return a single node for now
	if keyword not in self.rel_map:
	return []
	texts = []
	for obj, rel in self.rel_map[keyword]:
	texts.append(str((keyword, rel, obj)))
	return texts

	def get_rel_map_tuples(self, keyword: str) -> List[Tuple[str, str]]:
	"""Get the corresponding knowledge for a given keyword."""
	# NOTE: return a single node for now
	if keyword not in self.rel_map:
	return []
	return self.rel_map[keyword]

	def get_node_ids(self, keyword: str, depth: int = 1) -> List[str]:
	"""Get the corresponding knowledge for a given keyword."""
	if depth > 1:
	raise ValueError("Depth > 1 not supported yet.")
	if keyword not in self.table:
	return []
	keywords = [keyword]
	# some keywords may correspond to a leaf node, may not be in rel_map
	if keyword in self.rel_map:
	keywords.extend([child for child, _ in self.rel_map[keyword]])

	node_ids: List[str] = []
	for keyword in keywords:
	for node_id in self.table.get(keyword, set()):
	node_ids.append(node_id)
	# TODO: Traverse (with depth > 1)
	return node_ids

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.KG


	# TODO: remove once we centralize UX around vector index


	@dataclass
	class SimpleIndexDict(IndexDict):
	"""Index dict for simple vector index."""

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.SIMPLE_DICT


	@dataclass
	class FaissIndexDict(IndexDict):
	"""Index dict for Faiss vector index."""

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.DICT


	@dataclass
	class WeaviateIndexDict(IndexDict):
	"""Index dict for Weaviate vector index."""

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.WEAVIATE


	@dataclass
	class PineconeIndexDict(IndexDict):
	"""Index dict for Pinecone vector index."""

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.PINECONE


	@dataclass
	class QdrantIndexDict(IndexDict):
	"""Index dict for Qdrant vector index."""

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.QDRANT


	@dataclass
	class MilvusIndexDict(IndexDict):
	"""Index dict for Milvus vector index."""

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.MILVUS


	@dataclass
	class ChromaIndexDict(IndexDict):
	"""Index dict for Chroma vector index."""

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.CHROMA


	@dataclass
	class OpensearchIndexDict(IndexDict):
	"""Index dict for Opensearch vector index."""

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.OPENSEARCH


	class ChatGPTRetrievalPluginIndexDict(IndexDict):
	"""Index dict for ChatGPT Retrieval Plugin."""

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.CHATGPT_RETRIEVAL_PLUGIN


	@dataclass
	class EmptyIndex(IndexDict):
	"""Empty index."""

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.EMPTY


	@dataclass
	class CompositeIndex(V2IndexStruct):
	all_index_structs: Dict[str, V2IndexStruct] = field(default_factory=dict)
	root_id: Optional[str] = None

	@classmethod
	def get_type(cls) -> IndexStructType:
	"""Get type."""
	return IndexStructType.COMPOSITE

	def to_dict(self, encode_json: bool = False) -> Dict[str, Json]:
	data_dict = {
	"all_index_structs": {
	id_: struct.to_dict(encode_json=encode_json)
	for id_, struct in self.all_index_structs.items()
	},
	"root_id": self.root_id,
	}

	out_dict = {
	TYPE_KEY: self.get_type(),
	DATA_KEY: data_dict,
	}
	return out_dict