Spaces:
Runtime error
Runtime error
"""Base schema for data structures.""" | |
from abc import abstractmethod | |
from dataclasses import dataclass | |
from typing import Any, Dict, List, Optional | |
from dataclasses_json import DataClassJsonMixin | |
from gpt_index.utils import get_new_id | |
class BaseDocument(DataClassJsonMixin): | |
"""Base document. | |
Generic abstract interfaces that captures both index structs | |
as well as documents. | |
""" | |
# TODO: consolidate fields from Document/IndexStruct into base class | |
text: Optional[str] = None | |
doc_id: Optional[str] = None | |
embedding: Optional[List[float]] = None | |
# extra fields | |
extra_info: Optional[Dict[str, Any]] = None | |
def __post_init__(self) -> None: | |
"""Post init.""" | |
# assign doc_id if not set | |
if self.doc_id is None: | |
self.doc_id = get_new_id(set()) | |
def get_type(cls) -> str: | |
"""Get Document type.""" | |
def get_types(cls) -> List[str]: | |
"""Get Document type.""" | |
# TODO: remove this method | |
# a hack to preserve backwards compatibility for vector indices | |
return [cls.get_type()] | |
def get_text(self) -> str: | |
"""Get text.""" | |
if self.text is None: | |
raise ValueError("text field not set.") | |
return self.text | |
def get_doc_id(self) -> str: | |
"""Get doc_id.""" | |
if self.doc_id is None: | |
raise ValueError("doc_id not set.") | |
return self.doc_id | |
def is_doc_id_none(self) -> bool: | |
"""Check if doc_id is None.""" | |
return self.doc_id is None | |
def get_embedding(self) -> List[float]: | |
"""Get embedding. | |
Errors if embedding is None. | |
""" | |
if self.embedding is None: | |
raise ValueError("embedding not set.") | |
return self.embedding | |
def extra_info_str(self) -> Optional[str]: | |
"""Extra info string.""" | |
if self.extra_info is None: | |
return None | |
return "\n".join([f"{k}: {str(v)}" for k, v in self.extra_info.items()]) | |