Spaces:
Runtime error
Runtime error
"""Load Documents from a set of persistent Steamship Files.""" | |
from typing import List, Optional | |
from gpt_index.readers.base import BaseReader | |
from gpt_index.readers.schema.base import Document | |
class SteamshipFileReader(BaseReader): | |
"""Reads persistent Steamship Files and converts them to Documents. | |
Args: | |
api_key: Steamship API key. Defaults to STEAMSHIP_API_KEY value if not provided. | |
Note: | |
Requires install of `steamship` package and an active Steamship API Key. | |
To get a Steamship API Key, visit: https://steamship.com/account/api. | |
Once you have an API Key, expose it via an environment variable named | |
`STEAMSHIP_API_KEY` or pass it as an init argument (`api_key`). | |
""" | |
def __init__(self, api_key: Optional[str] = None) -> None: | |
"""Initialize the Reader.""" | |
try: | |
import steamship # noqa: F401 | |
self.api_key = api_key | |
except ImportError: | |
raise ImportError( | |
"`steamship` must be installed to use the SteamshipFileReader.\n" | |
"Please run `pip install --upgrade steamship." | |
) | |
def load_data( | |
self, | |
workspace: str, | |
query: Optional[str] = None, | |
file_handles: Optional[List[str]] = None, | |
collapse_blocks: bool = True, | |
join_str: str = "\n\n", | |
) -> List[Document]: | |
"""Load data from persistent Steamship Files into Documents. | |
Args: | |
workspace: the handle for a Steamship workspace | |
(see: https://docs.steamship.com/workspaces/index.html) | |
query: a Steamship tag query for retrieving files | |
(ex: 'filetag and value("import-id")="import-001"') | |
file_handles: a list of Steamship File handles | |
(ex: `smooth-valley-9kbdr`) | |
collapse_blocks: whether to merge individual File Blocks into a | |
single Document, or separate them. | |
join_str: when collapse_blocks is True, this is how the block texts | |
will be concatenated. | |
Note: | |
The collection of Files from both `query` and `file_handles` will be | |
combined. There is no (current) support for deconflicting the collections | |
(meaning that if a file appears both in the result set of the query and | |
as a handle in file_handles, it will be loaded twice). | |
""" | |
from steamship import File, Steamship | |
client = Steamship(workspace=workspace, api_key=self.api_key) | |
files = [] | |
if query: | |
files_from_query = File.query(client=client, tag_filter_query=query).files | |
files.extend(files_from_query) | |
if file_handles: | |
files.extend([File.get(client=client, handle=h) for h in file_handles]) | |
docs = [] | |
for file in files: | |
extra_info = {"source": file.handle} | |
for tag in file.tags: | |
extra_info[tag.kind] = tag.value | |
if collapse_blocks: | |
text = join_str.join([b.text for b in file.blocks]) | |
docs.append( | |
Document(text=text, doc_id=file.handle, extra_info=extra_info) | |
) | |
else: | |
docs.extend( | |
[ | |
Document(text=b.text, doc_id=file.handle, extra_info=extra_info) | |
for b in file.blocks | |
] | |
) | |
return docs | |