SOAPAssistV00

Runtime error

File size: 7,399 Bytes

35b22df

"""Download loader from the Loader Hub."""

import json
import os
import subprocess
import sys
from importlib import util
from pathlib import Path
from typing import List, Optional, Tuple, Type

import pkg_resources
import requests
from pkg_resources import DistributionNotFound

from gpt_index.readers.base import BaseReader

LLAMA_HUB_CONTENTS_URL = "https://raw.githubusercontent.com/emptycrown/loader-hub/main"
LOADER_HUB_PATH = "/loader_hub"
LOADER_HUB_URL = LLAMA_HUB_CONTENTS_URL + LOADER_HUB_PATH


def _get_file_content(loader_hub_url: str, path: str) -> Tuple[str, int]:
    """Get the content of a file from the GitHub REST API."""
    resp = requests.get(loader_hub_url + path)
    return resp.text, resp.status_code


def get_exports(raw_content: str) -> List:
    """Read content of a Python file and returns a list of exported class names.

    For example:
    ```python
    from .a import A
    from .b import B

    __all__ = ["A", "B"]
    ```
    will return `["A", "B"]`.

    Args:
        - raw_content: The content of a Python file as a string.

    Returns:
        A list of exported class names.

    """
    exports = []
    for line in raw_content.splitlines():
        line = line.strip()
        if line.startswith("__all__"):
            exports = line.split("=")[1].strip().strip("[").strip("]").split(",")
            exports = [export.strip().strip("'").strip('"') for export in exports]
    return exports


def rewrite_exports(exports: List[str]) -> None:
    """Write the `__all__` variable to the `__init__.py` file in the modules dir.

    Removes the line that contains `__all__` and appends a new line with the updated
    `__all__` variable.

    Args:
        - exports: A list of exported class names.

    """
    dirpath = Path(__file__).parent / "llamahub_modules"
    init_path = f"{dirpath}/__init__.py"
    with open(init_path, "r") as f:
        lines = f.readlines()
    with open(init_path, "w") as f:
        for line in lines:
            line = line.strip()
            if line.startswith("__all__"):
                continue
            f.write(line + os.linesep)
        f.write(f"__all__ = {list(set(exports))}" + os.linesep)


def download_loader(
    loader_class: str,
    loader_hub_url: str = LOADER_HUB_URL,
    refresh_cache: Optional[bool] = False,
    use_gpt_index_import: bool = False,
) -> Type[BaseReader]:
    """Download a single loader from the Loader Hub.

    Args:
        loader_class: The name of the loader class you want to download,
            such as `SimpleWebPageReader`.
        refresh_cache: If true, the local cache will be skipped and the
            loader will be fetched directly from the remote repo.
        use_gpt_index_import: If true, the loader files will use
            gpt_index as the base dependency. By default (False),
            the loader files use llama_index as the base dependency.
            NOTE: this is a temporary workaround while we fully migrate all usages
            to llama_index.

    Returns:
        A Loader.
    """
    dirpath = Path(__file__).parent / "llamahub_modules"
    if not os.path.exists(dirpath):
        # Create a new directory because it does not exist
        os.makedirs(dirpath)
    if not os.path.exists(f"{dirpath}/__init__.py"):
        # Create an empty __init__.py file if it does not exist yet
        with open(f"{dirpath}/__init__.py", "w") as f:
            pass

    library_path = f"{dirpath}/library.json"
    loader_id = None  # e.g. `web/simple_web`
    extra_files = []  # e.g. `web/simple_web/utils.py`

    # Check cache first
    if not refresh_cache and os.path.exists(library_path):
        with open(library_path) as f:
            library = json.load(f)
        if loader_class in library:
            loader_id = library[loader_class]["id"]
            extra_files = library[loader_class].get("extra_files", [])

    # Fetch up-to-date library from remote repo if loader_id not found
    if loader_id is None:
        library_raw_content, _ = _get_file_content(loader_hub_url, "/library.json")
        library = json.loads(library_raw_content)
        if loader_class not in library:
            raise ValueError("Loader class name not found in library")

        loader_id = library[loader_class]["id"]
        extra_files = library[loader_class].get("extra_files", [])
        # Update cache
        with open(library_path, "w") as f:
            f.write(library_raw_content)

    if loader_id is None:
        raise ValueError("Loader class name not found in library")
    # Load the module
    loader_path = f"{dirpath}/{loader_id}"
    requirements_path = f"{loader_path}/requirements.txt"

    if refresh_cache or not os.path.exists(loader_path):
        os.makedirs(loader_path, exist_ok=True)
        basepy_raw_content, _ = _get_file_content(
            loader_hub_url, f"/{loader_id}/base.py"
        )
        if use_gpt_index_import:
            basepy_raw_content = basepy_raw_content.replace(
                "import llama_index", "import gpt_index"
            )
            basepy_raw_content = basepy_raw_content.replace(
                "from llama_index", "from gpt_index"
            )

        with open(f"{loader_path}/base.py", "w") as f:
            f.write(basepy_raw_content)

        # Get content of extra files if there are any
        # and write them under the loader directory
        for extra_file in extra_files:
            extra_file_raw_content, _ = _get_file_content(
                loader_hub_url, f"/{loader_id}/{extra_file}"
            )
            # If the extra file is an __init__.py file, we need to
            # add the exports to the __init__.py file in the modules directory
            if extra_file == "__init__.py":
                loader_exports = get_exports(extra_file_raw_content)
                existing_exports = []
                with open(dirpath / "__init__.py", "r+") as f:
                    f.write(f"from .{loader_id} import {', '.join(loader_exports)}")
                    existing_exports = get_exports(f.read())
                rewrite_exports(existing_exports + loader_exports)
            with open(f"{loader_path}/{extra_file}", "w") as f:
                f.write(extra_file_raw_content)

    if not os.path.exists(requirements_path):
        # NOTE: need to check the status code
        response_txt, status_code = _get_file_content(
            loader_hub_url, f"/{loader_id}/requirements.txt"
        )
        if status_code == 200:
            with open(requirements_path, "w") as f:
                f.write(response_txt)

    # Install dependencies if there are any and not already installed
    if os.path.exists(requirements_path):
        try:
            requirements = pkg_resources.parse_requirements(
                Path(requirements_path).open()
            )
            pkg_resources.require([str(r) for r in requirements])
        except DistributionNotFound:
            subprocess.check_call(
                [sys.executable, "-m", "pip", "install", "-r", requirements_path]
            )
    spec = util.spec_from_file_location(
        "custom_loader", location=f"{loader_path}/base.py"
    )
    if spec is None:
        raise ValueError(f"Could not find file: {loader_path}/base.py.")
    module = util.module_from_spec(spec)
    spec.loader.exec_module(module)  # type: ignore

    return getattr(module, loader_class)