File size: 7,399 Bytes
35b22df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""Download loader from the Loader Hub."""

import json
import os
import subprocess
import sys
from importlib import util
from pathlib import Path
from typing import List, Optional, Tuple, Type

import pkg_resources
import requests
from pkg_resources import DistributionNotFound

from gpt_index.readers.base import BaseReader

LLAMA_HUB_CONTENTS_URL = "https://raw.githubusercontent.com/emptycrown/loader-hub/main"
LOADER_HUB_PATH = "/loader_hub"
LOADER_HUB_URL = LLAMA_HUB_CONTENTS_URL + LOADER_HUB_PATH


def _get_file_content(loader_hub_url: str, path: str) -> Tuple[str, int]:
    """Get the content of a file from the GitHub REST API."""
    resp = requests.get(loader_hub_url + path)
    return resp.text, resp.status_code


def get_exports(raw_content: str) -> List:
    """Read content of a Python file and returns a list of exported class names.

    For example:
    ```python
    from .a import A
    from .b import B

    __all__ = ["A", "B"]
    ```
    will return `["A", "B"]`.

    Args:
        - raw_content: The content of a Python file as a string.

    Returns:
        A list of exported class names.

    """
    exports = []
    for line in raw_content.splitlines():
        line = line.strip()
        if line.startswith("__all__"):
            exports = line.split("=")[1].strip().strip("[").strip("]").split(",")
            exports = [export.strip().strip("'").strip('"') for export in exports]
    return exports


def rewrite_exports(exports: List[str]) -> None:
    """Write the `__all__` variable to the `__init__.py` file in the modules dir.

    Removes the line that contains `__all__` and appends a new line with the updated
    `__all__` variable.

    Args:
        - exports: A list of exported class names.

    """
    dirpath = Path(__file__).parent / "llamahub_modules"
    init_path = f"{dirpath}/__init__.py"
    with open(init_path, "r") as f:
        lines = f.readlines()
    with open(init_path, "w") as f:
        for line in lines:
            line = line.strip()
            if line.startswith("__all__"):
                continue
            f.write(line + os.linesep)
        f.write(f"__all__ = {list(set(exports))}" + os.linesep)


def download_loader(
    loader_class: str,
    loader_hub_url: str = LOADER_HUB_URL,
    refresh_cache: Optional[bool] = False,
    use_gpt_index_import: bool = False,
) -> Type[BaseReader]:
    """Download a single loader from the Loader Hub.

    Args:
        loader_class: The name of the loader class you want to download,
            such as `SimpleWebPageReader`.
        refresh_cache: If true, the local cache will be skipped and the
            loader will be fetched directly from the remote repo.
        use_gpt_index_import: If true, the loader files will use
            gpt_index as the base dependency. By default (False),
            the loader files use llama_index as the base dependency.
            NOTE: this is a temporary workaround while we fully migrate all usages
            to llama_index.

    Returns:
        A Loader.
    """
    dirpath = Path(__file__).parent / "llamahub_modules"
    if not os.path.exists(dirpath):
        # Create a new directory because it does not exist
        os.makedirs(dirpath)
    if not os.path.exists(f"{dirpath}/__init__.py"):
        # Create an empty __init__.py file if it does not exist yet
        with open(f"{dirpath}/__init__.py", "w") as f:
            pass

    library_path = f"{dirpath}/library.json"
    loader_id = None  # e.g. `web/simple_web`
    extra_files = []  # e.g. `web/simple_web/utils.py`

    # Check cache first
    if not refresh_cache and os.path.exists(library_path):
        with open(library_path) as f:
            library = json.load(f)
        if loader_class in library:
            loader_id = library[loader_class]["id"]
            extra_files = library[loader_class].get("extra_files", [])

    # Fetch up-to-date library from remote repo if loader_id not found
    if loader_id is None:
        library_raw_content, _ = _get_file_content(loader_hub_url, "/library.json")
        library = json.loads(library_raw_content)
        if loader_class not in library:
            raise ValueError("Loader class name not found in library")

        loader_id = library[loader_class]["id"]
        extra_files = library[loader_class].get("extra_files", [])
        # Update cache
        with open(library_path, "w") as f:
            f.write(library_raw_content)

    if loader_id is None:
        raise ValueError("Loader class name not found in library")
    # Load the module
    loader_path = f"{dirpath}/{loader_id}"
    requirements_path = f"{loader_path}/requirements.txt"

    if refresh_cache or not os.path.exists(loader_path):
        os.makedirs(loader_path, exist_ok=True)
        basepy_raw_content, _ = _get_file_content(
            loader_hub_url, f"/{loader_id}/base.py"
        )
        if use_gpt_index_import:
            basepy_raw_content = basepy_raw_content.replace(
                "import llama_index", "import gpt_index"
            )
            basepy_raw_content = basepy_raw_content.replace(
                "from llama_index", "from gpt_index"
            )

        with open(f"{loader_path}/base.py", "w") as f:
            f.write(basepy_raw_content)

        # Get content of extra files if there are any
        # and write them under the loader directory
        for extra_file in extra_files:
            extra_file_raw_content, _ = _get_file_content(
                loader_hub_url, f"/{loader_id}/{extra_file}"
            )
            # If the extra file is an __init__.py file, we need to
            # add the exports to the __init__.py file in the modules directory
            if extra_file == "__init__.py":
                loader_exports = get_exports(extra_file_raw_content)
                existing_exports = []
                with open(dirpath / "__init__.py", "r+") as f:
                    f.write(f"from .{loader_id} import {', '.join(loader_exports)}")
                    existing_exports = get_exports(f.read())
                rewrite_exports(existing_exports + loader_exports)
            with open(f"{loader_path}/{extra_file}", "w") as f:
                f.write(extra_file_raw_content)

    if not os.path.exists(requirements_path):
        # NOTE: need to check the status code
        response_txt, status_code = _get_file_content(
            loader_hub_url, f"/{loader_id}/requirements.txt"
        )
        if status_code == 200:
            with open(requirements_path, "w") as f:
                f.write(response_txt)

    # Install dependencies if there are any and not already installed
    if os.path.exists(requirements_path):
        try:
            requirements = pkg_resources.parse_requirements(
                Path(requirements_path).open()
            )
            pkg_resources.require([str(r) for r in requirements])
        except DistributionNotFound:
            subprocess.check_call(
                [sys.executable, "-m", "pip", "install", "-r", requirements_path]
            )
    spec = util.spec_from_file_location(
        "custom_loader", location=f"{loader_path}/base.py"
    )
    if spec is None:
        raise ValueError(f"Could not find file: {loader_path}/base.py.")
    module = util.module_from_spec(spec)
    spec.loader.exec_module(module)  # type: ignore

    return getattr(module, loader_class)