Spaces:
Runtime error
Runtime error
File size: 7,399 Bytes
35b22df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
"""Download loader from the Loader Hub."""
import json
import os
import subprocess
import sys
from importlib import util
from pathlib import Path
from typing import List, Optional, Tuple, Type
import pkg_resources
import requests
from pkg_resources import DistributionNotFound
from gpt_index.readers.base import BaseReader
LLAMA_HUB_CONTENTS_URL = "https://raw.githubusercontent.com/emptycrown/loader-hub/main"
LOADER_HUB_PATH = "/loader_hub"
LOADER_HUB_URL = LLAMA_HUB_CONTENTS_URL + LOADER_HUB_PATH
def _get_file_content(loader_hub_url: str, path: str) -> Tuple[str, int]:
"""Get the content of a file from the GitHub REST API."""
resp = requests.get(loader_hub_url + path)
return resp.text, resp.status_code
def get_exports(raw_content: str) -> List:
"""Read content of a Python file and returns a list of exported class names.
For example:
```python
from .a import A
from .b import B
__all__ = ["A", "B"]
```
will return `["A", "B"]`.
Args:
- raw_content: The content of a Python file as a string.
Returns:
A list of exported class names.
"""
exports = []
for line in raw_content.splitlines():
line = line.strip()
if line.startswith("__all__"):
exports = line.split("=")[1].strip().strip("[").strip("]").split(",")
exports = [export.strip().strip("'").strip('"') for export in exports]
return exports
def rewrite_exports(exports: List[str]) -> None:
"""Write the `__all__` variable to the `__init__.py` file in the modules dir.
Removes the line that contains `__all__` and appends a new line with the updated
`__all__` variable.
Args:
- exports: A list of exported class names.
"""
dirpath = Path(__file__).parent / "llamahub_modules"
init_path = f"{dirpath}/__init__.py"
with open(init_path, "r") as f:
lines = f.readlines()
with open(init_path, "w") as f:
for line in lines:
line = line.strip()
if line.startswith("__all__"):
continue
f.write(line + os.linesep)
f.write(f"__all__ = {list(set(exports))}" + os.linesep)
def download_loader(
loader_class: str,
loader_hub_url: str = LOADER_HUB_URL,
refresh_cache: Optional[bool] = False,
use_gpt_index_import: bool = False,
) -> Type[BaseReader]:
"""Download a single loader from the Loader Hub.
Args:
loader_class: The name of the loader class you want to download,
such as `SimpleWebPageReader`.
refresh_cache: If true, the local cache will be skipped and the
loader will be fetched directly from the remote repo.
use_gpt_index_import: If true, the loader files will use
gpt_index as the base dependency. By default (False),
the loader files use llama_index as the base dependency.
NOTE: this is a temporary workaround while we fully migrate all usages
to llama_index.
Returns:
A Loader.
"""
dirpath = Path(__file__).parent / "llamahub_modules"
if not os.path.exists(dirpath):
# Create a new directory because it does not exist
os.makedirs(dirpath)
if not os.path.exists(f"{dirpath}/__init__.py"):
# Create an empty __init__.py file if it does not exist yet
with open(f"{dirpath}/__init__.py", "w") as f:
pass
library_path = f"{dirpath}/library.json"
loader_id = None # e.g. `web/simple_web`
extra_files = [] # e.g. `web/simple_web/utils.py`
# Check cache first
if not refresh_cache and os.path.exists(library_path):
with open(library_path) as f:
library = json.load(f)
if loader_class in library:
loader_id = library[loader_class]["id"]
extra_files = library[loader_class].get("extra_files", [])
# Fetch up-to-date library from remote repo if loader_id not found
if loader_id is None:
library_raw_content, _ = _get_file_content(loader_hub_url, "/library.json")
library = json.loads(library_raw_content)
if loader_class not in library:
raise ValueError("Loader class name not found in library")
loader_id = library[loader_class]["id"]
extra_files = library[loader_class].get("extra_files", [])
# Update cache
with open(library_path, "w") as f:
f.write(library_raw_content)
if loader_id is None:
raise ValueError("Loader class name not found in library")
# Load the module
loader_path = f"{dirpath}/{loader_id}"
requirements_path = f"{loader_path}/requirements.txt"
if refresh_cache or not os.path.exists(loader_path):
os.makedirs(loader_path, exist_ok=True)
basepy_raw_content, _ = _get_file_content(
loader_hub_url, f"/{loader_id}/base.py"
)
if use_gpt_index_import:
basepy_raw_content = basepy_raw_content.replace(
"import llama_index", "import gpt_index"
)
basepy_raw_content = basepy_raw_content.replace(
"from llama_index", "from gpt_index"
)
with open(f"{loader_path}/base.py", "w") as f:
f.write(basepy_raw_content)
# Get content of extra files if there are any
# and write them under the loader directory
for extra_file in extra_files:
extra_file_raw_content, _ = _get_file_content(
loader_hub_url, f"/{loader_id}/{extra_file}"
)
# If the extra file is an __init__.py file, we need to
# add the exports to the __init__.py file in the modules directory
if extra_file == "__init__.py":
loader_exports = get_exports(extra_file_raw_content)
existing_exports = []
with open(dirpath / "__init__.py", "r+") as f:
f.write(f"from .{loader_id} import {', '.join(loader_exports)}")
existing_exports = get_exports(f.read())
rewrite_exports(existing_exports + loader_exports)
with open(f"{loader_path}/{extra_file}", "w") as f:
f.write(extra_file_raw_content)
if not os.path.exists(requirements_path):
# NOTE: need to check the status code
response_txt, status_code = _get_file_content(
loader_hub_url, f"/{loader_id}/requirements.txt"
)
if status_code == 200:
with open(requirements_path, "w") as f:
f.write(response_txt)
# Install dependencies if there are any and not already installed
if os.path.exists(requirements_path):
try:
requirements = pkg_resources.parse_requirements(
Path(requirements_path).open()
)
pkg_resources.require([str(r) for r in requirements])
except DistributionNotFound:
subprocess.check_call(
[sys.executable, "-m", "pip", "install", "-r", requirements_path]
)
spec = util.spec_from_file_location(
"custom_loader", location=f"{loader_path}/base.py"
)
if spec is None:
raise ValueError(f"Could not find file: {loader_path}/base.py.")
module = util.module_from_spec(spec)
spec.loader.exec_module(module) # type: ignore
return getattr(module, loader_class)
|