Spaces:
Runtime error
Runtime error
File size: 7,399 Bytes
35b22df |
|
"""Download loader from the Loader Hub."""
import json
import os
import subprocess
import sys
from importlib import util
from pathlib import Path
from typing import List, Optional, Tuple, Type
import pkg_resources
import requests
from pkg_resources import DistributionNotFound
from gpt_index.readers.base import BaseReader
LLAMA_HUB_CONTENTS_URL = "https://raw.githubusercontent.com/emptycrown/loader-hub/main"
LOADER_HUB_PATH = "/loader_hub"
LOADER_HUB_URL = LLAMA_HUB_CONTENTS_URL + LOADER_HUB_PATH
def _get_file_content(loader_hub_url: str, path: str) -> Tuple[str, int]:
"""Get the content of a file from the GitHub REST API."""
resp = requests.get(loader_hub_url + path)
return resp.text, resp.status_code
def get_exports(raw_content: str) -> List:
"""Read content of a Python file and returns a list of exported class names.
For example:
```python
from .a import A
from .b import B
__all__ = ["A", "B"]
```
will return `["A", "B"]`.
Args:
- raw_content: The content of a Python file as a string.
Returns:
A list of exported class names.
"""
exports = []
for line in raw_content.splitlines():
line = line.strip()
if line.startswith("__all__"):
exports = line.split("=")[1].strip().strip("[").strip("]").split(",")
exports = [export.strip().strip("'").strip('"') for export in exports]
return exports
def rewrite_exports(exports: List[str]) -> None:
"""Write the `__all__` variable to the `__init__.py` file in the modules dir.
Removes the line that contains `__all__` and appends a new line with the updated
`__all__` variable.
Args:
- exports: A list of exported class names.
"""
dirpath = Path(__file__).parent / "llamahub_modules"
init_path = f"{dirpath}/__init__.py"
with open(init_path, "r") as f:
lines = f.readlines()
with open(init_path, "w") as f:
for line in lines:
line = line.strip()
if line.startswith("__all__"):
continue
f.write(line + os.linesep)
f.write(f"__all__ = {list(set(exports))}" + os.linesep)
def download_loader(
loader_class: str,
loader_hub_url: str = LOADER_HUB_URL,
refresh_cache: Optional[bool] = False,
use_gpt_index_import: bool = False,
) -> Type[BaseReader]:
"""Download a single loader from the Loader Hub.
Args:
loader_class: The name of the loader class you want to download,
such as `SimpleWebPageReader`.
refresh_cache: If true, the local cache will be skipped and the
loader will be fetched directly from the remote repo.
use_gpt_index_import: If true, the loader files will use
gpt_index as the base dependency. By default (False),
the loader files use llama_index as the base dependency.
NOTE: this is a temporary workaround while we fully migrate all usages
to llama_index.
Returns:
A Loader.
"""
dirpath = Path(__file__).parent / "llamahub_modules"
if not os.path.exists(dirpath):
# Create a new directory because it does not exist
os.makedirs(dirpath)
if not os.path.exists(f"{dirpath}/__init__.py"):
# Create an empty __init__.py file if it does not exist yet
with open(f"{dirpath}/__init__.py", "w") as f:
pass
library_path = f"{dirpath}/library.json"
loader_id = None # e.g. `web/simple_web`
extra_files = [] # e.g. `web/simple_web/utils.py`
# Check cache first
if not refresh_cache and os.path.exists(library_path):
with open(library_path) as f:
library = json.load(f)
if loader_class in library:
loader_id = library[loader_class]["id"]
extra_files = library[loader_class].get("extra_files", [])
# Fetch up-to-date library from remote repo if loader_id not found
if loader_id is None:
library_raw_content, _ = _get_file_content(loader_hub_url, "/library.json")
library = json.loads(library_raw_content)
if loader_class not in library:
raise ValueError("Loader class name not found in library")
loader_id = library[loader_class]["id"]
extra_files = library[loader_class].get("extra_files", [])
# Update cache
with open(library_path, "w") as f:
f.write(library_raw_content)
if loader_id is None:
raise ValueError("Loader class name not found in library")
# Load the module
loader_path = f"{dirpath}/{loader_id}"
requirements_path = f"{loader_path}/requirements.txt"
if refresh_cache or not os.path.exists(loader_path):
os.makedirs(loader_path, exist_ok=True)
basepy_raw_content, _ = _get_file_content(
loader_hub_url, f"/{loader_id}/base.py"
)
if use_gpt_index_import:
basepy_raw_content = basepy_raw_content.replace(
"import llama_index", "import gpt_index"
)
basepy_raw_content = basepy_raw_content.replace(
"from llama_index", "from gpt_index"
)
with open(f"{loader_path}/base.py", "w") as f:
f.write(basepy_raw_content)
# Get content of extra files if there are any
# and write them under the loader directory
for extra_file in extra_files:
extra_file_raw_content, _ = _get_file_content(
loader_hub_url, f"/{loader_id}/{extra_file}"
)
# If the extra file is an __init__.py file, we need to
# add the exports to the __init__.py file in the modules directory
if extra_file == "__init__.py":
loader_exports = get_exports(extra_file_raw_content)
existing_exports = []
with open(dirpath / "__init__.py", "r+") as f:
f.write(f"from .{loader_id} import {', '.join(loader_exports)}")
existing_exports = get_exports(f.read())
rewrite_exports(existing_exports + loader_exports)
with open(f"{loader_path}/{extra_file}", "w") as f:
f.write(extra_file_raw_content)
if not os.path.exists(requirements_path):
# NOTE: need to check the status code
response_txt, status_code = _get_file_content(
loader_hub_url, f"/{loader_id}/requirements.txt"
)
if status_code == 200:
with open(requirements_path, "w") as f:
f.write(response_txt)
# Install dependencies if there are any and not already installed
if os.path.exists(requirements_path):
try:
requirements = pkg_resources.parse_requirements(
Path(requirements_path).open()
)
pkg_resources.require([str(r) for r in requirements])
except DistributionNotFound:
subprocess.check_call(
[sys.executable, "-m", "pip", "install", "-r", requirements_path]
)
spec = util.spec_from_file_location(
"custom_loader", location=f"{loader_path}/base.py"
)
if spec is None:
raise ValueError(f"Could not find file: {loader_path}/base.py.")
module = util.module_from_spec(spec)
spec.loader.exec_module(module) # type: ignore
return getattr(module, loader_class)
|