"""Download loader from the Loader Hub.""" import json import os import subprocess import sys from importlib import util from pathlib import Path from typing import List, Optional, Tuple, Type import pkg_resources import requests from pkg_resources import DistributionNotFound from gpt_index.readers.base import BaseReader LLAMA_HUB_CONTENTS_URL = "https://raw.githubusercontent.com/emptycrown/loader-hub/main" LOADER_HUB_PATH = "/loader_hub" LOADER_HUB_URL = LLAMA_HUB_CONTENTS_URL + LOADER_HUB_PATH def _get_file_content(loader_hub_url: str, path: str) -> Tuple[str, int]: """Get the content of a file from the GitHub REST API.""" resp = requests.get(loader_hub_url + path) return resp.text, resp.status_code def get_exports(raw_content: str) -> List: """Read content of a Python file and returns a list of exported class names. For example: ```python from .a import A from .b import B __all__ = ["A", "B"] ``` will return `["A", "B"]`. Args: - raw_content: The content of a Python file as a string. Returns: A list of exported class names. """ exports = [] for line in raw_content.splitlines(): line = line.strip() if line.startswith("__all__"): exports = line.split("=")[1].strip().strip("[").strip("]").split(",") exports = [export.strip().strip("'").strip('"') for export in exports] return exports def rewrite_exports(exports: List[str]) -> None: """Write the `__all__` variable to the `__init__.py` file in the modules dir. Removes the line that contains `__all__` and appends a new line with the updated `__all__` variable. Args: - exports: A list of exported class names. """ dirpath = Path(__file__).parent / "llamahub_modules" init_path = f"{dirpath}/__init__.py" with open(init_path, "r") as f: lines = f.readlines() with open(init_path, "w") as f: for line in lines: line = line.strip() if line.startswith("__all__"): continue f.write(line + os.linesep) f.write(f"__all__ = {list(set(exports))}" + os.linesep) def download_loader( loader_class: str, loader_hub_url: str = LOADER_HUB_URL, refresh_cache: Optional[bool] = False, use_gpt_index_import: bool = False, ) -> Type[BaseReader]: """Download a single loader from the Loader Hub. Args: loader_class: The name of the loader class you want to download, such as `SimpleWebPageReader`. refresh_cache: If true, the local cache will be skipped and the loader will be fetched directly from the remote repo. use_gpt_index_import: If true, the loader files will use gpt_index as the base dependency. By default (False), the loader files use llama_index as the base dependency. NOTE: this is a temporary workaround while we fully migrate all usages to llama_index. Returns: A Loader. """ dirpath = Path(__file__).parent / "llamahub_modules" if not os.path.exists(dirpath): # Create a new directory because it does not exist os.makedirs(dirpath) if not os.path.exists(f"{dirpath}/__init__.py"): # Create an empty __init__.py file if it does not exist yet with open(f"{dirpath}/__init__.py", "w") as f: pass library_path = f"{dirpath}/library.json" loader_id = None # e.g. `web/simple_web` extra_files = [] # e.g. `web/simple_web/utils.py` # Check cache first if not refresh_cache and os.path.exists(library_path): with open(library_path) as f: library = json.load(f) if loader_class in library: loader_id = library[loader_class]["id"] extra_files = library[loader_class].get("extra_files", []) # Fetch up-to-date library from remote repo if loader_id not found if loader_id is None: library_raw_content, _ = _get_file_content(loader_hub_url, "/library.json") library = json.loads(library_raw_content) if loader_class not in library: raise ValueError("Loader class name not found in library") loader_id = library[loader_class]["id"] extra_files = library[loader_class].get("extra_files", []) # Update cache with open(library_path, "w") as f: f.write(library_raw_content) if loader_id is None: raise ValueError("Loader class name not found in library") # Load the module loader_path = f"{dirpath}/{loader_id}" requirements_path = f"{loader_path}/requirements.txt" if refresh_cache or not os.path.exists(loader_path): os.makedirs(loader_path, exist_ok=True) basepy_raw_content, _ = _get_file_content( loader_hub_url, f"/{loader_id}/base.py" ) if use_gpt_index_import: basepy_raw_content = basepy_raw_content.replace( "import llama_index", "import gpt_index" ) basepy_raw_content = basepy_raw_content.replace( "from llama_index", "from gpt_index" ) with open(f"{loader_path}/base.py", "w") as f: f.write(basepy_raw_content) # Get content of extra files if there are any # and write them under the loader directory for extra_file in extra_files: extra_file_raw_content, _ = _get_file_content( loader_hub_url, f"/{loader_id}/{extra_file}" ) # If the extra file is an __init__.py file, we need to # add the exports to the __init__.py file in the modules directory if extra_file == "__init__.py": loader_exports = get_exports(extra_file_raw_content) existing_exports = [] with open(dirpath / "__init__.py", "r+") as f: f.write(f"from .{loader_id} import {', '.join(loader_exports)}") existing_exports = get_exports(f.read()) rewrite_exports(existing_exports + loader_exports) with open(f"{loader_path}/{extra_file}", "w") as f: f.write(extra_file_raw_content) if not os.path.exists(requirements_path): # NOTE: need to check the status code response_txt, status_code = _get_file_content( loader_hub_url, f"/{loader_id}/requirements.txt" ) if status_code == 200: with open(requirements_path, "w") as f: f.write(response_txt) # Install dependencies if there are any and not already installed if os.path.exists(requirements_path): try: requirements = pkg_resources.parse_requirements( Path(requirements_path).open() ) pkg_resources.require([str(r) for r in requirements]) except DistributionNotFound: subprocess.check_call( [sys.executable, "-m", "pip", "install", "-r", requirements_path] ) spec = util.spec_from_file_location( "custom_loader", location=f"{loader_path}/base.py" ) if spec is None: raise ValueError(f"Could not find file: {loader_path}/base.py.") module = util.module_from_spec(spec) spec.loader.exec_module(module) # type: ignore return getattr(module, loader_class)