"""Obsidian reader class. Pass in the path to an Obsidian vault and it will parse all markdown files into a List of Documents, with each Document containing text from under an Obsidian header. """ import os from pathlib import Path from typing import Any, List from langchain.docstore.document import Document as LCDocument from gpt_index.readers.base import BaseReader from gpt_index.readers.file.markdown_parser import MarkdownParser from gpt_index.readers.schema.base import Document class ObsidianReader(BaseReader): """Utilities for loading data from an Obsidian Vault. Args: input_dir (str): Path to the vault. """ def __init__(self, input_dir: str): """Init params.""" self.input_dir = Path(input_dir) def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]: """Load data from the input directory.""" docs: List[str] = [] for dirpath, dirnames, filenames in os.walk(self.input_dir): dirnames[:] = [d for d in dirnames if not d.startswith(".")] for filename in filenames: if filename.endswith(".md"): filepath = os.path.join(dirpath, filename) content = MarkdownParser().parse_file(Path(filepath)) docs.extend(content) return [Document(d) for d in docs] def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]: """Load data in LangChain document format.""" docs = self.load_data(**load_kwargs) return [d.to_langchain_format() for d in docs]