Spaces:
Running
Running
import os | |
from enum import Enum | |
from typing import Any, Dict, List, Optional | |
from langchain.callbacks.manager import CallbackManagerForRetrieverRun | |
from langchain.schema import Document | |
from langchain.schema.retriever import BaseRetriever | |
class SearchAPIRetriever(BaseRetriever): | |
"""Search API retriever.""" | |
pages: List[Dict] = [] | |
def _get_relevant_documents( | |
self, query: str, *, run_manager: CallbackManagerForRetrieverRun | |
) -> List[Document]: | |
docs = [ | |
Document( | |
page_content=page.get("raw_content", ""), | |
metadata={ | |
"title": page.get("title", ""), | |
"source": page.get("url", ""), | |
}, | |
) | |
for page in self.pages | |
] | |
return docs | |
class SectionRetriever(BaseRetriever): | |
""" | |
SectionRetriever: | |
This class is used to retrieve sections while avoiding redundant subtopics. | |
""" | |
sections: List[Dict] = [] | |
""" | |
sections example: | |
[ | |
{ | |
"section_title": "Example Title", | |
"written_content": "Example content" | |
}, | |
... | |
] | |
""" | |
def _get_relevant_documents( | |
self, query: str, *, run_manager: CallbackManagerForRetrieverRun | |
) -> List[Document]: | |
docs = [ | |
Document( | |
page_content=page.get("written_content", ""), | |
metadata={ | |
"section_title": page.get("section_title", ""), | |
}, | |
) | |
for page in self.sections # Changed 'self.pages' to 'self.sections' | |
] | |
return docs |