Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| from pathlib import Path | |
| from typing import Dict, Any, List | |
| from loguru import logger | |
| from omegaconf import OmegaConf | |
| from sqlalchemy import create_engine | |
| from sqlalchemy.orm import sessionmaker | |
| def get_meme_corpus(db, crud) -> List[str]: | |
| """ | |
| Retrieve all meme texts from the database. | |
| Args: | |
| db: Database session. | |
| crud: CRUD operations module. | |
| Returns: | |
| List[str]: List of meme texts. | |
| """ | |
| memes = crud.get_all_memes(db) | |
| corpus = [meme.text for meme in memes] | |
| logger.info(f"Retrieved {len(corpus)} memes from the database") | |
| return corpus | |
| def build_semantic_index( | |
| corpus: List[str], config: Dict[str, Any], SemanticIndexer): | |
| """ | |
| Build and save the semantic index. | |
| Args: | |
| corpus (List[str]): List of meme texts. | |
| config (Dict[str, Any]): Configuration dictionary. | |
| SemanticIndexer: SemanticIndexer class. | |
| """ | |
| model = config['semantic_search']['model'] | |
| prefix = config['semantic_search']['document_prefix'] | |
| indexer = SemanticIndexer(corpus, model=model, prefix=prefix) | |
| semantic_index_folder = config['index_folders']['semantic'] | |
| os.makedirs(semantic_index_folder, exist_ok=True) | |
| indexer.create_index(semantic_index_folder) | |
| logger.info(f"Semantic index created and saved in {semantic_index_folder}") | |
| def main(): | |
| from src.db import crud | |
| from src.indexing.semantic_indexer import SemanticIndexer | |
| logger.add("logs/build_semantic_index.log", rotation="10 MB") | |
| # Load configuration | |
| config = OmegaConf.load('config.yaml') | |
| config = OmegaConf.to_container(config) | |
| engine = create_engine(config['database']['url']) | |
| SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) | |
| db = SessionLocal() | |
| try: | |
| corpus = get_meme_corpus(db, crud) | |
| build_semantic_index(corpus, config, SemanticIndexer) | |
| finally: | |
| db.close() | |
| logger.info("Semantic index building completed") | |
| if __name__ == "__main__": | |
| # Set up project root path | |
| project_root = Path(__file__).resolve().parents[1] | |
| sys.path.insert(0, str(project_root)) | |
| main() | |