File size: 2,170 Bytes
7e1f5f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import sys
from pathlib import Path
from typing import Dict, Any, List

from loguru import logger
from omegaconf import OmegaConf
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker


def get_meme_corpus(db, crud) -> List[str]:
    """
    Retrieve all meme texts from the database.

    Args:
        db: Database session.
        crud: CRUD operations module.

    Returns:
        List[str]: List of meme texts.
    """
    memes = crud.get_all_memes(db)
    corpus = [meme.text for meme in memes]
    logger.info(f"Retrieved {len(corpus)} memes from the database")
    return corpus


def build_semantic_index(
        corpus: List[str], config: Dict[str, Any], SemanticIndexer):
    """
    Build and save the semantic index.

    Args:
        corpus (List[str]): List of meme texts.
        config (Dict[str, Any]): Configuration dictionary.
        SemanticIndexer: SemanticIndexer class.
    """
    model = config['semantic_search']['model']
    prefix = config['semantic_search']['document_prefix']
    indexer = SemanticIndexer(corpus, model=model, prefix=prefix)

    semantic_index_folder = config['index_folders']['semantic']
    os.makedirs(semantic_index_folder, exist_ok=True)
    indexer.create_index(semantic_index_folder)
    logger.info(f"Semantic index created and saved in {semantic_index_folder}")


def main():
    from src.db import crud
    from src.indexing.semantic_indexer import SemanticIndexer

    logger.add("logs/build_semantic_index.log", rotation="10 MB")

    # Load configuration
    config = OmegaConf.load('config.yaml')
    config = OmegaConf.to_container(config)

    engine = create_engine(config['database']['url'])
    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
    db = SessionLocal()

    try:
        corpus = get_meme_corpus(db, crud)
        build_semantic_index(corpus, config, SemanticIndexer)
    finally:
        db.close()

    logger.info("Semantic index building completed")


if __name__ == "__main__":
    # Set up project root path
    project_root = Path(__file__).resolve().parents[1]
    sys.path.insert(0, str(project_root))
    main()