File size: 2,282 Bytes
7e1f5f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import sys
from pathlib import Path
from typing import Dict, Any, List

from loguru import logger
from omegaconf import OmegaConf
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker


def get_meme_corpus(db, crud) -> List[str]:
    """
    Retrieve all meme texts from the database.

    Args:
        db: Database session.
        crud: CRUD operations module.

    Returns:
        List[str]: List of meme texts.
    """
    memes = crud.get_all_memes(db)
    corpus = [meme.text for meme in memes]
    logger.info(f"Retrieved {len(corpus)} memes from the database")
    return corpus


def build_bm25_index(corpus: List[str],
                     config: Dict[str,
                                  Any],
                     mystem_tokenizer,
                     BM25Indexer):
    """
    Build and save the BM25 index.

    Args:
        corpus (List[str]): List of meme texts.
        config (Dict[str, Any]): Configuration dictionary.
        mystem_tokenizer: MystemTokenizer instance.
        BM25Indexer: BM25Indexer class.
    """
    indexer = BM25Indexer(corpus, mystem_tokenizer.tokenize)
    bm25_index_folder = config['index_folders']['bm25']
    os.makedirs(bm25_index_folder, exist_ok=True)
    indexer.create_index(bm25_index_folder)
    logger.info(f"BM25S index created and saved in {bm25_index_folder}")


def main():
    from src.db import crud
    from src.preprocessing.mystem_tokenizer import MystemTokenizer
    from src.indexing.bm25_indexer import BM25Indexer

    logger.add("logs/build_bm25s_index.log", rotation="10 MB")

    # Load configuration
    config = OmegaConf.load('config.yaml')
    config = OmegaConf.to_container(config)

    engine = create_engine(config['database']['url'])
    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
    db = SessionLocal()

    try:
        corpus = get_meme_corpus(db, crud)
        mystem_tokenizer = MystemTokenizer()
        build_bm25_index(corpus, config, mystem_tokenizer, BM25Indexer)
    finally:
        db.close()

    logger.info("BM25S index building completed")


if __name__ == "__main__":
    # Set up project root path
    project_root = Path(__file__).resolve().parents[1]
    sys.path.insert(0, str(project_root))
    main()