textmeme_search / scripts /build_bm25_index.py
Futyn-Maker
Deploy the app
7e1f5f6
import os
import sys
from pathlib import Path
from typing import Dict, Any, List
from loguru import logger
from omegaconf import OmegaConf
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
def get_meme_corpus(db, crud) -> List[str]:
"""
Retrieve all meme texts from the database.
Args:
db: Database session.
crud: CRUD operations module.
Returns:
List[str]: List of meme texts.
"""
memes = crud.get_all_memes(db)
corpus = [meme.text for meme in memes]
logger.info(f"Retrieved {len(corpus)} memes from the database")
return corpus
def build_bm25_index(corpus: List[str],
config: Dict[str,
Any],
mystem_tokenizer,
BM25Indexer):
"""
Build and save the BM25 index.
Args:
corpus (List[str]): List of meme texts.
config (Dict[str, Any]): Configuration dictionary.
mystem_tokenizer: MystemTokenizer instance.
BM25Indexer: BM25Indexer class.
"""
indexer = BM25Indexer(corpus, mystem_tokenizer.tokenize)
bm25_index_folder = config['index_folders']['bm25']
os.makedirs(bm25_index_folder, exist_ok=True)
indexer.create_index(bm25_index_folder)
logger.info(f"BM25S index created and saved in {bm25_index_folder}")
def main():
from src.db import crud
from src.preprocessing.mystem_tokenizer import MystemTokenizer
from src.indexing.bm25_indexer import BM25Indexer
logger.add("logs/build_bm25s_index.log", rotation="10 MB")
# Load configuration
config = OmegaConf.load('config.yaml')
config = OmegaConf.to_container(config)
engine = create_engine(config['database']['url'])
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
db = SessionLocal()
try:
corpus = get_meme_corpus(db, crud)
mystem_tokenizer = MystemTokenizer()
build_bm25_index(corpus, config, mystem_tokenizer, BM25Indexer)
finally:
db.close()
logger.info("BM25S index building completed")
if __name__ == "__main__":
# Set up project root path
project_root = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(project_root))
main()