Spaces:
Sleeping
Sleeping
File size: 2,170 Bytes
7e1f5f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import os
import sys
from pathlib import Path
from typing import Dict, Any, List
from loguru import logger
from omegaconf import OmegaConf
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
def get_meme_corpus(db, crud) -> List[str]:
"""
Retrieve all meme texts from the database.
Args:
db: Database session.
crud: CRUD operations module.
Returns:
List[str]: List of meme texts.
"""
memes = crud.get_all_memes(db)
corpus = [meme.text for meme in memes]
logger.info(f"Retrieved {len(corpus)} memes from the database")
return corpus
def build_semantic_index(
corpus: List[str], config: Dict[str, Any], SemanticIndexer):
"""
Build and save the semantic index.
Args:
corpus (List[str]): List of meme texts.
config (Dict[str, Any]): Configuration dictionary.
SemanticIndexer: SemanticIndexer class.
"""
model = config['semantic_search']['model']
prefix = config['semantic_search']['document_prefix']
indexer = SemanticIndexer(corpus, model=model, prefix=prefix)
semantic_index_folder = config['index_folders']['semantic']
os.makedirs(semantic_index_folder, exist_ok=True)
indexer.create_index(semantic_index_folder)
logger.info(f"Semantic index created and saved in {semantic_index_folder}")
def main():
from src.db import crud
from src.indexing.semantic_indexer import SemanticIndexer
logger.add("logs/build_semantic_index.log", rotation="10 MB")
# Load configuration
config = OmegaConf.load('config.yaml')
config = OmegaConf.to_container(config)
engine = create_engine(config['database']['url'])
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
db = SessionLocal()
try:
corpus = get_meme_corpus(db, crud)
build_semantic_index(corpus, config, SemanticIndexer)
finally:
db.close()
logger.info("Semantic index building completed")
if __name__ == "__main__":
# Set up project root path
project_root = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(project_root))
main()
|