import json import os import sys from pathlib import Path from typing import Dict, Any, List from loguru import logger from omegaconf import OmegaConf from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker def process_json_files( raw_data_path: str) -> tuple[List[Dict[str, str]], List[Dict[str, Any]]]: """ Process all JSON files in the raw data folder. Args: raw_data_path (str): Path to the folder containing JSON files. Returns: tuple: Lists of public and meme data to be added to the database. """ publics_to_add: List[Dict[str, str]] = [] memes_to_add: List[Dict[str, Any]] = [] for filename in os.listdir(raw_data_path): if filename.endswith('.json'): public_vk = filename[:-5] # Remove .json extension file_path = os.path.join(raw_data_path, filename) with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) publics_to_add.append({ "public_vk": public_vk, "public_name": data['name'] }) for post in data['posts']: memes_to_add.append({ "public_vk": public_vk, "text": post['text'], "image_url": post['image_url'] }) logger.info( f"Processed file: {filename}, found {len(data['posts'])} memes") return publics_to_add, memes_to_add def main(): from src.db.models import Base from src.db import crud logger.add("logs/make_db.log", rotation="10 MB") # Load configuration config = OmegaConf.load('config.yaml') config = OmegaConf.to_container(config) engine = create_engine(config['database']['url']) # Drop all existing tables and create new ones Base.metadata.drop_all(bind=engine) Base.metadata.create_all(bind=engine) SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) db = SessionLocal() raw_data_path = config['data_folders']['raw_data'] publics_to_add, memes_to_add = process_json_files(raw_data_path) # Add all publics to the database added_publics = crud.add_publics(db, publics_to_add) # Create a mapping of public_vk to public_id public_vk_to_id = {public.public_vk: public.id for public in added_publics} # Update memes with correct public_id for meme in memes_to_add: meme['public_id'] = public_vk_to_id[meme.pop('public_vk')] # Add all memes to the database crud.add_memes(db, memes_to_add) logger.info( f"Added {len(added_publics)} publics and {len(memes_to_add)} memes to the database") db.close() logger.info("Database population completed") if __name__ == "__main__": # Set up project root path project_root = Path(__file__).resolve().parents[1] sys.path.insert(0, str(project_root)) main()