Spaces:
Sleeping
Sleeping
File size: 2,948 Bytes
7e1f5f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import json
import os
import sys
from pathlib import Path
from typing import Dict, Any, List
from loguru import logger
from omegaconf import OmegaConf
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
def process_json_files(
raw_data_path: str) -> tuple[List[Dict[str, str]], List[Dict[str, Any]]]:
"""
Process all JSON files in the raw data folder.
Args:
raw_data_path (str): Path to the folder containing JSON files.
Returns:
tuple: Lists of public and meme data to be added to the database.
"""
publics_to_add: List[Dict[str, str]] = []
memes_to_add: List[Dict[str, Any]] = []
for filename in os.listdir(raw_data_path):
if filename.endswith('.json'):
public_vk = filename[:-5] # Remove .json extension
file_path = os.path.join(raw_data_path, filename)
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
publics_to_add.append({
"public_vk": public_vk,
"public_name": data['name']
})
for post in data['posts']:
memes_to_add.append({
"public_vk": public_vk,
"text": post['text'],
"image_url": post['image_url']
})
logger.info(
f"Processed file: {filename}, found {len(data['posts'])} memes")
return publics_to_add, memes_to_add
def main():
from src.db.models import Base
from src.db import crud
logger.add("logs/make_db.log", rotation="10 MB")
# Load configuration
config = OmegaConf.load('config.yaml')
config = OmegaConf.to_container(config)
engine = create_engine(config['database']['url'])
# Drop all existing tables and create new ones
Base.metadata.drop_all(bind=engine)
Base.metadata.create_all(bind=engine)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
db = SessionLocal()
raw_data_path = config['data_folders']['raw_data']
publics_to_add, memes_to_add = process_json_files(raw_data_path)
# Add all publics to the database
added_publics = crud.add_publics(db, publics_to_add)
# Create a mapping of public_vk to public_id
public_vk_to_id = {public.public_vk: public.id for public in added_publics}
# Update memes with correct public_id
for meme in memes_to_add:
meme['public_id'] = public_vk_to_id[meme.pop('public_vk')]
# Add all memes to the database
crud.add_memes(db, memes_to_add)
logger.info(
f"Added {len(added_publics)} publics and {len(memes_to_add)} memes to the database")
db.close()
logger.info("Database population completed")
if __name__ == "__main__":
# Set up project root path
project_root = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(project_root))
main()
|