File size: 2,448 Bytes
7e1f5f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import json
import os
import sys
from pathlib import Path
from typing import Dict, Any

from loguru import logger
from omegaconf import OmegaConf


def process_public(parser, public_id: str, config: Dict[str, Any]) -> None:
    """
    Process a single public page, updating or creating its JSON file.

    Args:
        parser: VK meme parser instance.
        public_id (str): ID our short name of the public page.
        config (Dict[str, Any]): Configuration dictionary.
    """
    raw_data_path = config['data_folders']['raw_data']
    json_file_path = os.path.join(raw_data_path, f"{public_id}.json")

    logger.info(f"Processing public: {public_id}")

    memes_data = parser.get_memes(public_id)

    if os.path.exists(json_file_path):
        # Update existing JSON file
        with open(json_file_path, 'r', encoding='utf-8') as file:
            existing_data = json.load(file)

        existing_posts = {post['id']: post for post in existing_data['posts']}
        new_posts = [post for post in memes_data['posts']
                     if post['id'] not in existing_posts]

        # Add new posts to the beginning of the list
        existing_data['posts'] = new_posts + existing_data['posts']

        with open(json_file_path, 'w', encoding='utf-8') as file:
            json.dump(existing_data, file, ensure_ascii=False, indent=2)

        logger.info(f"Updated {len(new_posts)} new posts for {public_id}")

    else:
        # Create new JSON file
        with open(json_file_path, 'w', encoding='utf-8') as file:
            json.dump(memes_data, file, ensure_ascii=False, indent=2)

        logger.info(
            f"Created new JSON file for {public_id} with {len(memes_data['posts'])} posts")


def main():
    from src.parsing.vk_meme_parser import VKMemeParser

    logger.add("logs/data_collector.log", rotation="10 MB")

    # Load configuration
    config = OmegaConf.load('config.yaml')
    config = OmegaConf.to_container(config)

    parser = VKMemeParser(config['vk_parser']['api_token'])

    for folder in config['data_folders'].values():
        os.makedirs(folder, exist_ok=True)

    for public_id in config['vk_parser']['meme_pages']:
        process_public(parser, public_id, config)

    logger.info("Data collection process completed")


if __name__ == "__main__":
    # Set up project root path
    project_root = Path(__file__).resolve().parents[1]
    sys.path.insert(0, str(project_root))
    main()