Spaces:
Sleeping
Sleeping
File size: 2,448 Bytes
7e1f5f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import json
import os
import sys
from pathlib import Path
from typing import Dict, Any
from loguru import logger
from omegaconf import OmegaConf
def process_public(parser, public_id: str, config: Dict[str, Any]) -> None:
"""
Process a single public page, updating or creating its JSON file.
Args:
parser: VK meme parser instance.
public_id (str): ID our short name of the public page.
config (Dict[str, Any]): Configuration dictionary.
"""
raw_data_path = config['data_folders']['raw_data']
json_file_path = os.path.join(raw_data_path, f"{public_id}.json")
logger.info(f"Processing public: {public_id}")
memes_data = parser.get_memes(public_id)
if os.path.exists(json_file_path):
# Update existing JSON file
with open(json_file_path, 'r', encoding='utf-8') as file:
existing_data = json.load(file)
existing_posts = {post['id']: post for post in existing_data['posts']}
new_posts = [post for post in memes_data['posts']
if post['id'] not in existing_posts]
# Add new posts to the beginning of the list
existing_data['posts'] = new_posts + existing_data['posts']
with open(json_file_path, 'w', encoding='utf-8') as file:
json.dump(existing_data, file, ensure_ascii=False, indent=2)
logger.info(f"Updated {len(new_posts)} new posts for {public_id}")
else:
# Create new JSON file
with open(json_file_path, 'w', encoding='utf-8') as file:
json.dump(memes_data, file, ensure_ascii=False, indent=2)
logger.info(
f"Created new JSON file for {public_id} with {len(memes_data['posts'])} posts")
def main():
from src.parsing.vk_meme_parser import VKMemeParser
logger.add("logs/data_collector.log", rotation="10 MB")
# Load configuration
config = OmegaConf.load('config.yaml')
config = OmegaConf.to_container(config)
parser = VKMemeParser(config['vk_parser']['api_token'])
for folder in config['data_folders'].values():
os.makedirs(folder, exist_ok=True)
for public_id in config['vk_parser']['meme_pages']:
process_public(parser, public_id, config)
logger.info("Data collection process completed")
if __name__ == "__main__":
# Set up project root path
project_root = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(project_root))
main()
|