tfrere's picture
first commit
0821095
raw
history blame
10.6 kB
"""
Utilities for file management.
"""
import json
import os
import datetime
import shutil
import time
import random
import tempfile
import logging
from filelock import FileLock
logger = logging.getLogger("leaderboard-parser")
def save_results(results, file_path):
"""
Save results to a JSON file.
Args:
results: The results to save
file_path: The path to the file
"""
with open(file_path, "w") as f:
json.dump(results, f, indent=2)
def create_category_slug(category_name):
"""
Creates a slug from a category name.
The slug uses only hyphens as separators (no underscore).
Args:
category_name: The category name
Returns:
The category slug
"""
if not category_name:
return ""
# Convert to lowercase and replace spaces with hyphens
# Ensure no underscores are used in the category slug
return category_name.lower().replace(" ", "-").replace("_", "-")
def create_combined_id(category, uid):
"""
Creates a normalized combined identifier from a category and UID.
First normalizes the category using create_category_slug.
Args:
category: The category name
uid: The UID of the leaderboard
Returns:
The combined identifier in the format category_slug_uid
"""
normalized_category = create_category_slug(category)
return f"{normalized_category}_{uid}"
def validate_leaderboard_result(result):
"""
Validates and corrects if necessary a leaderboard result to ensure identifier consistency.
This function checks:
1. That 'uid' is present and correctly formatted (category_original_uid)
2. That 'original_uid' is present
3. That 'category' is present and normalized
4. That 'uid' corresponds to the combination of category and original_uid
Args:
result: The leaderboard result to validate (dict)
Returns:
The validated and corrected result, or None if validation is impossible
"""
if not isinstance(result, dict):
logger.error(f"Validation error: the result is not a dictionary")
return None
# Check if required fields are present
if "original_uid" not in result:
logger.error(f"Validation error: original_uid missing from result")
return None
if "category" not in result:
logger.error(f"Validation error: category missing from result")
return None
original_uid = result["original_uid"]
category = result["category"]
# Normalize the category if necessary
normalized_category = create_category_slug(category)
if normalized_category != category:
logger.warning(f"Category not normalized: '{category}' -> '{normalized_category}'")
result["category"] = normalized_category
# Recalculate the correct combined uid
correct_uid = create_combined_id(normalized_category, original_uid)
# Check if existing uid is correct
if "uid" not in result:
logger.warning(f"uid missing, adding calculated uid: {correct_uid}")
result["uid"] = correct_uid
elif result["uid"] != correct_uid:
logger.warning(f"uid inconsistent: '{result['uid']}' does not match '{correct_uid}', correction applied")
result["uid"] = correct_uid
return result
def load_and_validate_results(file_path):
"""
Loads results from the file without strict validation.
Args:
file_path: Path to the results file
Returns:
List of results, or empty list in case of error
"""
try:
# Load results from the file
try:
with open(file_path, "r", encoding="utf-8") as f:
results_data = json.load(f)
except (FileNotFoundError, json.JSONDecodeError) as e:
logger.warning(f"Unable to load file {file_path}: {str(e)}")
return []
# Convert from dict with "leaderboards" to array if necessary
if isinstance(results_data, dict) and "leaderboards" in results_data:
array_results = []
for uid, item in results_data["leaderboards"].items():
item_copy = item.copy()
item_copy["uid"] = uid
array_results.append(item_copy)
results_data = array_results
# Ensure results_data is a list
if not isinstance(results_data, list):
logger.warning(f"Invalid data format in {file_path}, initializing empty list")
return []
# Sort results
results_data.sort(key=lambda x: (x.get("category", ""), x.get("original_uid", "")))
logger.info(f"Load successful: {len(results_data)} results")
return results_data
except Exception as e:
logger.error(f"Error loading results: {str(e)}")
return []
def update_leaderboard_result(leaderboard_result, file_path, max_wait_seconds=30):
"""
Updates a leaderboard result in the specified file.
If an entry with the same uid already exists, it is updated.
Otherwise, a new entry is added.
Args:
leaderboard_result: The leaderboard result to update (must contain a uid)
file_path: Path to the results file
max_wait_seconds: Maximum wait time for file lock (in seconds)
Returns:
Updated results list or None in case of error
"""
if not leaderboard_result or "uid" not in leaderboard_result:
logger.error("Unable to update: invalid or missing leaderboard result or uid")
return None
# Create parent directory if necessary
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Use a lock to avoid concurrent writes
lock_path = f"{file_path}.lock"
lock = FileLock(lock_path, timeout=max_wait_seconds)
try:
with lock:
# Load existing results
current_results = load_and_validate_results(file_path)
# Index by uid for easy update
results_by_uid = {r.get("uid", ""): r for r in current_results if "uid" in r}
# Update or add result
uid = leaderboard_result["uid"]
if uid in results_by_uid:
# Update existing result
results_by_uid[uid].update(leaderboard_result)
logger.info(f"Result updated for uid: {uid}")
else:
# Add new result
results_by_uid[uid] = leaderboard_result
logger.info(f"New result added for uid: {uid}")
# Convert to list for writing
updated_results = list(results_by_uid.values())
# Sort results
updated_results.sort(key=lambda x: (x.get("category", ""), x.get("original_uid", "")))
# Write to temporary file then rename for atomicity
fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(file_path))
try:
with os.fdopen(fd, 'w', encoding='utf-8') as f:
json.dump(updated_results, f, indent=2, ensure_ascii=False)
# Replace original file with temporary file
shutil.move(temp_path, file_path)
logger.info(f"File updated successfully: {file_path}")
return updated_results
except Exception as e:
# Clean up in case of error
if os.path.exists(temp_path):
os.unlink(temp_path)
raise e
except Exception as e:
logger.error(f"Error updating file {file_path}: {str(e)}")
return None
def split_combined_id(combined_id):
"""
Splits a combined identifier (category_uid) into its components.
Uses only the first underscore "_" as separator.
Args:
combined_id: The combined identifier (category_uid)
Returns:
A tuple (category, uid) or (None, combined_id) if no underscore
"""
if not combined_id:
return None, None
# Search for the first underscore to separate category and uid
parts = combined_id.split("_", 1)
if len(parts) == 2:
return parts[0], parts[1]
else:
# If no underscore, consider it as just a uid without category
return None, combined_id
def format_datetime(dt_str):
"""
Format a datetime string to a human readable format.
Args:
dt_str: The datetime string to format
Returns:
A formatted datetime string
"""
try:
# Check if input is already a datetime object
if isinstance(dt_str, datetime.datetime):
dt = dt_str
else:
# Convert ISO format to datetime object
# Handle different formats of ISO dates including fractional seconds and timezone
try:
dt = datetime.datetime.fromisoformat(dt_str)
except ValueError:
# Handle other common formats
formats = [
"%Y-%m-%dT%H:%M:%S.%f%z",
"%Y-%m-%dT%H:%M:%S.%f",
"%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%S",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d"
]
for fmt in formats:
try:
dt = datetime.datetime.strptime(dt_str, fmt)
break
except ValueError:
continue
else:
# If no format matches
return dt_str
# Format the datetime object
return dt.strftime("%d/%m/%Y à %H:%M:%S")
except (ValueError, TypeError) as e:
print(f"Error formatting date {dt_str}: {e}")
return dt_str
def clean_output_files(results_file):
"""
Clean the output files, but keep a backup of the original.
Args:
results_file: The results file to clean
"""
# If results file exists, make a backup
if os.path.exists(results_file):
backup_file = f"{results_file}.backup"
shutil.copy2(results_file, backup_file)
print(f"Backup of {results_file} created in {backup_file}")
# Create an empty results file
with open(results_file, "w") as f:
json.dump([], f, indent=2)
print(f"File {results_file} cleaned")