Spaces:
Build error
Build error
File size: 10,627 Bytes
0821095 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 |
"""
Utilities for file management.
"""
import json
import os
import datetime
import shutil
import time
import random
import tempfile
import logging
from filelock import FileLock
logger = logging.getLogger("leaderboard-parser")
def save_results(results, file_path):
"""
Save results to a JSON file.
Args:
results: The results to save
file_path: The path to the file
"""
with open(file_path, "w") as f:
json.dump(results, f, indent=2)
def create_category_slug(category_name):
"""
Creates a slug from a category name.
The slug uses only hyphens as separators (no underscore).
Args:
category_name: The category name
Returns:
The category slug
"""
if not category_name:
return ""
# Convert to lowercase and replace spaces with hyphens
# Ensure no underscores are used in the category slug
return category_name.lower().replace(" ", "-").replace("_", "-")
def create_combined_id(category, uid):
"""
Creates a normalized combined identifier from a category and UID.
First normalizes the category using create_category_slug.
Args:
category: The category name
uid: The UID of the leaderboard
Returns:
The combined identifier in the format category_slug_uid
"""
normalized_category = create_category_slug(category)
return f"{normalized_category}_{uid}"
def validate_leaderboard_result(result):
"""
Validates and corrects if necessary a leaderboard result to ensure identifier consistency.
This function checks:
1. That 'uid' is present and correctly formatted (category_original_uid)
2. That 'original_uid' is present
3. That 'category' is present and normalized
4. That 'uid' corresponds to the combination of category and original_uid
Args:
result: The leaderboard result to validate (dict)
Returns:
The validated and corrected result, or None if validation is impossible
"""
if not isinstance(result, dict):
logger.error(f"Validation error: the result is not a dictionary")
return None
# Check if required fields are present
if "original_uid" not in result:
logger.error(f"Validation error: original_uid missing from result")
return None
if "category" not in result:
logger.error(f"Validation error: category missing from result")
return None
original_uid = result["original_uid"]
category = result["category"]
# Normalize the category if necessary
normalized_category = create_category_slug(category)
if normalized_category != category:
logger.warning(f"Category not normalized: '{category}' -> '{normalized_category}'")
result["category"] = normalized_category
# Recalculate the correct combined uid
correct_uid = create_combined_id(normalized_category, original_uid)
# Check if existing uid is correct
if "uid" not in result:
logger.warning(f"uid missing, adding calculated uid: {correct_uid}")
result["uid"] = correct_uid
elif result["uid"] != correct_uid:
logger.warning(f"uid inconsistent: '{result['uid']}' does not match '{correct_uid}', correction applied")
result["uid"] = correct_uid
return result
def load_and_validate_results(file_path):
"""
Loads results from the file without strict validation.
Args:
file_path: Path to the results file
Returns:
List of results, or empty list in case of error
"""
try:
# Load results from the file
try:
with open(file_path, "r", encoding="utf-8") as f:
results_data = json.load(f)
except (FileNotFoundError, json.JSONDecodeError) as e:
logger.warning(f"Unable to load file {file_path}: {str(e)}")
return []
# Convert from dict with "leaderboards" to array if necessary
if isinstance(results_data, dict) and "leaderboards" in results_data:
array_results = []
for uid, item in results_data["leaderboards"].items():
item_copy = item.copy()
item_copy["uid"] = uid
array_results.append(item_copy)
results_data = array_results
# Ensure results_data is a list
if not isinstance(results_data, list):
logger.warning(f"Invalid data format in {file_path}, initializing empty list")
return []
# Sort results
results_data.sort(key=lambda x: (x.get("category", ""), x.get("original_uid", "")))
logger.info(f"Load successful: {len(results_data)} results")
return results_data
except Exception as e:
logger.error(f"Error loading results: {str(e)}")
return []
def update_leaderboard_result(leaderboard_result, file_path, max_wait_seconds=30):
"""
Updates a leaderboard result in the specified file.
If an entry with the same uid already exists, it is updated.
Otherwise, a new entry is added.
Args:
leaderboard_result: The leaderboard result to update (must contain a uid)
file_path: Path to the results file
max_wait_seconds: Maximum wait time for file lock (in seconds)
Returns:
Updated results list or None in case of error
"""
if not leaderboard_result or "uid" not in leaderboard_result:
logger.error("Unable to update: invalid or missing leaderboard result or uid")
return None
# Create parent directory if necessary
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Use a lock to avoid concurrent writes
lock_path = f"{file_path}.lock"
lock = FileLock(lock_path, timeout=max_wait_seconds)
try:
with lock:
# Load existing results
current_results = load_and_validate_results(file_path)
# Index by uid for easy update
results_by_uid = {r.get("uid", ""): r for r in current_results if "uid" in r}
# Update or add result
uid = leaderboard_result["uid"]
if uid in results_by_uid:
# Update existing result
results_by_uid[uid].update(leaderboard_result)
logger.info(f"Result updated for uid: {uid}")
else:
# Add new result
results_by_uid[uid] = leaderboard_result
logger.info(f"New result added for uid: {uid}")
# Convert to list for writing
updated_results = list(results_by_uid.values())
# Sort results
updated_results.sort(key=lambda x: (x.get("category", ""), x.get("original_uid", "")))
# Write to temporary file then rename for atomicity
fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(file_path))
try:
with os.fdopen(fd, 'w', encoding='utf-8') as f:
json.dump(updated_results, f, indent=2, ensure_ascii=False)
# Replace original file with temporary file
shutil.move(temp_path, file_path)
logger.info(f"File updated successfully: {file_path}")
return updated_results
except Exception as e:
# Clean up in case of error
if os.path.exists(temp_path):
os.unlink(temp_path)
raise e
except Exception as e:
logger.error(f"Error updating file {file_path}: {str(e)}")
return None
def split_combined_id(combined_id):
"""
Splits a combined identifier (category_uid) into its components.
Uses only the first underscore "_" as separator.
Args:
combined_id: The combined identifier (category_uid)
Returns:
A tuple (category, uid) or (None, combined_id) if no underscore
"""
if not combined_id:
return None, None
# Search for the first underscore to separate category and uid
parts = combined_id.split("_", 1)
if len(parts) == 2:
return parts[0], parts[1]
else:
# If no underscore, consider it as just a uid without category
return None, combined_id
def format_datetime(dt_str):
"""
Format a datetime string to a human readable format.
Args:
dt_str: The datetime string to format
Returns:
A formatted datetime string
"""
try:
# Check if input is already a datetime object
if isinstance(dt_str, datetime.datetime):
dt = dt_str
else:
# Convert ISO format to datetime object
# Handle different formats of ISO dates including fractional seconds and timezone
try:
dt = datetime.datetime.fromisoformat(dt_str)
except ValueError:
# Handle other common formats
formats = [
"%Y-%m-%dT%H:%M:%S.%f%z",
"%Y-%m-%dT%H:%M:%S.%f",
"%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%S",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d"
]
for fmt in formats:
try:
dt = datetime.datetime.strptime(dt_str, fmt)
break
except ValueError:
continue
else:
# If no format matches
return dt_str
# Format the datetime object
return dt.strftime("%d/%m/%Y à %H:%M:%S")
except (ValueError, TypeError) as e:
print(f"Error formatting date {dt_str}: {e}")
return dt_str
def clean_output_files(results_file):
"""
Clean the output files, but keep a backup of the original.
Args:
results_file: The results file to clean
"""
# If results file exists, make a backup
if os.path.exists(results_file):
backup_file = f"{results_file}.backup"
shutil.copy2(results_file, backup_file)
print(f"Backup of {results_file} created in {backup_file}")
# Create an empty results file
with open(results_file, "w") as f:
json.dump([], f, indent=2)
print(f"File {results_file} cleaned") |