Spaces:
Build error
Build error
File size: 6,539 Bytes
0821095 b0bf659 0821095 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
"""
Leaderboard processing.
"""
import datetime
import os
from src.agents.parser_agent import process_leaderboard
from src.file_utils import create_category_slug, split_combined_id
def normalize_category(category_name):
"""
Normalizes a category name by replacing spaces and underscores with hyphens and converting to lowercase.
Args:
category_name: The category name to normalize
Returns:
The normalized category
"""
# Use the create_category_slug function from file_utils.py
return create_category_slug(category_name)
def process_single_leaderboard(uid, host, model, index, all_results, additional_rules=None, category=None):
"""
Process a single leaderboard and update the results.
Args:
uid: The UID of the leaderboard to process
host: The URL of the leaderboard
model: The model to use
index: The index of the leaderboard
all_results: The list of all results
additional_rules: Additional specific rules for this leaderboard
category: The category of the leaderboard (for combined identifier)
Returns:
The updated list of results
"""
print(f"\n\nProcessing leaderboard: {uid} - {host}")
if additional_rules:
print(f"Additional rules for this leaderboard: {additional_rules}")
if category:
normalized_category = normalize_category(category)
print(f"Category: {category} (normalized: {normalized_category})")
else:
normalized_category = None
# Get the maximum number of retries from environment variables
max_retries = int(os.getenv("LEADERBOARD_MAX_RETRIES", "3"))
print(f"Maximum number of retries configured: {max_retries}")
attempt = 0
last_error = None
# Try to process the leaderboard multiple times
while attempt < max_retries:
attempt += 1
if attempt > 1:
print(f"Retry attempt {attempt}/{max_retries} for leaderboard {uid} - {host}")
# Process the leaderboard
result = process_leaderboard(host, model, index, uid, additional_rules)
# If the parsing was successful or we've reached the maximum number of retries
if result.get("parsing_status") == "success" or attempt >= max_retries:
break
# If there was an error, save it for later
if result.get("parsing_status") == "error":
last_error = result.get("parsing_message", "Unknown error")
print(f"Error during attempt {attempt}: {last_error}")
# Get parsing date from result or generate a new one if not available
if result and "parsed_at" in result:
parsed_at = result["parsed_at"]
else:
# Fallback to current time if not provided by process_leaderboard
now = datetime.datetime.now()
parsed_at = now.isoformat()
# Create combined ID if category is provided
result_uid = uid
if normalized_category:
# Format of the combined UID: category_uid
# The category is already normalized (slugified) by normalize_category
# The underscore "_" is the ONLY separator between the category and the UID
result_uid = f"{normalized_category}_{uid}"
# Create base result object with uid, host, and thumbnail
leaderboard_result = {
"uid": result_uid,
"original_uid": uid,
"category": normalized_category,
"host": host,
"parsing_status": "rejected", # Default to rejected
"parsed_at": parsed_at
}
# Check if we have valid results
valid_result = False
if result and result.get("results"):
if isinstance(result["results"], dict):
# Check if we have top models with required fields
if "top_models" in result["results"] and len(result["results"]["top_models"]) > 0:
valid_models = True
for model_info in result["results"]["top_models"]:
# Each model must have at least rank and name
if not model_info.get("rank") or not model_info.get("name"):
valid_models = False
break
# Check if we have evaluation criteria
if valid_models and "evaluation_criteria" in result["results"] and result["results"]["evaluation_criteria"]:
valid_result = True
else:
print(f"Invalid results format: {type(result['results']).__name__}, expected dict")
else:
print(f"Missing or empty results in agent response")
# If we have valid results, extract the data
if valid_result:
leaderboard_result["parsing_status"] = "approved"
leaderboard_result["top_models"] = []
leaderboard_result["evaluation_criteria"] = result["results"]["evaluation_criteria"]
# Extract top models
for model_info in result["results"]["top_models"]:
model_entry = {
"rank": model_info.get("rank"),
"name": model_info.get("name"),
"url": model_info.get("url", None)
}
leaderboard_result["top_models"].append(model_entry)
else:
print(f"Leaderboard rejected: {uid} - Incomplete or invalid information")
# Check if this UID already exists in the results
for i, existing_result in enumerate(all_results):
if existing_result["uid"] == result_uid:
# Replace the existing result
all_results[i] = leaderboard_result
print(f"Result updated for UID: {result_uid}")
return all_results
# ADDITIONAL CHECK: Make sure there's no confusion with other categories
# for the same original_uid
for existing_result in all_results:
if existing_result["original_uid"] == uid and existing_result["category"] != normalized_category:
print(f"WARNING: A result already exists for original_uid {uid} but with a different category:")
print(f" - Existing category: {existing_result['category']}, UID: {existing_result['uid']}")
print(f" - New category: {normalized_category}, UID: {result_uid}")
# We continue anyway, as it's a valid case to have the same leaderboard in different categories
# If we get here, this is a new result
all_results.append(leaderboard_result)
print(f"New result added for UID: {result_uid}")
return all_results |