submodule + versioning
Browse files- .gitignore +6 -0
- .gitmodules +3 -0
- app.py +80 -53
- guard-bench-submodule +1 -0
- src/display/css_html_js.py +18 -0
- src/display/utils.py +21 -14
- src/leaderboard/processor.py +65 -32
- src/populate.py +97 -26
- src/submission/submit.py +38 -20
.gitignore
CHANGED
|
@@ -43,3 +43,9 @@ eval-queue/
|
|
| 43 |
eval-results/
|
| 44 |
eval-queue-bk/
|
| 45 |
eval-results-bk/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
eval-results/
|
| 44 |
eval-queue-bk/
|
| 45 |
eval-results-bk/
|
| 46 |
+
|
| 47 |
+
# Data files
|
| 48 |
+
data/
|
| 49 |
+
|
| 50 |
+
# Versioned leaderboard files
|
| 51 |
+
data/leaderboard_v*.json
|
.gitmodules
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[submodule "guard-bench-submodule"]
|
| 2 |
+
path = guard-bench-submodule
|
| 3 |
+
url = https://github.com/whitecircle-ai/guard-bench.git
|
app.py
CHANGED
|
@@ -51,10 +51,14 @@ logger = logging.getLogger(__name__)
|
|
| 51 |
# Ensure data directory exists
|
| 52 |
os.makedirs(DATA_PATH, exist_ok=True)
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
# Initialize leaderboard data
|
| 55 |
try:
|
| 56 |
logger.info("Initializing leaderboard data...")
|
| 57 |
-
LEADERBOARD_DF = get_leaderboard_df()
|
| 58 |
logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
|
| 59 |
except Exception as e:
|
| 60 |
logger.error(f"Error loading leaderboard data: {e}")
|
|
@@ -70,7 +74,7 @@ def init_leaderboard(dataframe):
|
|
| 70 |
columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
|
| 71 |
dataframe = pd.DataFrame(columns=columns)
|
| 72 |
logger.warning("Initializing empty leaderboard")
|
| 73 |
-
|
| 74 |
return Leaderboard(
|
| 75 |
value=dataframe,
|
| 76 |
datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS],
|
|
@@ -79,7 +83,7 @@ def init_leaderboard(dataframe):
|
|
| 79 |
cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS],
|
| 80 |
label="Select Columns to Display:",
|
| 81 |
),
|
| 82 |
-
search_columns=[GUARDBENCH_COLUMN.
|
| 83 |
hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS],
|
| 84 |
filter_columns=[
|
| 85 |
ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"),
|
|
@@ -95,23 +99,24 @@ def submit_results(
|
|
| 95 |
precision: str,
|
| 96 |
weight_type: str,
|
| 97 |
model_type: str,
|
| 98 |
-
submission_file: tempfile._TemporaryFileWrapper
|
|
|
|
| 99 |
):
|
| 100 |
"""
|
| 101 |
Handle submission of results with model metadata.
|
| 102 |
"""
|
| 103 |
if submission_file is None:
|
| 104 |
return styled_error("No submission file provided")
|
| 105 |
-
|
| 106 |
if not model_name:
|
| 107 |
return styled_error("Model name is required")
|
| 108 |
-
|
| 109 |
if not model_type:
|
| 110 |
return styled_error("Please select a model type")
|
| 111 |
-
|
| 112 |
file_path = submission_file.name
|
| 113 |
logger.info(f"Received submission for model {model_name}: {file_path}")
|
| 114 |
-
|
| 115 |
# Add metadata to the submission
|
| 116 |
metadata = {
|
| 117 |
"model_name": model_name,
|
|
@@ -119,35 +124,46 @@ def submit_results(
|
|
| 119 |
"revision": revision if revision else "main",
|
| 120 |
"precision": precision,
|
| 121 |
"weight_type": weight_type,
|
| 122 |
-
"model_type": model_type
|
|
|
|
| 123 |
}
|
| 124 |
-
|
| 125 |
# Process the submission
|
| 126 |
-
result = process_submission(file_path, metadata)
|
| 127 |
-
|
| 128 |
# Refresh the leaderboard data
|
| 129 |
global LEADERBOARD_DF
|
| 130 |
try:
|
| 131 |
-
logger.info("Refreshing leaderboard data after submission...")
|
| 132 |
-
LEADERBOARD_DF = get_leaderboard_df()
|
| 133 |
logger.info("Refreshed leaderboard data after submission")
|
| 134 |
except Exception as e:
|
| 135 |
logger.error(f"Error refreshing leaderboard data: {e}")
|
| 136 |
-
|
| 137 |
return result
|
| 138 |
|
| 139 |
|
| 140 |
-
def refresh_data():
|
| 141 |
"""
|
| 142 |
Refresh the leaderboard data from HuggingFace.
|
| 143 |
"""
|
| 144 |
global LEADERBOARD_DF
|
| 145 |
try:
|
| 146 |
-
logger.info("Performing scheduled refresh of leaderboard data...")
|
| 147 |
-
LEADERBOARD_DF = get_leaderboard_df()
|
| 148 |
logger.info("Scheduled refresh of leaderboard data completed")
|
| 149 |
except Exception as e:
|
| 150 |
logger.error(f"Error in scheduled refresh: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
|
| 153 |
# Create Gradio app
|
|
@@ -155,43 +171,54 @@ demo = gr.Blocks(css=custom_css)
|
|
| 155 |
|
| 156 |
with demo:
|
| 157 |
gr.HTML(TITLE)
|
| 158 |
-
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 161 |
with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
|
| 162 |
refresh_button = gr.Button("Refresh Leaderboard")
|
| 163 |
-
|
| 164 |
# Create tabs for each category
|
| 165 |
with gr.Tabs(elem_classes="category-tabs") as category_tabs:
|
| 166 |
# First tab for average metrics across all categories
|
| 167 |
with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
|
| 168 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 169 |
-
|
| 170 |
# Create a tab for each category
|
| 171 |
for category in CATEGORIES:
|
| 172 |
with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
|
| 173 |
-
category_df = get_category_leaderboard_df(category)
|
| 174 |
category_leaderboard = init_leaderboard(category_df)
|
| 175 |
-
|
| 176 |
# Refresh button functionality
|
| 177 |
refresh_button.click(
|
| 178 |
fn=lambda: [
|
| 179 |
-
init_leaderboard(get_leaderboard_df()),
|
| 180 |
-
*[init_leaderboard(get_category_leaderboard_df(category)) for category in CATEGORIES]
|
| 181 |
],
|
| 182 |
inputs=[],
|
| 183 |
outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
|
| 184 |
)
|
| 185 |
-
|
| 186 |
with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1):
|
| 187 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 188 |
-
|
| 189 |
with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2):
|
| 190 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 191 |
-
|
| 192 |
with gr.Row():
|
| 193 |
gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
|
| 194 |
-
|
| 195 |
with gr.Row():
|
| 196 |
with gr.Column():
|
| 197 |
model_name_textbox = gr.Textbox(label="Model name")
|
|
@@ -203,33 +230,33 @@ with demo:
|
|
| 203 |
value=None,
|
| 204 |
interactive=True,
|
| 205 |
)
|
| 206 |
-
|
| 207 |
with gr.Column():
|
| 208 |
precision = gr.Dropdown(
|
| 209 |
-
choices=[i.
|
| 210 |
label="Precision",
|
| 211 |
multiselect=False,
|
| 212 |
value="float16",
|
| 213 |
interactive=True,
|
| 214 |
)
|
| 215 |
weight_type = gr.Dropdown(
|
| 216 |
-
choices=[i.
|
| 217 |
label="Weights type",
|
| 218 |
multiselect=False,
|
| 219 |
value="Original",
|
| 220 |
interactive=True,
|
| 221 |
)
|
| 222 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 223 |
-
|
| 224 |
with gr.Row():
|
| 225 |
file_input = gr.File(
|
| 226 |
-
label="Upload JSONL Results File",
|
| 227 |
file_types=[".jsonl"]
|
| 228 |
)
|
| 229 |
-
|
| 230 |
submit_button = gr.Button("Submit Results")
|
| 231 |
result_output = gr.Markdown()
|
| 232 |
-
|
| 233 |
submit_button.click(
|
| 234 |
fn=submit_results,
|
| 235 |
inputs=[
|
|
@@ -239,11 +266,19 @@ with demo:
|
|
| 239 |
precision,
|
| 240 |
weight_type,
|
| 241 |
model_type,
|
| 242 |
-
file_input
|
|
|
|
| 243 |
],
|
| 244 |
outputs=result_output
|
| 245 |
)
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
with gr.Row():
|
| 248 |
with gr.Accordion("📙 Citation", open=False):
|
| 249 |
citation_button = gr.Textbox(
|
|
@@ -253,29 +288,21 @@ with demo:
|
|
| 253 |
elem_id="citation-button",
|
| 254 |
show_copy_button=True,
|
| 255 |
)
|
| 256 |
-
|
| 257 |
with gr.Accordion("ℹ️ Dataset Information", open=False):
|
| 258 |
dataset_info = gr.Markdown(f"""
|
| 259 |
## Dataset Information
|
| 260 |
-
|
| 261 |
Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID})
|
| 262 |
-
|
| 263 |
Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")}
|
| 264 |
""")
|
| 265 |
|
| 266 |
-
# Set up scheduler to refresh data periodically
|
| 267 |
scheduler = BackgroundScheduler()
|
| 268 |
-
scheduler.add_job(refresh_data, 'interval', minutes=30)
|
| 269 |
scheduler.start()
|
| 270 |
|
| 271 |
# Launch the app
|
| 272 |
if __name__ == "__main__":
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
logger.warning("Admin username or password not set. Running without authentication.")
|
| 276 |
-
auth = None
|
| 277 |
-
else:
|
| 278 |
-
auth = (ADMIN_USERNAME, ADMIN_PASSWORD)
|
| 279 |
-
|
| 280 |
-
# Launch the app
|
| 281 |
-
demo.launch(server_name="0.0.0.0", server_port=7860, auth=auth)
|
|
|
|
| 51 |
# Ensure data directory exists
|
| 52 |
os.makedirs(DATA_PATH, exist_ok=True)
|
| 53 |
|
| 54 |
+
# Available benchmark versions
|
| 55 |
+
BENCHMARK_VERSIONS = ["v0"]
|
| 56 |
+
CURRENT_VERSION = "v0"
|
| 57 |
+
|
| 58 |
# Initialize leaderboard data
|
| 59 |
try:
|
| 60 |
logger.info("Initializing leaderboard data...")
|
| 61 |
+
LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION)
|
| 62 |
logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
|
| 63 |
except Exception as e:
|
| 64 |
logger.error(f"Error loading leaderboard data: {e}")
|
|
|
|
| 74 |
columns = [getattr(GUARDBENCH_COLUMN, col).name for col in DISPLAY_COLS]
|
| 75 |
dataframe = pd.DataFrame(columns=columns)
|
| 76 |
logger.warning("Initializing empty leaderboard")
|
| 77 |
+
|
| 78 |
return Leaderboard(
|
| 79 |
value=dataframe,
|
| 80 |
datatype=[getattr(GUARDBENCH_COLUMN, col).type for col in DISPLAY_COLS],
|
|
|
|
| 83 |
cant_deselect=[getattr(GUARDBENCH_COLUMN, col).name for col in NEVER_HIDDEN_COLS],
|
| 84 |
label="Select Columns to Display:",
|
| 85 |
),
|
| 86 |
+
search_columns=[GUARDBENCH_COLUMN.model_name.name],
|
| 87 |
hide_columns=[getattr(GUARDBENCH_COLUMN, col).name for col in HIDDEN_COLS],
|
| 88 |
filter_columns=[
|
| 89 |
ColumnFilter(GUARDBENCH_COLUMN.model_type.name, type="checkboxgroup", label="Model types"),
|
|
|
|
| 99 |
precision: str,
|
| 100 |
weight_type: str,
|
| 101 |
model_type: str,
|
| 102 |
+
submission_file: tempfile._TemporaryFileWrapper,
|
| 103 |
+
version: str
|
| 104 |
):
|
| 105 |
"""
|
| 106 |
Handle submission of results with model metadata.
|
| 107 |
"""
|
| 108 |
if submission_file is None:
|
| 109 |
return styled_error("No submission file provided")
|
| 110 |
+
|
| 111 |
if not model_name:
|
| 112 |
return styled_error("Model name is required")
|
| 113 |
+
|
| 114 |
if not model_type:
|
| 115 |
return styled_error("Please select a model type")
|
| 116 |
+
|
| 117 |
file_path = submission_file.name
|
| 118 |
logger.info(f"Received submission for model {model_name}: {file_path}")
|
| 119 |
+
|
| 120 |
# Add metadata to the submission
|
| 121 |
metadata = {
|
| 122 |
"model_name": model_name,
|
|
|
|
| 124 |
"revision": revision if revision else "main",
|
| 125 |
"precision": precision,
|
| 126 |
"weight_type": weight_type,
|
| 127 |
+
"model_type": model_type,
|
| 128 |
+
"version": version
|
| 129 |
}
|
| 130 |
+
|
| 131 |
# Process the submission
|
| 132 |
+
result = process_submission(file_path, metadata, version=version)
|
| 133 |
+
|
| 134 |
# Refresh the leaderboard data
|
| 135 |
global LEADERBOARD_DF
|
| 136 |
try:
|
| 137 |
+
logger.info(f"Refreshing leaderboard data after submission for version {version}...")
|
| 138 |
+
LEADERBOARD_DF = get_leaderboard_df(version=version)
|
| 139 |
logger.info("Refreshed leaderboard data after submission")
|
| 140 |
except Exception as e:
|
| 141 |
logger.error(f"Error refreshing leaderboard data: {e}")
|
| 142 |
+
|
| 143 |
return result
|
| 144 |
|
| 145 |
|
| 146 |
+
def refresh_data(version=CURRENT_VERSION):
|
| 147 |
"""
|
| 148 |
Refresh the leaderboard data from HuggingFace.
|
| 149 |
"""
|
| 150 |
global LEADERBOARD_DF
|
| 151 |
try:
|
| 152 |
+
logger.info(f"Performing scheduled refresh of leaderboard data for version {version}...")
|
| 153 |
+
LEADERBOARD_DF = get_leaderboard_df(version=version)
|
| 154 |
logger.info("Scheduled refresh of leaderboard data completed")
|
| 155 |
except Exception as e:
|
| 156 |
logger.error(f"Error in scheduled refresh: {e}")
|
| 157 |
+
return LEADERBOARD_DF
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def update_leaderboards(version):
|
| 161 |
+
"""
|
| 162 |
+
Update all leaderboard components with data for the selected version.
|
| 163 |
+
"""
|
| 164 |
+
new_df = get_leaderboard_df(version=version)
|
| 165 |
+
category_dfs = [get_category_leaderboard_df(category, version=version) for category in CATEGORIES]
|
| 166 |
+
return [init_leaderboard(new_df)] + [init_leaderboard(df) for df in category_dfs]
|
| 167 |
|
| 168 |
|
| 169 |
# Create Gradio app
|
|
|
|
| 171 |
|
| 172 |
with demo:
|
| 173 |
gr.HTML(TITLE)
|
| 174 |
+
|
| 175 |
+
with gr.Row():
|
| 176 |
+
with gr.Column(scale=3):
|
| 177 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 178 |
+
with gr.Column(scale=1):
|
| 179 |
+
version_selector = gr.Dropdown(
|
| 180 |
+
choices=BENCHMARK_VERSIONS,
|
| 181 |
+
label="Benchmark Version",
|
| 182 |
+
value=CURRENT_VERSION,
|
| 183 |
+
interactive=True,
|
| 184 |
+
elem_classes="version-selector"
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 188 |
with gr.TabItem("🏅 Leaderboard", elem_id="guardbench-leaderboard-tab", id=0):
|
| 189 |
refresh_button = gr.Button("Refresh Leaderboard")
|
| 190 |
+
|
| 191 |
# Create tabs for each category
|
| 192 |
with gr.Tabs(elem_classes="category-tabs") as category_tabs:
|
| 193 |
# First tab for average metrics across all categories
|
| 194 |
with gr.TabItem("📊 Overall Performance", elem_id="overall-tab"):
|
| 195 |
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
| 196 |
+
|
| 197 |
# Create a tab for each category
|
| 198 |
for category in CATEGORIES:
|
| 199 |
with gr.TabItem(f"{category}", elem_id=f"category-{category.lower().replace(' ', '-')}-tab"):
|
| 200 |
+
category_df = get_category_leaderboard_df(category, version=CURRENT_VERSION)
|
| 201 |
category_leaderboard = init_leaderboard(category_df)
|
| 202 |
+
|
| 203 |
# Refresh button functionality
|
| 204 |
refresh_button.click(
|
| 205 |
fn=lambda: [
|
| 206 |
+
init_leaderboard(get_leaderboard_df(version=version_selector.value)),
|
| 207 |
+
*[init_leaderboard(get_category_leaderboard_df(category, version=version_selector.value)) for category in CATEGORIES]
|
| 208 |
],
|
| 209 |
inputs=[],
|
| 210 |
outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
|
| 211 |
)
|
| 212 |
+
|
| 213 |
with gr.TabItem("📝 About", elem_id="guardbench-about-tab", id=1):
|
| 214 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 215 |
+
|
| 216 |
with gr.TabItem("🚀 Submit", elem_id="guardbench-submit-tab", id=2):
|
| 217 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 218 |
+
|
| 219 |
with gr.Row():
|
| 220 |
gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
|
| 221 |
+
|
| 222 |
with gr.Row():
|
| 223 |
with gr.Column():
|
| 224 |
model_name_textbox = gr.Textbox(label="Model name")
|
|
|
|
| 230 |
value=None,
|
| 231 |
interactive=True,
|
| 232 |
)
|
| 233 |
+
|
| 234 |
with gr.Column():
|
| 235 |
precision = gr.Dropdown(
|
| 236 |
+
choices=[i.name for i in Precision if i != Precision.Unknown],
|
| 237 |
label="Precision",
|
| 238 |
multiselect=False,
|
| 239 |
value="float16",
|
| 240 |
interactive=True,
|
| 241 |
)
|
| 242 |
weight_type = gr.Dropdown(
|
| 243 |
+
choices=[i.name for i in WeightType],
|
| 244 |
label="Weights type",
|
| 245 |
multiselect=False,
|
| 246 |
value="Original",
|
| 247 |
interactive=True,
|
| 248 |
)
|
| 249 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
| 250 |
+
|
| 251 |
with gr.Row():
|
| 252 |
file_input = gr.File(
|
| 253 |
+
label="Upload JSONL Results File",
|
| 254 |
file_types=[".jsonl"]
|
| 255 |
)
|
| 256 |
+
|
| 257 |
submit_button = gr.Button("Submit Results")
|
| 258 |
result_output = gr.Markdown()
|
| 259 |
+
|
| 260 |
submit_button.click(
|
| 261 |
fn=submit_results,
|
| 262 |
inputs=[
|
|
|
|
| 266 |
precision,
|
| 267 |
weight_type,
|
| 268 |
model_type,
|
| 269 |
+
file_input,
|
| 270 |
+
version_selector
|
| 271 |
],
|
| 272 |
outputs=result_output
|
| 273 |
)
|
| 274 |
+
|
| 275 |
+
# Version selector functionality
|
| 276 |
+
version_selector.change(
|
| 277 |
+
fn=update_leaderboards,
|
| 278 |
+
inputs=[version_selector],
|
| 279 |
+
outputs=[leaderboard] + [category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)]
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
with gr.Row():
|
| 283 |
with gr.Accordion("📙 Citation", open=False):
|
| 284 |
citation_button = gr.Textbox(
|
|
|
|
| 288 |
elem_id="citation-button",
|
| 289 |
show_copy_button=True,
|
| 290 |
)
|
| 291 |
+
|
| 292 |
with gr.Accordion("ℹ️ Dataset Information", open=False):
|
| 293 |
dataset_info = gr.Markdown(f"""
|
| 294 |
## Dataset Information
|
| 295 |
+
|
| 296 |
Results are stored in the HuggingFace dataset: [{RESULTS_DATASET_ID}](https://huggingface.co/datasets/{RESULTS_DATASET_ID})
|
| 297 |
+
|
| 298 |
Last updated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S UTC")}
|
| 299 |
""")
|
| 300 |
|
|
|
|
| 301 |
scheduler = BackgroundScheduler()
|
| 302 |
+
scheduler.add_job(lambda: refresh_data(version=CURRENT_VERSION), 'interval', minutes=30)
|
| 303 |
scheduler.start()
|
| 304 |
|
| 305 |
# Launch the app
|
| 306 |
if __name__ == "__main__":
|
| 307 |
+
|
| 308 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
guard-bench-submodule
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit 0a9f48bcedd0ccb6b5cf59ff7ed1186e32a5dc17
|
src/display/css_html_js.py
CHANGED
|
@@ -43,4 +43,22 @@ custom_css = """
|
|
| 43 |
text-decoration: underline;
|
| 44 |
color: #1976D2;
|
| 45 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
"""
|
|
|
|
| 43 |
text-decoration: underline;
|
| 44 |
color: #1976D2;
|
| 45 |
}
|
| 46 |
+
|
| 47 |
+
.version-selector {
|
| 48 |
+
margin-top: 10px;
|
| 49 |
+
padding: 5px;
|
| 50 |
+
border: 1px solid #e0e0e0;
|
| 51 |
+
border-radius: 5px;
|
| 52 |
+
background-color: #f9f9f9;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
.version-selector label {
|
| 56 |
+
font-weight: bold;
|
| 57 |
+
color: #2196F3;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
.version-selector select {
|
| 61 |
+
border-color: #2196F3;
|
| 62 |
+
border-radius: 4px;
|
| 63 |
+
}
|
| 64 |
"""
|
src/display/utils.py
CHANGED
|
@@ -36,12 +36,19 @@ class Precision(Enum):
|
|
| 36 |
int8 = auto()
|
| 37 |
int4 = auto()
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
class WeightType(Enum):
|
| 41 |
"""Model weight types."""
|
| 42 |
Original = auto()
|
| 43 |
Delta = auto()
|
| 44 |
Adapter = auto()
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
@dataclass
|
|
@@ -58,19 +65,19 @@ class ColumnInfo:
|
|
| 58 |
@dataclass
|
| 59 |
class GuardBenchColumn:
|
| 60 |
"""Columns for the GuardBench leaderboard."""
|
| 61 |
-
|
| 62 |
name="model_name",
|
| 63 |
display_name="Model",
|
| 64 |
never_hidden=True,
|
| 65 |
displayed_by_default=True
|
| 66 |
))
|
| 67 |
-
|
| 68 |
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 69 |
name="model_type",
|
| 70 |
display_name="Type",
|
| 71 |
displayed_by_default=True
|
| 72 |
))
|
| 73 |
-
|
| 74 |
# Metrics for all categories
|
| 75 |
default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 76 |
name="default_prompts_f1",
|
|
@@ -78,28 +85,28 @@ class GuardBenchColumn:
|
|
| 78 |
type="number",
|
| 79 |
displayed_by_default=True
|
| 80 |
))
|
| 81 |
-
|
| 82 |
jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 83 |
name="jailbreaked_prompts_f1",
|
| 84 |
display_name="Jailbreaked Prompts F1",
|
| 85 |
type="number",
|
| 86 |
displayed_by_default=True
|
| 87 |
))
|
| 88 |
-
|
| 89 |
default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 90 |
name="default_answers_f1",
|
| 91 |
display_name="Default Answers F1",
|
| 92 |
type="number",
|
| 93 |
displayed_by_default=True
|
| 94 |
))
|
| 95 |
-
|
| 96 |
jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 97 |
name="jailbreaked_answers_f1",
|
| 98 |
display_name="Jailbreaked Answers F1",
|
| 99 |
type="number",
|
| 100 |
displayed_by_default=True
|
| 101 |
))
|
| 102 |
-
|
| 103 |
# Average metrics
|
| 104 |
average_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 105 |
name="average_f1",
|
|
@@ -108,21 +115,21 @@ class GuardBenchColumn:
|
|
| 108 |
displayed_by_default=True,
|
| 109 |
never_hidden=True
|
| 110 |
))
|
| 111 |
-
|
| 112 |
average_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 113 |
name="average_recall",
|
| 114 |
display_name="Average Recall",
|
| 115 |
type="number",
|
| 116 |
displayed_by_default=False
|
| 117 |
))
|
| 118 |
-
|
| 119 |
average_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 120 |
name="average_precision",
|
| 121 |
display_name="Average Precision",
|
| 122 |
type="number",
|
| 123 |
displayed_by_default=False
|
| 124 |
))
|
| 125 |
-
|
| 126 |
# Additional metadata
|
| 127 |
submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 128 |
name="submission_date",
|
|
@@ -136,13 +143,13 @@ GUARDBENCH_COLUMN = GuardBenchColumn()
|
|
| 136 |
|
| 137 |
# Extract column lists for different views
|
| 138 |
COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
|
| 139 |
-
DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
| 140 |
if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
|
| 141 |
-
METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
| 142 |
if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
|
| 143 |
-
HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
| 144 |
if getattr(GUARDBENCH_COLUMN, f.name).hidden]
|
| 145 |
-
NEVER_HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
| 146 |
if getattr(GUARDBENCH_COLUMN, f.name).never_hidden]
|
| 147 |
|
| 148 |
# Categories in GuardBench
|
|
|
|
| 36 |
int8 = auto()
|
| 37 |
int4 = auto()
|
| 38 |
|
| 39 |
+
def __str__(self):
|
| 40 |
+
"""String representation of the precision type."""
|
| 41 |
+
return self.name
|
| 42 |
+
|
| 43 |
|
| 44 |
class WeightType(Enum):
|
| 45 |
"""Model weight types."""
|
| 46 |
Original = auto()
|
| 47 |
Delta = auto()
|
| 48 |
Adapter = auto()
|
| 49 |
+
def __str__(self):
|
| 50 |
+
"""String representation of the weight type."""
|
| 51 |
+
return self.name
|
| 52 |
|
| 53 |
|
| 54 |
@dataclass
|
|
|
|
| 65 |
@dataclass
|
| 66 |
class GuardBenchColumn:
|
| 67 |
"""Columns for the GuardBench leaderboard."""
|
| 68 |
+
model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 69 |
name="model_name",
|
| 70 |
display_name="Model",
|
| 71 |
never_hidden=True,
|
| 72 |
displayed_by_default=True
|
| 73 |
))
|
| 74 |
+
|
| 75 |
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 76 |
name="model_type",
|
| 77 |
display_name="Type",
|
| 78 |
displayed_by_default=True
|
| 79 |
))
|
| 80 |
+
|
| 81 |
# Metrics for all categories
|
| 82 |
default_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 83 |
name="default_prompts_f1",
|
|
|
|
| 85 |
type="number",
|
| 86 |
displayed_by_default=True
|
| 87 |
))
|
| 88 |
+
|
| 89 |
jailbreaked_prompts_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 90 |
name="jailbreaked_prompts_f1",
|
| 91 |
display_name="Jailbreaked Prompts F1",
|
| 92 |
type="number",
|
| 93 |
displayed_by_default=True
|
| 94 |
))
|
| 95 |
+
|
| 96 |
default_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 97 |
name="default_answers_f1",
|
| 98 |
display_name="Default Answers F1",
|
| 99 |
type="number",
|
| 100 |
displayed_by_default=True
|
| 101 |
))
|
| 102 |
+
|
| 103 |
jailbreaked_answers_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 104 |
name="jailbreaked_answers_f1",
|
| 105 |
display_name="Jailbreaked Answers F1",
|
| 106 |
type="number",
|
| 107 |
displayed_by_default=True
|
| 108 |
))
|
| 109 |
+
|
| 110 |
# Average metrics
|
| 111 |
average_f1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 112 |
name="average_f1",
|
|
|
|
| 115 |
displayed_by_default=True,
|
| 116 |
never_hidden=True
|
| 117 |
))
|
| 118 |
+
|
| 119 |
average_recall: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 120 |
name="average_recall",
|
| 121 |
display_name="Average Recall",
|
| 122 |
type="number",
|
| 123 |
displayed_by_default=False
|
| 124 |
))
|
| 125 |
+
|
| 126 |
average_precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 127 |
name="average_precision",
|
| 128 |
display_name="Average Precision",
|
| 129 |
type="number",
|
| 130 |
displayed_by_default=False
|
| 131 |
))
|
| 132 |
+
|
| 133 |
# Additional metadata
|
| 134 |
submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
| 135 |
name="submission_date",
|
|
|
|
| 143 |
|
| 144 |
# Extract column lists for different views
|
| 145 |
COLS = [f.name for f in fields(GUARDBENCH_COLUMN)]
|
| 146 |
+
DISPLAY_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
| 147 |
if getattr(GUARDBENCH_COLUMN, f.name).displayed_by_default]
|
| 148 |
+
METRIC_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
| 149 |
if getattr(GUARDBENCH_COLUMN, f.name).type == "number"]
|
| 150 |
+
HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
| 151 |
if getattr(GUARDBENCH_COLUMN, f.name).hidden]
|
| 152 |
+
NEVER_HIDDEN_COLS = [getattr(GUARDBENCH_COLUMN, f.name).name for f in fields(GUARDBENCH_COLUMN)
|
| 153 |
if getattr(GUARDBENCH_COLUMN, f.name).never_hidden]
|
| 154 |
|
| 155 |
# Categories in GuardBench
|
src/leaderboard/processor.py
CHANGED
|
@@ -16,11 +16,21 @@ def load_leaderboard_data(file_path: str) -> Dict:
|
|
| 16 |
Load the leaderboard data from a JSON file.
|
| 17 |
"""
|
| 18 |
if not os.path.exists(file_path):
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
| 21 |
with open(file_path, 'r') as f:
|
| 22 |
data = json.load(f)
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
return data
|
| 25 |
|
| 26 |
|
|
@@ -30,10 +40,17 @@ def save_leaderboard_data(data: Dict, file_path: str) -> None:
|
|
| 30 |
"""
|
| 31 |
# Ensure the directory exists
|
| 32 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
| 33 |
-
|
| 34 |
# Update the last_updated timestamp
|
| 35 |
data["last_updated"] = datetime.now().isoformat()
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
with open(file_path, 'w') as f:
|
| 38 |
json.dump(data, f, indent=2)
|
| 39 |
|
|
@@ -43,26 +60,32 @@ def process_submission(submission_data: List[Dict]) -> List[Dict]:
|
|
| 43 |
Process submission data and convert it to leaderboard entries.
|
| 44 |
"""
|
| 45 |
entries = []
|
| 46 |
-
|
| 47 |
for item in submission_data:
|
| 48 |
# Create a new entry for the leaderboard
|
| 49 |
entry = {
|
| 50 |
"model_name": item.get("model_name", "Unknown Model"),
|
| 51 |
"per_category_metrics": {},
|
| 52 |
"avg_metrics": {},
|
| 53 |
-
"submission_date": datetime.now().isoformat()
|
|
|
|
| 54 |
}
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
# Process per-category metrics
|
| 57 |
if "per_category_metrics" in item:
|
| 58 |
entry["per_category_metrics"] = item["per_category_metrics"]
|
| 59 |
-
|
| 60 |
# Process average metrics
|
| 61 |
if "avg_metrics" in item:
|
| 62 |
entry["avg_metrics"] = item["avg_metrics"]
|
| 63 |
-
|
| 64 |
entries.append(entry)
|
| 65 |
-
|
| 66 |
return entries
|
| 67 |
|
| 68 |
|
|
@@ -71,17 +94,23 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
| 71 |
Convert leaderboard data to a pandas DataFrame for display.
|
| 72 |
"""
|
| 73 |
rows = []
|
| 74 |
-
|
| 75 |
for entry in leaderboard_data.get("entries", []):
|
| 76 |
model_name = entry.get("model_name", "Unknown Model")
|
| 77 |
-
|
| 78 |
# Extract average metrics for main display
|
| 79 |
row = {
|
| 80 |
"model_name": model_name,
|
| 81 |
"model_type": entry.get("model_type", "Unknown"),
|
| 82 |
-
"submission_date": entry.get("submission_date", "")
|
|
|
|
| 83 |
}
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
# Add average metrics
|
| 86 |
avg_metrics = entry.get("avg_metrics", {})
|
| 87 |
for test_type in TEST_TYPES:
|
|
@@ -90,12 +119,12 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
| 90 |
if metric in avg_metrics[test_type]:
|
| 91 |
col_name = f"{test_type}_{metric}"
|
| 92 |
row[col_name] = avg_metrics[test_type][metric]
|
| 93 |
-
|
| 94 |
# Calculate overall averages for key metrics
|
| 95 |
f1_values = []
|
| 96 |
recall_values = []
|
| 97 |
precision_values = []
|
| 98 |
-
|
| 99 |
for test_type in TEST_TYPES:
|
| 100 |
if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
|
| 101 |
f1_values.append(avg_metrics[test_type]["f1_binary"])
|
|
@@ -103,7 +132,7 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
| 103 |
recall_values.append(avg_metrics[test_type]["recall_binary"])
|
| 104 |
if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
|
| 105 |
precision_values.append(avg_metrics[test_type]["precision_binary"])
|
| 106 |
-
|
| 107 |
# Add overall averages
|
| 108 |
if f1_values:
|
| 109 |
row["average_f1"] = sum(f1_values) / len(f1_values)
|
|
@@ -111,7 +140,7 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
| 111 |
row["average_recall"] = sum(recall_values) / len(recall_values)
|
| 112 |
if precision_values:
|
| 113 |
row["average_precision"] = sum(precision_values) / len(precision_values)
|
| 114 |
-
|
| 115 |
# Add specific test type F1 scores for display
|
| 116 |
if "default_prompts" in avg_metrics and "f1_binary" in avg_metrics["default_prompts"]:
|
| 117 |
row["default_prompts_f1"] = avg_metrics["default_prompts"]["f1_binary"]
|
|
@@ -121,14 +150,14 @@ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
|
| 121 |
row["default_answers_f1"] = avg_metrics["default_answers"]["f1_binary"]
|
| 122 |
if "jailbreaked_answers" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_answers"]:
|
| 123 |
row["jailbreaked_answers_f1"] = avg_metrics["jailbreaked_answers"]["f1_binary"]
|
| 124 |
-
|
| 125 |
rows.append(row)
|
| 126 |
-
|
| 127 |
# Create DataFrame and sort by average F1 score
|
| 128 |
df = pd.DataFrame(rows)
|
| 129 |
if not df.empty and "average_f1" in df.columns:
|
| 130 |
df = df.sort_values(by="average_f1", ascending=False)
|
| 131 |
-
|
| 132 |
return df
|
| 133 |
|
| 134 |
|
|
@@ -136,25 +165,29 @@ def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict])
|
|
| 136 |
"""
|
| 137 |
Add new entries to the leaderboard, replacing any with the same model name.
|
| 138 |
"""
|
| 139 |
-
# Create a mapping of existing entries by model name
|
| 140 |
-
existing_entries = {
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
| 142 |
# Process each new entry
|
| 143 |
for new_entry in new_entries:
|
| 144 |
model_name = new_entry.get("model_name")
|
| 145 |
-
|
| 146 |
-
|
|
|
|
| 147 |
# Replace existing entry
|
| 148 |
-
leaderboard_data["entries"][existing_entries[model_name]] = new_entry
|
| 149 |
else:
|
| 150 |
# Add new entry
|
| 151 |
if "entries" not in leaderboard_data:
|
| 152 |
leaderboard_data["entries"] = []
|
| 153 |
leaderboard_data["entries"].append(new_entry)
|
| 154 |
-
|
| 155 |
# Update the last_updated timestamp
|
| 156 |
leaderboard_data["last_updated"] = datetime.now().isoformat()
|
| 157 |
-
|
| 158 |
return leaderboard_data
|
| 159 |
|
| 160 |
|
|
@@ -171,10 +204,10 @@ def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
|
|
| 171 |
entries.append(entry)
|
| 172 |
except json.JSONDecodeError as e:
|
| 173 |
return [], f"Invalid JSON in submission file: {e}"
|
| 174 |
-
|
| 175 |
if not entries:
|
| 176 |
return [], "Submission file is empty"
|
| 177 |
-
|
| 178 |
return entries, "Successfully processed submission"
|
| 179 |
except Exception as e:
|
| 180 |
return [], f"Error processing submission file: {e}"
|
|
|
|
| 16 |
Load the leaderboard data from a JSON file.
|
| 17 |
"""
|
| 18 |
if not os.path.exists(file_path):
|
| 19 |
+
version = "v0"
|
| 20 |
+
if "_v" in file_path:
|
| 21 |
+
version = file_path.split("_")[-1].split(".")[0]
|
| 22 |
+
return {"entries": [], "last_updated": datetime.now().isoformat(), "version": version}
|
| 23 |
+
|
| 24 |
with open(file_path, 'r') as f:
|
| 25 |
data = json.load(f)
|
| 26 |
+
|
| 27 |
+
# Ensure version field exists
|
| 28 |
+
if "version" not in data:
|
| 29 |
+
version = "v0"
|
| 30 |
+
if "_v" in file_path:
|
| 31 |
+
version = file_path.split("_")[-1].split(".")[0]
|
| 32 |
+
data["version"] = version
|
| 33 |
+
|
| 34 |
return data
|
| 35 |
|
| 36 |
|
|
|
|
| 40 |
"""
|
| 41 |
# Ensure the directory exists
|
| 42 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
| 43 |
+
|
| 44 |
# Update the last_updated timestamp
|
| 45 |
data["last_updated"] = datetime.now().isoformat()
|
| 46 |
+
|
| 47 |
+
# Ensure version is set
|
| 48 |
+
if "version" not in data:
|
| 49 |
+
version = "v0"
|
| 50 |
+
if "_v" in file_path:
|
| 51 |
+
version = file_path.split("_")[-1].split(".")[0]
|
| 52 |
+
data["version"] = version
|
| 53 |
+
|
| 54 |
with open(file_path, 'w') as f:
|
| 55 |
json.dump(data, f, indent=2)
|
| 56 |
|
|
|
|
| 60 |
Process submission data and convert it to leaderboard entries.
|
| 61 |
"""
|
| 62 |
entries = []
|
| 63 |
+
|
| 64 |
for item in submission_data:
|
| 65 |
# Create a new entry for the leaderboard
|
| 66 |
entry = {
|
| 67 |
"model_name": item.get("model_name", "Unknown Model"),
|
| 68 |
"per_category_metrics": {},
|
| 69 |
"avg_metrics": {},
|
| 70 |
+
"submission_date": datetime.now().isoformat(),
|
| 71 |
+
"version": item.get("version", "v0")
|
| 72 |
}
|
| 73 |
+
|
| 74 |
+
# Copy model metadata
|
| 75 |
+
for key in ["model_type", "base_model", "revision", "precision", "weight_type"]:
|
| 76 |
+
if key in item:
|
| 77 |
+
entry[key] = item[key]
|
| 78 |
+
|
| 79 |
# Process per-category metrics
|
| 80 |
if "per_category_metrics" in item:
|
| 81 |
entry["per_category_metrics"] = item["per_category_metrics"]
|
| 82 |
+
|
| 83 |
# Process average metrics
|
| 84 |
if "avg_metrics" in item:
|
| 85 |
entry["avg_metrics"] = item["avg_metrics"]
|
| 86 |
+
|
| 87 |
entries.append(entry)
|
| 88 |
+
|
| 89 |
return entries
|
| 90 |
|
| 91 |
|
|
|
|
| 94 |
Convert leaderboard data to a pandas DataFrame for display.
|
| 95 |
"""
|
| 96 |
rows = []
|
| 97 |
+
|
| 98 |
for entry in leaderboard_data.get("entries", []):
|
| 99 |
model_name = entry.get("model_name", "Unknown Model")
|
| 100 |
+
|
| 101 |
# Extract average metrics for main display
|
| 102 |
row = {
|
| 103 |
"model_name": model_name,
|
| 104 |
"model_type": entry.get("model_type", "Unknown"),
|
| 105 |
+
"submission_date": entry.get("submission_date", ""),
|
| 106 |
+
"version": entry.get("version", "v0")
|
| 107 |
}
|
| 108 |
+
|
| 109 |
+
# Add additional metadata fields if present
|
| 110 |
+
for key in ["base_model", "revision", "precision", "weight_type"]:
|
| 111 |
+
if key in entry:
|
| 112 |
+
row[key] = entry[key]
|
| 113 |
+
|
| 114 |
# Add average metrics
|
| 115 |
avg_metrics = entry.get("avg_metrics", {})
|
| 116 |
for test_type in TEST_TYPES:
|
|
|
|
| 119 |
if metric in avg_metrics[test_type]:
|
| 120 |
col_name = f"{test_type}_{metric}"
|
| 121 |
row[col_name] = avg_metrics[test_type][metric]
|
| 122 |
+
|
| 123 |
# Calculate overall averages for key metrics
|
| 124 |
f1_values = []
|
| 125 |
recall_values = []
|
| 126 |
precision_values = []
|
| 127 |
+
|
| 128 |
for test_type in TEST_TYPES:
|
| 129 |
if test_type in avg_metrics and "f1_binary" in avg_metrics[test_type]:
|
| 130 |
f1_values.append(avg_metrics[test_type]["f1_binary"])
|
|
|
|
| 132 |
recall_values.append(avg_metrics[test_type]["recall_binary"])
|
| 133 |
if test_type in avg_metrics and "precision_binary" in avg_metrics[test_type]:
|
| 134 |
precision_values.append(avg_metrics[test_type]["precision_binary"])
|
| 135 |
+
|
| 136 |
# Add overall averages
|
| 137 |
if f1_values:
|
| 138 |
row["average_f1"] = sum(f1_values) / len(f1_values)
|
|
|
|
| 140 |
row["average_recall"] = sum(recall_values) / len(recall_values)
|
| 141 |
if precision_values:
|
| 142 |
row["average_precision"] = sum(precision_values) / len(precision_values)
|
| 143 |
+
|
| 144 |
# Add specific test type F1 scores for display
|
| 145 |
if "default_prompts" in avg_metrics and "f1_binary" in avg_metrics["default_prompts"]:
|
| 146 |
row["default_prompts_f1"] = avg_metrics["default_prompts"]["f1_binary"]
|
|
|
|
| 150 |
row["default_answers_f1"] = avg_metrics["default_answers"]["f1_binary"]
|
| 151 |
if "jailbreaked_answers" in avg_metrics and "f1_binary" in avg_metrics["jailbreaked_answers"]:
|
| 152 |
row["jailbreaked_answers_f1"] = avg_metrics["jailbreaked_answers"]["f1_binary"]
|
| 153 |
+
|
| 154 |
rows.append(row)
|
| 155 |
+
|
| 156 |
# Create DataFrame and sort by average F1 score
|
| 157 |
df = pd.DataFrame(rows)
|
| 158 |
if not df.empty and "average_f1" in df.columns:
|
| 159 |
df = df.sort_values(by="average_f1", ascending=False)
|
| 160 |
+
|
| 161 |
return df
|
| 162 |
|
| 163 |
|
|
|
|
| 165 |
"""
|
| 166 |
Add new entries to the leaderboard, replacing any with the same model name.
|
| 167 |
"""
|
| 168 |
+
# Create a mapping of existing entries by model name and version
|
| 169 |
+
existing_entries = {
|
| 170 |
+
(entry["model_name"], entry.get("version", "v0")): i
|
| 171 |
+
for i, entry in enumerate(leaderboard_data.get("entries", []))
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
# Process each new entry
|
| 175 |
for new_entry in new_entries:
|
| 176 |
model_name = new_entry.get("model_name")
|
| 177 |
+
version = new_entry.get("version", "v0")
|
| 178 |
+
|
| 179 |
+
if (model_name, version) in existing_entries:
|
| 180 |
# Replace existing entry
|
| 181 |
+
leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
|
| 182 |
else:
|
| 183 |
# Add new entry
|
| 184 |
if "entries" not in leaderboard_data:
|
| 185 |
leaderboard_data["entries"] = []
|
| 186 |
leaderboard_data["entries"].append(new_entry)
|
| 187 |
+
|
| 188 |
# Update the last_updated timestamp
|
| 189 |
leaderboard_data["last_updated"] = datetime.now().isoformat()
|
| 190 |
+
|
| 191 |
return leaderboard_data
|
| 192 |
|
| 193 |
|
|
|
|
| 204 |
entries.append(entry)
|
| 205 |
except json.JSONDecodeError as e:
|
| 206 |
return [], f"Invalid JSON in submission file: {e}"
|
| 207 |
+
|
| 208 |
if not entries:
|
| 209 |
return [], "Submission file is empty"
|
| 210 |
+
|
| 211 |
return entries, "Successfully processed submission"
|
| 212 |
except Exception as e:
|
| 213 |
return [], f"Error processing submission file: {e}"
|
src/populate.py
CHANGED
|
@@ -17,15 +17,29 @@ from src.envs import RESULTS_DATASET_ID, TOKEN, LEADERBOARD_FILE, CACHE_PATH
|
|
| 17 |
from src.leaderboard.processor import leaderboard_to_dataframe, load_leaderboard_data, save_leaderboard_data, process_jsonl_submission, add_entries_to_leaderboard
|
| 18 |
|
| 19 |
|
| 20 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
"""
|
| 22 |
Download the latest leaderboard data from HuggingFace.
|
|
|
|
|
|
|
|
|
|
| 23 |
"""
|
| 24 |
try:
|
| 25 |
# Create a temporary directory to download the submissions
|
| 26 |
-
temp_dir = os.path.join(CACHE_PATH, "
|
| 27 |
os.makedirs(temp_dir, exist_ok=True)
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
# Download the entire repository
|
| 30 |
try:
|
| 31 |
snapshot_path = snapshot_download(
|
|
@@ -43,25 +57,43 @@ def download_leaderboard_data() -> bool:
|
|
| 43 |
|
| 44 |
# Look for submission files in the submissions directory
|
| 45 |
submissions_dir = os.path.join(snapshot_path, "submissions")
|
|
|
|
|
|
|
|
|
|
| 46 |
if os.path.exists(submissions_dir):
|
| 47 |
submission_files.extend(glob(os.path.join(submissions_dir, "*.jsonl")))
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
# Process each submission file
|
| 53 |
for file_path in submission_files:
|
| 54 |
entries, _ = process_jsonl_submission(file_path)
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
# Create leaderboard data structure
|
| 58 |
leaderboard_data = {
|
| 59 |
"entries": all_entries,
|
| 60 |
-
"last_updated": pd.Timestamp.now().isoformat()
|
|
|
|
| 61 |
}
|
| 62 |
|
| 63 |
# Save to local file
|
| 64 |
-
save_leaderboard_data(leaderboard_data,
|
| 65 |
|
| 66 |
return True
|
| 67 |
except Exception as e:
|
|
@@ -72,7 +104,14 @@ def download_leaderboard_data() -> bool:
|
|
| 72 |
api = HfApi(token=TOKEN)
|
| 73 |
files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
|
| 74 |
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
all_entries = []
|
| 77 |
|
| 78 |
for file_path in submission_files:
|
|
@@ -84,49 +123,70 @@ def download_leaderboard_data() -> bool:
|
|
| 84 |
token=TOKEN
|
| 85 |
)
|
| 86 |
entries, _ = process_jsonl_submission(local_path)
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
except Exception as file_error:
|
| 89 |
print(f"Error downloading file {file_path}: {file_error}")
|
| 90 |
|
| 91 |
# Create leaderboard data structure
|
| 92 |
leaderboard_data = {
|
| 93 |
"entries": all_entries,
|
| 94 |
-
"last_updated": pd.Timestamp.now().isoformat()
|
|
|
|
| 95 |
}
|
| 96 |
|
| 97 |
# Save to local file
|
| 98 |
-
save_leaderboard_data(leaderboard_data,
|
| 99 |
|
| 100 |
return True
|
| 101 |
except Exception as list_error:
|
| 102 |
print(f"Error listing repository files: {list_error}")
|
| 103 |
|
| 104 |
# If we can't download anything, create an empty leaderboard
|
| 105 |
-
if not os.path.exists(
|
| 106 |
-
empty_data = {
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
return False
|
| 110 |
except Exception as e:
|
| 111 |
print(f"Error downloading leaderboard data: {e}")
|
| 112 |
|
| 113 |
# Ensure we have at least an empty leaderboard file
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
return False
|
| 119 |
|
| 120 |
|
| 121 |
-
def get_leaderboard_df() -> pd.DataFrame:
|
| 122 |
"""
|
| 123 |
Get the leaderboard data as a DataFrame.
|
|
|
|
|
|
|
|
|
|
| 124 |
"""
|
| 125 |
# Try to download the latest data
|
| 126 |
-
download_leaderboard_data()
|
| 127 |
|
| 128 |
# Load from local file
|
| 129 |
-
|
|
|
|
| 130 |
|
| 131 |
# Convert to DataFrame
|
| 132 |
df = leaderboard_to_dataframe(leaderboard_data)
|
|
@@ -134,18 +194,20 @@ def get_leaderboard_df() -> pd.DataFrame:
|
|
| 134 |
return df
|
| 135 |
|
| 136 |
|
| 137 |
-
def get_category_leaderboard_df(category: str) -> pd.DataFrame:
|
| 138 |
"""
|
| 139 |
Get the leaderboard data filtered by a specific category.
|
| 140 |
|
| 141 |
Args:
|
| 142 |
category: The category to filter by (e.g., "Criminal, Violent, and Terrorist Activity")
|
|
|
|
| 143 |
|
| 144 |
Returns:
|
| 145 |
DataFrame with metrics for the specified category
|
| 146 |
"""
|
| 147 |
# Load the leaderboard data
|
| 148 |
-
|
|
|
|
| 149 |
|
| 150 |
# Filter entries to only include those with data for the specified category
|
| 151 |
filtered_entries = []
|
|
@@ -158,6 +220,7 @@ def get_category_leaderboard_df(category: str) -> pd.DataFrame:
|
|
| 158 |
"model_name": entry.get("model_name", "Unknown Model"),
|
| 159 |
"model_type": entry.get("model_type", "Unknown"),
|
| 160 |
"submission_date": entry.get("submission_date", ""),
|
|
|
|
| 161 |
}
|
| 162 |
|
| 163 |
# Extract metrics for this category
|
|
@@ -189,7 +252,8 @@ def get_category_leaderboard_df(category: str) -> pd.DataFrame:
|
|
| 189 |
# Create a new leaderboard data structure with the filtered entries
|
| 190 |
filtered_leaderboard = {
|
| 191 |
"entries": filtered_entries,
|
| 192 |
-
"last_updated": leaderboard_data.get("last_updated", pd.Timestamp.now().isoformat())
|
|
|
|
| 193 |
}
|
| 194 |
|
| 195 |
# Convert to DataFrame
|
|
@@ -198,14 +262,21 @@ def get_category_leaderboard_df(category: str) -> pd.DataFrame:
|
|
| 198 |
return df
|
| 199 |
|
| 200 |
|
| 201 |
-
def get_detailed_model_data(model_name: str) -> Dict:
|
| 202 |
"""
|
| 203 |
Get detailed data for a specific model.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
"""
|
| 205 |
-
|
|
|
|
| 206 |
|
| 207 |
for entry in leaderboard_data.get("entries", []):
|
| 208 |
-
|
|
|
|
|
|
|
| 209 |
return entry
|
| 210 |
|
| 211 |
return {}
|
|
|
|
| 17 |
from src.leaderboard.processor import leaderboard_to_dataframe, load_leaderboard_data, save_leaderboard_data, process_jsonl_submission, add_entries_to_leaderboard
|
| 18 |
|
| 19 |
|
| 20 |
+
def get_versioned_leaderboard_file(version="v0"):
|
| 21 |
+
"""
|
| 22 |
+
Get the versioned leaderboard file path.
|
| 23 |
+
"""
|
| 24 |
+
base_name, ext = os.path.splitext(LEADERBOARD_FILE)
|
| 25 |
+
return f"{base_name}_{version}{ext}"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def download_leaderboard_data(version="v0") -> bool:
|
| 29 |
"""
|
| 30 |
Download the latest leaderboard data from HuggingFace.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
version: The dataset version to download
|
| 34 |
"""
|
| 35 |
try:
|
| 36 |
# Create a temporary directory to download the submissions
|
| 37 |
+
temp_dir = os.path.join(CACHE_PATH, f"temp_submissions_{version}")
|
| 38 |
os.makedirs(temp_dir, exist_ok=True)
|
| 39 |
|
| 40 |
+
# Get the versioned leaderboard file
|
| 41 |
+
leaderboard_file = get_versioned_leaderboard_file(version)
|
| 42 |
+
|
| 43 |
# Download the entire repository
|
| 44 |
try:
|
| 45 |
snapshot_path = snapshot_download(
|
|
|
|
| 57 |
|
| 58 |
# Look for submission files in the submissions directory
|
| 59 |
submissions_dir = os.path.join(snapshot_path, "submissions")
|
| 60 |
+
version_submissions_dir = os.path.join(snapshot_path, f"submissions_{version}")
|
| 61 |
+
|
| 62 |
+
# Check both standard and versioned submission directories
|
| 63 |
if os.path.exists(submissions_dir):
|
| 64 |
submission_files.extend(glob(os.path.join(submissions_dir, "*.jsonl")))
|
| 65 |
|
| 66 |
+
if os.path.exists(version_submissions_dir):
|
| 67 |
+
submission_files.extend(glob(os.path.join(version_submissions_dir, "*.jsonl")))
|
| 68 |
+
|
| 69 |
+
# Also look for any versioned JSONL files in the root
|
| 70 |
+
submission_files.extend(glob(os.path.join(snapshot_path, f"*_{version}.jsonl")))
|
| 71 |
+
|
| 72 |
+
# If we're looking for v0 and no versioned files found, use generic ones
|
| 73 |
+
if version == "v0" and not submission_files:
|
| 74 |
+
submission_files.extend(glob(os.path.join(snapshot_path, "*.jsonl")))
|
| 75 |
|
| 76 |
# Process each submission file
|
| 77 |
for file_path in submission_files:
|
| 78 |
entries, _ = process_jsonl_submission(file_path)
|
| 79 |
+
|
| 80 |
+
# Filter entries to those that match the version or don't have version specified
|
| 81 |
+
filtered_entries = [
|
| 82 |
+
entry for entry in entries
|
| 83 |
+
if entry.get("version", "v0") == version or "version" not in entry
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
all_entries.extend(filtered_entries)
|
| 87 |
|
| 88 |
# Create leaderboard data structure
|
| 89 |
leaderboard_data = {
|
| 90 |
"entries": all_entries,
|
| 91 |
+
"last_updated": pd.Timestamp.now().isoformat(),
|
| 92 |
+
"version": version
|
| 93 |
}
|
| 94 |
|
| 95 |
# Save to local file
|
| 96 |
+
save_leaderboard_data(leaderboard_data, leaderboard_file)
|
| 97 |
|
| 98 |
return True
|
| 99 |
except Exception as e:
|
|
|
|
| 104 |
api = HfApi(token=TOKEN)
|
| 105 |
files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
|
| 106 |
|
| 107 |
+
# Look for versioned and regular files
|
| 108 |
+
submission_files = [
|
| 109 |
+
f for f in files
|
| 110 |
+
if (f.endswith(f'_{version}.jsonl') or
|
| 111 |
+
f.startswith(f'submissions_{version}/') or
|
| 112 |
+
(version == "v0" and f.endswith('.jsonl')))
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
all_entries = []
|
| 116 |
|
| 117 |
for file_path in submission_files:
|
|
|
|
| 123 |
token=TOKEN
|
| 124 |
)
|
| 125 |
entries, _ = process_jsonl_submission(local_path)
|
| 126 |
+
|
| 127 |
+
# Filter entries to those that match the version or don't have version specified
|
| 128 |
+
filtered_entries = [
|
| 129 |
+
entry for entry in entries
|
| 130 |
+
if entry.get("version", "v0") == version or "version" not in entry
|
| 131 |
+
]
|
| 132 |
+
|
| 133 |
+
all_entries.extend(filtered_entries)
|
| 134 |
except Exception as file_error:
|
| 135 |
print(f"Error downloading file {file_path}: {file_error}")
|
| 136 |
|
| 137 |
# Create leaderboard data structure
|
| 138 |
leaderboard_data = {
|
| 139 |
"entries": all_entries,
|
| 140 |
+
"last_updated": pd.Timestamp.now().isoformat(),
|
| 141 |
+
"version": version
|
| 142 |
}
|
| 143 |
|
| 144 |
# Save to local file
|
| 145 |
+
save_leaderboard_data(leaderboard_data, leaderboard_file)
|
| 146 |
|
| 147 |
return True
|
| 148 |
except Exception as list_error:
|
| 149 |
print(f"Error listing repository files: {list_error}")
|
| 150 |
|
| 151 |
# If we can't download anything, create an empty leaderboard
|
| 152 |
+
if not os.path.exists(leaderboard_file):
|
| 153 |
+
empty_data = {
|
| 154 |
+
"entries": [],
|
| 155 |
+
"last_updated": pd.Timestamp.now().isoformat(),
|
| 156 |
+
"version": version
|
| 157 |
+
}
|
| 158 |
+
save_leaderboard_data(empty_data, leaderboard_file)
|
| 159 |
|
| 160 |
return False
|
| 161 |
except Exception as e:
|
| 162 |
print(f"Error downloading leaderboard data: {e}")
|
| 163 |
|
| 164 |
# Ensure we have at least an empty leaderboard file
|
| 165 |
+
leaderboard_file = get_versioned_leaderboard_file(version)
|
| 166 |
+
if not os.path.exists(leaderboard_file):
|
| 167 |
+
empty_data = {
|
| 168 |
+
"entries": [],
|
| 169 |
+
"last_updated": pd.Timestamp.now().isoformat(),
|
| 170 |
+
"version": version
|
| 171 |
+
}
|
| 172 |
+
save_leaderboard_data(empty_data, leaderboard_file)
|
| 173 |
|
| 174 |
return False
|
| 175 |
|
| 176 |
|
| 177 |
+
def get_leaderboard_df(version="v0") -> pd.DataFrame:
|
| 178 |
"""
|
| 179 |
Get the leaderboard data as a DataFrame.
|
| 180 |
+
|
| 181 |
+
Args:
|
| 182 |
+
version: The dataset version to retrieve
|
| 183 |
"""
|
| 184 |
# Try to download the latest data
|
| 185 |
+
download_leaderboard_data(version=version)
|
| 186 |
|
| 187 |
# Load from local file
|
| 188 |
+
leaderboard_file = get_versioned_leaderboard_file(version)
|
| 189 |
+
leaderboard_data = load_leaderboard_data(leaderboard_file)
|
| 190 |
|
| 191 |
# Convert to DataFrame
|
| 192 |
df = leaderboard_to_dataframe(leaderboard_data)
|
|
|
|
| 194 |
return df
|
| 195 |
|
| 196 |
|
| 197 |
+
def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
| 198 |
"""
|
| 199 |
Get the leaderboard data filtered by a specific category.
|
| 200 |
|
| 201 |
Args:
|
| 202 |
category: The category to filter by (e.g., "Criminal, Violent, and Terrorist Activity")
|
| 203 |
+
version: The dataset version to retrieve
|
| 204 |
|
| 205 |
Returns:
|
| 206 |
DataFrame with metrics for the specified category
|
| 207 |
"""
|
| 208 |
# Load the leaderboard data
|
| 209 |
+
leaderboard_file = get_versioned_leaderboard_file(version)
|
| 210 |
+
leaderboard_data = load_leaderboard_data(leaderboard_file)
|
| 211 |
|
| 212 |
# Filter entries to only include those with data for the specified category
|
| 213 |
filtered_entries = []
|
|
|
|
| 220 |
"model_name": entry.get("model_name", "Unknown Model"),
|
| 221 |
"model_type": entry.get("model_type", "Unknown"),
|
| 222 |
"submission_date": entry.get("submission_date", ""),
|
| 223 |
+
"version": entry.get("version", version),
|
| 224 |
}
|
| 225 |
|
| 226 |
# Extract metrics for this category
|
|
|
|
| 252 |
# Create a new leaderboard data structure with the filtered entries
|
| 253 |
filtered_leaderboard = {
|
| 254 |
"entries": filtered_entries,
|
| 255 |
+
"last_updated": leaderboard_data.get("last_updated", pd.Timestamp.now().isoformat()),
|
| 256 |
+
"version": version
|
| 257 |
}
|
| 258 |
|
| 259 |
# Convert to DataFrame
|
|
|
|
| 262 |
return df
|
| 263 |
|
| 264 |
|
| 265 |
+
def get_detailed_model_data(model_name: str, version="v0") -> Dict:
|
| 266 |
"""
|
| 267 |
Get detailed data for a specific model.
|
| 268 |
+
|
| 269 |
+
Args:
|
| 270 |
+
model_name: The name of the model to get data for
|
| 271 |
+
version: The dataset version to retrieve
|
| 272 |
"""
|
| 273 |
+
leaderboard_file = get_versioned_leaderboard_file(version)
|
| 274 |
+
leaderboard_data = load_leaderboard_data(leaderboard_file)
|
| 275 |
|
| 276 |
for entry in leaderboard_data.get("entries", []):
|
| 277 |
+
# Check both the model name and version
|
| 278 |
+
entry_version = entry.get("version", "v0")
|
| 279 |
+
if entry.get("model_name") == model_name and (entry_version == version or entry_version is None):
|
| 280 |
return entry
|
| 281 |
|
| 282 |
return {}
|
src/submission/submit.py
CHANGED
|
@@ -25,33 +25,40 @@ def validate_submission(file_path: str) -> Tuple[bool, str]:
|
|
| 25 |
entries, message = process_jsonl_submission(file_path)
|
| 26 |
if not entries:
|
| 27 |
return False, message
|
| 28 |
-
|
| 29 |
# Additional validation could be added here
|
| 30 |
-
|
| 31 |
return True, "Submission is valid"
|
| 32 |
except Exception as e:
|
| 33 |
return False, f"Error validating submission: {e}"
|
| 34 |
|
| 35 |
|
| 36 |
-
def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str) -> Tuple[bool, str]:
|
| 37 |
"""
|
| 38 |
Submit results to a HuggingFace dataset repository as individual files.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
"""
|
| 40 |
try:
|
| 41 |
# Process the submission file to validate
|
| 42 |
entries, message = process_jsonl_submission(file_path)
|
| 43 |
if not entries:
|
| 44 |
return False, message
|
| 45 |
-
|
| 46 |
# Generate a unique submission ID
|
| 47 |
model_name = metadata.get("model_name", "unknown")
|
| 48 |
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
| 49 |
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 50 |
submission_id = f"{model_name_safe}_{timestamp}"
|
| 51 |
-
|
| 52 |
# Create an API instance
|
| 53 |
api = HfApi(token=token)
|
| 54 |
-
|
| 55 |
# Create a temporary file with metadata added
|
| 56 |
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as temp_file:
|
| 57 |
# Add metadata to each entry
|
|
@@ -59,47 +66,58 @@ def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str) -
|
|
| 59 |
# If the entry already has a model_name, don't override it
|
| 60 |
if "model_name" not in entry:
|
| 61 |
entry["model_name"] = metadata.get("model_name")
|
| 62 |
-
|
| 63 |
# Add other metadata if not present
|
| 64 |
for key, value in metadata.items():
|
| 65 |
if key != "model_name" and key not in entry:
|
| 66 |
entry[key] = value
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
| 68 |
# Write to temp file
|
| 69 |
temp_file.write(json.dumps(entry) + "\n")
|
| 70 |
-
|
| 71 |
temp_path = temp_file.name
|
| 72 |
-
|
| 73 |
-
# Upload the file
|
| 74 |
-
submission_path = f"submissions/{submission_id}.jsonl"
|
| 75 |
api.upload_file(
|
| 76 |
path_or_fileobj=temp_path,
|
| 77 |
path_in_repo=submission_path,
|
| 78 |
repo_id=dataset_id,
|
| 79 |
repo_type="dataset",
|
| 80 |
-
commit_message=f"Add submission for {model_name}"
|
| 81 |
)
|
| 82 |
-
|
| 83 |
# Clean up the temporary file
|
| 84 |
os.unlink(temp_path)
|
| 85 |
-
|
| 86 |
-
return True, f"Successfully uploaded submission for {model_name} to {dataset_id}"
|
| 87 |
except Exception as e:
|
| 88 |
return False, f"Error submitting to dataset: {e}"
|
| 89 |
|
| 90 |
|
| 91 |
-
def process_submission(file_path: str, metadata: Dict) -> str:
|
| 92 |
"""
|
| 93 |
Process a submission to the GuardBench leaderboard.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
"""
|
| 95 |
# Validate submission file
|
| 96 |
is_valid, validation_message = validate_submission(file_path)
|
| 97 |
if not is_valid:
|
| 98 |
return styled_error(validation_message)
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
| 100 |
# Submit to HuggingFace dataset repository
|
| 101 |
-
success, message = submit_to_hub(file_path, metadata, RESULTS_DATASET_ID, TOKEN)
|
| 102 |
if not success:
|
| 103 |
return styled_error(message)
|
| 104 |
-
|
| 105 |
return styled_message(f"Submission successful! {message}")
|
|
|
|
| 25 |
entries, message = process_jsonl_submission(file_path)
|
| 26 |
if not entries:
|
| 27 |
return False, message
|
| 28 |
+
|
| 29 |
# Additional validation could be added here
|
| 30 |
+
|
| 31 |
return True, "Submission is valid"
|
| 32 |
except Exception as e:
|
| 33 |
return False, f"Error validating submission: {e}"
|
| 34 |
|
| 35 |
|
| 36 |
+
def submit_to_hub(file_path: str, metadata: Dict, dataset_id: str, token: str, version="v0") -> Tuple[bool, str]:
|
| 37 |
"""
|
| 38 |
Submit results to a HuggingFace dataset repository as individual files.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
file_path: Path to the submission file
|
| 42 |
+
metadata: Metadata to include with the submission
|
| 43 |
+
dataset_id: The dataset repository ID
|
| 44 |
+
token: HuggingFace API token
|
| 45 |
+
version: The version of the benchmark used (e.g., "v0", "v1")
|
| 46 |
"""
|
| 47 |
try:
|
| 48 |
# Process the submission file to validate
|
| 49 |
entries, message = process_jsonl_submission(file_path)
|
| 50 |
if not entries:
|
| 51 |
return False, message
|
| 52 |
+
|
| 53 |
# Generate a unique submission ID
|
| 54 |
model_name = metadata.get("model_name", "unknown")
|
| 55 |
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
| 56 |
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
| 57 |
submission_id = f"{model_name_safe}_{timestamp}"
|
| 58 |
+
|
| 59 |
# Create an API instance
|
| 60 |
api = HfApi(token=token)
|
| 61 |
+
|
| 62 |
# Create a temporary file with metadata added
|
| 63 |
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as temp_file:
|
| 64 |
# Add metadata to each entry
|
|
|
|
| 66 |
# If the entry already has a model_name, don't override it
|
| 67 |
if "model_name" not in entry:
|
| 68 |
entry["model_name"] = metadata.get("model_name")
|
| 69 |
+
|
| 70 |
# Add other metadata if not present
|
| 71 |
for key, value in metadata.items():
|
| 72 |
if key != "model_name" and key not in entry:
|
| 73 |
entry[key] = value
|
| 74 |
+
|
| 75 |
+
# Ensure version is set
|
| 76 |
+
entry["version"] = version
|
| 77 |
+
|
| 78 |
# Write to temp file
|
| 79 |
temp_file.write(json.dumps(entry) + "\n")
|
| 80 |
+
|
| 81 |
temp_path = temp_file.name
|
| 82 |
+
|
| 83 |
+
# Upload the file to the version-specific directory
|
| 84 |
+
submission_path = f"submissions_{version}/{submission_id}_{version}.jsonl" if version != "v0" else f"submissions/{submission_id}.jsonl"
|
| 85 |
api.upload_file(
|
| 86 |
path_or_fileobj=temp_path,
|
| 87 |
path_in_repo=submission_path,
|
| 88 |
repo_id=dataset_id,
|
| 89 |
repo_type="dataset",
|
| 90 |
+
commit_message=f"Add submission for {model_name} (version {version})"
|
| 91 |
)
|
| 92 |
+
|
| 93 |
# Clean up the temporary file
|
| 94 |
os.unlink(temp_path)
|
| 95 |
+
|
| 96 |
+
return True, f"Successfully uploaded submission for {model_name} to {dataset_id} (version {version})"
|
| 97 |
except Exception as e:
|
| 98 |
return False, f"Error submitting to dataset: {e}"
|
| 99 |
|
| 100 |
|
| 101 |
+
def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
|
| 102 |
"""
|
| 103 |
Process a submission to the GuardBench leaderboard.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
file_path: Path to the submission file
|
| 107 |
+
metadata: Metadata to include with the submission
|
| 108 |
+
version: The version of the benchmark used (e.g., "v0", "v1")
|
| 109 |
"""
|
| 110 |
# Validate submission file
|
| 111 |
is_valid, validation_message = validate_submission(file_path)
|
| 112 |
if not is_valid:
|
| 113 |
return styled_error(validation_message)
|
| 114 |
+
|
| 115 |
+
# Add version to metadata
|
| 116 |
+
metadata["version"] = version
|
| 117 |
+
|
| 118 |
# Submit to HuggingFace dataset repository
|
| 119 |
+
success, message = submit_to_hub(file_path, metadata, RESULTS_DATASET_ID, TOKEN, version=version)
|
| 120 |
if not success:
|
| 121 |
return styled_error(message)
|
| 122 |
+
|
| 123 |
return styled_message(f"Submission successful! {message}")
|