Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	
		Amber Tanaka
		
	commited on
		
		
					Bug Bash Fixes (#79)
Browse files- leaderboard_transformer.py +19 -31
- main_page.py +3 -2
- submission.py +3 -2
- ui_components.py +1 -1
    	
        leaderboard_transformer.py
    CHANGED
    
    | @@ -87,7 +87,7 @@ ORDER_MAP = { | |
| 87 | 
             
            }
         | 
| 88 |  | 
| 89 |  | 
| 90 | 
            -
            def _safe_round(value, digits= | 
| 91 | 
             
                """Rounds a number if it's a valid float/int, otherwise returns it as is."""
         | 
| 92 | 
             
                return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
         | 
| 93 |  | 
| @@ -278,13 +278,7 @@ class DataTransformer: | |
| 278 | 
             
                    if primary_metric == "Overall":
         | 
| 279 | 
             
                        def calculate_attempted(row):
         | 
| 280 | 
             
                            main_categories = ['Literature Understanding', 'Code & Execution', 'Data Analysis', 'End-to-End Discovery']
         | 
| 281 | 
            -
                            count = sum(1 for category in main_categories if  | 
| 282 | 
            -
             | 
| 283 | 
            -
                            # Return the formatted string with the correct emoji
         | 
| 284 | 
            -
                            if count == 4:
         | 
| 285 | 
            -
                                return f"4/4"
         | 
| 286 | 
            -
                            if count == 0:
         | 
| 287 | 
            -
                                return f"0/4"
         | 
| 288 | 
             
                            return f"{count}/4"
         | 
| 289 |  | 
| 290 | 
             
                        # Apply the function row-wise to create the new column
         | 
| @@ -295,13 +289,8 @@ class DataTransformer: | |
| 295 | 
             
                        total_benchmarks = len(group_metrics)
         | 
| 296 | 
             
                        def calculate_benchmarks_attempted(row):
         | 
| 297 | 
             
                            # Count how many benchmarks in this category have COST data reported
         | 
| 298 | 
            -
                            count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark}  | 
| 299 | 
            -
                             | 
| 300 | 
            -
                                return f"{count}/{total_benchmarks} "
         | 
| 301 | 
            -
                            elif count == 0:
         | 
| 302 | 
            -
                                return f"{count}/{total_benchmarks} "
         | 
| 303 | 
            -
                            else:
         | 
| 304 | 
            -
                                return f"{count}/{total_benchmarks}"
         | 
| 305 | 
             
                        # Insert the new column, for example, after "Date"
         | 
| 306 | 
             
                        df_view.insert((cols - 2), "Benchmarks Attempted", df_view.apply(calculate_benchmarks_attempted, axis=1))
         | 
| 307 |  | 
| @@ -459,7 +448,7 @@ def _plot_scatter_plotly( | |
| 459 | 
             
                    h_pad = "   "
         | 
| 460 | 
             
                    parts = ["<br>"]
         | 
| 461 | 
             
                    parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
         | 
| 462 | 
            -
                    parts.append(f"{h_pad}Score: <b>{row[y_col]:. | 
| 463 | 
             
                    parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
         | 
| 464 | 
             
                    parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
         | 
| 465 | 
             
                    parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
         | 
| @@ -531,15 +520,14 @@ def _plot_scatter_plotly( | |
| 531 | 
             
                    )
         | 
| 532 |  | 
| 533 | 
             
                    # ---Adjust x-axis range to make room for the new points ---
         | 
| 534 | 
            -
                    xaxis_config['range'] = [0, (max_reported_cost + (max_reported_cost / 4))]
         | 
| 535 |  | 
| 536 | 
            -
                logo_data_uri = svg_to_data_uri("assets/just-icon.svg")
         | 
| 537 |  | 
| 538 | 
             
                fig.update_layout(
         | 
| 539 | 
             
                    template="plotly_white",
         | 
| 540 | 
             
                    title=f"AstaBench {name} Leaderboard",
         | 
| 541 | 
             
                    xaxis=xaxis_config, # Use the updated config
         | 
| 542 | 
            -
                    yaxis=dict(title="Average (mean) score",  | 
| 543 | 
             
                    legend=dict(
         | 
| 544 | 
             
                        bgcolor='#FAF2E9',
         | 
| 545 | 
             
                    ),
         | 
| @@ -551,17 +539,17 @@ def _plot_scatter_plotly( | |
| 551 | 
             
                        font_color="#d3dedc",
         | 
| 552 | 
             
                    ),
         | 
| 553 | 
             
                )
         | 
| 554 | 
            -
                fig.add_layout_image(
         | 
| 555 | 
            -
             | 
| 556 | 
            -
             | 
| 557 | 
            -
             | 
| 558 | 
            -
             | 
| 559 | 
            -
             | 
| 560 | 
            -
             | 
| 561 | 
            -
             | 
| 562 | 
            -
             | 
| 563 | 
            -
             | 
| 564 | 
            -
                )
         | 
| 565 |  | 
| 566 | 
             
                return fig
         | 
| 567 |  | 
| @@ -621,7 +609,7 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame: | |
| 621 |  | 
| 622 | 
             
                    # For all other numbers, format them for consistency.
         | 
| 623 | 
             
                    if isinstance(score_value, (int, float)):
         | 
| 624 | 
            -
                        return f"{score_value:. | 
| 625 |  | 
| 626 | 
             
                    # Fallback for any unexpected non-numeric data
         | 
| 627 | 
             
                    return score_value
         | 
|  | |
| 87 | 
             
            }
         | 
| 88 |  | 
| 89 |  | 
| 90 | 
            +
            def _safe_round(value, digits=3):
         | 
| 91 | 
             
                """Rounds a number if it's a valid float/int, otherwise returns it as is."""
         | 
| 92 | 
             
                return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
         | 
| 93 |  | 
|  | |
| 278 | 
             
                    if primary_metric == "Overall":
         | 
| 279 | 
             
                        def calculate_attempted(row):
         | 
| 280 | 
             
                            main_categories = ['Literature Understanding', 'Code & Execution', 'Data Analysis', 'End-to-End Discovery']
         | 
| 281 | 
            +
                            count = sum(1 for category in main_categories if row.get(f"{category} Score") != 0.0)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 282 | 
             
                            return f"{count}/4"
         | 
| 283 |  | 
| 284 | 
             
                        # Apply the function row-wise to create the new column
         | 
|  | |
| 289 | 
             
                        total_benchmarks = len(group_metrics)
         | 
| 290 | 
             
                        def calculate_benchmarks_attempted(row):
         | 
| 291 | 
             
                            # Count how many benchmarks in this category have COST data reported
         | 
| 292 | 
            +
                            count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark} Score")))
         | 
| 293 | 
            +
                            return f"{count}/{total_benchmarks}"
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 294 | 
             
                        # Insert the new column, for example, after "Date"
         | 
| 295 | 
             
                        df_view.insert((cols - 2), "Benchmarks Attempted", df_view.apply(calculate_benchmarks_attempted, axis=1))
         | 
| 296 |  | 
|  | |
| 448 | 
             
                    h_pad = "   "
         | 
| 449 | 
             
                    parts = ["<br>"]
         | 
| 450 | 
             
                    parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
         | 
| 451 | 
            +
                    parts.append(f"{h_pad}Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
         | 
| 452 | 
             
                    parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
         | 
| 453 | 
             
                    parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
         | 
| 454 | 
             
                    parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
         | 
|  | |
| 520 | 
             
                    )
         | 
| 521 |  | 
| 522 | 
             
                    # ---Adjust x-axis range to make room for the new points ---
         | 
| 523 | 
            +
                    xaxis_config['range'] = [-0.2, (max_reported_cost + (max_reported_cost / 4))]
         | 
| 524 |  | 
|  | |
| 525 |  | 
| 526 | 
             
                fig.update_layout(
         | 
| 527 | 
             
                    template="plotly_white",
         | 
| 528 | 
             
                    title=f"AstaBench {name} Leaderboard",
         | 
| 529 | 
             
                    xaxis=xaxis_config, # Use the updated config
         | 
| 530 | 
            +
                    yaxis=dict(title="Average (mean) score", range=[-0.2, None]),
         | 
| 531 | 
             
                    legend=dict(
         | 
| 532 | 
             
                        bgcolor='#FAF2E9',
         | 
| 533 | 
             
                    ),
         | 
|  | |
| 539 | 
             
                        font_color="#d3dedc",
         | 
| 540 | 
             
                    ),
         | 
| 541 | 
             
                )
         | 
| 542 | 
            +
                # fig.add_layout_image(
         | 
| 543 | 
            +
                #     dict(
         | 
| 544 | 
            +
                #         source=logo_data_uri,
         | 
| 545 | 
            +
                #         xref="x domain", yref="y domain",
         | 
| 546 | 
            +
                #         x=1.1, y=1.1,
         | 
| 547 | 
            +
                #         sizex=0.2, sizey=0.2,
         | 
| 548 | 
            +
                #         xanchor="left",
         | 
| 549 | 
            +
                #         yanchor="bottom",
         | 
| 550 | 
            +
                #         layer="above",
         | 
| 551 | 
            +
                #     ),
         | 
| 552 | 
            +
                # )
         | 
| 553 |  | 
| 554 | 
             
                return fig
         | 
| 555 |  | 
|  | |
| 609 |  | 
| 610 | 
             
                    # For all other numbers, format them for consistency.
         | 
| 611 | 
             
                    if isinstance(score_value, (int, float)):
         | 
| 612 | 
            +
                        return f"{score_value:.3f}"
         | 
| 613 |  | 
| 614 | 
             
                    # Fallback for any unexpected non-numeric data
         | 
| 615 | 
             
                    return score_value
         | 
    	
        main_page.py
    CHANGED
    
    | @@ -57,8 +57,9 @@ def build_page(): | |
| 57 | 
             
                        else:
         | 
| 58 | 
             
                            gr.Markdown("No data available for validation split.")
         | 
| 59 |  | 
| 60 | 
            -
                 | 
| 61 | 
            -
             | 
|  | |
| 62 |  | 
| 63 |  | 
| 64 | 
             
                # JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
         | 
|  | |
| 57 | 
             
                        else:
         | 
| 58 | 
             
                            gr.Markdown("No data available for validation split.")
         | 
| 59 |  | 
| 60 | 
            +
                # hiding this for now till we have the real paper data
         | 
| 61 | 
            +
                # with gr.Accordion("📙 Citation", open=False):
         | 
| 62 | 
            +
                #     gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
         | 
| 63 |  | 
| 64 |  | 
| 65 | 
             
                # JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
         | 
    	
        submission.py
    CHANGED
    
    | @@ -428,5 +428,6 @@ def build_page(): | |
| 428 | 
             
                    ],
         | 
| 429 | 
             
                    [error_message, error_modal, success_modal, loading_modal],
         | 
| 430 | 
             
                )
         | 
| 431 | 
            -
                 | 
| 432 | 
            -
             | 
|  | 
|  | |
| 428 | 
             
                    ],
         | 
| 429 | 
             
                    [error_message, error_modal, success_modal, loading_modal],
         | 
| 430 | 
             
                )
         | 
| 431 | 
            +
                # hiding this for now till we have the real paper data
         | 
| 432 | 
            +
                # with gr.Accordion("📙 Citation", open=False):
         | 
| 433 | 
            +
                #     gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
         | 
    	
        ui_components.py
    CHANGED
    
    | @@ -596,7 +596,7 @@ def create_leaderboard_display( | |
| 596 | 
             
                    with gr.Column(scale=3):
         | 
| 597 | 
             
                        plot_component = gr.Plot(
         | 
| 598 | 
             
                            value=scatter_plot,
         | 
| 599 | 
            -
                            show_label=False
         | 
| 600 | 
             
                        )
         | 
| 601 | 
             
                    with gr.Column(scale=1):
         | 
| 602 | 
             
                        gr.HTML(value=plot_legend_html)
         | 
|  | |
| 596 | 
             
                    with gr.Column(scale=3):
         | 
| 597 | 
             
                        plot_component = gr.Plot(
         | 
| 598 | 
             
                            value=scatter_plot,
         | 
| 599 | 
            +
                            show_label=False,
         | 
| 600 | 
             
                        )
         | 
| 601 | 
             
                    with gr.Column(scale=1):
         | 
| 602 | 
             
                        gr.HTML(value=plot_legend_html)
         | 
