Spaces:

lisabdunlap
/

VibeCheck-Examples

Sleeping

File size: 12,951 Bytes

import os
import gradio as gr
import pandas as pd

def list_saved_results():
    """List all saved result files in the saved_results directory."""
    results_dir = os.path.join(".")
    if not os.path.exists(results_dir):
        return []
    
    result_files = [f for f in os.listdir(results_dir) if f.endswith(".pkl")]
    # Sort by modification time (newest first)
    result_files.sort(key=lambda x: os.path.getmtime(os.path.join(results_dir, x)), reverse=True)
    formatted_results = []
    for filename in result_files:
        formatted_results.append((filename, filename))
    
    return formatted_results

def load_saved_result(filename):
    """Load a saved result file."""
    import pickle
    
    results_dir = os.path.join(".")
    filepath = os.path.join(results_dir, filename)
    
    if not os.path.exists(filepath):
        return None
    
    with open(filepath, "rb") as f:
        results = pickle.load(f)
    
    return results

def show_examples(selected_vibe, results_dict, model_list_str=None):
    """Display example rows for the selected vibe."""
    if not selected_vibe:
        return "Please select a vibe to see examples.", []
    
    if not results_dict or "vibe_df" not in results_dict:
        return "No vibe data available.", []
    
    vibe_df = results_dict["vibe_df"]
    
    # Extract model names from the results dictionary or use provided models
    if model_list_str:
        models = [m.strip() for m in model_list_str.split(",") if m.strip()]
    else:
        models = results_dict.get("models", ["Model A", "Model B"])
    
    # Filter for the selected vibe
    subset = vibe_df[(vibe_df["vibe"] == selected_vibe)].head(10)
    
    # Create a list of example choices with preview text
    example_choices = []
    for i, row in enumerate(subset.itertuples(), 1):
        row = row._asdict()
        # Create a short preview of the prompt (first 100 chars)
        preview = row['question'][:100] + "..." if len(row['question']) > 100 else row['question']
        score = row['score']
        # Format: "Example 1: This is the prompt... (Score: 0.123)"
        label = f"Example {i}: {preview} (Score: {score:.3f})"
        example_choices.append(label)
    
    return gr.Dropdown(choices=example_choices, value=example_choices[0] if example_choices else None)

def display_selected_example(example_idx, selected_vibe, results_dict, model_list_str=None):
    """Display a single selected example."""
    if example_idx is None or not selected_vibe:
        return "Please select an example to view."
    
    if not results_dict or "vibe_df" not in results_dict:
        return "No vibe data available."
    
    vibe_df = results_dict["vibe_df"]
    
    # Extract model names from the results dictionary or use provided models
    if model_list_str:
        models = [m.strip() for m in model_list_str.split(",") if m.strip()]
    else:
        models = results_dict.get("models", ["Model A", "Model B"])
    
    # Filter for the selected vibe
    subset = vibe_df[
        (vibe_df["vibe"] == selected_vibe) & (vibe_df["score"].abs() > 0.0)
    ].head(10).to_dict(orient="records")  # Match the number in show_examples
    
    # Extract the example number from the dropdown selection
    try:
        # Parse the example number from the string (e.g., "Example 3: ...")
        example_num = int(example_idx.split(":")[0].replace("Example ", "")) - 1
        if example_num < 0 or example_num >= len(subset):
            return "Example index out of range."
    except (ValueError, AttributeError, IndexError):
        return f"Invalid example selection: '{example_idx}'"
    
    # Get the selected example
    row = subset[example_num]
    
    md = f"**Prompt:** {row['question']}\n\n"
    
    # Show both model outputs if available
    for model in models:
        if model in row:
            md += f"**{model} Output:**\n{row[model]}\n________________________\n"
    md += f"**Score:** {row['score']:.3f} "
    
    # Add interpretation of which model exhibits the vibe more
    if len(models) >= 2:
        if row["score"] > 0:
            md += f"({models[0]} exhibits this vibe more)\n\n"
        else:
            md += f"({models[1]} exhibits this vibe more)\n\n"
    
    return md

def create_vibecheck_results_ui():
    """
    Builds and returns a Gradio Blocks interface for viewing VibeCheck results
    """
    
    # Custom CSS for a more professional look
    custom_css = """
    .gradio-container {
        max-width: 1200px !important;
        margin: auto !important;
    }
    .header-text {
        text-align: center;
        margin-bottom: 1rem;
    }
    .header-emoji {
        font-size: 2.5rem;
        margin-right: 0.5rem;
    }
    .tab-content {
        padding: 1rem;
    }
    .footer {
        text-align: center;
        margin-top: 2rem;
        color: #666;
        font-size: 0.9rem;
    }
    """

    with gr.Blocks(theme='davehornik/Tealy', css=custom_css) as demo:
        gr.HTML("""
            <div class="header-text">
                <h1><span class="header-emoji">✨</span> LLM VibeChecks <span class="header-emoji">✨</span></h1>
                <p>Visualize the Vibes of model pairs using <a href="https://arxiv.org/abs/2410.12851" target="_blank" style="text-decoration: underline; color: inherit;">VibeCheck system</a> (more to come)</p>
            </div>
        """)
        
        with gr.Row():
            # Left Column - Selection
            with gr.Column(scale=1):
                # Dropdown to select saved results
                saved_results_dropdown = gr.Dropdown(
                    label="Select a dataset/model pair",
                    choices=list_saved_results(),
                    interactive=True,
                    allow_custom_value=False,
                )
                
                # Refresh button
                refresh_btn = gr.Button("Refresh List", variant="secondary")
                
                # Load button
                load_btn = gr.Button("Load Selected Results", variant="primary")
                
                # Status message
                load_status = gr.Markdown()
            
        # Output area
        with gr.Row():
            load_output_md = gr.Markdown()
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("### Vibe Score Heuristics\n\nVibes which have low scores are not used in prediction models")
                load_output_plot1 = gr.Plot()
            
        with gr.Row():
            with gr.Column():
                gr.Markdown("### Vibe Prediction Coefficients")
                load_output_plot2 = gr.Plot()

        # Bottom Row - Vibe Examples
        with gr.Row():
            with gr.Column():
                gr.Markdown("## Explore Examples")
                load_vibe_dropdown = gr.Dropdown(
                    label="Select a vibe to see examples and their scores",
                    choices=[],
                    interactive=True,
                    multiselect=False,
                    value=None,
                    allow_custom_value=False,
                )
                load_example_dropdown = gr.Dropdown(
                    label="Select an example to view",
                    choices=[],
                    interactive=True,
                    multiselect=False,
                    value=None,
                    allow_custom_value=False,
                )
                load_examples_md = gr.Markdown()
        
        # Store loaded results state
        load_results_state = gr.State()
        
        # Function to refresh the list of saved results
        def refresh_saved_results():
            return gr.Dropdown(choices=list_saved_results())
        
        refresh_btn.click(
            fn=refresh_saved_results,
            inputs=[],
            outputs=[saved_results_dropdown]
        )
        
        # Function to load selected results
        def on_load_results(selected_result):
            if not selected_result:
                return "Please select a saved analysis to load.", None, None, None, None
            
            # Get the actual filename from the display name
            filename = selected_result[1] if isinstance(selected_result, tuple) else selected_result
            
            # Load the results
            results = load_saved_result(filename)
            
            if not results or "vibe_df" not in results:
                return "Failed to load results or invalid result file.", None, None, None, None

            model_names = results.get("models")
          
            accuracy_info = ""
            if "vibe_prediction_metrics" in results and results["vibe_prediction_metrics"]:
                metrics = results["vibe_prediction_metrics"]
                
                # Format identity metrics
                if "identity_metrics" in metrics:
                    id_metrics = metrics["identity_metrics"]
                    accuracy_info += f"\n\n### Model Identity Prediction\n"
                    accuracy_info += f"- Accuracy: {id_metrics.get('accuracy', 'N/A'):.3f} ± {id_metrics.get('acc_std', 'N/A'):.3f}\n"
                    if "acc_ci" in id_metrics:
                        accuracy_info += f"- 95% CI: [{id_metrics['acc_ci'][0]:.3f}, {id_metrics['acc_ci'][1]:.3f}]\n"
                
                # Format preference metrics
                if "preference_metrics" in metrics:
                    pref_metrics = metrics["preference_metrics"]
                    accuracy_info += f"\n### Preference Prediction\n"
                    accuracy_info += f"- Accuracy: {pref_metrics.get('accuracy', 'N/A'):.3f} ± {pref_metrics.get('acc_std', 'N/A'):.3f}\n"
                    if "acc_ci" in pref_metrics:
                        accuracy_info += f"- 95% CI: [{pref_metrics['acc_ci'][0]:.3f}, {pref_metrics['acc_ci'][1]:.3f}]\n"
            
            # Add model names to the summary
            model_info = f"\n\n**Models Compared: {' vs. '.join(model_names)}**\n\n"
            
            # Summarize
            summary_text = (
                "### Loaded VibeCheck Results\n"
                + model_info
                + f"**Found {len(results['vibe_df']['vibe'].unique())} vibe(s)**\n"
                + "\n".join(
                    [
                        f"- {vibe}"
                        for vibe in results["vibe_df"]["vibe"].unique().tolist()
                    ]
                )
                + accuracy_info
            )
            
            return (
                "Results loaded successfully!",
                summary_text,
                results["model_vibe_scores_plot"],
                results["score_dist_plot"],
                results,
                gr.Dropdown(choices=sorted(results["vibe_df"]["vibe"].unique().tolist()), 
                           value=sorted(results["vibe_df"]["vibe"].unique().tolist())[0] if results["vibe_df"]["vibe"].unique().tolist() else None)
            )
        
        load_btn.click(
            fn=on_load_results,
            inputs=[saved_results_dropdown],
            outputs=[
                load_status,
                load_output_md,
                load_output_plot1,
                load_output_plot2,
                load_results_state,
                load_vibe_dropdown,
            ],
        )
        
        # Show examples for selected vibe
        load_vibe_dropdown.change(
            fn=show_examples,
            inputs=[load_vibe_dropdown, load_results_state],
            outputs=[load_example_dropdown],
        )

        # Add handler for example dropdown
        load_example_dropdown.change(
            fn=display_selected_example,
            inputs=[load_example_dropdown, load_vibe_dropdown, load_results_state],
            outputs=[load_examples_md],
        )

        # Add a download button and file component
        with gr.Row():
            download_btn = gr.Button("Download DataFrame as CSV", variant="secondary")
            download_file = gr.File(label="Download CSV", interactive=False)

        # Function to convert DataFrame to CSV and return for download
        def download_dataframe(results_dict):
            if not results_dict or "df" not in results_dict:
                return None
            
            # Convert DataFrame to CSV
            csv_file_path = "vibecheck_results.csv"
            results_dict["df"].to_csv(csv_file_path, index=False)
            return csv_file_path

        # Connect the download button to the function
        download_btn.click(
            fn=download_dataframe,
            inputs=[load_results_state],
            outputs=[download_file]
        )

        # Add a footer
        gr.HTML("""
        <div class="footer">
            <p>© just some folks who like vibes</p>
        </div>
        """)

    return demo

app = create_vibecheck_results_ui()
app.launch()