import os import gradio as gr import pandas as pd def list_saved_results(): """List all saved result files in the saved_results directory.""" results_dir = os.path.join(".") if not os.path.exists(results_dir): return [] result_files = [f for f in os.listdir(results_dir) if f.endswith(".pkl")] # Sort by modification time (newest first) result_files.sort(key=lambda x: os.path.getmtime(os.path.join(results_dir, x)), reverse=True) formatted_results = [] for filename in result_files: formatted_results.append((filename, filename)) return formatted_results def load_saved_result(filename): """Load a saved result file.""" import pickle results_dir = os.path.join(".") filepath = os.path.join(results_dir, filename) if not os.path.exists(filepath): return None with open(filepath, "rb") as f: results = pickle.load(f) return results def show_examples(selected_vibe, results_dict, model_list_str=None): """Display example rows for the selected vibe.""" if not selected_vibe: return "Please select a vibe to see examples.", [] if not results_dict or "vibe_df" not in results_dict: return "No vibe data available.", [] vibe_df = results_dict["vibe_df"] # Extract model names from the results dictionary or use provided models if model_list_str: models = [m.strip() for m in model_list_str.split(",") if m.strip()] else: models = results_dict.get("models", ["Model A", "Model B"]) # Filter for the selected vibe subset = vibe_df[(vibe_df["vibe"] == selected_vibe)].head(10) # Create a list of example choices with preview text example_choices = [] for i, row in enumerate(subset.itertuples(), 1): row = row._asdict() # Create a short preview of the prompt (first 100 chars) preview = row['question'][:100] + "..." if len(row['question']) > 100 else row['question'] score = row['score'] # Format: "Example 1: This is the prompt... (Score: 0.123)" label = f"Example {i}: {preview} (Score: {score:.3f})" example_choices.append(label) return gr.Dropdown(choices=example_choices, value=example_choices[0] if example_choices else None) def display_selected_example(example_idx, selected_vibe, results_dict, model_list_str=None): """Display a single selected example.""" if example_idx is None or not selected_vibe: return "Please select an example to view." if not results_dict or "vibe_df" not in results_dict: return "No vibe data available." vibe_df = results_dict["vibe_df"] # Extract model names from the results dictionary or use provided models if model_list_str: models = [m.strip() for m in model_list_str.split(",") if m.strip()] else: models = results_dict.get("models", ["Model A", "Model B"]) # Filter for the selected vibe subset = vibe_df[ (vibe_df["vibe"] == selected_vibe) & (vibe_df["score"].abs() > 0.0) ].head(10).to_dict(orient="records") # Match the number in show_examples # Extract the example number from the dropdown selection try: # Parse the example number from the string (e.g., "Example 3: ...") example_num = int(example_idx.split(":")[0].replace("Example ", "")) - 1 if example_num < 0 or example_num >= len(subset): return "Example index out of range." except (ValueError, AttributeError, IndexError): return f"Invalid example selection: '{example_idx}'" # Get the selected example row = subset[example_num] md = f"**Prompt:** {row['question']}\n\n" # Show both model outputs if available for model in models: if model in row: md += f"**{model} Output:**\n{row[model]}\n________________________\n" md += f"**Score:** {row['score']:.3f} " # Add interpretation of which model exhibits the vibe more if len(models) >= 2: if row["score"] > 0: md += f"({models[0]} exhibits this vibe more)\n\n" else: md += f"({models[1]} exhibits this vibe more)\n\n" return md def create_vibecheck_results_ui(): """ Builds and returns a Gradio Blocks interface for viewing VibeCheck results """ # Custom CSS for a more professional look custom_css = """ .gradio-container { max-width: 1200px !important; margin: auto !important; } .header-text { text-align: center; margin-bottom: 1rem; } .header-emoji { font-size: 2.5rem; margin-right: 0.5rem; } .tab-content { padding: 1rem; } .footer { text-align: center; margin-top: 2rem; color: #666; font-size: 0.9rem; } """ with gr.Blocks(theme='davehornik/Tealy', css=custom_css) as demo: gr.HTML("""

LLM VibeChecks

Visualize the Vibes of model pairs using VibeCheck system (more to come)

""") with gr.Row(): # Left Column - Selection with gr.Column(scale=1): # Dropdown to select saved results saved_results_dropdown = gr.Dropdown( label="Select a dataset/model pair", choices=list_saved_results(), interactive=True, allow_custom_value=False, ) # Refresh button refresh_btn = gr.Button("Refresh List", variant="secondary") # Load button load_btn = gr.Button("Load Selected Results", variant="primary") # Status message load_status = gr.Markdown() # Output area with gr.Row(): load_output_md = gr.Markdown() with gr.Row(): with gr.Column(): gr.Markdown("### Vibe Score Heuristics\n\nVibes which have low scores are not used in prediction models") load_output_plot1 = gr.Plot() with gr.Row(): with gr.Column(): gr.Markdown("### Vibe Prediction Coefficients") load_output_plot2 = gr.Plot() # Bottom Row - Vibe Examples with gr.Row(): with gr.Column(): gr.Markdown("## Explore Examples") load_vibe_dropdown = gr.Dropdown( label="Select a vibe to see examples and their scores", choices=[], interactive=True, multiselect=False, value=None, allow_custom_value=False, ) load_example_dropdown = gr.Dropdown( label="Select an example to view", choices=[], interactive=True, multiselect=False, value=None, allow_custom_value=False, ) load_examples_md = gr.Markdown() # Store loaded results state load_results_state = gr.State() # Function to refresh the list of saved results def refresh_saved_results(): return gr.Dropdown(choices=list_saved_results()) refresh_btn.click( fn=refresh_saved_results, inputs=[], outputs=[saved_results_dropdown] ) # Function to load selected results def on_load_results(selected_result): if not selected_result: return "Please select a saved analysis to load.", None, None, None, None # Get the actual filename from the display name filename = selected_result[1] if isinstance(selected_result, tuple) else selected_result # Load the results results = load_saved_result(filename) if not results or "vibe_df" not in results: return "Failed to load results or invalid result file.", None, None, None, None model_names = results.get("models") accuracy_info = "" if "vibe_prediction_metrics" in results and results["vibe_prediction_metrics"]: metrics = results["vibe_prediction_metrics"] # Format identity metrics if "identity_metrics" in metrics: id_metrics = metrics["identity_metrics"] accuracy_info += f"\n\n### Model Identity Prediction\n" accuracy_info += f"- Accuracy: {id_metrics.get('accuracy', 'N/A'):.3f} ± {id_metrics.get('acc_std', 'N/A'):.3f}\n" if "acc_ci" in id_metrics: accuracy_info += f"- 95% CI: [{id_metrics['acc_ci'][0]:.3f}, {id_metrics['acc_ci'][1]:.3f}]\n" # Format preference metrics if "preference_metrics" in metrics: pref_metrics = metrics["preference_metrics"] accuracy_info += f"\n### Preference Prediction\n" accuracy_info += f"- Accuracy: {pref_metrics.get('accuracy', 'N/A'):.3f} ± {pref_metrics.get('acc_std', 'N/A'):.3f}\n" if "acc_ci" in pref_metrics: accuracy_info += f"- 95% CI: [{pref_metrics['acc_ci'][0]:.3f}, {pref_metrics['acc_ci'][1]:.3f}]\n" # Add model names to the summary model_info = f"\n\n**Models Compared: {' vs. '.join(model_names)}**\n\n" # Summarize summary_text = ( "### Loaded VibeCheck Results\n" + model_info + f"**Found {len(results['vibe_df']['vibe'].unique())} vibe(s)**\n" + "\n".join( [ f"- {vibe}" for vibe in results["vibe_df"]["vibe"].unique().tolist() ] ) + accuracy_info ) return ( "Results loaded successfully!", summary_text, results["model_vibe_scores_plot"], results["score_dist_plot"], results, gr.Dropdown(choices=sorted(results["vibe_df"]["vibe"].unique().tolist()), value=sorted(results["vibe_df"]["vibe"].unique().tolist())[0] if results["vibe_df"]["vibe"].unique().tolist() else None) ) load_btn.click( fn=on_load_results, inputs=[saved_results_dropdown], outputs=[ load_status, load_output_md, load_output_plot1, load_output_plot2, load_results_state, load_vibe_dropdown, ], ) # Show examples for selected vibe load_vibe_dropdown.change( fn=show_examples, inputs=[load_vibe_dropdown, load_results_state], outputs=[load_example_dropdown], ) # Add handler for example dropdown load_example_dropdown.change( fn=display_selected_example, inputs=[load_example_dropdown, load_vibe_dropdown, load_results_state], outputs=[load_examples_md], ) # Add a download button and file component with gr.Row(): download_btn = gr.Button("Download DataFrame as CSV", variant="secondary") download_file = gr.File(label="Download CSV", interactive=False) # Function to convert DataFrame to CSV and return for download def download_dataframe(results_dict): if not results_dict or "df" not in results_dict: return None # Convert DataFrame to CSV csv_file_path = "vibecheck_results.csv" results_dict["df"].to_csv(csv_file_path, index=False) return csv_file_path # Connect the download button to the function download_btn.click( fn=download_dataframe, inputs=[load_results_state], outputs=[download_file] ) # Add a footer gr.HTML(""" """) return demo app = create_vibecheck_results_ui() app.launch()