lisabdunlap's picture
Update app.py
b4bfeb8 verified
import os
import gradio as gr
import pandas as pd
def list_saved_results():
"""List all saved result files in the saved_results directory."""
results_dir = os.path.join(".")
if not os.path.exists(results_dir):
return []
result_files = [f for f in os.listdir(results_dir) if f.endswith(".pkl")]
# Sort by modification time (newest first)
result_files.sort(key=lambda x: os.path.getmtime(os.path.join(results_dir, x)), reverse=True)
formatted_results = []
for filename in result_files:
formatted_results.append((filename, filename))
return formatted_results
def load_saved_result(filename):
"""Load a saved result file."""
import pickle
results_dir = os.path.join(".")
filepath = os.path.join(results_dir, filename)
if not os.path.exists(filepath):
return None
with open(filepath, "rb") as f:
results = pickle.load(f)
return results
def show_examples(selected_vibe, results_dict, model_list_str=None):
"""Display example rows for the selected vibe."""
if not selected_vibe:
return "Please select a vibe to see examples.", []
if not results_dict or "vibe_df" not in results_dict:
return "No vibe data available.", []
vibe_df = results_dict["vibe_df"]
# Extract model names from the results dictionary or use provided models
if model_list_str:
models = [m.strip() for m in model_list_str.split(",") if m.strip()]
else:
models = results_dict.get("models", ["Model A", "Model B"])
# Filter for the selected vibe
subset = vibe_df[(vibe_df["vibe"] == selected_vibe)].head(10)
# Create a list of example choices with preview text
example_choices = []
for i, row in enumerate(subset.itertuples(), 1):
row = row._asdict()
# Create a short preview of the prompt (first 100 chars)
preview = row['question'][:100] + "..." if len(row['question']) > 100 else row['question']
score = row['score']
# Format: "Example 1: This is the prompt... (Score: 0.123)"
label = f"Example {i}: {preview} (Score: {score:.3f})"
example_choices.append(label)
return gr.Dropdown(choices=example_choices, value=example_choices[0] if example_choices else None)
def display_selected_example(example_idx, selected_vibe, results_dict, model_list_str=None):
"""Display a single selected example."""
if example_idx is None or not selected_vibe:
return "Please select an example to view."
if not results_dict or "vibe_df" not in results_dict:
return "No vibe data available."
vibe_df = results_dict["vibe_df"]
# Extract model names from the results dictionary or use provided models
if model_list_str:
models = [m.strip() for m in model_list_str.split(",") if m.strip()]
else:
models = results_dict.get("models", ["Model A", "Model B"])
# Filter for the selected vibe
subset = vibe_df[
(vibe_df["vibe"] == selected_vibe) & (vibe_df["score"].abs() > 0.0)
].head(10).to_dict(orient="records") # Match the number in show_examples
# Extract the example number from the dropdown selection
try:
# Parse the example number from the string (e.g., "Example 3: ...")
example_num = int(example_idx.split(":")[0].replace("Example ", "")) - 1
if example_num < 0 or example_num >= len(subset):
return "Example index out of range."
except (ValueError, AttributeError, IndexError):
return f"Invalid example selection: '{example_idx}'"
# Get the selected example
row = subset[example_num]
md = f"**Prompt:** {row['question']}\n\n"
# Show both model outputs if available
for model in models:
if model in row:
md += f"**{model} Output:**\n{row[model]}\n________________________\n"
md += f"**Score:** {row['score']:.3f} "
# Add interpretation of which model exhibits the vibe more
if len(models) >= 2:
if row["score"] > 0:
md += f"({models[0]} exhibits this vibe more)\n\n"
else:
md += f"({models[1]} exhibits this vibe more)\n\n"
return md
def create_vibecheck_results_ui():
"""
Builds and returns a Gradio Blocks interface for viewing VibeCheck results
"""
# Custom CSS for a more professional look
custom_css = """
.gradio-container {
max-width: 1200px !important;
margin: auto !important;
}
.header-text {
text-align: center;
margin-bottom: 1rem;
}
.header-emoji {
font-size: 2.5rem;
margin-right: 0.5rem;
}
.tab-content {
padding: 1rem;
}
.footer {
text-align: center;
margin-top: 2rem;
color: #666;
font-size: 0.9rem;
}
"""
with gr.Blocks(theme='davehornik/Tealy', css=custom_css) as demo:
gr.HTML("""
<div class="header-text">
<h1><span class="header-emoji">✨</span> LLM VibeChecks <span class="header-emoji">✨</span></h1>
<p>Visualize the Vibes of model pairs using <a href="https://arxiv.org/abs/2410.12851" target="_blank" style="text-decoration: underline; color: inherit;">VibeCheck system</a> (more to come)</p>
</div>
""")
with gr.Row():
# Left Column - Selection
with gr.Column(scale=1):
# Dropdown to select saved results
saved_results_dropdown = gr.Dropdown(
label="Select a dataset/model pair",
choices=list_saved_results(),
interactive=True,
allow_custom_value=False,
)
# Refresh button
refresh_btn = gr.Button("Refresh List", variant="secondary")
# Load button
load_btn = gr.Button("Load Selected Results", variant="primary")
# Status message
load_status = gr.Markdown()
# Output area
with gr.Row():
load_output_md = gr.Markdown()
with gr.Row():
with gr.Column():
gr.Markdown("### Vibe Score Heuristics\n\nVibes which have low scores are not used in prediction models")
load_output_plot1 = gr.Plot()
with gr.Row():
with gr.Column():
gr.Markdown("### Vibe Prediction Coefficients")
load_output_plot2 = gr.Plot()
# Bottom Row - Vibe Examples
with gr.Row():
with gr.Column():
gr.Markdown("## Explore Examples")
load_vibe_dropdown = gr.Dropdown(
label="Select a vibe to see examples and their scores",
choices=[],
interactive=True,
multiselect=False,
value=None,
allow_custom_value=False,
)
load_example_dropdown = gr.Dropdown(
label="Select an example to view",
choices=[],
interactive=True,
multiselect=False,
value=None,
allow_custom_value=False,
)
load_examples_md = gr.Markdown()
# Store loaded results state
load_results_state = gr.State()
# Function to refresh the list of saved results
def refresh_saved_results():
return gr.Dropdown(choices=list_saved_results())
refresh_btn.click(
fn=refresh_saved_results,
inputs=[],
outputs=[saved_results_dropdown]
)
# Function to load selected results
def on_load_results(selected_result):
if not selected_result:
return "Please select a saved analysis to load.", None, None, None, None
# Get the actual filename from the display name
filename = selected_result[1] if isinstance(selected_result, tuple) else selected_result
# Load the results
results = load_saved_result(filename)
if not results or "vibe_df" not in results:
return "Failed to load results or invalid result file.", None, None, None, None
model_names = results.get("models")
accuracy_info = ""
if "vibe_prediction_metrics" in results and results["vibe_prediction_metrics"]:
metrics = results["vibe_prediction_metrics"]
# Format identity metrics
if "identity_metrics" in metrics:
id_metrics = metrics["identity_metrics"]
accuracy_info += f"\n\n### Model Identity Prediction\n"
accuracy_info += f"- Accuracy: {id_metrics.get('accuracy', 'N/A'):.3f} ± {id_metrics.get('acc_std', 'N/A'):.3f}\n"
if "acc_ci" in id_metrics:
accuracy_info += f"- 95% CI: [{id_metrics['acc_ci'][0]:.3f}, {id_metrics['acc_ci'][1]:.3f}]\n"
# Format preference metrics
if "preference_metrics" in metrics:
pref_metrics = metrics["preference_metrics"]
accuracy_info += f"\n### Preference Prediction\n"
accuracy_info += f"- Accuracy: {pref_metrics.get('accuracy', 'N/A'):.3f} ± {pref_metrics.get('acc_std', 'N/A'):.3f}\n"
if "acc_ci" in pref_metrics:
accuracy_info += f"- 95% CI: [{pref_metrics['acc_ci'][0]:.3f}, {pref_metrics['acc_ci'][1]:.3f}]\n"
# Add model names to the summary
model_info = f"\n\n**Models Compared: {' vs. '.join(model_names)}**\n\n"
# Summarize
summary_text = (
"### Loaded VibeCheck Results\n"
+ model_info
+ f"**Found {len(results['vibe_df']['vibe'].unique())} vibe(s)**\n"
+ "\n".join(
[
f"- {vibe}"
for vibe in results["vibe_df"]["vibe"].unique().tolist()
]
)
+ accuracy_info
)
return (
"Results loaded successfully!",
summary_text,
results["model_vibe_scores_plot"],
results["score_dist_plot"],
results,
gr.Dropdown(choices=sorted(results["vibe_df"]["vibe"].unique().tolist()),
value=sorted(results["vibe_df"]["vibe"].unique().tolist())[0] if results["vibe_df"]["vibe"].unique().tolist() else None)
)
load_btn.click(
fn=on_load_results,
inputs=[saved_results_dropdown],
outputs=[
load_status,
load_output_md,
load_output_plot1,
load_output_plot2,
load_results_state,
load_vibe_dropdown,
],
)
# Show examples for selected vibe
load_vibe_dropdown.change(
fn=show_examples,
inputs=[load_vibe_dropdown, load_results_state],
outputs=[load_example_dropdown],
)
# Add handler for example dropdown
load_example_dropdown.change(
fn=display_selected_example,
inputs=[load_example_dropdown, load_vibe_dropdown, load_results_state],
outputs=[load_examples_md],
)
# Add a download button and file component
with gr.Row():
download_btn = gr.Button("Download DataFrame as CSV", variant="secondary")
download_file = gr.File(label="Download CSV", interactive=False)
# Function to convert DataFrame to CSV and return for download
def download_dataframe(results_dict):
if not results_dict or "df" not in results_dict:
return None
# Convert DataFrame to CSV
csv_file_path = "vibecheck_results.csv"
results_dict["df"].to_csv(csv_file_path, index=False)
return csv_file_path
# Connect the download button to the function
download_btn.click(
fn=download_dataframe,
inputs=[load_results_state],
outputs=[download_file]
)
# Add a footer
gr.HTML("""
<div class="footer">
<p>© just some folks who like vibes</p>
</div>
""")
return demo
app = create_vibecheck_results_ui()
app.launch()