Spaces:

lisabdunlap
/

VibeCheck-Examples

Running

App Files Files Community

VibeCheck-Examples / app.py

lisabdunlap

Update app.py

b4bfeb8 verified 3 days ago

raw

history blame contribute delete

13 kB

	import os
	import gradio as gr
	import pandas as pd

	def list_saved_results():
	"""List all saved result files in the saved_results directory."""
	results_dir = os.path.join(".")
	if not os.path.exists(results_dir):
	return []

	result_files = [f for f in os.listdir(results_dir) if f.endswith(".pkl")]
	# Sort by modification time (newest first)
	result_files.sort(key=lambda x: os.path.getmtime(os.path.join(results_dir, x)), reverse=True)
	formatted_results = []
	for filename in result_files:
	formatted_results.append((filename, filename))

	return formatted_results

	def load_saved_result(filename):
	"""Load a saved result file."""
	import pickle

	results_dir = os.path.join(".")
	filepath = os.path.join(results_dir, filename)

	if not os.path.exists(filepath):
	return None

	with open(filepath, "rb") as f:
	results = pickle.load(f)

	return results

	def show_examples(selected_vibe, results_dict, model_list_str=None):
	"""Display example rows for the selected vibe."""
	if not selected_vibe:
	return "Please select a vibe to see examples.", []

	if not results_dict or "vibe_df" not in results_dict:
	return "No vibe data available.", []

	vibe_df = results_dict["vibe_df"]

	# Extract model names from the results dictionary or use provided models
	if model_list_str:
	models = [m.strip() for m in model_list_str.split(",") if m.strip()]
	else:
	models = results_dict.get("models", ["Model A", "Model B"])

	# Filter for the selected vibe
	subset = vibe_df[(vibe_df["vibe"] == selected_vibe)].head(10)

	# Create a list of example choices with preview text
	example_choices = []
	for i, row in enumerate(subset.itertuples(), 1):
	row = row._asdict()
	# Create a short preview of the prompt (first 100 chars)
	preview = row['question'][:100] + "..." if len(row['question']) > 100 else row['question']
	score = row['score']
	# Format: "Example 1: This is the prompt... (Score: 0.123)"
	label = f"Example {i}: {preview} (Score: {score:.3f})"
	example_choices.append(label)

	return gr.Dropdown(choices=example_choices, value=example_choices[0] if example_choices else None)

	def display_selected_example(example_idx, selected_vibe, results_dict, model_list_str=None):
	"""Display a single selected example."""
	if example_idx is None or not selected_vibe:
	return "Please select an example to view."

	if not results_dict or "vibe_df" not in results_dict:
	return "No vibe data available."

	vibe_df = results_dict["vibe_df"]

	# Extract model names from the results dictionary or use provided models
	if model_list_str:
	models = [m.strip() for m in model_list_str.split(",") if m.strip()]
	else:
	models = results_dict.get("models", ["Model A", "Model B"])

	# Filter for the selected vibe
	subset = vibe_df[
	(vibe_df["vibe"] == selected_vibe) & (vibe_df["score"].abs() > 0.0)
	].head(10).to_dict(orient="records") # Match the number in show_examples

	# Extract the example number from the dropdown selection
	try:
	# Parse the example number from the string (e.g., "Example 3: ...")
	example_num = int(example_idx.split(":")[0].replace("Example ", "")) - 1
	if example_num < 0 or example_num >= len(subset):
	return "Example index out of range."
	except (ValueError, AttributeError, IndexError):
	return f"Invalid example selection: '{example_idx}'"

	# Get the selected example
	row = subset[example_num]

	md = f"Prompt: {row['question']}\n\n"

	# Show both model outputs if available
	for model in models:
	if model in row:
	md += f"{model} Output:\n{row[model]}\n________________________\n"
	md += f"Score: {row['score']:.3f} "

	# Add interpretation of which model exhibits the vibe more
	if len(models) >= 2:
	if row["score"] > 0:
	md += f"({models[0]} exhibits this vibe more)\n\n"
	else:
	md += f"({models[1]} exhibits this vibe more)\n\n"

	return md

	def create_vibecheck_results_ui():
	"""
	Builds and returns a Gradio Blocks interface for viewing VibeCheck results
	"""

	# Custom CSS for a more professional look
	custom_css = """
	.gradio-container {
	max-width: 1200px !important;
	margin: auto !important;
	}
	.header-text {
	text-align: center;
	margin-bottom: 1rem;
	}
	.header-emoji {
	font-size: 2.5rem;
	margin-right: 0.5rem;
	}
	.tab-content {
	padding: 1rem;
	}
	.footer {
	text-align: center;
	margin-top: 2rem;
	color: #666;
	font-size: 0.9rem;
	}
	"""

	with gr.Blocks(theme='davehornik/Tealy', css=custom_css) as demo:
	gr.HTML("""
	<div class="header-text">
	<h1><span class="header-emoji">✨</span> LLM VibeChecks <span class="header-emoji">✨</span></h1>
	<p>Visualize the Vibes of model pairs using <a href="https://arxiv.org/abs/2410.12851" target="_blank" style="text-decoration: underline; color: inherit;">VibeCheck system</a> (more to come)</p>
	</div>
	""")

	with gr.Row():
	# Left Column - Selection
	with gr.Column(scale=1):
	# Dropdown to select saved results
	saved_results_dropdown = gr.Dropdown(
	label="Select a dataset/model pair",
	choices=list_saved_results(),
	interactive=True,
	allow_custom_value=False,
	)

	# Refresh button
	refresh_btn = gr.Button("Refresh List", variant="secondary")

	# Load button
	load_btn = gr.Button("Load Selected Results", variant="primary")

	# Status message
	load_status = gr.Markdown()

	# Output area
	with gr.Row():
	load_output_md = gr.Markdown()

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Vibe Score Heuristics\n\nVibes which have low scores are not used in prediction models")
	load_output_plot1 = gr.Plot()

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Vibe Prediction Coefficients")
	load_output_plot2 = gr.Plot()

	# Bottom Row - Vibe Examples
	with gr.Row():
	with gr.Column():
	gr.Markdown("## Explore Examples")
	load_vibe_dropdown = gr.Dropdown(
	label="Select a vibe to see examples and their scores",
	choices=[],
	interactive=True,
	multiselect=False,
	value=None,
	allow_custom_value=False,
	)
	load_example_dropdown = gr.Dropdown(
	label="Select an example to view",
	choices=[],
	interactive=True,
	multiselect=False,
	value=None,
	allow_custom_value=False,
	)
	load_examples_md = gr.Markdown()

	# Store loaded results state
	load_results_state = gr.State()

	# Function to refresh the list of saved results
	def refresh_saved_results():
	return gr.Dropdown(choices=list_saved_results())

	refresh_btn.click(
	fn=refresh_saved_results,
	inputs=[],
	outputs=[saved_results_dropdown]
	)

	# Function to load selected results
	def on_load_results(selected_result):
	if not selected_result:
	return "Please select a saved analysis to load.", None, None, None, None

	# Get the actual filename from the display name
	filename = selected_result[1] if isinstance(selected_result, tuple) else selected_result

	# Load the results
	results = load_saved_result(filename)

	if not results or "vibe_df" not in results:
	return "Failed to load results or invalid result file.", None, None, None, None

	model_names = results.get("models")

	accuracy_info = ""
	if "vibe_prediction_metrics" in results and results["vibe_prediction_metrics"]:
	metrics = results["vibe_prediction_metrics"]

	# Format identity metrics
	if "identity_metrics" in metrics:
	id_metrics = metrics["identity_metrics"]
	accuracy_info += f"\n\n### Model Identity Prediction\n"
	accuracy_info += f"- Accuracy: {id_metrics.get('accuracy', 'N/A'):.3f} ± {id_metrics.get('acc_std', 'N/A'):.3f}\n"
	if "acc_ci" in id_metrics:
	accuracy_info += f"- 95% CI: [{id_metrics['acc_ci'][0]:.3f}, {id_metrics['acc_ci'][1]:.3f}]\n"

	# Format preference metrics
	if "preference_metrics" in metrics:
	pref_metrics = metrics["preference_metrics"]
	accuracy_info += f"\n### Preference Prediction\n"
	accuracy_info += f"- Accuracy: {pref_metrics.get('accuracy', 'N/A'):.3f} ± {pref_metrics.get('acc_std', 'N/A'):.3f}\n"
	if "acc_ci" in pref_metrics:
	accuracy_info += f"- 95% CI: [{pref_metrics['acc_ci'][0]:.3f}, {pref_metrics['acc_ci'][1]:.3f}]\n"

	# Add model names to the summary
	model_info = f"\n\nModels Compared: {' vs. '.join(model_names)}\n\n"

	# Summarize
	summary_text = (
	"### Loaded VibeCheck Results\n"
	+ model_info
	+ f"Found {len(results['vibe_df']['vibe'].unique())} vibe(s)\n"
	+ "\n".join(
	[
	f"- {vibe}"
	for vibe in results["vibe_df"]["vibe"].unique().tolist()
	]
	)
	+ accuracy_info
	)

	return (
	"Results loaded successfully!",
	summary_text,
	results["model_vibe_scores_plot"],
	results["score_dist_plot"],
	results,
	gr.Dropdown(choices=sorted(results["vibe_df"]["vibe"].unique().tolist()),
	value=sorted(results["vibe_df"]["vibe"].unique().tolist())[0] if results["vibe_df"]["vibe"].unique().tolist() else None)
	)

	load_btn.click(
	fn=on_load_results,
	inputs=[saved_results_dropdown],
	outputs=[
	load_status,
	load_output_md,
	load_output_plot1,
	load_output_plot2,
	load_results_state,
	load_vibe_dropdown,
	],
	)

	# Show examples for selected vibe
	load_vibe_dropdown.change(
	fn=show_examples,
	inputs=[load_vibe_dropdown, load_results_state],
	outputs=[load_example_dropdown],
	)

	# Add handler for example dropdown
	load_example_dropdown.change(
	fn=display_selected_example,
	inputs=[load_example_dropdown, load_vibe_dropdown, load_results_state],
	outputs=[load_examples_md],
	)

	# Add a download button and file component
	with gr.Row():
	download_btn = gr.Button("Download DataFrame as CSV", variant="secondary")
	download_file = gr.File(label="Download CSV", interactive=False)

	# Function to convert DataFrame to CSV and return for download
	def download_dataframe(results_dict):
	if not results_dict or "df" not in results_dict:
	return None

	# Convert DataFrame to CSV
	csv_file_path = "vibecheck_results.csv"
	results_dict["df"].to_csv(csv_file_path, index=False)
	return csv_file_path

	# Connect the download button to the function
	download_btn.click(
	fn=download_dataframe,
	inputs=[load_results_state],
	outputs=[download_file]
	)

	# Add a footer
	gr.HTML("""
	<div class="footer">
	<p>© just some folks who like vibes</p>
	</div>
	""")

	return demo

	app = create_vibecheck_results_ui()
	app.launch()