Spaces:
Running
Running
import os | |
import gradio as gr | |
import pandas as pd | |
def list_saved_results(): | |
"""List all saved result files in the saved_results directory.""" | |
results_dir = os.path.join(".") | |
if not os.path.exists(results_dir): | |
return [] | |
result_files = [f for f in os.listdir(results_dir) if f.endswith(".pkl")] | |
# Sort by modification time (newest first) | |
result_files.sort(key=lambda x: os.path.getmtime(os.path.join(results_dir, x)), reverse=True) | |
formatted_results = [] | |
for filename in result_files: | |
formatted_results.append((filename, filename)) | |
return formatted_results | |
def load_saved_result(filename): | |
"""Load a saved result file.""" | |
import pickle | |
results_dir = os.path.join(".") | |
filepath = os.path.join(results_dir, filename) | |
if not os.path.exists(filepath): | |
return None | |
with open(filepath, "rb") as f: | |
results = pickle.load(f) | |
return results | |
def show_examples(selected_vibe, results_dict, model_list_str=None): | |
"""Display example rows for the selected vibe.""" | |
if not selected_vibe: | |
return "Please select a vibe to see examples.", [] | |
if not results_dict or "vibe_df" not in results_dict: | |
return "No vibe data available.", [] | |
vibe_df = results_dict["vibe_df"] | |
# Extract model names from the results dictionary or use provided models | |
if model_list_str: | |
models = [m.strip() for m in model_list_str.split(",") if m.strip()] | |
else: | |
models = results_dict.get("models", ["Model A", "Model B"]) | |
# Filter for the selected vibe | |
subset = vibe_df[(vibe_df["vibe"] == selected_vibe)].head(10) | |
# Create a list of example choices with preview text | |
example_choices = [] | |
for i, row in enumerate(subset.itertuples(), 1): | |
row = row._asdict() | |
# Create a short preview of the prompt (first 100 chars) | |
preview = row['question'][:100] + "..." if len(row['question']) > 100 else row['question'] | |
score = row['score'] | |
# Format: "Example 1: This is the prompt... (Score: 0.123)" | |
label = f"Example {i}: {preview} (Score: {score:.3f})" | |
example_choices.append(label) | |
return gr.Dropdown(choices=example_choices, value=example_choices[0] if example_choices else None) | |
def display_selected_example(example_idx, selected_vibe, results_dict, model_list_str=None): | |
"""Display a single selected example.""" | |
if example_idx is None or not selected_vibe: | |
return "Please select an example to view." | |
if not results_dict or "vibe_df" not in results_dict: | |
return "No vibe data available." | |
vibe_df = results_dict["vibe_df"] | |
# Extract model names from the results dictionary or use provided models | |
if model_list_str: | |
models = [m.strip() for m in model_list_str.split(",") if m.strip()] | |
else: | |
models = results_dict.get("models", ["Model A", "Model B"]) | |
# Filter for the selected vibe | |
subset = vibe_df[ | |
(vibe_df["vibe"] == selected_vibe) & (vibe_df["score"].abs() > 0.0) | |
].head(10).to_dict(orient="records") # Match the number in show_examples | |
# Extract the example number from the dropdown selection | |
try: | |
# Parse the example number from the string (e.g., "Example 3: ...") | |
example_num = int(example_idx.split(":")[0].replace("Example ", "")) - 1 | |
if example_num < 0 or example_num >= len(subset): | |
return "Example index out of range." | |
except (ValueError, AttributeError, IndexError): | |
return f"Invalid example selection: '{example_idx}'" | |
# Get the selected example | |
row = subset[example_num] | |
md = f"**Prompt:** {row['question']}\n\n" | |
# Show both model outputs if available | |
for model in models: | |
if model in row: | |
md += f"**{model} Output:**\n{row[model]}\n________________________\n" | |
md += f"**Score:** {row['score']:.3f} " | |
# Add interpretation of which model exhibits the vibe more | |
if len(models) >= 2: | |
if row["score"] > 0: | |
md += f"({models[0]} exhibits this vibe more)\n\n" | |
else: | |
md += f"({models[1]} exhibits this vibe more)\n\n" | |
return md | |
def create_vibecheck_results_ui(): | |
""" | |
Builds and returns a Gradio Blocks interface for viewing VibeCheck results | |
""" | |
# Custom CSS for a more professional look | |
custom_css = """ | |
.gradio-container { | |
max-width: 1200px !important; | |
margin: auto !important; | |
} | |
.header-text { | |
text-align: center; | |
margin-bottom: 1rem; | |
} | |
.header-emoji { | |
font-size: 2.5rem; | |
margin-right: 0.5rem; | |
} | |
.tab-content { | |
padding: 1rem; | |
} | |
.footer { | |
text-align: center; | |
margin-top: 2rem; | |
color: #666; | |
font-size: 0.9rem; | |
} | |
""" | |
with gr.Blocks(theme='davehornik/Tealy', css=custom_css) as demo: | |
gr.HTML(""" | |
<div class="header-text"> | |
<h1><span class="header-emoji">✨</span> LLM VibeChecks <span class="header-emoji">✨</span></h1> | |
<p>Visualize the Vibes of model pairs using <a href="https://arxiv.org/abs/2410.12851" target="_blank" style="text-decoration: underline; color: inherit;">VibeCheck system</a> (more to come)</p> | |
</div> | |
""") | |
with gr.Row(): | |
# Left Column - Selection | |
with gr.Column(scale=1): | |
# Dropdown to select saved results | |
saved_results_dropdown = gr.Dropdown( | |
label="Select a dataset/model pair", | |
choices=list_saved_results(), | |
interactive=True, | |
allow_custom_value=False, | |
) | |
# Refresh button | |
refresh_btn = gr.Button("Refresh List", variant="secondary") | |
# Load button | |
load_btn = gr.Button("Load Selected Results", variant="primary") | |
# Status message | |
load_status = gr.Markdown() | |
# Output area | |
with gr.Row(): | |
load_output_md = gr.Markdown() | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Vibe Score Heuristics\n\nVibes which have low scores are not used in prediction models") | |
load_output_plot1 = gr.Plot() | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Vibe Prediction Coefficients") | |
load_output_plot2 = gr.Plot() | |
# Bottom Row - Vibe Examples | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("## Explore Examples") | |
load_vibe_dropdown = gr.Dropdown( | |
label="Select a vibe to see examples and their scores", | |
choices=[], | |
interactive=True, | |
multiselect=False, | |
value=None, | |
allow_custom_value=False, | |
) | |
load_example_dropdown = gr.Dropdown( | |
label="Select an example to view", | |
choices=[], | |
interactive=True, | |
multiselect=False, | |
value=None, | |
allow_custom_value=False, | |
) | |
load_examples_md = gr.Markdown() | |
# Store loaded results state | |
load_results_state = gr.State() | |
# Function to refresh the list of saved results | |
def refresh_saved_results(): | |
return gr.Dropdown(choices=list_saved_results()) | |
refresh_btn.click( | |
fn=refresh_saved_results, | |
inputs=[], | |
outputs=[saved_results_dropdown] | |
) | |
# Function to load selected results | |
def on_load_results(selected_result): | |
if not selected_result: | |
return "Please select a saved analysis to load.", None, None, None, None | |
# Get the actual filename from the display name | |
filename = selected_result[1] if isinstance(selected_result, tuple) else selected_result | |
# Load the results | |
results = load_saved_result(filename) | |
if not results or "vibe_df" not in results: | |
return "Failed to load results or invalid result file.", None, None, None, None | |
model_names = results.get("models") | |
accuracy_info = "" | |
if "vibe_prediction_metrics" in results and results["vibe_prediction_metrics"]: | |
metrics = results["vibe_prediction_metrics"] | |
# Format identity metrics | |
if "identity_metrics" in metrics: | |
id_metrics = metrics["identity_metrics"] | |
accuracy_info += f"\n\n### Model Identity Prediction\n" | |
accuracy_info += f"- Accuracy: {id_metrics.get('accuracy', 'N/A'):.3f} ± {id_metrics.get('acc_std', 'N/A'):.3f}\n" | |
if "acc_ci" in id_metrics: | |
accuracy_info += f"- 95% CI: [{id_metrics['acc_ci'][0]:.3f}, {id_metrics['acc_ci'][1]:.3f}]\n" | |
# Format preference metrics | |
if "preference_metrics" in metrics: | |
pref_metrics = metrics["preference_metrics"] | |
accuracy_info += f"\n### Preference Prediction\n" | |
accuracy_info += f"- Accuracy: {pref_metrics.get('accuracy', 'N/A'):.3f} ± {pref_metrics.get('acc_std', 'N/A'):.3f}\n" | |
if "acc_ci" in pref_metrics: | |
accuracy_info += f"- 95% CI: [{pref_metrics['acc_ci'][0]:.3f}, {pref_metrics['acc_ci'][1]:.3f}]\n" | |
# Add model names to the summary | |
model_info = f"\n\n**Models Compared: {' vs. '.join(model_names)}**\n\n" | |
# Summarize | |
summary_text = ( | |
"### Loaded VibeCheck Results\n" | |
+ model_info | |
+ f"**Found {len(results['vibe_df']['vibe'].unique())} vibe(s)**\n" | |
+ "\n".join( | |
[ | |
f"- {vibe}" | |
for vibe in results["vibe_df"]["vibe"].unique().tolist() | |
] | |
) | |
+ accuracy_info | |
) | |
return ( | |
"Results loaded successfully!", | |
summary_text, | |
results["model_vibe_scores_plot"], | |
results["score_dist_plot"], | |
results, | |
gr.Dropdown(choices=sorted(results["vibe_df"]["vibe"].unique().tolist()), | |
value=sorted(results["vibe_df"]["vibe"].unique().tolist())[0] if results["vibe_df"]["vibe"].unique().tolist() else None) | |
) | |
load_btn.click( | |
fn=on_load_results, | |
inputs=[saved_results_dropdown], | |
outputs=[ | |
load_status, | |
load_output_md, | |
load_output_plot1, | |
load_output_plot2, | |
load_results_state, | |
load_vibe_dropdown, | |
], | |
) | |
# Show examples for selected vibe | |
load_vibe_dropdown.change( | |
fn=show_examples, | |
inputs=[load_vibe_dropdown, load_results_state], | |
outputs=[load_example_dropdown], | |
) | |
# Add handler for example dropdown | |
load_example_dropdown.change( | |
fn=display_selected_example, | |
inputs=[load_example_dropdown, load_vibe_dropdown, load_results_state], | |
outputs=[load_examples_md], | |
) | |
# Add a download button and file component | |
with gr.Row(): | |
download_btn = gr.Button("Download DataFrame as CSV", variant="secondary") | |
download_file = gr.File(label="Download CSV", interactive=False) | |
# Function to convert DataFrame to CSV and return for download | |
def download_dataframe(results_dict): | |
if not results_dict or "df" not in results_dict: | |
return None | |
# Convert DataFrame to CSV | |
csv_file_path = "vibecheck_results.csv" | |
results_dict["df"].to_csv(csv_file_path, index=False) | |
return csv_file_path | |
# Connect the download button to the function | |
download_btn.click( | |
fn=download_dataframe, | |
inputs=[load_results_state], | |
outputs=[download_file] | |
) | |
# Add a footer | |
gr.HTML(""" | |
<div class="footer"> | |
<p>© just some folks who like vibes</p> | |
</div> | |
""") | |
return demo | |
app = create_vibecheck_results_ui() | |
app.launch() |