Spaces:
Sleeping
Sleeping
File size: 12,951 Bytes
cf4feda 4801d40 cf4feda 6dbefd6 cf4feda 4801d40 cf4feda a5f9d23 cf4feda 6dbefd6 cf4feda 96e1679 cf4feda b4bfeb8 cf4feda b4bfeb8 cf4feda 73f5057 cf4feda 73f5057 cf4feda 73f5057 cf4feda 73f5057 cf4feda a5f9d23 cf4feda 438b334 12521d7 cf4feda 6dbefd6 cf4feda 2abec57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 |
import os
import gradio as gr
import pandas as pd
def list_saved_results():
"""List all saved result files in the saved_results directory."""
results_dir = os.path.join(".")
if not os.path.exists(results_dir):
return []
result_files = [f for f in os.listdir(results_dir) if f.endswith(".pkl")]
# Sort by modification time (newest first)
result_files.sort(key=lambda x: os.path.getmtime(os.path.join(results_dir, x)), reverse=True)
formatted_results = []
for filename in result_files:
formatted_results.append((filename, filename))
return formatted_results
def load_saved_result(filename):
"""Load a saved result file."""
import pickle
results_dir = os.path.join(".")
filepath = os.path.join(results_dir, filename)
if not os.path.exists(filepath):
return None
with open(filepath, "rb") as f:
results = pickle.load(f)
return results
def show_examples(selected_vibe, results_dict, model_list_str=None):
"""Display example rows for the selected vibe."""
if not selected_vibe:
return "Please select a vibe to see examples.", []
if not results_dict or "vibe_df" not in results_dict:
return "No vibe data available.", []
vibe_df = results_dict["vibe_df"]
# Extract model names from the results dictionary or use provided models
if model_list_str:
models = [m.strip() for m in model_list_str.split(",") if m.strip()]
else:
models = results_dict.get("models", ["Model A", "Model B"])
# Filter for the selected vibe
subset = vibe_df[(vibe_df["vibe"] == selected_vibe)].head(10)
# Create a list of example choices with preview text
example_choices = []
for i, row in enumerate(subset.itertuples(), 1):
row = row._asdict()
# Create a short preview of the prompt (first 100 chars)
preview = row['question'][:100] + "..." if len(row['question']) > 100 else row['question']
score = row['score']
# Format: "Example 1: This is the prompt... (Score: 0.123)"
label = f"Example {i}: {preview} (Score: {score:.3f})"
example_choices.append(label)
return gr.Dropdown(choices=example_choices, value=example_choices[0] if example_choices else None)
def display_selected_example(example_idx, selected_vibe, results_dict, model_list_str=None):
"""Display a single selected example."""
if example_idx is None or not selected_vibe:
return "Please select an example to view."
if not results_dict or "vibe_df" not in results_dict:
return "No vibe data available."
vibe_df = results_dict["vibe_df"]
# Extract model names from the results dictionary or use provided models
if model_list_str:
models = [m.strip() for m in model_list_str.split(",") if m.strip()]
else:
models = results_dict.get("models", ["Model A", "Model B"])
# Filter for the selected vibe
subset = vibe_df[
(vibe_df["vibe"] == selected_vibe) & (vibe_df["score"].abs() > 0.0)
].head(10).to_dict(orient="records") # Match the number in show_examples
# Extract the example number from the dropdown selection
try:
# Parse the example number from the string (e.g., "Example 3: ...")
example_num = int(example_idx.split(":")[0].replace("Example ", "")) - 1
if example_num < 0 or example_num >= len(subset):
return "Example index out of range."
except (ValueError, AttributeError, IndexError):
return f"Invalid example selection: '{example_idx}'"
# Get the selected example
row = subset[example_num]
md = f"**Prompt:** {row['question']}\n\n"
# Show both model outputs if available
for model in models:
if model in row:
md += f"**{model} Output:**\n{row[model]}\n________________________\n"
md += f"**Score:** {row['score']:.3f} "
# Add interpretation of which model exhibits the vibe more
if len(models) >= 2:
if row["score"] > 0:
md += f"({models[0]} exhibits this vibe more)\n\n"
else:
md += f"({models[1]} exhibits this vibe more)\n\n"
return md
def create_vibecheck_results_ui():
"""
Builds and returns a Gradio Blocks interface for viewing VibeCheck results
"""
# Custom CSS for a more professional look
custom_css = """
.gradio-container {
max-width: 1200px !important;
margin: auto !important;
}
.header-text {
text-align: center;
margin-bottom: 1rem;
}
.header-emoji {
font-size: 2.5rem;
margin-right: 0.5rem;
}
.tab-content {
padding: 1rem;
}
.footer {
text-align: center;
margin-top: 2rem;
color: #666;
font-size: 0.9rem;
}
"""
with gr.Blocks(theme='davehornik/Tealy', css=custom_css) as demo:
gr.HTML("""
<div class="header-text">
<h1><span class="header-emoji">✨</span> LLM VibeChecks <span class="header-emoji">✨</span></h1>
<p>Visualize the Vibes of model pairs using <a href="https://arxiv.org/abs/2410.12851" target="_blank" style="text-decoration: underline; color: inherit;">VibeCheck system</a> (more to come)</p>
</div>
""")
with gr.Row():
# Left Column - Selection
with gr.Column(scale=1):
# Dropdown to select saved results
saved_results_dropdown = gr.Dropdown(
label="Select a dataset/model pair",
choices=list_saved_results(),
interactive=True,
allow_custom_value=False,
)
# Refresh button
refresh_btn = gr.Button("Refresh List", variant="secondary")
# Load button
load_btn = gr.Button("Load Selected Results", variant="primary")
# Status message
load_status = gr.Markdown()
# Output area
with gr.Row():
load_output_md = gr.Markdown()
with gr.Row():
with gr.Column():
gr.Markdown("### Vibe Score Heuristics\n\nVibes which have low scores are not used in prediction models")
load_output_plot1 = gr.Plot()
with gr.Row():
with gr.Column():
gr.Markdown("### Vibe Prediction Coefficients")
load_output_plot2 = gr.Plot()
# Bottom Row - Vibe Examples
with gr.Row():
with gr.Column():
gr.Markdown("## Explore Examples")
load_vibe_dropdown = gr.Dropdown(
label="Select a vibe to see examples and their scores",
choices=[],
interactive=True,
multiselect=False,
value=None,
allow_custom_value=False,
)
load_example_dropdown = gr.Dropdown(
label="Select an example to view",
choices=[],
interactive=True,
multiselect=False,
value=None,
allow_custom_value=False,
)
load_examples_md = gr.Markdown()
# Store loaded results state
load_results_state = gr.State()
# Function to refresh the list of saved results
def refresh_saved_results():
return gr.Dropdown(choices=list_saved_results())
refresh_btn.click(
fn=refresh_saved_results,
inputs=[],
outputs=[saved_results_dropdown]
)
# Function to load selected results
def on_load_results(selected_result):
if not selected_result:
return "Please select a saved analysis to load.", None, None, None, None
# Get the actual filename from the display name
filename = selected_result[1] if isinstance(selected_result, tuple) else selected_result
# Load the results
results = load_saved_result(filename)
if not results or "vibe_df" not in results:
return "Failed to load results or invalid result file.", None, None, None, None
model_names = results.get("models")
accuracy_info = ""
if "vibe_prediction_metrics" in results and results["vibe_prediction_metrics"]:
metrics = results["vibe_prediction_metrics"]
# Format identity metrics
if "identity_metrics" in metrics:
id_metrics = metrics["identity_metrics"]
accuracy_info += f"\n\n### Model Identity Prediction\n"
accuracy_info += f"- Accuracy: {id_metrics.get('accuracy', 'N/A'):.3f} ± {id_metrics.get('acc_std', 'N/A'):.3f}\n"
if "acc_ci" in id_metrics:
accuracy_info += f"- 95% CI: [{id_metrics['acc_ci'][0]:.3f}, {id_metrics['acc_ci'][1]:.3f}]\n"
# Format preference metrics
if "preference_metrics" in metrics:
pref_metrics = metrics["preference_metrics"]
accuracy_info += f"\n### Preference Prediction\n"
accuracy_info += f"- Accuracy: {pref_metrics.get('accuracy', 'N/A'):.3f} ± {pref_metrics.get('acc_std', 'N/A'):.3f}\n"
if "acc_ci" in pref_metrics:
accuracy_info += f"- 95% CI: [{pref_metrics['acc_ci'][0]:.3f}, {pref_metrics['acc_ci'][1]:.3f}]\n"
# Add model names to the summary
model_info = f"\n\n**Models Compared: {' vs. '.join(model_names)}**\n\n"
# Summarize
summary_text = (
"### Loaded VibeCheck Results\n"
+ model_info
+ f"**Found {len(results['vibe_df']['vibe'].unique())} vibe(s)**\n"
+ "\n".join(
[
f"- {vibe}"
for vibe in results["vibe_df"]["vibe"].unique().tolist()
]
)
+ accuracy_info
)
return (
"Results loaded successfully!",
summary_text,
results["model_vibe_scores_plot"],
results["score_dist_plot"],
results,
gr.Dropdown(choices=sorted(results["vibe_df"]["vibe"].unique().tolist()),
value=sorted(results["vibe_df"]["vibe"].unique().tolist())[0] if results["vibe_df"]["vibe"].unique().tolist() else None)
)
load_btn.click(
fn=on_load_results,
inputs=[saved_results_dropdown],
outputs=[
load_status,
load_output_md,
load_output_plot1,
load_output_plot2,
load_results_state,
load_vibe_dropdown,
],
)
# Show examples for selected vibe
load_vibe_dropdown.change(
fn=show_examples,
inputs=[load_vibe_dropdown, load_results_state],
outputs=[load_example_dropdown],
)
# Add handler for example dropdown
load_example_dropdown.change(
fn=display_selected_example,
inputs=[load_example_dropdown, load_vibe_dropdown, load_results_state],
outputs=[load_examples_md],
)
# Add a download button and file component
with gr.Row():
download_btn = gr.Button("Download DataFrame as CSV", variant="secondary")
download_file = gr.File(label="Download CSV", interactive=False)
# Function to convert DataFrame to CSV and return for download
def download_dataframe(results_dict):
if not results_dict or "df" not in results_dict:
return None
# Convert DataFrame to CSV
csv_file_path = "vibecheck_results.csv"
results_dict["df"].to_csv(csv_file_path, index=False)
return csv_file_path
# Connect the download button to the function
download_btn.click(
fn=download_dataframe,
inputs=[load_results_state],
outputs=[download_file]
)
# Add a footer
gr.HTML("""
<div class="footer">
<p>© just some folks who like vibes</p>
</div>
""")
return demo
app = create_vibecheck_results_ui()
app.launch() |