Spaces:
Sleeping
Sleeping
import gradio as gr | |
import time | |
import json | |
import pandas as pd | |
from typing import List, Dict, Any | |
class BenchmarkSystem: | |
def __init__(self): | |
self.results = {} | |
def run_benchmark(self, | |
model_name: str, | |
test_cases: List[str], | |
system_prompt: str = "") -> Dict[str, Any]: | |
""" | |
Run benchmark tests and measure performance metrics | |
""" | |
results = { | |
"model_name": model_name, | |
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), | |
"total_tokens": 0, | |
"total_time": 0, | |
"responses": [], | |
"metrics": {} | |
} | |
start_time = time.time() | |
# Simulate processing test cases | |
for test in test_cases: | |
# Here you would add actual model inference | |
# This is a placeholder for demonstration | |
time.sleep(0.5) # Simulate processing time | |
results["responses"].append({ | |
"input": test, | |
"output": f"Sample response for: {test}", | |
"tokens": len(test.split()), | |
"time": 0.5 | |
}) | |
results["total_time"] = time.time() - start_time | |
results["total_tokens"] = sum(r["tokens"] for r in results["responses"]) | |
# Calculate aggregate metrics | |
results["metrics"] = { | |
"avg_response_time": results["total_time"] / len(test_cases), | |
"avg_tokens_per_response": results["total_tokens"] / len(test_cases) | |
} | |
self.results[model_name] = results | |
return results | |
def format_results(results: Dict[str, Any]) -> str: | |
"""Format benchmark results for display""" | |
output = f"Model: {results['model_name']}\n" | |
output += f"Timestamp: {results['timestamp']}\n" | |
output += f"Total Time: {results['total_time']:.2f}s\n" | |
output += f"Total Tokens: {results['total_tokens']}\n\n" | |
output += "Metrics:\n" | |
for metric, value in results["metrics"].items(): | |
output += f"- {metric}: {value:.2f}\n" | |
return output | |
def save_results(results: Dict[str, Any], filename: str = "benchmark_results.json"): | |
"""Save benchmark results to a file""" | |
with open(filename, "w") as f: | |
json.dump(results, f, indent=2) | |
return f"Results saved to {filename}" | |
def run_benchmark_interface(model_name: str, | |
test_cases: str, | |
system_prompt: str) -> tuple[str, pd.DataFrame]: | |
""" | |
Gradio interface function for running benchmarks | |
""" | |
benchmark = BenchmarkSystem() | |
# Parse test cases (assuming one per line) | |
test_cases_list = [t.strip() for t in test_cases.split("\n") if t.strip()] | |
# Run benchmark | |
results = benchmark.run_benchmark( | |
model_name=model_name, | |
test_cases=test_cases_list, | |
system_prompt=system_prompt | |
) | |
# Create DataFrame for response details | |
df = pd.DataFrame([ | |
{ | |
"Input": r["input"], | |
"Output": r["output"], | |
"Tokens": r["tokens"], | |
"Time (s)": r["time"] | |
} | |
for r in results["responses"] | |
]) | |
# Save results | |
save_results(results) | |
return format_results(results), df | |
# Create Gradio interface | |
with gr.Blocks(title="Model Benchmark Suite") as demo: | |
gr.Markdown("# Model Benchmark Suite") | |
gr.Markdown("Test and compare model performance across different scenarios") | |
with gr.Row(): | |
with gr.Column(): | |
model_name = gr.Textbox( | |
label="Model Name", | |
placeholder="Enter model name or identifier" | |
) | |
system_prompt = gr.Textbox( | |
label="System Prompt (Optional)", | |
placeholder="Enter system prompt if applicable", | |
lines=2 | |
) | |
test_cases = gr.Textbox( | |
label="Test Cases", | |
placeholder="Enter test cases (one per line)", | |
lines=5 | |
) | |
run_button = gr.Button("Run Benchmark") | |
with gr.Column(): | |
results_text = gr.Textbox( | |
label="Benchmark Results", | |
lines=10, | |
readonly=True | |
) | |
results_table = gr.DataFrame( | |
label="Detailed Results", | |
headers=["Input", "Output", "Tokens", "Time (s)"] | |
) | |
run_button.click( | |
fn=run_benchmark_interface, | |
inputs=[model_name, test_cases, system_prompt], | |
outputs=[results_text, results_table] | |
) | |
if __name__ == "__main__": | |
demo.launch() |