MrSimple07's picture
Create app.py
f2a274f verified
raw
history blame
4.72 kB
import gradio as gr
import time
import json
import pandas as pd
from typing import List, Dict, Any
class BenchmarkSystem:
def __init__(self):
self.results = {}
def run_benchmark(self,
model_name: str,
test_cases: List[str],
system_prompt: str = "") -> Dict[str, Any]:
"""
Run benchmark tests and measure performance metrics
"""
results = {
"model_name": model_name,
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"total_tokens": 0,
"total_time": 0,
"responses": [],
"metrics": {}
}
start_time = time.time()
# Simulate processing test cases
for test in test_cases:
# Here you would add actual model inference
# This is a placeholder for demonstration
time.sleep(0.5) # Simulate processing time
results["responses"].append({
"input": test,
"output": f"Sample response for: {test}",
"tokens": len(test.split()),
"time": 0.5
})
results["total_time"] = time.time() - start_time
results["total_tokens"] = sum(r["tokens"] for r in results["responses"])
# Calculate aggregate metrics
results["metrics"] = {
"avg_response_time": results["total_time"] / len(test_cases),
"avg_tokens_per_response": results["total_tokens"] / len(test_cases)
}
self.results[model_name] = results
return results
def format_results(results: Dict[str, Any]) -> str:
"""Format benchmark results for display"""
output = f"Model: {results['model_name']}\n"
output += f"Timestamp: {results['timestamp']}\n"
output += f"Total Time: {results['total_time']:.2f}s\n"
output += f"Total Tokens: {results['total_tokens']}\n\n"
output += "Metrics:\n"
for metric, value in results["metrics"].items():
output += f"- {metric}: {value:.2f}\n"
return output
def save_results(results: Dict[str, Any], filename: str = "benchmark_results.json"):
"""Save benchmark results to a file"""
with open(filename, "w") as f:
json.dump(results, f, indent=2)
return f"Results saved to {filename}"
def run_benchmark_interface(model_name: str,
test_cases: str,
system_prompt: str) -> tuple[str, pd.DataFrame]:
"""
Gradio interface function for running benchmarks
"""
benchmark = BenchmarkSystem()
# Parse test cases (assuming one per line)
test_cases_list = [t.strip() for t in test_cases.split("\n") if t.strip()]
# Run benchmark
results = benchmark.run_benchmark(
model_name=model_name,
test_cases=test_cases_list,
system_prompt=system_prompt
)
# Create DataFrame for response details
df = pd.DataFrame([
{
"Input": r["input"],
"Output": r["output"],
"Tokens": r["tokens"],
"Time (s)": r["time"]
}
for r in results["responses"]
])
# Save results
save_results(results)
return format_results(results), df
# Create Gradio interface
with gr.Blocks(title="Model Benchmark Suite") as demo:
gr.Markdown("# Model Benchmark Suite")
gr.Markdown("Test and compare model performance across different scenarios")
with gr.Row():
with gr.Column():
model_name = gr.Textbox(
label="Model Name",
placeholder="Enter model name or identifier"
)
system_prompt = gr.Textbox(
label="System Prompt (Optional)",
placeholder="Enter system prompt if applicable",
lines=2
)
test_cases = gr.Textbox(
label="Test Cases",
placeholder="Enter test cases (one per line)",
lines=5
)
run_button = gr.Button("Run Benchmark")
with gr.Column():
results_text = gr.Textbox(
label="Benchmark Results",
lines=10,
readonly=True
)
results_table = gr.DataFrame(
label="Detailed Results",
headers=["Input", "Output", "Tokens", "Time (s)"]
)
run_button.click(
fn=run_benchmark_interface,
inputs=[model_name, test_cases, system_prompt],
outputs=[results_text, results_table]
)
if __name__ == "__main__":
demo.launch()