Spaces:

MrSimple07
/

RuSimulBench_arena

Sleeping

App Files Files Community

RuSimulBench_arena / app.py

MrSimple07

Create app.py

f2a274f verified 3 months ago

raw

history blame

4.72 kB

	import gradio as gr
	import time
	import json
	import pandas as pd
	from typing import List, Dict, Any

	class BenchmarkSystem:
	def __init__(self):
	self.results = {}

	def run_benchmark(self,
	model_name: str,
	test_cases: List[str],
	system_prompt: str = "") -> Dict[str, Any]:
	"""
	Run benchmark tests and measure performance metrics
	"""
	results = {
	"model_name": model_name,
	"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
	"total_tokens": 0,
	"total_time": 0,
	"responses": [],
	"metrics": {}
	}

	start_time = time.time()

	# Simulate processing test cases
	for test in test_cases:
	# Here you would add actual model inference
	# This is a placeholder for demonstration
	time.sleep(0.5) # Simulate processing time
	results["responses"].append({
	"input": test,
	"output": f"Sample response for: {test}",
	"tokens": len(test.split()),
	"time": 0.5
	})

	results["total_time"] = time.time() - start_time
	results["total_tokens"] = sum(r["tokens"] for r in results["responses"])

	# Calculate aggregate metrics
	results["metrics"] = {
	"avg_response_time": results["total_time"] / len(test_cases),
	"avg_tokens_per_response": results["total_tokens"] / len(test_cases)
	}

	self.results[model_name] = results
	return results

	def format_results(results: Dict[str, Any]) -> str:
	"""Format benchmark results for display"""
	output = f"Model: {results['model_name']}\n"
	output += f"Timestamp: {results['timestamp']}\n"
	output += f"Total Time: {results['total_time']:.2f}s\n"
	output += f"Total Tokens: {results['total_tokens']}\n\n"

	output += "Metrics:\n"
	for metric, value in results["metrics"].items():
	output += f"- {metric}: {value:.2f}\n"

	return output

	def save_results(results: Dict[str, Any], filename: str = "benchmark_results.json"):
	"""Save benchmark results to a file"""
	with open(filename, "w") as f:
	json.dump(results, f, indent=2)
	return f"Results saved to {filename}"

	def run_benchmark_interface(model_name: str,
	test_cases: str,
	system_prompt: str) -> tuple[str, pd.DataFrame]:
	"""
	Gradio interface function for running benchmarks
	"""
	benchmark = BenchmarkSystem()

	# Parse test cases (assuming one per line)
	test_cases_list = [t.strip() for t in test_cases.split("\n") if t.strip()]

	# Run benchmark
	results = benchmark.run_benchmark(
	model_name=model_name,
	test_cases=test_cases_list,
	system_prompt=system_prompt
	)

	# Create DataFrame for response details
	df = pd.DataFrame([
	{
	"Input": r["input"],
	"Output": r["output"],
	"Tokens": r["tokens"],
	"Time (s)": r["time"]
	}
	for r in results["responses"]
	])

	# Save results
	save_results(results)

	return format_results(results), df

	# Create Gradio interface
	with gr.Blocks(title="Model Benchmark Suite") as demo:
	gr.Markdown("# Model Benchmark Suite")
	gr.Markdown("Test and compare model performance across different scenarios")

	with gr.Row():
	with gr.Column():
	model_name = gr.Textbox(
	label="Model Name",
	placeholder="Enter model name or identifier"
	)
	system_prompt = gr.Textbox(
	label="System Prompt (Optional)",
	placeholder="Enter system prompt if applicable",
	lines=2
	)
	test_cases = gr.Textbox(
	label="Test Cases",
	placeholder="Enter test cases (one per line)",
	lines=5
	)
	run_button = gr.Button("Run Benchmark")

	with gr.Column():
	results_text = gr.Textbox(
	label="Benchmark Results",
	lines=10,
	readonly=True
	)
	results_table = gr.DataFrame(
	label="Detailed Results",
	headers=["Input", "Output", "Tokens", "Time (s)"]
	)

	run_button.click(
	fn=run_benchmark_interface,
	inputs=[model_name, test_cases, system_prompt],
	outputs=[results_text, results_table]
	)

	if __name__ == "__main__":
	demo.launch()