Spaces:

MrSimple07
/

RuSimulBench_arena

Sleeping

App Files Files Community

MrSimple07 commited on Jan 4

Commit

f2a274f

verified ·

1 Parent(s): c0cc27f

Create app.py

Browse files

Files changed (1) hide show

app.py +147 -0

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import gradio as gr
+import time
+import json
+import pandas as pd
+from typing import List, Dict, Any
+class BenchmarkSystem:
+    def __init__(self):
+        self.results = {}
+    def run_benchmark(self,
+                     model_name: str,
+                     test_cases: List[str],
+                     system_prompt: str = "") -> Dict[str, Any]:
+        """
+        Run benchmark tests and measure performance metrics
+        """
+        results = {
+            "model_name": model_name,
+            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "total_tokens": 0,
+            "total_time": 0,
+            "responses": [],
+            "metrics": {}
+        }
+        start_time = time.time()
+        # Simulate processing test cases
+        for test in test_cases:
+            # Here you would add actual model inference
+            # This is a placeholder for demonstration
+            time.sleep(0.5)  # Simulate processing time
+            results["responses"].append({
+                "input": test,
+                "output": f"Sample response for: {test}",
+                "tokens": len(test.split()),
+                "time": 0.5
+            })
+        results["total_time"] = time.time() - start_time
+        results["total_tokens"] = sum(r["tokens"] for r in results["responses"])
+        # Calculate aggregate metrics
+        results["metrics"] = {
+            "avg_response_time": results["total_time"] / len(test_cases),
+            "avg_tokens_per_response": results["total_tokens"] / len(test_cases)
+        }
+        self.results[model_name] = results
+        return results
+def format_results(results: Dict[str, Any]) -> str:
+    """Format benchmark results for display"""
+    output = f"Model: {results['model_name']}\n"
+    output += f"Timestamp: {results['timestamp']}\n"
+    output += f"Total Time: {results['total_time']:.2f}s\n"
+    output += f"Total Tokens: {results['total_tokens']}\n\n"
+    output += "Metrics:\n"
+    for metric, value in results["metrics"].items():
+        output += f"- {metric}: {value:.2f}\n"
+    return output
+def save_results(results: Dict[str, Any], filename: str = "benchmark_results.json"):
+    """Save benchmark results to a file"""
+    with open(filename, "w") as f:
+        json.dump(results, f, indent=2)
+    return f"Results saved to {filename}"
+def run_benchmark_interface(model_name: str,
+                          test_cases: str,
+                          system_prompt: str) -> tuple[str, pd.DataFrame]:
+    """
+    Gradio interface function for running benchmarks
+    """
+    benchmark = BenchmarkSystem()
+    # Parse test cases (assuming one per line)
+    test_cases_list = [t.strip() for t in test_cases.split("\n") if t.strip()]
+    # Run benchmark
+    results = benchmark.run_benchmark(
+        model_name=model_name,
+        test_cases=test_cases_list,
+        system_prompt=system_prompt
+    )
+    # Create DataFrame for response details
+    df = pd.DataFrame([
+        {
+            "Input": r["input"],
+            "Output": r["output"],
+            "Tokens": r["tokens"],
+            "Time (s)": r["time"]
+        }
+        for r in results["responses"]
+    ])
+    # Save results
+    save_results(results)
+    return format_results(results), df
+# Create Gradio interface
+with gr.Blocks(title="Model Benchmark Suite") as demo:
+    gr.Markdown("# Model Benchmark Suite")
+    gr.Markdown("Test and compare model performance across different scenarios")
+    with gr.Row():
+        with gr.Column():
+            model_name = gr.Textbox(
+                label="Model Name",
+                placeholder="Enter model name or identifier"
+            )
+            system_prompt = gr.Textbox(
+                label="System Prompt (Optional)",
+                placeholder="Enter system prompt if applicable",
+                lines=2
+            )
+            test_cases = gr.Textbox(
+                label="Test Cases",
+                placeholder="Enter test cases (one per line)",
+                lines=5
+            )
+            run_button = gr.Button("Run Benchmark")
+        with gr.Column():
+            results_text = gr.Textbox(
+                label="Benchmark Results",
+                lines=10,
+                readonly=True
+            )
+            results_table = gr.DataFrame(
+                label="Detailed Results",
+                headers=["Input", "Output", "Tokens", "Time (s)"]
+            )
+    run_button.click(
+        fn=run_benchmark_interface,
+        inputs=[model_name, test_cases, system_prompt],
+        outputs=[results_text, results_table]
+    )
+if __name__ == "__main__":
+    demo.launch()