MrSimple07 commited on
Commit
f2a274f
·
verified ·
1 Parent(s): c0cc27f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -0
app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+ import json
4
+ import pandas as pd
5
+ from typing import List, Dict, Any
6
+
7
+ class BenchmarkSystem:
8
+ def __init__(self):
9
+ self.results = {}
10
+
11
+ def run_benchmark(self,
12
+ model_name: str,
13
+ test_cases: List[str],
14
+ system_prompt: str = "") -> Dict[str, Any]:
15
+ """
16
+ Run benchmark tests and measure performance metrics
17
+ """
18
+ results = {
19
+ "model_name": model_name,
20
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
21
+ "total_tokens": 0,
22
+ "total_time": 0,
23
+ "responses": [],
24
+ "metrics": {}
25
+ }
26
+
27
+ start_time = time.time()
28
+
29
+ # Simulate processing test cases
30
+ for test in test_cases:
31
+ # Here you would add actual model inference
32
+ # This is a placeholder for demonstration
33
+ time.sleep(0.5) # Simulate processing time
34
+ results["responses"].append({
35
+ "input": test,
36
+ "output": f"Sample response for: {test}",
37
+ "tokens": len(test.split()),
38
+ "time": 0.5
39
+ })
40
+
41
+ results["total_time"] = time.time() - start_time
42
+ results["total_tokens"] = sum(r["tokens"] for r in results["responses"])
43
+
44
+ # Calculate aggregate metrics
45
+ results["metrics"] = {
46
+ "avg_response_time": results["total_time"] / len(test_cases),
47
+ "avg_tokens_per_response": results["total_tokens"] / len(test_cases)
48
+ }
49
+
50
+ self.results[model_name] = results
51
+ return results
52
+
53
+ def format_results(results: Dict[str, Any]) -> str:
54
+ """Format benchmark results for display"""
55
+ output = f"Model: {results['model_name']}\n"
56
+ output += f"Timestamp: {results['timestamp']}\n"
57
+ output += f"Total Time: {results['total_time']:.2f}s\n"
58
+ output += f"Total Tokens: {results['total_tokens']}\n\n"
59
+
60
+ output += "Metrics:\n"
61
+ for metric, value in results["metrics"].items():
62
+ output += f"- {metric}: {value:.2f}\n"
63
+
64
+ return output
65
+
66
+ def save_results(results: Dict[str, Any], filename: str = "benchmark_results.json"):
67
+ """Save benchmark results to a file"""
68
+ with open(filename, "w") as f:
69
+ json.dump(results, f, indent=2)
70
+ return f"Results saved to {filename}"
71
+
72
+ def run_benchmark_interface(model_name: str,
73
+ test_cases: str,
74
+ system_prompt: str) -> tuple[str, pd.DataFrame]:
75
+ """
76
+ Gradio interface function for running benchmarks
77
+ """
78
+ benchmark = BenchmarkSystem()
79
+
80
+ # Parse test cases (assuming one per line)
81
+ test_cases_list = [t.strip() for t in test_cases.split("\n") if t.strip()]
82
+
83
+ # Run benchmark
84
+ results = benchmark.run_benchmark(
85
+ model_name=model_name,
86
+ test_cases=test_cases_list,
87
+ system_prompt=system_prompt
88
+ )
89
+
90
+ # Create DataFrame for response details
91
+ df = pd.DataFrame([
92
+ {
93
+ "Input": r["input"],
94
+ "Output": r["output"],
95
+ "Tokens": r["tokens"],
96
+ "Time (s)": r["time"]
97
+ }
98
+ for r in results["responses"]
99
+ ])
100
+
101
+ # Save results
102
+ save_results(results)
103
+
104
+ return format_results(results), df
105
+
106
+ # Create Gradio interface
107
+ with gr.Blocks(title="Model Benchmark Suite") as demo:
108
+ gr.Markdown("# Model Benchmark Suite")
109
+ gr.Markdown("Test and compare model performance across different scenarios")
110
+
111
+ with gr.Row():
112
+ with gr.Column():
113
+ model_name = gr.Textbox(
114
+ label="Model Name",
115
+ placeholder="Enter model name or identifier"
116
+ )
117
+ system_prompt = gr.Textbox(
118
+ label="System Prompt (Optional)",
119
+ placeholder="Enter system prompt if applicable",
120
+ lines=2
121
+ )
122
+ test_cases = gr.Textbox(
123
+ label="Test Cases",
124
+ placeholder="Enter test cases (one per line)",
125
+ lines=5
126
+ )
127
+ run_button = gr.Button("Run Benchmark")
128
+
129
+ with gr.Column():
130
+ results_text = gr.Textbox(
131
+ label="Benchmark Results",
132
+ lines=10,
133
+ readonly=True
134
+ )
135
+ results_table = gr.DataFrame(
136
+ label="Detailed Results",
137
+ headers=["Input", "Output", "Tokens", "Time (s)"]
138
+ )
139
+
140
+ run_button.click(
141
+ fn=run_benchmark_interface,
142
+ inputs=[model_name, test_cases, system_prompt],
143
+ outputs=[results_text, results_table]
144
+ )
145
+
146
+ if __name__ == "__main__":
147
+ demo.launch()