Add bountybench
Browse files- app.py +3 -3
- gen_table.py +3 -3
- meta_data.py +6 -0
- results.json +23 -0
app.py
CHANGED
|
@@ -110,7 +110,7 @@ head_style) as demo:
|
|
| 110 |
)
|
| 111 |
s.headers = s.check_box['essential'] + s.checkbox_group.value
|
| 112 |
|
| 113 |
-
if benchmark not in ["SWE-bench-verified", "CyberGym"]:
|
| 114 |
with gr.Row():
|
| 115 |
s.model_name = gr.Textbox(
|
| 116 |
value='Input the Model Name (fuzzy, case insensitive)',
|
|
@@ -137,7 +137,7 @@ head_style) as demo:
|
|
| 137 |
s = structs[benchmark_list.index(dataset_name)]
|
| 138 |
headers = s.check_box['essential'] + fields
|
| 139 |
df = cp.deepcopy(s.table)
|
| 140 |
-
if dataset_name not in ["SWE-bench-verified", "CyberGym"]:
|
| 141 |
default_val = 'Input the Model Name (fuzzy, case insensitive)'
|
| 142 |
else:
|
| 143 |
default_val = 'Input the Agent Name (fuzzy, case insensitive)'
|
|
@@ -145,7 +145,7 @@ head_style) as demo:
|
|
| 145 |
if model_name != default_val:
|
| 146 |
print(model_name)
|
| 147 |
model_name = model_name.lower()
|
| 148 |
-
if dataset_name not in ["SWE-bench-verified", "CyberGym"]:
|
| 149 |
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Model']]
|
| 150 |
else:
|
| 151 |
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Agent']]
|
|
|
|
| 110 |
)
|
| 111 |
s.headers = s.check_box['essential'] + s.checkbox_group.value
|
| 112 |
|
| 113 |
+
if benchmark not in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
|
| 114 |
with gr.Row():
|
| 115 |
s.model_name = gr.Textbox(
|
| 116 |
value='Input the Model Name (fuzzy, case insensitive)',
|
|
|
|
| 137 |
s = structs[benchmark_list.index(dataset_name)]
|
| 138 |
headers = s.check_box['essential'] + fields
|
| 139 |
df = cp.deepcopy(s.table)
|
| 140 |
+
if dataset_name not in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
|
| 141 |
default_val = 'Input the Model Name (fuzzy, case insensitive)'
|
| 142 |
else:
|
| 143 |
default_val = 'Input the Agent Name (fuzzy, case insensitive)'
|
|
|
|
| 145 |
if model_name != default_val:
|
| 146 |
print(model_name)
|
| 147 |
model_name = model_name.lower()
|
| 148 |
+
if dataset_name not in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
|
| 149 |
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Model']]
|
| 150 |
else:
|
| 151 |
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Agent']]
|
gen_table.py
CHANGED
|
@@ -54,9 +54,9 @@ def BUILD_L2_DF(results, benchmark):
|
|
| 54 |
model_list=list(set(model_list))
|
| 55 |
|
| 56 |
res = defaultdict(list)
|
| 57 |
-
if benchmark not in ["RedCode","NYU CTF Bench","PrimeVul","SWE-bench-verified","CyberGym"]:
|
| 58 |
res['Model']=model_list
|
| 59 |
-
elif benchmark
|
| 60 |
res['Agent']=model_list
|
| 61 |
elif benchmark == "PrimeVul":
|
| 62 |
used=[]
|
|
@@ -104,7 +104,7 @@ def BUILD_L2_DF(results, benchmark):
|
|
| 104 |
required_fields = all_fields
|
| 105 |
|
| 106 |
check_box = {}
|
| 107 |
-
if benchmark in ["SWE-bench-verified", "CyberGym"]:
|
| 108 |
check_box['essential'] = ['Agent']
|
| 109 |
elif benchmark=='PrimeVul':
|
| 110 |
check_box['essential'] = ['Model','Method']
|
|
|
|
| 54 |
model_list=list(set(model_list))
|
| 55 |
|
| 56 |
res = defaultdict(list)
|
| 57 |
+
if benchmark not in ["RedCode","NYU CTF Bench","PrimeVul","SWE-bench-verified","CyberGym", "BountyBench"]:
|
| 58 |
res['Model']=model_list
|
| 59 |
+
elif benchmark in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
|
| 60 |
res['Agent']=model_list
|
| 61 |
elif benchmark == "PrimeVul":
|
| 62 |
used=[]
|
|
|
|
| 104 |
required_fields = all_fields
|
| 105 |
|
| 106 |
check_box = {}
|
| 107 |
+
if benchmark in ["SWE-bench-verified", "CyberGym", "BountyBench"]:
|
| 108 |
check_box['essential'] = ['Agent']
|
| 109 |
elif benchmark=='PrimeVul':
|
| 110 |
check_box['essential'] = ['Model','Method']
|
meta_data.py
CHANGED
|
@@ -91,4 +91,10 @@ LEADERBOARD_MD['CyberGym'] = """This is a large-scale and high-quality cybersecu
|
|
| 91 |
|
| 92 |
Paper: https://arxiv.org/abs/2506.02548
|
| 93 |
Code: https://github.com/sunblaze-ucb/cybergym
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
"""
|
|
|
|
| 91 |
|
| 92 |
Paper: https://arxiv.org/abs/2506.02548
|
| 93 |
Code: https://github.com/sunblaze-ucb/cybergym
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
+
LEADERBOARD_MD['BountyBench'] = """This is a benchmark with 25 systems with complex, real-world codebases, and includes 40 bug bounties that cover 9 of the OWASP Top 10 Risks.
|
| 97 |
+
|
| 98 |
+
Paper: https://arxiv.org/abs/2505.15216
|
| 99 |
+
Code: https://github.com/bountybench/bountybench
|
| 100 |
"""
|
results.json
CHANGED
|
@@ -829,6 +829,29 @@
|
|
| 829 |
"OpenHands + OpenHands-LM-32B": 0.33,
|
| 830 |
"OpenHands + SWE-Gym-32B": 0.07
|
| 831 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 832 |
}
|
| 833 |
}
|
| 834 |
}
|
|
|
|
| 829 |
"OpenHands + OpenHands-LM-32B": 0.33,
|
| 830 |
"OpenHands + SWE-Gym-32B": 0.07
|
| 831 |
}
|
| 832 |
+
},
|
| 833 |
+
"BountyBench": {
|
| 834 |
+
"Detect Success Rate": {
|
| 835 |
+
"Claude Code": 5,
|
| 836 |
+
"OpenAI Codex CLI": 5,
|
| 837 |
+
"C-Agent: Claude 3.7": 5,
|
| 838 |
+
"C-Agent: Gemini 2.5": 2.5,
|
| 839 |
+
"C-Agent: GPT-4.1": 0
|
| 840 |
+
},
|
| 841 |
+
"Exploit Success Rate": {
|
| 842 |
+
"Claude Code": 57.5,
|
| 843 |
+
"OpenAI Codex CLI": 32.5,
|
| 844 |
+
"C-Agent: Claude 3.7": 67.5,
|
| 845 |
+
"C-Agent: Gemini 2.5": 40,
|
| 846 |
+
"C-Agent: GPT-4.1": 55
|
| 847 |
+
},
|
| 848 |
+
"Patch Success Rate": {
|
| 849 |
+
"Claude Code": 87.5,
|
| 850 |
+
"OpenAI Codex CLI": 90,
|
| 851 |
+
"C-Agent: Claude 3.7": 60,
|
| 852 |
+
"C-Agent: Gemini 2.5": 45,
|
| 853 |
+
"C-Agent: GPT-4.1": 50
|
| 854 |
+
}
|
| 855 |
}
|
| 856 |
}
|
| 857 |
}
|