Jay
commited on
Commit
Β·
e4c9620
1
Parent(s):
774db71
feat: update new table
Browse files- app.py +49 -2
- assets/text.py +2 -1
- data/ChineseGuardBench.csv +33 -0
app.py
CHANGED
|
@@ -9,13 +9,13 @@ from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWL
|
|
| 9 |
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
|
| 10 |
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
|
| 11 |
|
| 12 |
-
|
| 13 |
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
|
| 14 |
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
|
| 15 |
|
|
|
|
| 16 |
|
| 17 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
| 18 |
-
|
| 19 |
|
| 20 |
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
| 21 |
|
|
@@ -70,6 +70,19 @@ def format_number(x):
|
|
| 70 |
return float(f"{x:.3}")
|
| 71 |
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
def get_dataset_csv(
|
| 74 |
model_size: List[str],
|
| 75 |
):
|
|
@@ -146,6 +159,17 @@ def get_dataset_classfier_gen(
|
|
| 146 |
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
| 147 |
return leaderboard_table
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
def get_dataset_classfier_per(
|
| 150 |
model_size: List[str],
|
| 151 |
main_choice: List[str],
|
|
@@ -200,6 +224,11 @@ with gr.Blocks() as demo:
|
|
| 200 |
dataframe_all_per = gr.components.Dataframe(
|
| 201 |
elem_id="leaderboard-table",
|
| 202 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
# ----------------- modify text -----------------
|
| 205 |
with gr.Row():
|
|
@@ -261,6 +290,24 @@ with gr.Blocks() as demo:
|
|
| 261 |
outputs=dataframe_all_gen,
|
| 262 |
)
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
demo.launch(share=True)
|
| 266 |
|
|
|
|
| 9 |
ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
|
| 10 |
ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
|
| 11 |
|
|
|
|
| 12 |
ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
|
| 13 |
ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
|
| 14 |
|
| 15 |
+
ORIGINAL_DF_NEW = pd.read_csv("./data/ChineseGuardBench.csv", encoding='utf-8') # new table
|
| 16 |
|
| 17 |
METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
| 18 |
+
# METRICS_GuardBench = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
|
| 19 |
|
| 20 |
SUBCLASS = ["Discrimination", "Variant", "Psychology", "Politics", "Eroticism", "Vulgarity", "Property", "Injury", "Criminality", "Ethics"]
|
| 21 |
|
|
|
|
| 70 |
return float(f"{x:.3}")
|
| 71 |
|
| 72 |
|
| 73 |
+
def get_dataset_new_csv(
|
| 74 |
+
model_size: List[str],
|
| 75 |
+
):
|
| 76 |
+
df = ORIGINAL_DF_NEW[ORIGINAL_DF_NEW['Size'].isin(model_size)]
|
| 77 |
+
df = df.drop(columns="Size")
|
| 78 |
+
|
| 79 |
+
leaderboard_table = gr.components.Dataframe(
|
| 80 |
+
value=df,
|
| 81 |
+
interactive=False,
|
| 82 |
+
visible=True,
|
| 83 |
+
)
|
| 84 |
+
return leaderboard_table
|
| 85 |
+
|
| 86 |
def get_dataset_csv(
|
| 87 |
model_size: List[str],
|
| 88 |
):
|
|
|
|
| 159 |
leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
| 160 |
return leaderboard_table
|
| 161 |
|
| 162 |
+
def get_ChineseGuardBench(
|
| 163 |
+
model_size: List[str],
|
| 164 |
+
main_choice: List[str],
|
| 165 |
+
):
|
| 166 |
+
leaderboard_table = get_dataset_new_csv(model_size)
|
| 167 |
+
# elif main_choice != "Subclass":
|
| 168 |
+
# subclass_choice = main_choice
|
| 169 |
+
# leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
|
| 170 |
+
return leaderboard_table
|
| 171 |
+
|
| 172 |
+
|
| 173 |
def get_dataset_classfier_per(
|
| 174 |
model_size: List[str],
|
| 175 |
main_choice: List[str],
|
|
|
|
| 224 |
dataframe_all_per = gr.components.Dataframe(
|
| 225 |
elem_id="leaderboard-table",
|
| 226 |
)
|
| 227 |
+
|
| 228 |
+
with gr.TabItem("π
NEW", elem_id="od-benchmark-tab-table", id=7):
|
| 229 |
+
dataframe_all_guardbench = gr.components.Dataframe(
|
| 230 |
+
elem_id="leaderboard-table",
|
| 231 |
+
)
|
| 232 |
|
| 233 |
# ----------------- modify text -----------------
|
| 234 |
with gr.Row():
|
|
|
|
| 290 |
outputs=dataframe_all_gen,
|
| 291 |
)
|
| 292 |
|
| 293 |
+
# this is new results for ChineseGuardBench
|
| 294 |
+
# main_choice.change(
|
| 295 |
+
# get_ChineseGuardBench,
|
| 296 |
+
# inputs=[model_choice, main_choice],
|
| 297 |
+
# outputs=dataframe_all_guardbench,
|
| 298 |
+
# )
|
| 299 |
+
|
| 300 |
+
model_choice.change(
|
| 301 |
+
get_ChineseGuardBench,
|
| 302 |
+
inputs=[model_choice, main_choice],
|
| 303 |
+
outputs=dataframe_all_guardbench,
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
demo.load(
|
| 307 |
+
fn=get_ChineseGuardBench,
|
| 308 |
+
inputs=[model_choice, main_choice],
|
| 309 |
+
outputs=dataframe_all_guardbench,
|
| 310 |
+
)
|
| 311 |
|
| 312 |
demo.launch(share=True)
|
| 313 |
|
assets/text.py
CHANGED
|
@@ -35,7 +35,8 @@ EVALUTION_TEXT= """
|
|
| 35 |
We evaluate the models using two methods: perplexity(multiple choice) and generation.
|
| 36 |
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
| 37 |
For generation, we use the content generated by the model to make prediction.
|
| 38 |
-
The following are the results of the evaluation.
|
|
|
|
| 39 |
</span> <br><br>
|
| 40 |
|
| 41 |
|
|
|
|
| 35 |
We evaluate the models using two methods: perplexity(multiple choice) and generation.
|
| 36 |
For perplexity, we select the label which is the lowest perplexity as the predicted results.
|
| 37 |
For generation, we use the content generated by the model to make prediction.
|
| 38 |
+
The following are the results of the evaluation.
|
| 39 |
+
In the "New" table, we present evaluation results for generation on a meticulously curated new benchmark, with details of its processing to be introduced later.πππ
|
| 40 |
</span> <br><br>
|
| 41 |
|
| 42 |
|
data/ChineseGuardBench.csv
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Size,F1,Accuracy,Precision,Recall,FPR,FNR
|
| 2 |
+
Deepexi-Guard-3B,1B~5B,89.63 ,89.72 ,85.53 ,94.15 ,14.24 ,5.85
|
| 3 |
+
Qwen3-32B,~30B,88.54 ,89.25 ,89.08 ,88.02 ,9.64 ,11.98
|
| 4 |
+
Qwen3-235B-A22B,>65B,87.92 ,88.96 ,90.86 ,85.17 ,7.66 ,14.83
|
| 5 |
+
Qwen3-235B-A22B-Instruct-2507,>65B,87.81 ,89.13 ,93.27 ,82.96 ,5.35 ,17.04
|
| 6 |
+
GLM-Z1-9B-0414,5B~10B,87.36 ,88.03 ,87.11 ,87.61 ,11.59 ,12.39
|
| 7 |
+
Qwen2.5-72B-Instruct,>65B,86.81 ,88.27 ,92.50 ,81.79 ,5.93 ,18.21
|
| 8 |
+
QwQ-32B,~30B,86.80 ,88.35 ,93.33 ,81.12 ,5.18 ,18.88
|
| 9 |
+
Phi-4,10B~20B,85.95 ,86.88 ,86.90 ,85.02 ,11.45 ,14.98
|
| 10 |
+
Gemma-3-27B-it,~30B,85.29 ,86.78 ,89.83 ,81.19 ,8.22 ,18.81
|
| 11 |
+
DeepSeek-R1-0528,>65B,85.24 ,87.47 ,96.02 ,76.63 ,2.84 ,23.37
|
| 12 |
+
Mistral-Small-3.2-24B-Instruct,~30B,85.07 ,87.03 ,93.14 ,78.29 ,5.15 ,21.71
|
| 13 |
+
GLM-4-9B-chat,5B~10B,84.85 ,86.27 ,88.47 ,81.52 ,9.49 ,18.48
|
| 14 |
+
MD-Judge-v0_2-internlm2_7B,5B~10B,84.63 ,85.88 ,87.03 ,82.37 ,10.98 ,17.63
|
| 15 |
+
DeepSeek-R1-Distill-Qwen-32B,~30B,84.55 ,86.64 ,93.05 ,77.47 ,5.17 ,22.53
|
| 16 |
+
Hunyuan-A13B-Instruct,>65B,84.32 ,86.21 ,90.97 ,78.58 ,6.98 ,21.42
|
| 17 |
+
Moonlight-16B-A3B-Instruct,10B~20B,84.21 ,84.35 ,80.41 ,88.38 ,19.25 ,11.62
|
| 18 |
+
GLM-Z1-32B-0414,~30B,83.40 ,85.75 ,92.63 ,75.85 ,5.40 ,24.15
|
| 19 |
+
Qwen3-8B,5B~10B,83.05 ,85.51 ,92.69 ,75.23 ,5.30 ,24.77
|
| 20 |
+
Qwen2.5-7B-Instruct,5B~10B,82.96 ,84.99 ,89.41 ,77.37 ,8.20 ,22.63
|
| 21 |
+
Qwen2.5-1.5B-Instruct,1B~5B,79.48 ,77.08 ,68.83 ,94.03 ,38.07 ,5.97
|
| 22 |
+
shieldgemma-2B,1B~5B,79.19 ,79.63 ,76.50 ,82.06 ,22.54 ,17.94
|
| 23 |
+
Qwen2.5-3B-Instruct,1B~5B,79.05 ,77.57 ,70.69 ,89.66 ,33.25 ,10.34
|
| 24 |
+
SHTEC_safety_fence_model_7B,5B~10B,78.44 ,82.48 ,93.54 ,67.54 ,4.17 ,32.46
|
| 25 |
+
Qwen3-4B,1B~5B,78.16 ,82.50 ,95.12 ,66.33 ,3.04 ,33.67
|
| 26 |
+
SmolLM3-3B,1B~5B,76.10 ,79.19 ,83.09 ,70.19 ,12.77 ,29.81
|
| 27 |
+
ERNIE-4.5-21B-A3B-Paddle,~20B,75.21 ,80.58 ,94.58 ,62.42 ,3.20 ,37.58
|
| 28 |
+
Qwen3-1.7B,1B~5B,74.46 ,79.34 ,89.36 ,63.82 ,6.79 ,36.18
|
| 29 |
+
internlm2_5-7B-chat,5B~10B,71.52 ,78.49 ,95.34 ,57.22 ,2.50 ,42.78
|
| 30 |
+
Llama-Guard-4-12B,10B~20B,65.66 ,74.64 ,90.99 ,51.36 ,4.54 ,48.64
|
| 31 |
+
Llama-Guard-3-8B,5B~10B,59.33 ,72.44 ,97.80 ,42.58 ,0.86 ,57.42
|
| 32 |
+
DeepSeek-R1-Distill-Qwen-7B,5B~10B,45.27 ,65.53 ,90.36 ,30.20 ,2.88 ,69.80
|
| 33 |
+
Gemma-3n-E4B-it,5B~10B,44.05 ,64.88 ,88.80 ,29.29 ,3.30 ,70.71
|