|
import os |
|
|
|
import pandas as pd |
|
|
|
from src.display.utils import AutoEvalColumn |
|
|
|
|
|
def get_leaderboard_df_crm( |
|
crm_results_path: str, accuracy_cols: list, ts_cols: list |
|
) -> tuple[pd.DataFrame, pd.DataFrame]: |
|
"""Creates a dataframe from all the individual experiment results""" |
|
use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv")) |
|
sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"] |
|
|
|
|
|
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv")) |
|
leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)] |
|
|
|
leaderboard_accuracy_df = leaderboard_accuracy_df.join( |
|
use_case_flavor_mapping_df[["Use Case Name", "Cost and Speed: Flavor"]].set_index("Use Case Name"), |
|
on="Use Case Name", |
|
) |
|
|
|
ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates() |
|
|
|
leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv")) |
|
leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)] |
|
leaderboard_accuracy_df = leaderboard_accuracy_df.join( |
|
leaderboard_cost_df.set_index(["Model Name", "Cost and Speed: Flavor"]), |
|
on=["Model Name", "Cost and Speed: Flavor"], |
|
) |
|
|
|
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv")) |
|
leaderboard_ts_crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv")) |
|
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)] |
|
leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts_crm_bias_df.set_index("Model Name"), on="Model Name") |
|
privacy_cols = leaderboard_ts_df[ |
|
[ |
|
"Privacy Zero-Shot Match Avoidance", |
|
"Privacy Zero-Shot Reveal Avoidance", |
|
"Privacy Five-Shot Match Avoidance", |
|
"Privacy Five-Shot Reveal Avoidance", |
|
] |
|
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1) |
|
|
|
leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x)) |
|
leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Bias"].transform(lambda x: x.split(" ")[0]) |
|
|
|
ts_lvl2_cols = leaderboard_ts_df[ |
|
[ |
|
"Safety", |
|
"Privacy", |
|
"Truthfulness", |
|
"Bias No CI", |
|
] |
|
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1) |
|
leaderboard_ts_df["Trust & Safety"] = ts_lvl2_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x)) |
|
|
|
leaderboard_accuracy_df = leaderboard_accuracy_df.join( |
|
leaderboard_ts_df[ts_cols].set_index(["Model Name"]), |
|
on=["Model Name"], |
|
) |
|
|
|
leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name") |
|
|
|
leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values( |
|
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False |
|
) |
|
leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2) |
|
return leaderboard_accuracy_df |
|
|