Spaces:

Salesforce
/

crm_llm_leaderboard

Running

App Files Files Community

yibum commited on Jun 17, 2024

Commit

84ee137

1 Parent(s): 4c0cc56

join cost table

Browse files

Files changed (7) hide show

README.md +1 -33
app.py +9 -5
crm-results/hf_leaderboard_flavor_mapping.csv +12 -0
crm-results/hf_leaderboard_latency_cost.csv +1 -1
src/about.py +0 -23
src/display/utils.py +12 -101
src/populate.py +18 -6

README.md CHANGED Viewed

@@ -9,36 +9,4 @@ pinned: true
 license: apache-2.0
 ---
-# Start the configuration
-Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
-Results files should have the following format and be stored as json files:
-```json
-{
-    "config": {
-        "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
-        "model_name": "path of the model on the hub: org/model",
-        "model_sha": "revision on the hub",
-    },
-    "results": {
-        "task_name": {
-            "metric_name": score,
-        },
-        "task_name2": {
-            "metric_name": score,
-        }
-    }
-}
-```
-Request files are created automatically by this tool.
-If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
-# Code logic for more complex edits
-You'll find
-- the main table' columns names and properties in `src/display/utils.py`
-- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
-- teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`

 license: apache-2.0
 ---
+# Generative AI Leaderboard for CRM

app.py CHANGED Viewed

@@ -57,7 +57,7 @@ def update_table(
     filtered_df = filter_use_case_func(filtered_df, use_case_query)
     filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
     df = select_columns(filtered_df, columns)
-    return df
 def update_cost_table(
@@ -65,11 +65,11 @@ def update_cost_table(
     columns: list,
     llm_query: list,
     llm_provider_query: list,
-    use_case_type_query: list,
 ):
     filtered_df = filter_llm_func(hidden_df, llm_query)
     filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
-    filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
     df = select_columns_cost_table(filtered_df, columns)
     return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
@@ -183,6 +183,10 @@ def filter_use_case_type_func(df: pd.DataFrame, use_case_type_query: list) -> pd
     return df[df["Use Case Type"].isin(use_case_type_query)]
 def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
     return df[df["Model Name"].isin(llm_query)]
@@ -222,7 +226,7 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
             with gr.Row():
                 with gr.Column():
                     # with gr.Row():
@@ -455,7 +459,7 @@ with demo:
                     filter_use_case_type = gr.CheckboxGroup(
                         choices=["Long", "Short"],
                         value=["Long", "Short"],
-                        label="Use Case Type",
                         info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
                         interactive=True,
                     )

     filtered_df = filter_use_case_func(filtered_df, use_case_query)
     filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
     df = select_columns(filtered_df, columns)
+    return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
 def update_cost_table(
     columns: list,
     llm_query: list,
     llm_provider_query: list,
+    use_case_flavor_query: list,
 ):
     filtered_df = filter_llm_func(hidden_df, llm_query)
     filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
+    filtered_df = filter_use_case_flavor_func(filtered_df, use_case_flavor_query)
     df = select_columns_cost_table(filtered_df, columns)
     return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
     return df[df["Use Case Type"].isin(use_case_type_query)]
+def filter_use_case_flavor_func(df: pd.DataFrame, use_case_flavor_query: list) -> pd.DataFrame:
+    return df[df["Cost and Speed: Flavor"].isin(use_case_flavor_query)]
 def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
     return df[df["Model Name"].isin(llm_query)]
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
             with gr.Row():
                 with gr.Column():
                     # with gr.Row():
                     filter_use_case_type = gr.CheckboxGroup(
                         choices=["Long", "Short"],
                         value=["Long", "Short"],
+                        label="Use Case Flavor",
                         info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
                         interactive=True,
                     )

crm-results/hf_leaderboard_flavor_mapping.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+Use Case Name,Use Case Type,Cost and Speed: Flavor
+Service: Conversation summary,Summary,Short
+Service: Reply Recommendations,Generation,Short
+Sales: Email Generation,Generation,Short
+Sales & Service: Update CRM Info,Generation,Long
+Service: Call Summary,Summary,Long
+Sales: Call Summary,Summary,Long
+Service: Live Chat Insights,Summary,Short
+Service: Live Chat Summary,Summary,Long
+Service: Email Summary,Summary,Long
+Service: Knowledge creation from Case Info,Generation,Long
+Sales: Email Summary,Summary,Long

crm-results/hf_leaderboard_latency_cost.csv CHANGED Viewed

@@ -1,4 +1,4 @@
-Model Name,Use Case Type,Version,Platform,Mean Latency (sec) per Request,Mean Output Tokens,Mean Cost per 1K Requests,Cost Band,,Model id,Cost per 1m input tokens,Cost per 1m output tokens,,,,Percentile,From,To,,min,Max
 AI21 Jamba-Instruct,Long,,AI21,4.0,232.9,1.6,Medium,,GPT 3.5 Turbo,0.5,1.5,,,0%,0.43,0.43,1.61,,0.43,61.11
 AI21 Jamba-Instruct,Short,,AI21,4.0,243.9,0.5,Low,,GPT 4 Turbo,10,30,,,33%,1.61,1.61,9.28,,,
 Claude 3 Haiku,Long,,Bedrock,2.8,236.9,1.0,Low,,GPT4-o,5,15,,,67%,9.28,9.28,61.11,,,

+Model Name,Cost and Speed: Flavor,Version,Platform,Response Time (Sec),Mean Output Tokens,Mean Cost per 1K Requests,Cost Band,,Model id,Cost per 1m input tokens,Cost per 1m output tokens,,,,Percentile,From,To,,min,Max
 AI21 Jamba-Instruct,Long,,AI21,4.0,232.9,1.6,Medium,,GPT 3.5 Turbo,0.5,1.5,,,0%,0.43,0.43,1.61,,0.43,61.11
 AI21 Jamba-Instruct,Short,,AI21,4.0,243.9,0.5,Low,,GPT 4 Turbo,10,30,,,33%,1.61,1.61,9.28,,,
 Claude 3 Haiku,Long,,Bedrock,2.8,236.9,1.0,Low,,GPT4-o,5,15,,,67%,9.28,9.28,61.11,,,

src/about.py CHANGED Viewed

@@ -1,26 +1,3 @@
-from dataclasses import dataclass
-from enum import Enum
-@dataclass
-class Task:
-    benchmark: str
-    metric: str
-    col_name: str
-# Select your tasks here
-# ---------------------------------------------------
-class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
-NUM_FEWSHOT = 0  # Change with your few shot
-# ---------------------------------------------------
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">Generative AI Leaderboard for CRM</h1>
 <h3>Assess which LLMs are accurate enough or need fine-tuning, and weigh this versus tradeoffs of speed, costs, and trust and safety. This is based on human manual and automated evaluation with real operational CRM data per use case.</h3>

 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">Generative AI Leaderboard for CRM</h1>
 <h3>Assess which LLMs are accurate enough or need fine-tuning, and weigh this versus tradeoffs of speed, costs, and trust and safety. This is based on human manual and automated evaluation with real operational CRM data per use case.</h3>

src/display/utils.py CHANGED Viewed

@@ -1,10 +1,7 @@
 from dataclasses import dataclass, make_dataclass
-from enum import Enum
 import pandas as pd
-from src.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -29,9 +26,8 @@ auto_eval_column_dict.append(
     ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
 )
 auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
-auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", False)])
 auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
 auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
 # Accuracy metrics
 auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
@@ -51,6 +47,14 @@ auto_eval_column_dict.append(
 auto_eval_column_dict.append(
     ["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
 )
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -62,10 +66,10 @@ cost_eval_column_dict.append(
     ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
 )
 cost_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
-cost_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", True)])
 cost_eval_column_dict.append(
-    ["latency", ColumnContent, ColumnContent("Mean Latency (sec) per Request", "markdown", True)]
 )
 cost_eval_column_dict.append(
     ["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
 )
@@ -85,96 +89,6 @@ ts_eval_column_dict.append(["crm_bias", ColumnContent, ColumnContent("CRM Bias",
 # ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
 TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
-# Scores
-# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
-# for task in Tasks:
-#     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
-# Model information
-# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
-# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
-# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
-# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
-## For the queue columns in the submission tab
-@dataclass(frozen=True)
-class EvalQueueColumn:  # Queue column
-    model = ColumnContent("model", "markdown", True)
-    revision = ColumnContent("revision", "str", True)
-    private = ColumnContent("private", "bool", True)
-    precision = ColumnContent("precision", "str", True)
-    weight_type = ColumnContent("weight_type", "str", "Original")
-    status = ColumnContent("status", "str", True)
-## All the model information that we might need
-@dataclass
-class ModelDetails:
-    name: str
-    display_name: str = ""
-    symbol: str = ""  # emoji
-class ModelType(Enum):
-    PT = ModelDetails(name="pretrained", symbol="🟢")
-    FT = ModelDetails(name="fine-tuned", symbol="🔶")
-    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
-    RL = ModelDetails(name="RL-tuned", symbol="🟦")
-    Unknown = ModelDetails(name="", symbol="?")
-    def to_str(self, separator=" "):
-        return f"{self.value.symbol}{separator}{self.value.name}"
-    @staticmethod
-    def from_str(type):
-        if "fine-tuned" in type or "🔶" in type:
-            return ModelType.FT
-        if "pretrained" in type or "🟢" in type:
-            return ModelType.PT
-        if "RL-tuned" in type or "🟦" in type:
-            return ModelType.RL
-        if "instruction-tuned" in type or "⭕" in type:
-            return ModelType.IFT
-        return ModelType.Unknown
-class WeightType(Enum):
-    Adapter = ModelDetails("Adapter")
-    Original = ModelDetails("Original")
-    Delta = ModelDetails("Delta")
-class Precision(Enum):
-    float16 = ModelDetails("float16")
-    bfloat16 = ModelDetails("bfloat16")
-    float32 = ModelDetails("float32")
-    # qt_8bit = ModelDetails("8bit")
-    # qt_4bit = ModelDetails("4bit")
-    # qt_GPTQ = ModelDetails("GPTQ")
-    Unknown = ModelDetails("?")
-    def from_str(precision):
-        if precision in ["torch.float16", "float16"]:
-            return Precision.float16
-        if precision in ["torch.bfloat16", "bfloat16"]:
-            return Precision.bfloat16
-        if precision in ["float32"]:
-            return Precision.float32
-        # if precision in ["8bit"]:
-        #    return Precision.qt_8bit
-        # if precision in ["4bit"]:
-        #    return Precision.qt_4bit
-        # if precision in ["GPTQ", "None"]:
-        #    return Precision.qt_GPTQ
-        return Precision.Unknown
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
@@ -187,10 +101,7 @@ COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
 TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden]
 TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden]
-EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
-EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

 from dataclasses import dataclass, make_dataclass
 import pandas as pd
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
     ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
 )
 auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
+auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
 auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
 auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
 # Accuracy metrics
 auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
 auto_eval_column_dict.append(
     ["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
 )
+# auto_eval_column_dict.append(
+#     ["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", False)]
+# )
+auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
+auto_eval_column_dict.append(
+    ["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
+)
+auto_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
     ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
 )
 cost_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
 cost_eval_column_dict.append(
+    ["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", True)]
 )
+cost_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
 cost_eval_column_dict.append(
     ["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
 )
 # ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
 TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden]
 TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden]
+# BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

src/populate.py CHANGED Viewed

@@ -2,26 +2,35 @@ import os
 import pandas as pd
 def get_leaderboard_df_crm(
     crm_results_path: str, accuracy_cols: list, cost_cols: list
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Creates a dataframe from all the individual experiment results"""
     sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
     leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
     leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
-    # leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
-    #     by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
-    # )
-    leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
     ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
     leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
     leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
     leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
-    # leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
     leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
     leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
@@ -29,7 +38,6 @@ def get_leaderboard_df_crm(
     leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
     leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
     leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
-    # leaderboard_ts_df["LLM Provider"] = leaderboard_ts_df["LLM Provider"].fillna("Google")
     privacy_cols = leaderboard_ts_df[
         [
             "Privacy Zero-Shot Match Avoidance",
@@ -52,4 +60,8 @@ def get_leaderboard_df_crm(
     ].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
     leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
     return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df

 import pandas as pd
+from src.display.utils import AutoEvalColumn
 def get_leaderboard_df_crm(
     crm_results_path: str, accuracy_cols: list, cost_cols: list
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Creates a dataframe from all the individual experiment results"""
+    use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv"))
     sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
+    # sf_finetuned_models = []
     leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
     leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
+    leaderboard_accuracy_df = leaderboard_accuracy_df.join(
+        use_case_flavor_mapping_df[["Use Case Name", "Cost and Speed: Flavor"]].set_index("Use Case Name"),
+        on="Use Case Name",
+    )
     ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
     leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
     leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
+    leaderboard_accuracy_df = leaderboard_accuracy_df.join(
+        leaderboard_cost_df.set_index(["Model Name", "Cost and Speed: Flavor"]),
+        on=["Model Name", "Cost and Speed: Flavor"],
+    )
     leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
     leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
     leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
     leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
     leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
     leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
     privacy_cols = leaderboard_ts_df[
         [
             "Privacy Zero-Shot Match Avoidance",
     ].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
     leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
+    leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
+        by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
+    )
+    leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
     return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df