Spaces:
Running
Running
join cost table
Browse files- README.md +1 -33
- app.py +9 -5
- crm-results/hf_leaderboard_flavor_mapping.csv +12 -0
- crm-results/hf_leaderboard_latency_cost.csv +1 -1
- src/about.py +0 -23
- src/display/utils.py +12 -101
- src/populate.py +18 -6
README.md
CHANGED
|
@@ -9,36 +9,4 @@ pinned: true
|
|
| 9 |
license: apache-2.0
|
| 10 |
---
|
| 11 |
|
| 12 |
-
#
|
| 13 |
-
|
| 14 |
-
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
| 15 |
-
|
| 16 |
-
Results files should have the following format and be stored as json files:
|
| 17 |
-
```json
|
| 18 |
-
{
|
| 19 |
-
"config": {
|
| 20 |
-
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
| 21 |
-
"model_name": "path of the model on the hub: org/model",
|
| 22 |
-
"model_sha": "revision on the hub",
|
| 23 |
-
},
|
| 24 |
-
"results": {
|
| 25 |
-
"task_name": {
|
| 26 |
-
"metric_name": score,
|
| 27 |
-
},
|
| 28 |
-
"task_name2": {
|
| 29 |
-
"metric_name": score,
|
| 30 |
-
}
|
| 31 |
-
}
|
| 32 |
-
}
|
| 33 |
-
```
|
| 34 |
-
|
| 35 |
-
Request files are created automatically by this tool.
|
| 36 |
-
|
| 37 |
-
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
| 38 |
-
|
| 39 |
-
# Code logic for more complex edits
|
| 40 |
-
|
| 41 |
-
You'll find
|
| 42 |
-
- the main table' columns names and properties in `src/display/utils.py`
|
| 43 |
-
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
| 44 |
-
- teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
|
|
|
| 9 |
license: apache-2.0
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Generative AI Leaderboard for CRM
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -57,7 +57,7 @@ def update_table(
|
|
| 57 |
filtered_df = filter_use_case_func(filtered_df, use_case_query)
|
| 58 |
filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
|
| 59 |
df = select_columns(filtered_df, columns)
|
| 60 |
-
return df
|
| 61 |
|
| 62 |
|
| 63 |
def update_cost_table(
|
|
@@ -65,11 +65,11 @@ def update_cost_table(
|
|
| 65 |
columns: list,
|
| 66 |
llm_query: list,
|
| 67 |
llm_provider_query: list,
|
| 68 |
-
|
| 69 |
):
|
| 70 |
filtered_df = filter_llm_func(hidden_df, llm_query)
|
| 71 |
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
| 72 |
-
filtered_df =
|
| 73 |
df = select_columns_cost_table(filtered_df, columns)
|
| 74 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
| 75 |
|
|
@@ -183,6 +183,10 @@ def filter_use_case_type_func(df: pd.DataFrame, use_case_type_query: list) -> pd
|
|
| 183 |
return df[df["Use Case Type"].isin(use_case_type_query)]
|
| 184 |
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
|
| 187 |
return df[df["Model Name"].isin(llm_query)]
|
| 188 |
|
|
@@ -222,7 +226,7 @@ with demo:
|
|
| 222 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 223 |
|
| 224 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 225 |
-
with gr.TabItem("🏅
|
| 226 |
with gr.Row():
|
| 227 |
with gr.Column():
|
| 228 |
# with gr.Row():
|
|
@@ -455,7 +459,7 @@ with demo:
|
|
| 455 |
filter_use_case_type = gr.CheckboxGroup(
|
| 456 |
choices=["Long", "Short"],
|
| 457 |
value=["Long", "Short"],
|
| 458 |
-
label="Use Case
|
| 459 |
info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
|
| 460 |
interactive=True,
|
| 461 |
)
|
|
|
|
| 57 |
filtered_df = filter_use_case_func(filtered_df, use_case_query)
|
| 58 |
filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
|
| 59 |
df = select_columns(filtered_df, columns)
|
| 60 |
+
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
| 61 |
|
| 62 |
|
| 63 |
def update_cost_table(
|
|
|
|
| 65 |
columns: list,
|
| 66 |
llm_query: list,
|
| 67 |
llm_provider_query: list,
|
| 68 |
+
use_case_flavor_query: list,
|
| 69 |
):
|
| 70 |
filtered_df = filter_llm_func(hidden_df, llm_query)
|
| 71 |
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
| 72 |
+
filtered_df = filter_use_case_flavor_func(filtered_df, use_case_flavor_query)
|
| 73 |
df = select_columns_cost_table(filtered_df, columns)
|
| 74 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
| 75 |
|
|
|
|
| 183 |
return df[df["Use Case Type"].isin(use_case_type_query)]
|
| 184 |
|
| 185 |
|
| 186 |
+
def filter_use_case_flavor_func(df: pd.DataFrame, use_case_flavor_query: list) -> pd.DataFrame:
|
| 187 |
+
return df[df["Cost and Speed: Flavor"].isin(use_case_flavor_query)]
|
| 188 |
+
|
| 189 |
+
|
| 190 |
def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
|
| 191 |
return df[df["Model Name"].isin(llm_query)]
|
| 192 |
|
|
|
|
| 226 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
| 227 |
|
| 228 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 229 |
+
with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
| 230 |
with gr.Row():
|
| 231 |
with gr.Column():
|
| 232 |
# with gr.Row():
|
|
|
|
| 459 |
filter_use_case_type = gr.CheckboxGroup(
|
| 460 |
choices=["Long", "Short"],
|
| 461 |
value=["Long", "Short"],
|
| 462 |
+
label="Use Case Flavor",
|
| 463 |
info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
|
| 464 |
interactive=True,
|
| 465 |
)
|
crm-results/hf_leaderboard_flavor_mapping.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Use Case Name,Use Case Type,Cost and Speed: Flavor
|
| 2 |
+
Service: Conversation summary,Summary,Short
|
| 3 |
+
Service: Reply Recommendations,Generation,Short
|
| 4 |
+
Sales: Email Generation,Generation,Short
|
| 5 |
+
Sales & Service: Update CRM Info,Generation,Long
|
| 6 |
+
Service: Call Summary,Summary,Long
|
| 7 |
+
Sales: Call Summary,Summary,Long
|
| 8 |
+
Service: Live Chat Insights,Summary,Short
|
| 9 |
+
Service: Live Chat Summary,Summary,Long
|
| 10 |
+
Service: Email Summary,Summary,Long
|
| 11 |
+
Service: Knowledge creation from Case Info,Generation,Long
|
| 12 |
+
Sales: Email Summary,Summary,Long
|
crm-results/hf_leaderboard_latency_cost.csv
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
Model Name,
|
| 2 |
AI21 Jamba-Instruct,Long,,AI21,4.0,232.9,1.6,Medium,,GPT 3.5 Turbo,0.5,1.5,,,0%,0.43,0.43,1.61,,0.43,61.11
|
| 3 |
AI21 Jamba-Instruct,Short,,AI21,4.0,243.9,0.5,Low,,GPT 4 Turbo,10,30,,,33%,1.61,1.61,9.28,,,
|
| 4 |
Claude 3 Haiku,Long,,Bedrock,2.8,236.9,1.0,Low,,GPT4-o,5,15,,,67%,9.28,9.28,61.11,,,
|
|
|
|
| 1 |
+
Model Name,Cost and Speed: Flavor,Version,Platform,Response Time (Sec),Mean Output Tokens,Mean Cost per 1K Requests,Cost Band,,Model id,Cost per 1m input tokens,Cost per 1m output tokens,,,,Percentile,From,To,,min,Max
|
| 2 |
AI21 Jamba-Instruct,Long,,AI21,4.0,232.9,1.6,Medium,,GPT 3.5 Turbo,0.5,1.5,,,0%,0.43,0.43,1.61,,0.43,61.11
|
| 3 |
AI21 Jamba-Instruct,Short,,AI21,4.0,243.9,0.5,Low,,GPT 4 Turbo,10,30,,,33%,1.61,1.61,9.28,,,
|
| 4 |
Claude 3 Haiku,Long,,Bedrock,2.8,236.9,1.0,Low,,GPT4-o,5,15,,,67%,9.28,9.28,61.11,,,
|
src/about.py
CHANGED
|
@@ -1,26 +1,3 @@
|
|
| 1 |
-
from dataclasses import dataclass
|
| 2 |
-
from enum import Enum
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
@dataclass
|
| 6 |
-
class Task:
|
| 7 |
-
benchmark: str
|
| 8 |
-
metric: str
|
| 9 |
-
col_name: str
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
# Select your tasks here
|
| 13 |
-
# ---------------------------------------------------
|
| 14 |
-
class Tasks(Enum):
|
| 15 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 16 |
-
task0 = Task("anli_r1", "acc", "ANLI")
|
| 17 |
-
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
| 21 |
-
# ---------------------------------------------------
|
| 22 |
-
|
| 23 |
-
|
| 24 |
# Your leaderboard name
|
| 25 |
TITLE = """<h1 align="center" id="space-title">Generative AI Leaderboard for CRM</h1>
|
| 26 |
<h3>Assess which LLMs are accurate enough or need fine-tuning, and weigh this versus tradeoffs of speed, costs, and trust and safety. This is based on human manual and automated evaluation with real operational CRM data per use case.</h3>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Your leaderboard name
|
| 2 |
TITLE = """<h1 align="center" id="space-title">Generative AI Leaderboard for CRM</h1>
|
| 3 |
<h3>Assess which LLMs are accurate enough or need fine-tuning, and weigh this versus tradeoffs of speed, costs, and trust and safety. This is based on human manual and automated evaluation with real operational CRM data per use case.</h3>
|
src/display/utils.py
CHANGED
|
@@ -1,10 +1,7 @@
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
| 2 |
-
from enum import Enum
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
-
from src.about import Tasks
|
| 7 |
-
|
| 8 |
|
| 9 |
def fields(raw_class):
|
| 10 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
@@ -29,9 +26,8 @@ auto_eval_column_dict.append(
|
|
| 29 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
| 30 |
)
|
| 31 |
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
| 32 |
-
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown",
|
| 33 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
| 34 |
-
|
| 35 |
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
|
| 36 |
# Accuracy metrics
|
| 37 |
auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
|
|
@@ -51,6 +47,14 @@ auto_eval_column_dict.append(
|
|
| 51 |
auto_eval_column_dict.append(
|
| 52 |
["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
|
| 53 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 55 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 56 |
|
|
@@ -62,10 +66,10 @@ cost_eval_column_dict.append(
|
|
| 62 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
| 63 |
)
|
| 64 |
cost_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
| 65 |
-
cost_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", True)])
|
| 66 |
cost_eval_column_dict.append(
|
| 67 |
-
["
|
| 68 |
)
|
|
|
|
| 69 |
cost_eval_column_dict.append(
|
| 70 |
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
|
| 71 |
)
|
|
@@ -85,96 +89,6 @@ ts_eval_column_dict.append(["crm_bias", ColumnContent, ColumnContent("CRM Bias",
|
|
| 85 |
# ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
|
| 86 |
TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
|
| 87 |
|
| 88 |
-
|
| 89 |
-
# Scores
|
| 90 |
-
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 91 |
-
# for task in Tasks:
|
| 92 |
-
# auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 93 |
-
# Model information
|
| 94 |
-
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 95 |
-
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
| 96 |
-
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 97 |
-
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 98 |
-
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
| 99 |
-
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
| 100 |
-
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
| 101 |
-
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 102 |
-
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
## For the queue columns in the submission tab
|
| 106 |
-
@dataclass(frozen=True)
|
| 107 |
-
class EvalQueueColumn: # Queue column
|
| 108 |
-
model = ColumnContent("model", "markdown", True)
|
| 109 |
-
revision = ColumnContent("revision", "str", True)
|
| 110 |
-
private = ColumnContent("private", "bool", True)
|
| 111 |
-
precision = ColumnContent("precision", "str", True)
|
| 112 |
-
weight_type = ColumnContent("weight_type", "str", "Original")
|
| 113 |
-
status = ColumnContent("status", "str", True)
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
## All the model information that we might need
|
| 117 |
-
@dataclass
|
| 118 |
-
class ModelDetails:
|
| 119 |
-
name: str
|
| 120 |
-
display_name: str = ""
|
| 121 |
-
symbol: str = "" # emoji
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
class ModelType(Enum):
|
| 125 |
-
PT = ModelDetails(name="pretrained", symbol="🟢")
|
| 126 |
-
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
| 127 |
-
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
| 128 |
-
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
| 129 |
-
Unknown = ModelDetails(name="", symbol="?")
|
| 130 |
-
|
| 131 |
-
def to_str(self, separator=" "):
|
| 132 |
-
return f"{self.value.symbol}{separator}{self.value.name}"
|
| 133 |
-
|
| 134 |
-
@staticmethod
|
| 135 |
-
def from_str(type):
|
| 136 |
-
if "fine-tuned" in type or "🔶" in type:
|
| 137 |
-
return ModelType.FT
|
| 138 |
-
if "pretrained" in type or "🟢" in type:
|
| 139 |
-
return ModelType.PT
|
| 140 |
-
if "RL-tuned" in type or "🟦" in type:
|
| 141 |
-
return ModelType.RL
|
| 142 |
-
if "instruction-tuned" in type or "⭕" in type:
|
| 143 |
-
return ModelType.IFT
|
| 144 |
-
return ModelType.Unknown
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
class WeightType(Enum):
|
| 148 |
-
Adapter = ModelDetails("Adapter")
|
| 149 |
-
Original = ModelDetails("Original")
|
| 150 |
-
Delta = ModelDetails("Delta")
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
class Precision(Enum):
|
| 154 |
-
float16 = ModelDetails("float16")
|
| 155 |
-
bfloat16 = ModelDetails("bfloat16")
|
| 156 |
-
float32 = ModelDetails("float32")
|
| 157 |
-
# qt_8bit = ModelDetails("8bit")
|
| 158 |
-
# qt_4bit = ModelDetails("4bit")
|
| 159 |
-
# qt_GPTQ = ModelDetails("GPTQ")
|
| 160 |
-
Unknown = ModelDetails("?")
|
| 161 |
-
|
| 162 |
-
def from_str(precision):
|
| 163 |
-
if precision in ["torch.float16", "float16"]:
|
| 164 |
-
return Precision.float16
|
| 165 |
-
if precision in ["torch.bfloat16", "bfloat16"]:
|
| 166 |
-
return Precision.bfloat16
|
| 167 |
-
if precision in ["float32"]:
|
| 168 |
-
return Precision.float32
|
| 169 |
-
# if precision in ["8bit"]:
|
| 170 |
-
# return Precision.qt_8bit
|
| 171 |
-
# if precision in ["4bit"]:
|
| 172 |
-
# return Precision.qt_4bit
|
| 173 |
-
# if precision in ["GPTQ", "None"]:
|
| 174 |
-
# return Precision.qt_GPTQ
|
| 175 |
-
return Precision.Unknown
|
| 176 |
-
|
| 177 |
-
|
| 178 |
# Column selection
|
| 179 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
| 180 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
|
@@ -187,10 +101,7 @@ COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
|
|
| 187 |
TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden]
|
| 188 |
TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden]
|
| 189 |
|
| 190 |
-
|
| 191 |
-
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 192 |
-
|
| 193 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 194 |
|
| 195 |
NUMERIC_INTERVALS = {
|
| 196 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
|
|
|
| 2 |
|
| 3 |
import pandas as pd
|
| 4 |
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def fields(raw_class):
|
| 7 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
|
| 26 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
| 27 |
)
|
| 28 |
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
| 29 |
+
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
|
| 30 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
|
|
|
| 31 |
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
|
| 32 |
# Accuracy metrics
|
| 33 |
auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
|
|
|
|
| 47 |
auto_eval_column_dict.append(
|
| 48 |
["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
|
| 49 |
)
|
| 50 |
+
# auto_eval_column_dict.append(
|
| 51 |
+
# ["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", False)]
|
| 52 |
+
# )
|
| 53 |
+
auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
|
| 54 |
+
auto_eval_column_dict.append(
|
| 55 |
+
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
|
| 56 |
+
)
|
| 57 |
+
auto_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
|
| 58 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 59 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 60 |
|
|
|
|
| 66 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
| 67 |
)
|
| 68 |
cost_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
|
|
|
| 69 |
cost_eval_column_dict.append(
|
| 70 |
+
["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", True)]
|
| 71 |
)
|
| 72 |
+
cost_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
|
| 73 |
cost_eval_column_dict.append(
|
| 74 |
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
|
| 75 |
)
|
|
|
|
| 89 |
# ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
|
| 90 |
TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
# Column selection
|
| 93 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
| 94 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
|
| 101 |
TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden]
|
| 102 |
TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden]
|
| 103 |
|
| 104 |
+
# BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
NUMERIC_INTERVALS = {
|
| 107 |
"?": pd.Interval(-1, 0, closed="right"),
|
src/populate.py
CHANGED
|
@@ -2,26 +2,35 @@ import os
|
|
| 2 |
|
| 3 |
import pandas as pd
|
| 4 |
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def get_leaderboard_df_crm(
|
| 7 |
crm_results_path: str, accuracy_cols: list, cost_cols: list
|
| 8 |
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
| 9 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
|
| 10 |
sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
|
|
|
|
| 11 |
|
| 12 |
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
|
| 13 |
leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
| 18 |
|
| 19 |
ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
|
| 20 |
|
| 21 |
leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
|
| 22 |
leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
| 24 |
-
# leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
|
| 25 |
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
|
| 26 |
|
| 27 |
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
|
@@ -29,7 +38,6 @@ def get_leaderboard_df_crm(
|
|
| 29 |
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
| 30 |
leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
| 31 |
leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
|
| 32 |
-
# leaderboard_ts_df["LLM Provider"] = leaderboard_ts_df["LLM Provider"].fillna("Google")
|
| 33 |
privacy_cols = leaderboard_ts_df[
|
| 34 |
[
|
| 35 |
"Privacy Zero-Shot Match Avoidance",
|
|
@@ -52,4 +60,8 @@ def get_leaderboard_df_crm(
|
|
| 52 |
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
|
| 53 |
leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df
|
|
|
|
| 2 |
|
| 3 |
import pandas as pd
|
| 4 |
|
| 5 |
+
from src.display.utils import AutoEvalColumn
|
| 6 |
+
|
| 7 |
|
| 8 |
def get_leaderboard_df_crm(
|
| 9 |
crm_results_path: str, accuracy_cols: list, cost_cols: list
|
| 10 |
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
| 11 |
"""Creates a dataframe from all the individual experiment results"""
|
| 12 |
+
use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv"))
|
| 13 |
sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
|
| 14 |
+
# sf_finetuned_models = []
|
| 15 |
|
| 16 |
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
|
| 17 |
leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
|
| 18 |
+
|
| 19 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
|
| 20 |
+
use_case_flavor_mapping_df[["Use Case Name", "Cost and Speed: Flavor"]].set_index("Use Case Name"),
|
| 21 |
+
on="Use Case Name",
|
| 22 |
+
)
|
| 23 |
|
| 24 |
ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
|
| 25 |
|
| 26 |
leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
|
| 27 |
leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
|
| 28 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
|
| 29 |
+
leaderboard_cost_df.set_index(["Model Name", "Cost and Speed: Flavor"]),
|
| 30 |
+
on=["Model Name", "Cost and Speed: Flavor"],
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
|
|
|
| 34 |
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
|
| 35 |
|
| 36 |
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
|
|
|
| 38 |
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
| 39 |
leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
| 40 |
leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
|
|
|
|
| 41 |
privacy_cols = leaderboard_ts_df[
|
| 42 |
[
|
| 43 |
"Privacy Zero-Shot Match Avoidance",
|
|
|
|
| 60 |
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
|
| 61 |
leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
|
| 62 |
|
| 63 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
| 64 |
+
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
| 65 |
+
)
|
| 66 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
|
| 67 |
return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df
|