Spaces:
Running
Running
Jae-Won Chung
commited on
Commit
·
8b30258
1
Parent(s):
cdc3f99
Default for the app
Browse files- LEADERBOARD.md +1 -1
- app.py +17 -10
- data/2023-06-17/schema.yaml +1 -1
LEADERBOARD.md
CHANGED
|
@@ -10,7 +10,7 @@ That is, when asked the same thing, different models answer in different lengths
|
|
| 10 |
|
| 11 |
- `gpu`: NVIDIA GPU model name. Note that NLP evaluation was only run once on our A40 GPUs, so this column only changes system-level measurements like latency and energy.
|
| 12 |
- `task`: Name of the task. See *Tasks* below for details.
|
| 13 |
-
- `
|
| 14 |
- `energy` (J): The average energy consumed by the model to generate a response.
|
| 15 |
- `nlp_average`: The arithmetic average of the NLP evaluation metrics we obtained. See *NLP evaluation metrics* below for details.
|
| 16 |
- `throughput` (token/s): The average number of tokens generated per second.
|
|
|
|
| 10 |
|
| 11 |
- `gpu`: NVIDIA GPU model name. Note that NLP evaluation was only run once on our A40 GPUs, so this column only changes system-level measurements like latency and energy.
|
| 12 |
- `task`: Name of the task. See *Tasks* below for details.
|
| 13 |
+
- `energy_eff`: Our definition of energy efficiency: Average NLP evaluation metric attained per Joule of energy.
|
| 14 |
- `energy` (J): The average energy consumed by the model to generate a response.
|
| 15 |
- `nlp_average`: The arithmetic average of the NLP evaluation metrics we obtained. See *NLP evaluation metrics* below for details.
|
| 16 |
- `throughput` (token/s): The average number of tokens generated per second.
|
app.py
CHANGED
|
@@ -35,7 +35,7 @@ class TableManager:
|
|
| 35 |
df["model"] = df["model"].apply(format_model_link)
|
| 36 |
|
| 37 |
# Sort by our 'energy efficiency' score.
|
| 38 |
-
df = df.sort_values(by="
|
| 39 |
|
| 40 |
# The full table where all the data are.
|
| 41 |
self.full_df = df
|
|
@@ -71,24 +71,24 @@ class TableManager:
|
|
| 71 |
if res_df.empty:
|
| 72 |
raise ValueError(f"No benchmark CSV files were read from {data_dir=}.")
|
| 73 |
|
| 74 |
-
df = pd.merge(res_df, df_score, on=["model"])
|
| 75 |
|
| 76 |
# Energy efficiency is defined as the amount of average NLP performance
|
| 77 |
# the model gets per Joule of energy.
|
| 78 |
-
df["
|
| 79 |
|
| 80 |
# Order columns.
|
| 81 |
columns = df.columns.to_list()
|
| 82 |
cols_to_order = ["model"]
|
| 83 |
cols_to_order.extend(self.schema.keys())
|
| 84 |
-
cols_to_order.extend(["
|
| 85 |
columns = cols_to_order + [col for col in columns if col not in cols_to_order]
|
| 86 |
df = df[columns]
|
| 87 |
|
| 88 |
# Delete rows with *any* NaN values.
|
| 89 |
df = df.dropna()
|
| 90 |
|
| 91 |
-
return df
|
| 92 |
|
| 93 |
def _format_msg(self, text: str) -> str:
|
| 94 |
"""Formats into HTML that prints in Monospace font."""
|
|
@@ -131,20 +131,27 @@ class TableManager:
|
|
| 131 |
return self.cur_df, self._format_msg(f"{verb} column '{column_name}'.")
|
| 132 |
|
| 133 |
def get_dropdown(self):
|
| 134 |
-
columns = self.full_df.columns.tolist()[1:]
|
| 135 |
return [
|
| 136 |
gr.Dropdown(value="gpu", choices=columns, label="X"),
|
| 137 |
gr.Dropdown(value="nlp_average", choices=columns, label="Y"),
|
| 138 |
-
gr.Dropdown(value="
|
| 139 |
]
|
| 140 |
|
| 141 |
def update_dropdown(self):
|
| 142 |
columns = self.full_df.columns.tolist()[1:]
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
def set_filter_get_df(self, *filters):
|
| 147 |
"""Set the current set of filters and return the filtered DataFrame."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
index = np.full(len(self.full_df), True)
|
| 149 |
for setup, choice in zip(self.schema, filters):
|
| 150 |
index = index & self.full_df[setup].isin(choice)
|
|
@@ -378,6 +385,6 @@ with block:
|
|
| 378 |
gr.Markdown(open("LEADERBOARD.md").read())
|
| 379 |
|
| 380 |
# Load the table on page load.
|
| 381 |
-
block.load(
|
| 382 |
|
| 383 |
block.launch()
|
|
|
|
| 35 |
df["model"] = df["model"].apply(format_model_link)
|
| 36 |
|
| 37 |
# Sort by our 'energy efficiency' score.
|
| 38 |
+
df = df.sort_values(by="energy_eff", ascending=False)
|
| 39 |
|
| 40 |
# The full table where all the data are.
|
| 41 |
self.full_df = df
|
|
|
|
| 71 |
if res_df.empty:
|
| 72 |
raise ValueError(f"No benchmark CSV files were read from {data_dir=}.")
|
| 73 |
|
| 74 |
+
df = pd.merge(res_df, df_score, on=["model"]).round(2)
|
| 75 |
|
| 76 |
# Energy efficiency is defined as the amount of average NLP performance
|
| 77 |
# the model gets per Joule of energy.
|
| 78 |
+
df["energy_eff"] = (df["nlp_average"] / df["energy"]).round(4)
|
| 79 |
|
| 80 |
# Order columns.
|
| 81 |
columns = df.columns.to_list()
|
| 82 |
cols_to_order = ["model"]
|
| 83 |
cols_to_order.extend(self.schema.keys())
|
| 84 |
+
cols_to_order.extend(["energy_eff", "energy", "nlp_average"])
|
| 85 |
columns = cols_to_order + [col for col in columns if col not in cols_to_order]
|
| 86 |
df = df[columns]
|
| 87 |
|
| 88 |
# Delete rows with *any* NaN values.
|
| 89 |
df = df.dropna()
|
| 90 |
|
| 91 |
+
return df
|
| 92 |
|
| 93 |
def _format_msg(self, text: str) -> str:
|
| 94 |
"""Formats into HTML that prints in Monospace font."""
|
|
|
|
| 131 |
return self.cur_df, self._format_msg(f"{verb} column '{column_name}'.")
|
| 132 |
|
| 133 |
def get_dropdown(self):
|
| 134 |
+
columns = self.full_df.columns.tolist()[1:]
|
| 135 |
return [
|
| 136 |
gr.Dropdown(value="gpu", choices=columns, label="X"),
|
| 137 |
gr.Dropdown(value="nlp_average", choices=columns, label="Y"),
|
| 138 |
+
gr.Dropdown(value="energy_eff", choices=["None", *columns], label="Z (optional)"),
|
| 139 |
]
|
| 140 |
|
| 141 |
def update_dropdown(self):
|
| 142 |
columns = self.full_df.columns.tolist()[1:]
|
| 143 |
+
return [
|
| 144 |
+
gr.Dropdown.update(choices=columns),
|
| 145 |
+
gr.Dropdown.update(choices=columns),
|
| 146 |
+
gr.Dropdown.update(choices=["None", *columns])),
|
| 147 |
+
]
|
| 148 |
|
| 149 |
def set_filter_get_df(self, *filters):
|
| 150 |
"""Set the current set of filters and return the filtered DataFrame."""
|
| 151 |
+
# If the filter is empty, we default to the first choice for each key.
|
| 152 |
+
if not filters:
|
| 153 |
+
filters = [choices[0] for choices in self.schema.values()]
|
| 154 |
+
|
| 155 |
index = np.full(len(self.full_df), True)
|
| 156 |
for setup, choice in zip(self.schema, filters):
|
| 157 |
index = index & self.full_df[setup].isin(choice)
|
|
|
|
| 385 |
gr.Markdown(open("LEADERBOARD.md").read())
|
| 386 |
|
| 387 |
# Load the table on page load.
|
| 388 |
+
block.load(TableManager.set_filter_get_df, input=tbm, outputs=dataframe)
|
| 389 |
|
| 390 |
block.launch()
|
data/2023-06-17/schema.yaml
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
-
gpu: ["
|
| 2 |
task: ["chat", "chat-concise", "instruct", "instruct-concise"]
|
|
|
|
| 1 |
+
gpu: ["A100", "A40"]
|
| 2 |
task: ["chat", "chat-concise", "instruct", "instruct-concise"]
|