Spaces:
Running
Running
File size: 4,414 Bytes
c2ba4d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
from typing import Dict, List, Literal
import pandas as pd
from varco_arena_core.prompts import load_prompt
from varco_arena_core.tracking_utils import pricing
def calculate(
dataset_df: pd.DataFrame = None,
model_name: str = None,
matching_method: Literal["tournament", "league"] = None,
evalprompt: str = None,
):
# same tournament hashed with "tournament_idx" column in data_utils.py:load_all_data()
df = dataset_df
# estimate the number of total required tokens
"""
total_toks = num_matches * avg_num_toks
= num_matches * (prompt + output)
= num_matches * ((inst + src + gen_a + gen_b) + output)
if "tournament":
num_matches = (n_participants - 1) * n_testset
elif "league":
num_matches = combination(n_participants, 2) * n_testset
"""
# n_testset, n_models
n_testset = len(df.tournament_idx.unique())
n_models = len(df.model_id.unique())
# num_matches
n_participants = len(df.model_id.unique())
if matching_method == "tournament":
num_matches = n_participants - 1
elif matching_method == "league":
num_matches = n_participants * (n_participants - 1) / 2
else:
raise ValueError(
f"{matching_method=} is undefined! Should be in [tournament, league]"
)
# load prompt objects of use for later below
eval_task_2_prm = dict()
tasks = df.task.unique().tolist()
for task in tasks:
eval_task_2_prm[f"{evalprompt}_{task}"] = load_prompt(evalprompt, task=task)
# num_generated_tokens / model
generateds = df.apply(
lambda row: eval_task_2_prm[
f"{evalprompt}_{row.task}"
].get_expected_max_tokens_w_room(
model_name, room=1.01
), # here, prompt_obj will define tokenizer with `model_name`
axis=1,
)
# assert len(generateds) == n_testset * n_models, f"{len(generateds)=}, {n_testset=}, {n_models=}"
gen_tokens = generateds.sum() / n_models
# num_queried_tokens / model
"""
we don't know what model's output will proceeds to the finalist matches, so we need to approximate.
let's use average of per-prompt LLM's generated tokens.
"""
df["approximate_match_prompts"] = df.apply(
lambda row: eval_task_2_prm[f"{evalprompt}_{row.task}"].complete_prompt(
inst=row.instruction,
src=row.source,
out_a=row.generated,
out_b=row.generated,
task=row.task,
),
axis=1,
) # all the llm responses appears uniformly (not realistic)
query_tokens = (
df.apply(
lambda row: eval_task_2_prm[
f"{evalprompt}_{row.task}"
].get_num_tokens_from_messages(row.approximate_match_prompts),
axis=1,
).sum()
/ n_models
)
# to total tokens:
total_num_matches = n_testset * num_matches
total_num_prompt_tokens = query_tokens * (n_models - 1)
total_num_completion_tokens = gen_tokens * (n_models - 1)
total_cost = cost_in_usd(
model_name, total_num_prompt_tokens, total_num_completion_tokens
)
return (
total_num_matches,
int(total_num_prompt_tokens),
int(total_num_completion_tokens),
total_cost,
)
def num_tokens_from_messages(messages, tokenizer):
if tokenizer is None:
return 0
tokens_per_message = 3
tokens_per_name = 1
num_tokens = 0
for message in messages:
num_tokens += tokens_per_message
for key, value in message.items():
num_tokens += len(tokenizer.encode(value))
if key == "name":
num_tokens += tokens_per_name
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>
return num_tokens
def num_tokens_from_string(string, tokenizer):
if tokenizer is None:
return 0
return len(tokenizer.encode(string))
def cost_in_usd(model_name, num_prompt_tokens, num_completion_tokens):
# Check if the provided model is in the pricing dictionary
if model_name not in pricing:
return 0.0
# Calculate the cost in USD for input and output tokens separately
cost_in_usd = (num_prompt_tokens / 1_000_000) * pricing[model_name]["input"] + (
num_completion_tokens / 1_000_000
) * pricing[model_name]["output"]
return cost_in_usd
|