File size: 4,414 Bytes
c2ba4d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from typing import Dict, List, Literal

import pandas as pd
from varco_arena_core.prompts import load_prompt
from varco_arena_core.tracking_utils import pricing


def calculate(
    dataset_df: pd.DataFrame = None,
    model_name: str = None,
    matching_method: Literal["tournament", "league"] = None,
    evalprompt: str = None,
):
    # same tournament hashed with "tournament_idx" column in data_utils.py:load_all_data()
    df = dataset_df

    # estimate the number of total required tokens
    """
    total_toks  = num_matches * avg_num_toks
                = num_matches * (prompt + output)
                = num_matches * ((inst + src + gen_a + gen_b) + output)

    if "tournament":
        num_matches = (n_participants - 1) * n_testset
    elif "league":
        num_matches = combination(n_participants, 2) * n_testset

    """
    # n_testset, n_models
    n_testset = len(df.tournament_idx.unique())
    n_models = len(df.model_id.unique())

    # num_matches
    n_participants = len(df.model_id.unique())
    if matching_method == "tournament":
        num_matches = n_participants - 1
    elif matching_method == "league":
        num_matches = n_participants * (n_participants - 1) / 2
    else:
        raise ValueError(
            f"{matching_method=} is undefined! Should be in [tournament, league]"
        )

    # load prompt objects of use for later below
    eval_task_2_prm = dict()
    tasks = df.task.unique().tolist()
    for task in tasks:
        eval_task_2_prm[f"{evalprompt}_{task}"] = load_prompt(evalprompt, task=task)

    # num_generated_tokens / model
    generateds = df.apply(
        lambda row: eval_task_2_prm[
            f"{evalprompt}_{row.task}"
        ].get_expected_max_tokens_w_room(
            model_name, room=1.01
        ),  # here, prompt_obj will define tokenizer with `model_name`
        axis=1,
    )
    # assert len(generateds) == n_testset * n_models, f"{len(generateds)=}, {n_testset=}, {n_models=}"
    gen_tokens = generateds.sum() / n_models
    # num_queried_tokens / model
    """
    we don't know what model's output will proceeds to the finalist matches, so we need to approximate.

    let's use average of per-prompt LLM's generated tokens.
    """
    df["approximate_match_prompts"] = df.apply(
        lambda row: eval_task_2_prm[f"{evalprompt}_{row.task}"].complete_prompt(
            inst=row.instruction,
            src=row.source,
            out_a=row.generated,
            out_b=row.generated,
            task=row.task,
        ),
        axis=1,
    )  # all the llm responses appears uniformly (not realistic)

    query_tokens = (
        df.apply(
            lambda row: eval_task_2_prm[
                f"{evalprompt}_{row.task}"
            ].get_num_tokens_from_messages(row.approximate_match_prompts),
            axis=1,
        ).sum()
        / n_models
    )

    # to total tokens:
    total_num_matches = n_testset * num_matches
    total_num_prompt_tokens = query_tokens * (n_models - 1)
    total_num_completion_tokens = gen_tokens * (n_models - 1)

    total_cost = cost_in_usd(
        model_name, total_num_prompt_tokens, total_num_completion_tokens
    )

    return (
        total_num_matches,
        int(total_num_prompt_tokens),
        int(total_num_completion_tokens),
        total_cost,
    )


def num_tokens_from_messages(messages, tokenizer):
    if tokenizer is None:
        return 0

    tokens_per_message = 3
    tokens_per_name = 1

    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(tokenizer.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens


def num_tokens_from_string(string, tokenizer):
    if tokenizer is None:
        return 0
    return len(tokenizer.encode(string))


def cost_in_usd(model_name, num_prompt_tokens, num_completion_tokens):
    # Check if the provided model is in the pricing dictionary
    if model_name not in pricing:
        return 0.0

    # Calculate the cost in USD for input and output tokens separately
    cost_in_usd = (num_prompt_tokens / 1_000_000) * pricing[model_name]["input"] + (
        num_completion_tokens / 1_000_000
    ) * pricing[model_name]["output"]

    return cost_in_usd