Upload metrics.py with huggingface_hub
Browse files- metrics.py +111 -34
metrics.py
CHANGED
|
@@ -1879,6 +1879,7 @@ class BertScore(HuggingfaceBulkMetric):
|
|
| 1879 |
hf_metric_fields = ["f1", "precision", "recall"]
|
| 1880 |
ci_scores = ["f1", "precision", "recall"]
|
| 1881 |
model_name: str
|
|
|
|
| 1882 |
|
| 1883 |
prediction_type = "str"
|
| 1884 |
|
|
@@ -1886,7 +1887,9 @@ class BertScore(HuggingfaceBulkMetric):
|
|
| 1886 |
|
| 1887 |
def prepare(self):
|
| 1888 |
super().prepare()
|
| 1889 |
-
self.hf_compute_args = {"model_type": self.model_name, "batch_size":
|
|
|
|
|
|
|
| 1890 |
|
| 1891 |
|
| 1892 |
class SentenceBert(BulkInstanceMetric):
|
|
@@ -1947,6 +1950,9 @@ class Reward(BulkInstanceMetric):
|
|
| 1947 |
|
| 1948 |
model_name: str
|
| 1949 |
|
|
|
|
|
|
|
|
|
|
| 1950 |
_requirements_list: List[str] = ["transformers", "torch"]
|
| 1951 |
|
| 1952 |
def prepare(self):
|
|
@@ -2141,9 +2147,13 @@ class Perplexity(BulkInstanceMetric):
|
|
| 2141 |
reduction_map = {"mean": ["perplexity"]}
|
| 2142 |
prediction_type = "str"
|
| 2143 |
|
| 2144 |
-
|
|
|
|
| 2145 |
batch_size: int = 32
|
| 2146 |
model_name: str
|
|
|
|
|
|
|
|
|
|
| 2147 |
|
| 2148 |
_requirements_list: List[str] = ["transformers", "torch"]
|
| 2149 |
|
|
@@ -2160,24 +2170,41 @@ class Perplexity(BulkInstanceMetric):
|
|
| 2160 |
|
| 2161 |
:return: the likelihood of generating text Y_i after each text X_i_j = P(Y_i|X_i_1), ..., P(Y_i|X_i_n) for every i.
|
| 2162 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2163 |
sources = []
|
| 2164 |
targets = []
|
| 2165 |
for prediction, instance_references in zip(predictions, references):
|
| 2166 |
for instance_reference in instance_references:
|
| 2167 |
-
sources.append(
|
| 2168 |
-
|
| 2169 |
-
|
| 2170 |
-
|
| 2171 |
-
|
| 2172 |
-
|
| 2173 |
-
|
| 2174 |
-
|
| 2175 |
-
|
| 2176 |
-
|
| 2177 |
-
|
|
|
|
|
|
|
|
|
|
| 2178 |
|
| 2179 |
# compute P(Q|P) and store in queue
|
| 2180 |
-
scores = lm.compute_lm(
|
| 2181 |
source=sources, target=targets, batch_size=self.batch_size
|
| 2182 |
)
|
| 2183 |
|
|
@@ -2200,8 +2227,25 @@ class Perplexity(BulkInstanceMetric):
|
|
| 2200 |
|
| 2201 |
return all_instances_scores
|
| 2202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2203 |
class AbstractLM(ABC):
|
| 2204 |
-
def __init__(self, model_name):
|
| 2205 |
import torch
|
| 2206 |
from transformers import AutoTokenizer
|
| 2207 |
|
|
@@ -2211,6 +2255,7 @@ class Perplexity(BulkInstanceMetric):
|
|
| 2211 |
self.model_class().from_pretrained(self.model_name).to(self.device)
|
| 2212 |
)
|
| 2213 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
|
|
| 2214 |
|
| 2215 |
def compute_lm(
|
| 2216 |
self, source: List[str], target: List[str], batch_size: int
|
|
@@ -2232,7 +2277,10 @@ class Perplexity(BulkInstanceMetric):
|
|
| 2232 |
batch_source, padding=True, return_tensors="pt"
|
| 2233 |
)
|
| 2234 |
tokens_target = self.tokenizer(
|
| 2235 |
-
batch_target,
|
|
|
|
|
|
|
|
|
|
| 2236 |
)
|
| 2237 |
|
| 2238 |
# compute the logits
|
|
@@ -3353,7 +3401,7 @@ class BinaryMaxAccuracy(GlobalMetric):
|
|
| 3353 |
def compute(
|
| 3354 |
self,
|
| 3355 |
references: List[List[str]],
|
| 3356 |
-
predictions: List[
|
| 3357 |
task_data: List[Dict],
|
| 3358 |
) -> dict:
|
| 3359 |
float_predictions = [to_float_or_default(p) for p in predictions]
|
|
@@ -3361,24 +3409,53 @@ class BinaryMaxAccuracy(GlobalMetric):
|
|
| 3361 |
["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
|
| 3362 |
]
|
| 3363 |
|
| 3364 |
-
|
| 3365 |
-
|
| 3366 |
-
|
| 3367 |
-
|
| 3368 |
-
|
| 3369 |
-
|
| 3370 |
-
|
| 3371 |
-
|
| 3372 |
-
|
| 3373 |
-
|
| 3374 |
-
|
| 3375 |
-
|
| 3376 |
-
|
| 3377 |
-
|
| 3378 |
-
|
| 3379 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3380 |
|
| 3381 |
-
return {
|
|
|
|
|
|
|
|
|
|
| 3382 |
|
| 3383 |
|
| 3384 |
######################
|
|
|
|
| 1879 |
hf_metric_fields = ["f1", "precision", "recall"]
|
| 1880 |
ci_scores = ["f1", "precision", "recall"]
|
| 1881 |
model_name: str
|
| 1882 |
+
model_layer: int = None
|
| 1883 |
|
| 1884 |
prediction_type = "str"
|
| 1885 |
|
|
|
|
| 1887 |
|
| 1888 |
def prepare(self):
|
| 1889 |
super().prepare()
|
| 1890 |
+
self.hf_compute_args = {"model_type": self.model_name, "batch_size": 32}
|
| 1891 |
+
if self.model_layer:
|
| 1892 |
+
self.hf_compute_args["num_layers"] = self.model_layer
|
| 1893 |
|
| 1894 |
|
| 1895 |
class SentenceBert(BulkInstanceMetric):
|
|
|
|
| 1950 |
|
| 1951 |
model_name: str
|
| 1952 |
|
| 1953 |
+
prediction_type = "str"
|
| 1954 |
+
single_reference_per_prediction = True
|
| 1955 |
+
|
| 1956 |
_requirements_list: List[str] = ["transformers", "torch"]
|
| 1957 |
|
| 1958 |
def prepare(self):
|
|
|
|
| 2147 |
reduction_map = {"mean": ["perplexity"]}
|
| 2148 |
prediction_type = "str"
|
| 2149 |
|
| 2150 |
+
source_template: str
|
| 2151 |
+
target_template: str
|
| 2152 |
batch_size: int = 32
|
| 2153 |
model_name: str
|
| 2154 |
+
single_token_mode: bool = False
|
| 2155 |
+
|
| 2156 |
+
lm = None
|
| 2157 |
|
| 2158 |
_requirements_list: List[str] = ["transformers", "torch"]
|
| 2159 |
|
|
|
|
| 2170 |
|
| 2171 |
:return: the likelihood of generating text Y_i after each text X_i_j = P(Y_i|X_i_1), ..., P(Y_i|X_i_n) for every i.
|
| 2172 |
"""
|
| 2173 |
+
if self.lm is None:
|
| 2174 |
+
from transformers import AutoConfig
|
| 2175 |
+
|
| 2176 |
+
config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=True)
|
| 2177 |
+
self.lm = (
|
| 2178 |
+
self.EncoderDecoderLM(
|
| 2179 |
+
model_name=self.model_name, single_token_mode=self.single_token_mode
|
| 2180 |
+
)
|
| 2181 |
+
if config.is_encoder_decoder is True
|
| 2182 |
+
else self.DecoderOnlyLM(
|
| 2183 |
+
model_name=self.model_name, single_token_mode=self.single_token_mode
|
| 2184 |
+
)
|
| 2185 |
+
)
|
| 2186 |
+
|
| 2187 |
sources = []
|
| 2188 |
targets = []
|
| 2189 |
for prediction, instance_references in zip(predictions, references):
|
| 2190 |
for instance_reference in instance_references:
|
| 2191 |
+
sources.append(
|
| 2192 |
+
self.Template.apply(
|
| 2193 |
+
self.source_template,
|
| 2194 |
+
prediction=prediction,
|
| 2195 |
+
reference=instance_reference,
|
| 2196 |
+
)
|
| 2197 |
+
)
|
| 2198 |
+
targets.append(
|
| 2199 |
+
self.Template.apply(
|
| 2200 |
+
self.target_template,
|
| 2201 |
+
prediction=prediction,
|
| 2202 |
+
reference=instance_reference,
|
| 2203 |
+
)
|
| 2204 |
+
)
|
| 2205 |
|
| 2206 |
# compute P(Q|P) and store in queue
|
| 2207 |
+
scores = self.lm.compute_lm(
|
| 2208 |
source=sources, target=targets, batch_size=self.batch_size
|
| 2209 |
)
|
| 2210 |
|
|
|
|
| 2227 |
|
| 2228 |
return all_instances_scores
|
| 2229 |
|
| 2230 |
+
class Template:
|
| 2231 |
+
regex = re.compile(r"\{(\w+)}")
|
| 2232 |
+
|
| 2233 |
+
@classmethod
|
| 2234 |
+
def apply(cls, template, **kwargs):
|
| 2235 |
+
matches = Perplexity.Template.regex.finditer(template)
|
| 2236 |
+
output = []
|
| 2237 |
+
cursor = 0
|
| 2238 |
+
for match in matches:
|
| 2239 |
+
start = match.start()
|
| 2240 |
+
end = match.end()
|
| 2241 |
+
output.append(template[cursor:start])
|
| 2242 |
+
output.append(kwargs[match.group(1)])
|
| 2243 |
+
cursor = end
|
| 2244 |
+
output.append(template[cursor:])
|
| 2245 |
+
return "".join(output)
|
| 2246 |
+
|
| 2247 |
class AbstractLM(ABC):
|
| 2248 |
+
def __init__(self, model_name, single_token_mode):
|
| 2249 |
import torch
|
| 2250 |
from transformers import AutoTokenizer
|
| 2251 |
|
|
|
|
| 2255 |
self.model_class().from_pretrained(self.model_name).to(self.device)
|
| 2256 |
)
|
| 2257 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 2258 |
+
self.single_token_mode = single_token_mode
|
| 2259 |
|
| 2260 |
def compute_lm(
|
| 2261 |
self, source: List[str], target: List[str], batch_size: int
|
|
|
|
| 2277 |
batch_source, padding=True, return_tensors="pt"
|
| 2278 |
)
|
| 2279 |
tokens_target = self.tokenizer(
|
| 2280 |
+
batch_target,
|
| 2281 |
+
padding=True,
|
| 2282 |
+
return_tensors="pt",
|
| 2283 |
+
add_special_tokens=not self.single_token_mode,
|
| 2284 |
)
|
| 2285 |
|
| 2286 |
# compute the logits
|
|
|
|
| 3401 |
def compute(
|
| 3402 |
self,
|
| 3403 |
references: List[List[str]],
|
| 3404 |
+
predictions: List[str],
|
| 3405 |
task_data: List[Dict],
|
| 3406 |
) -> dict:
|
| 3407 |
float_predictions = [to_float_or_default(p) for p in predictions]
|
|
|
|
| 3409 |
["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
|
| 3410 |
]
|
| 3411 |
|
| 3412 |
+
# Sticking to the test >= thr, accuracy induced by threshold thr is the number of float predictions
|
| 3413 |
+
# that pass the test (are >= thr) and are paired with reference "1" plus the number of float predictions that
|
| 3414 |
+
# fail the test (are < thr) and are paired with reference "0".
|
| 3415 |
+
# A given threshold thr induces the same partition over the float predictions into passing and failing
|
| 3416 |
+
# as threshold thr' induces, with thr' being the smallest among the ones passing the test of thr.
|
| 3417 |
+
# Hence, we only need to review thresholds being float predictions, plus a threshold being larger than
|
| 3418 |
+
# the largest float predictions, to induce the partition into all-failing , none-passing.
|
| 3419 |
+
|
| 3420 |
+
fp = [
|
| 3421 |
+
(float_predictions[i], i, -1 if references[i][0] == "1" else +1)
|
| 3422 |
+
for i in range(len(float_predictions))
|
| 3423 |
+
]
|
| 3424 |
+
fp.sort()
|
| 3425 |
+
# each triplet above: float-prediction f; f's ordinal position in float_predictions, which is also
|
| 3426 |
+
# a means to obtain distinct triplets; and: the change in number of predictions that the test sends
|
| 3427 |
+
# to the reference they are paired with, a change implied by a move of thr that transfers f
|
| 3428 |
+
# from the set of passing the test to the set of failing it.
|
| 3429 |
+
|
| 3430 |
+
rightmost_thr = 1.0 if fp[-1][0] < 1 else fp[-1][0] + 0.01
|
| 3431 |
+
# trying to be esthetic, have the threshold within [0,1], although this is not a requirement,
|
| 3432 |
+
# and even the float predictions are not guaranteed to be within the range [0,1]
|
| 3433 |
+
|
| 3434 |
+
current_thr = fp[0][0]
|
| 3435 |
+
# partition float_predictions into all-passing, none-failing
|
| 3436 |
+
current_acc = sum(r[0] == "1" for r in references)
|
| 3437 |
+
# number of predictions that thr sends to the reference they are paired with
|
| 3438 |
+
|
| 3439 |
+
best_acc = current_acc
|
| 3440 |
+
best_thr = current_thr
|
| 3441 |
+
|
| 3442 |
+
i = 0
|
| 3443 |
+
while (i < len(predictions)) and (best_acc < len(predictions)):
|
| 3444 |
+
# best_acc can not exceed len(predictions)
|
| 3445 |
+
delta = fp[i][2]
|
| 3446 |
+
i += 1
|
| 3447 |
+
while i < len(predictions) and fp[i][0] <= fp[i - 1][0]:
|
| 3448 |
+
delta += fp[i][2]
|
| 3449 |
+
i += 1
|
| 3450 |
+
current_acc += delta
|
| 3451 |
+
if current_acc > best_acc:
|
| 3452 |
+
best_acc = current_acc
|
| 3453 |
+
best_thr = fp[i][0] if i < len(predictions) else rightmost_thr
|
| 3454 |
|
| 3455 |
+
return {
|
| 3456 |
+
self.main_score: float(best_acc) / len(predictions),
|
| 3457 |
+
"best_thr_max_acc": best_thr,
|
| 3458 |
+
}
|
| 3459 |
|
| 3460 |
|
| 3461 |
######################
|