Upload metrics.py with huggingface_hub
Browse files- metrics.py +194 -5
metrics.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import re
|
| 2 |
import string
|
| 3 |
import uuid
|
| 4 |
-
from abc import abstractmethod
|
| 5 |
from collections import Counter
|
| 6 |
from dataclasses import field
|
| 7 |
from typing import Any, Dict, Generator, List, Optional, Tuple
|
|
@@ -361,7 +361,7 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
|
|
| 361 |
references: List[List[Any]],
|
| 362 |
predictions: List[Any],
|
| 363 |
additional_inputs: List[Dict],
|
| 364 |
-
) -> Dict[str, Any]:
|
| 365 |
pass
|
| 366 |
|
| 367 |
|
|
@@ -643,7 +643,6 @@ class HuggingfaceBulkMetric(BulkInstanceMetric):
|
|
| 643 |
predictions: List[str],
|
| 644 |
additional_inputs: List[Any],
|
| 645 |
) -> List[Dict[str, Any]]:
|
| 646 |
-
passed_additional_inputs = {}
|
| 647 |
passed_additional_inputs = {}
|
| 648 |
for additional_input_field in self.hf_additional_input_fields:
|
| 649 |
assert (
|
|
@@ -1247,7 +1246,7 @@ class SentenceBert(BulkInstanceMetric):
|
|
| 1247 |
references: List[List[Any]],
|
| 1248 |
predictions: List[Any],
|
| 1249 |
additional_inputs: List[Dict],
|
| 1250 |
-
) -> List[Any]:
|
| 1251 |
scores = []
|
| 1252 |
|
| 1253 |
# we are in a multi-reference case (each prediction may have multiple
|
|
@@ -1292,7 +1291,7 @@ class Reward(BulkInstanceMetric):
|
|
| 1292 |
references: List[List[Any]],
|
| 1293 |
predictions: List[Any],
|
| 1294 |
additional_inputs: List[Dict],
|
| 1295 |
-
) -> List[Any]:
|
| 1296 |
# treat the references as the questions and the predictions as answers
|
| 1297 |
# assume a single reference
|
| 1298 |
questions = [refs[0] for refs in references]
|
|
@@ -1306,6 +1305,196 @@ class Reward(BulkInstanceMetric):
|
|
| 1306 |
return self.pipe(inputs, batch_size=self.batch_size)
|
| 1307 |
|
| 1308 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1309 |
class NDCG(GlobalMetric):
|
| 1310 |
"""Normalized Discounted Cumulative Gain: measures the quality of ranking with respect to ground truth ranking scores.
|
| 1311 |
|
|
|
|
| 1 |
import re
|
| 2 |
import string
|
| 3 |
import uuid
|
| 4 |
+
from abc import ABC, abstractmethod
|
| 5 |
from collections import Counter
|
| 6 |
from dataclasses import field
|
| 7 |
from typing import Any, Dict, Generator, List, Optional, Tuple
|
|
|
|
| 361 |
references: List[List[Any]],
|
| 362 |
predictions: List[Any],
|
| 363 |
additional_inputs: List[Dict],
|
| 364 |
+
) -> List[Dict[str, Any]]:
|
| 365 |
pass
|
| 366 |
|
| 367 |
|
|
|
|
| 643 |
predictions: List[str],
|
| 644 |
additional_inputs: List[Any],
|
| 645 |
) -> List[Dict[str, Any]]:
|
|
|
|
| 646 |
passed_additional_inputs = {}
|
| 647 |
for additional_input_field in self.hf_additional_input_fields:
|
| 648 |
assert (
|
|
|
|
| 1246 |
references: List[List[Any]],
|
| 1247 |
predictions: List[Any],
|
| 1248 |
additional_inputs: List[Dict],
|
| 1249 |
+
) -> List[Dict[str, Any]]:
|
| 1250 |
scores = []
|
| 1251 |
|
| 1252 |
# we are in a multi-reference case (each prediction may have multiple
|
|
|
|
| 1291 |
references: List[List[Any]],
|
| 1292 |
predictions: List[Any],
|
| 1293 |
additional_inputs: List[Dict],
|
| 1294 |
+
) -> List[Dict[str, Any]]:
|
| 1295 |
# treat the references as the questions and the predictions as answers
|
| 1296 |
# assume a single reference
|
| 1297 |
questions = [refs[0] for refs in references]
|
|
|
|
| 1305 |
return self.pipe(inputs, batch_size=self.batch_size)
|
| 1306 |
|
| 1307 |
|
| 1308 |
+
class Perplexity(BulkInstanceMetric):
|
| 1309 |
+
"""Computes the likelihood of generating text Y after text X - P(Y|X)."""
|
| 1310 |
+
|
| 1311 |
+
main_score = "perplexity"
|
| 1312 |
+
reduction_map = {"mean": ["perplexity"]}
|
| 1313 |
+
|
| 1314 |
+
perplexity_prompt: str
|
| 1315 |
+
|
| 1316 |
+
batch_size: int = 32
|
| 1317 |
+
model_name: str
|
| 1318 |
+
|
| 1319 |
+
def compute(
|
| 1320 |
+
self,
|
| 1321 |
+
references: List[List[Any]],
|
| 1322 |
+
predictions: List[Any],
|
| 1323 |
+
additional_inputs: List[Dict],
|
| 1324 |
+
) -> List[Dict[str, Any]]:
|
| 1325 |
+
"""Computes the likelihood of generating text Y after text X - P(Y|X).
|
| 1326 |
+
|
| 1327 |
+
:param references: the list of Y texts as a list of singletons.
|
| 1328 |
+
:param predictions: the list of X texts as a plain list of strings
|
| 1329 |
+
|
| 1330 |
+
:return: the likelihood of generating text Y_i after text X_i = P(Y_i|X_i) for every i.
|
| 1331 |
+
"""
|
| 1332 |
+
# make sure all references are singletons
|
| 1333 |
+
assert all(len(ref) == 1 for ref in references)
|
| 1334 |
+
|
| 1335 |
+
# add the instruction as prefix
|
| 1336 |
+
predictions = [f"{self.perplexity_prompt} {x}" for x in predictions]
|
| 1337 |
+
references = [y[0] for y in references]
|
| 1338 |
+
|
| 1339 |
+
# check if the model is enc-dec or dec-only to use the right perplexity computation
|
| 1340 |
+
from transformers import AutoConfig
|
| 1341 |
+
|
| 1342 |
+
config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=True)
|
| 1343 |
+
lm = (
|
| 1344 |
+
self.EncoderDecoderLM(model_name=self.model_name)
|
| 1345 |
+
if config.is_encoder_decoder is True
|
| 1346 |
+
else self.DecoderOnlyLM(model_name=self.model_name)
|
| 1347 |
+
)
|
| 1348 |
+
|
| 1349 |
+
# compute P(Q|P) and store in queue
|
| 1350 |
+
scores = lm.compute_lm(
|
| 1351 |
+
source=predictions, target=references, batch_size=self.batch_size
|
| 1352 |
+
)
|
| 1353 |
+
|
| 1354 |
+
return [{self.main_score: score} for score in scores]
|
| 1355 |
+
|
| 1356 |
+
class AbstractLM(ABC):
|
| 1357 |
+
def __init__(self, model_name):
|
| 1358 |
+
import torch
|
| 1359 |
+
from transformers import AutoTokenizer
|
| 1360 |
+
|
| 1361 |
+
self.model_name = model_name
|
| 1362 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 1363 |
+
self.model = self.model_class().from_pretrained(self.model_name)
|
| 1364 |
+
self.is_cuda = torch.cuda.is_available()
|
| 1365 |
+
|
| 1366 |
+
def compute_lm(self, source, target, batch_size: int) -> List[float]:
|
| 1367 |
+
import torch
|
| 1368 |
+
|
| 1369 |
+
scores = []
|
| 1370 |
+
|
| 1371 |
+
with torch.no_grad():
|
| 1372 |
+
# break the documents to batches
|
| 1373 |
+
n_batches = int(len(source) / batch_size)
|
| 1374 |
+
batch_range = range(n_batches + 1)
|
| 1375 |
+
for batch in batch_range:
|
| 1376 |
+
batch_source = source[batch * batch_size : (batch + 1) * batch_size]
|
| 1377 |
+
batch_target = target[batch * batch_size : (batch + 1) * batch_size]
|
| 1378 |
+
if len(batch_source) > 0:
|
| 1379 |
+
# tokenize the source and target
|
| 1380 |
+
tokens_source = self.tokenizer(
|
| 1381 |
+
batch_source, padding=True, return_tensors="pt"
|
| 1382 |
+
)
|
| 1383 |
+
tokens_target = self.tokenizer(
|
| 1384 |
+
batch_target, padding=True, return_tensors="pt"
|
| 1385 |
+
)
|
| 1386 |
+
|
| 1387 |
+
# compute the logits
|
| 1388 |
+
logits, labels = self.compute_batch(
|
| 1389 |
+
tokens_source, tokens_target
|
| 1390 |
+
)
|
| 1391 |
+
|
| 1392 |
+
# the model returns mean over all batch. We run the CE again without reduction
|
| 1393 |
+
# and extarct the mean for each document
|
| 1394 |
+
loss_fct = torch.nn.CrossEntropyLoss(
|
| 1395 |
+
ignore_index=-100, reduction="none"
|
| 1396 |
+
)
|
| 1397 |
+
loss = loss_fct(
|
| 1398 |
+
logits.view(-1, logits.size(-1)), labels.view(-1)
|
| 1399 |
+
)
|
| 1400 |
+
loss = loss.view(len(batch_source), -1)
|
| 1401 |
+
|
| 1402 |
+
# for each document, do mean only over the non zero values (sum(labels>0))
|
| 1403 |
+
batch_loss = torch.sum(loss, dim=1) / torch.sum(
|
| 1404 |
+
labels > 0, dim=1
|
| 1405 |
+
)
|
| 1406 |
+
|
| 1407 |
+
# append the batch scores to the list of all scores
|
| 1408 |
+
scores.append(batch_loss)
|
| 1409 |
+
|
| 1410 |
+
return torch.cat(scores, dim=0).tolist()
|
| 1411 |
+
|
| 1412 |
+
@abstractmethod
|
| 1413 |
+
def model_class(self):
|
| 1414 |
+
pass
|
| 1415 |
+
|
| 1416 |
+
@abstractmethod
|
| 1417 |
+
def compute_batch(self, tokens_source, tokens_target):
|
| 1418 |
+
pass
|
| 1419 |
+
|
| 1420 |
+
class EncoderDecoderLM(AbstractLM):
|
| 1421 |
+
def model_class(self):
|
| 1422 |
+
from transformers import AutoModelForSeq2SeqLM
|
| 1423 |
+
|
| 1424 |
+
return AutoModelForSeq2SeqLM
|
| 1425 |
+
|
| 1426 |
+
def compute_batch(self, tokens_source, tokens_target):
|
| 1427 |
+
tokens_docs_ids = tokens_source["input_ids"]
|
| 1428 |
+
attention = tokens_source["attention_mask"]
|
| 1429 |
+
labels = tokens_target["input_ids"]
|
| 1430 |
+
|
| 1431 |
+
if self.is_cuda:
|
| 1432 |
+
tokens_docs_ids, attention, labels = (
|
| 1433 |
+
tokens_docs_ids.cuda(),
|
| 1434 |
+
attention.cuda(),
|
| 1435 |
+
labels.cuda(),
|
| 1436 |
+
)
|
| 1437 |
+
|
| 1438 |
+
logits = self.model(
|
| 1439 |
+
input_ids=tokens_docs_ids.long(),
|
| 1440 |
+
attention_mask=attention.long(),
|
| 1441 |
+
labels=labels.long(),
|
| 1442 |
+
).logits
|
| 1443 |
+
|
| 1444 |
+
# replace the padding token in the labels by -100
|
| 1445 |
+
labels[labels == self.tokenizer.pad_token_id] = -100
|
| 1446 |
+
|
| 1447 |
+
return logits, labels
|
| 1448 |
+
|
| 1449 |
+
class DecoderOnlyLM(AbstractLM):
|
| 1450 |
+
def model_class(self):
|
| 1451 |
+
from transformers import AutoModelForCausalLM
|
| 1452 |
+
|
| 1453 |
+
return AutoModelForCausalLM
|
| 1454 |
+
|
| 1455 |
+
def compute_batch(self, tokens_source, tokens_target):
|
| 1456 |
+
import torch
|
| 1457 |
+
|
| 1458 |
+
tokens = torch.cat(
|
| 1459 |
+
[tokens_source["input_ids"], tokens_target["input_ids"]], dim=1
|
| 1460 |
+
)
|
| 1461 |
+
attention = torch.cat(
|
| 1462 |
+
[tokens_source["attention_mask"], tokens_target["attention_mask"]],
|
| 1463 |
+
dim=1,
|
| 1464 |
+
)
|
| 1465 |
+
labels = torch.cat(
|
| 1466 |
+
[
|
| 1467 |
+
torch.zeros_like(tokens_source["input_ids"]).fill_(-100),
|
| 1468 |
+
tokens_target["input_ids"],
|
| 1469 |
+
],
|
| 1470 |
+
dim=1,
|
| 1471 |
+
)
|
| 1472 |
+
|
| 1473 |
+
# replace the padding token in the labels by -100
|
| 1474 |
+
labels[labels == self.tokenizer.pad_token_id] = -100
|
| 1475 |
+
|
| 1476 |
+
if self.is_cuda:
|
| 1477 |
+
tokens, attention, labels = (
|
| 1478 |
+
tokens.cuda(),
|
| 1479 |
+
attention.cuda(),
|
| 1480 |
+
labels.cuda(),
|
| 1481 |
+
)
|
| 1482 |
+
|
| 1483 |
+
# no need to pass labels as we calculate the loss below per document
|
| 1484 |
+
model_output = self.model(
|
| 1485 |
+
input_ids=tokens.long(), attention_mask=attention.long()
|
| 1486 |
+
)
|
| 1487 |
+
logits = model_output.logits
|
| 1488 |
+
|
| 1489 |
+
# in decoder only, the first token is not being generated, it is taken from the input,
|
| 1490 |
+
# so the model is generating from token 2 to n+1. therefore, we need to skip the last
|
| 1491 |
+
# logit and the first label.
|
| 1492 |
+
shifted_logits = logits[..., :-1, :].contiguous()
|
| 1493 |
+
shifted_labels = labels[..., 1:].contiguous()
|
| 1494 |
+
|
| 1495 |
+
return shifted_logits, shifted_labels
|
| 1496 |
+
|
| 1497 |
+
|
| 1498 |
class NDCG(GlobalMetric):
|
| 1499 |
"""Normalized Discounted Cumulative Gain: measures the quality of ranking with respect to ground truth ranking scores.
|
| 1500 |
|