Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Jan 18, 2024

Commit

f86db44

verified ·

1 Parent(s): cddf773

Upload metrics.py with huggingface_hub

Browse files

Files changed (1) hide show

metrics.py +194 -5

metrics.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 import string
 import uuid
-from abc import abstractmethod
 from collections import Counter
 from dataclasses import field
 from typing import Any, Dict, Generator, List, Optional, Tuple
@@ -361,7 +361,7 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         references: List[List[Any]],
         predictions: List[Any],
         additional_inputs: List[Dict],
-    ) -> Dict[str, Any]:
         pass
@@ -643,7 +643,6 @@ class HuggingfaceBulkMetric(BulkInstanceMetric):
         predictions: List[str],
         additional_inputs: List[Any],
     ) -> List[Dict[str, Any]]:
-        passed_additional_inputs = {}
         passed_additional_inputs = {}
         for additional_input_field in self.hf_additional_input_fields:
             assert (
@@ -1247,7 +1246,7 @@ class SentenceBert(BulkInstanceMetric):
         references: List[List[Any]],
         predictions: List[Any],
         additional_inputs: List[Dict],
-    ) -> List[Any]:
         scores = []
         # we are in a multi-reference case (each prediction may have multiple
@@ -1292,7 +1291,7 @@ class Reward(BulkInstanceMetric):
         references: List[List[Any]],
         predictions: List[Any],
         additional_inputs: List[Dict],
-    ) -> List[Any]:
         # treat the references as the questions and the predictions as answers
         # assume a single reference
         questions = [refs[0] for refs in references]
@@ -1306,6 +1305,196 @@ class Reward(BulkInstanceMetric):
         return self.pipe(inputs, batch_size=self.batch_size)
 class NDCG(GlobalMetric):
     """Normalized Discounted Cumulative Gain: measures the quality of ranking with respect to ground truth ranking scores.

 import re
 import string
 import uuid
+from abc import ABC, abstractmethod
 from collections import Counter
 from dataclasses import field
 from typing import Any, Dict, Generator, List, Optional, Tuple
         references: List[List[Any]],
         predictions: List[Any],
         additional_inputs: List[Dict],
+    ) -> List[Dict[str, Any]]:
         pass
         predictions: List[str],
         additional_inputs: List[Any],
     ) -> List[Dict[str, Any]]:
         passed_additional_inputs = {}
         for additional_input_field in self.hf_additional_input_fields:
             assert (
         references: List[List[Any]],
         predictions: List[Any],
         additional_inputs: List[Dict],
+    ) -> List[Dict[str, Any]]:
         scores = []
         # we are in a multi-reference case (each prediction may have multiple
         references: List[List[Any]],
         predictions: List[Any],
         additional_inputs: List[Dict],
+    ) -> List[Dict[str, Any]]:
         # treat the references as the questions and the predictions as answers
         # assume a single reference
         questions = [refs[0] for refs in references]
         return self.pipe(inputs, batch_size=self.batch_size)
+class Perplexity(BulkInstanceMetric):
+    """Computes the likelihood of generating text Y after text X - P(Y|X)."""
+    main_score = "perplexity"
+    reduction_map = {"mean": ["perplexity"]}
+    perplexity_prompt: str
+    batch_size: int = 32
+    model_name: str
+    def compute(
+        self,
+        references: List[List[Any]],
+        predictions: List[Any],
+        additional_inputs: List[Dict],
+    ) -> List[Dict[str, Any]]:
+        """Computes the likelihood of generating text Y after text X - P(Y|X).
+        :param references: the list of Y texts as a list of singletons.
+        :param predictions: the list of X texts as a plain list of strings
+        :return: the likelihood of generating text Y_i after text X_i = P(Y_i|X_i) for every i.
+        """
+        # make sure all references are singletons
+        assert all(len(ref) == 1 for ref in references)
+        # add the instruction as prefix
+        predictions = [f"{self.perplexity_prompt} {x}" for x in predictions]
+        references = [y[0] for y in references]
+        # check if the model is enc-dec or dec-only to use the right perplexity computation
+        from transformers import AutoConfig
+        config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=True)
+        lm = (
+            self.EncoderDecoderLM(model_name=self.model_name)
+            if config.is_encoder_decoder is True
+            else self.DecoderOnlyLM(model_name=self.model_name)
+        )
+        # compute P(Q|P) and store in queue
+        scores = lm.compute_lm(
+            source=predictions, target=references, batch_size=self.batch_size
+        )
+        return [{self.main_score: score} for score in scores]
+    class AbstractLM(ABC):
+        def __init__(self, model_name):
+            import torch
+            from transformers import AutoTokenizer
+            self.model_name = model_name
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            self.model = self.model_class().from_pretrained(self.model_name)
+            self.is_cuda = torch.cuda.is_available()
+        def compute_lm(self, source, target, batch_size: int) -> List[float]:
+            import torch
+            scores = []
+            with torch.no_grad():
+                # break the documents to batches
+                n_batches = int(len(source) / batch_size)
+                batch_range = range(n_batches + 1)
+                for batch in batch_range:
+                    batch_source = source[batch * batch_size : (batch + 1) * batch_size]
+                    batch_target = target[batch * batch_size : (batch + 1) * batch_size]
+                    if len(batch_source) > 0:
+                        # tokenize the source and target
+                        tokens_source = self.tokenizer(
+                            batch_source, padding=True, return_tensors="pt"
+                        )
+                        tokens_target = self.tokenizer(
+                            batch_target, padding=True, return_tensors="pt"
+                        )
+                        # compute the logits
+                        logits, labels = self.compute_batch(
+                            tokens_source, tokens_target
+                        )
+                        # the model returns mean over all batch. We run the CE again without reduction
+                        # and extarct the mean for each document
+                        loss_fct = torch.nn.CrossEntropyLoss(
+                            ignore_index=-100, reduction="none"
+                        )
+                        loss = loss_fct(
+                            logits.view(-1, logits.size(-1)), labels.view(-1)
+                        )
+                        loss = loss.view(len(batch_source), -1)
+                        # for each document, do mean only over the non zero values (sum(labels>0))
+                        batch_loss = torch.sum(loss, dim=1) / torch.sum(
+                            labels > 0, dim=1
+                        )
+                        # append the batch scores to the list of all scores
+                        scores.append(batch_loss)
+            return torch.cat(scores, dim=0).tolist()
+        @abstractmethod
+        def model_class(self):
+            pass
+        @abstractmethod
+        def compute_batch(self, tokens_source, tokens_target):
+            pass
+    class EncoderDecoderLM(AbstractLM):
+        def model_class(self):
+            from transformers import AutoModelForSeq2SeqLM
+            return AutoModelForSeq2SeqLM
+        def compute_batch(self, tokens_source, tokens_target):
+            tokens_docs_ids = tokens_source["input_ids"]
+            attention = tokens_source["attention_mask"]
+            labels = tokens_target["input_ids"]
+            if self.is_cuda:
+                tokens_docs_ids, attention, labels = (
+                    tokens_docs_ids.cuda(),
+                    attention.cuda(),
+                    labels.cuda(),
+                )
+            logits = self.model(
+                input_ids=tokens_docs_ids.long(),
+                attention_mask=attention.long(),
+                labels=labels.long(),
+            ).logits
+            # replace the padding token in the labels by -100
+            labels[labels == self.tokenizer.pad_token_id] = -100
+            return logits, labels
+    class DecoderOnlyLM(AbstractLM):
+        def model_class(self):
+            from transformers import AutoModelForCausalLM
+            return AutoModelForCausalLM
+        def compute_batch(self, tokens_source, tokens_target):
+            import torch
+            tokens = torch.cat(
+                [tokens_source["input_ids"], tokens_target["input_ids"]], dim=1
+            )
+            attention = torch.cat(
+                [tokens_source["attention_mask"], tokens_target["attention_mask"]],
+                dim=1,
+            )
+            labels = torch.cat(
+                [
+                    torch.zeros_like(tokens_source["input_ids"]).fill_(-100),
+                    tokens_target["input_ids"],
+                ],
+                dim=1,
+            )
+            # replace the padding token in the labels by -100
+            labels[labels == self.tokenizer.pad_token_id] = -100
+            if self.is_cuda:
+                tokens, attention, labels = (
+                    tokens.cuda(),
+                    attention.cuda(),
+                    labels.cuda(),
+                )
+            # no need to pass labels as we calculate the loss below per document
+            model_output = self.model(
+                input_ids=tokens.long(), attention_mask=attention.long()
+            )
+            logits = model_output.logits
+            # in decoder only, the first token is not being generated, it is taken from the input,
+            # so the model is generating from token 2 to n+1. therefore, we need to skip the last
+            # logit and the first label.
+            shifted_logits = logits[..., :-1, :].contiguous()
+            shifted_labels = labels[..., 1:].contiguous()
+            return shifted_logits, shifted_labels
 class NDCG(GlobalMetric):
     """Normalized Discounted Cumulative Gain: measures the quality of ranking with respect to ground truth ranking scores.