Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Mar 5, 2024

Commit

1e4d944

verified ·

1 Parent(s): 49c1c5f

Upload metrics.py with huggingface_hub

Browse files

Files changed (1) hide show

metrics.py +1338 -199

metrics.py CHANGED Viewed

@@ -1,19 +1,24 @@
 import re
 import string
 import uuid
 from abc import ABC, abstractmethod
 from collections import Counter
 from dataclasses import field
 from typing import Any, Dict, Generator, List, Optional, Tuple
 import evaluate
 import numpy
 import numpy as np
 from scipy.stats import bootstrap
 from .artifact import Artifact
 from .dataclass import InternalField, OptionalField
 from .logging_utils import get_logger
 from .operator import (
     MultiStreamOperator,
     SingleStreamOperator,
@@ -22,14 +27,17 @@ from .operator import (
 )
 from .operators import CopyFields
 from .random_utils import get_seed
 from .stream import MultiStream, Stream
-from .type_utils import isoftype
 logger = get_logger()
-# The default number of resamples used to estimate the confidence intervals
-# global and instances metrics. Use None to disable confidence interval computation by default.
-_N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS = 1000
-_N_RESAMPLES_DEFAULT_FOR_GLOBAL_METRICS = 100
 def abstract_factory():
@@ -40,6 +48,18 @@ def abstract_field():
     return field(default_factory=abstract_factory)
 class UpdateStream(StreamInstanceOperator):
     update: dict
@@ -57,6 +77,48 @@ class Metric(Artifact):
     def main_score(self):
         pass
 class MetricWithConfidenceInterval(Metric):
     # The number of resamples used to estimate the confidence intervals of this metric.
@@ -73,7 +135,12 @@ class MetricWithConfidenceInterval(Metric):
         return np.random.default_rng(hash(get_seed()) & _max_32bit)
     def disable_confidence_interval_calculation(self):
         self.n_resamples = None
     def _can_compute_confidence_intervals(self, num_predictions):
         return (
@@ -82,45 +149,117 @@ class MetricWithConfidenceInterval(Metric):
             and num_predictions > 1
         )
-    def score_based_confidence_interval(self, instances):
-        """Compute confidence intervals based on existing scores, already computed on the input instances.
-        score_names: List[str]
-            Compute a confidence interval for each score_name from this list.
-        instances:
-            The instances for which the confidence intervals are computed.
         """
-        from statistics import mean
         result = {}
         if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
             return result
-        score_names = (
-            self.ci_scores if self.ci_scores is not None else [self.main_score]
-        )
         for score_name in score_names:
-            scores = [
-                instance["score"]["instance"][score_name] for instance in instances
-            ]
             ci = bootstrap(
-                (scores,),
-                statistic=mean,
                 n_resamples=self.n_resamples,
                 confidence_level=self.confidence_level,
                 random_state=self.new_random_generator(),
             ).confidence_interval
-            result[f"{score_name}_ci_low"] = ci.low
-            result[f"{score_name}_ci_high"] = ci.high
             if score_name == self.main_score:
                 result["score_ci_low"] = ci.low
                 result["score_ci_high"] = ci.high
         return result
     def compute_global_confidence_intervals(
-        self, references, predictions, additional_inputs, score_name
     ):
         """Computed confidence intervals for a set of references and predictions."""
         random_gen = self.new_random_generator()
@@ -128,12 +267,12 @@ class MetricWithConfidenceInterval(Metric):
         def statistic(arr, axis):
             # arr is a 2d array where each row is a resampling, so we
             # iterate over the rows and compute the metric on each resampling
-            def metric(sample_refs, sample_preds, sample_additional_inputs):
                 try:
                     return self._compute(
                         references=sample_refs,
                         predictions=sample_preds,
-                        additional_inputs=sample_additional_inputs,
                     )["score"]
                 except Exception as e:
                     # this happens in edge cases, for example, when the sampling creates a
@@ -141,40 +280,21 @@ class MetricWithConfidenceInterval(Metric):
                     logger.info(f"Warning in {self.__class__.__name__}", e)
                     return np.nan
             scores = numpy.apply_along_axis(
                 lambda x: metric(
                     sample_refs=[references[i] for i in x],
                     sample_preds=[predictions[i] for i in x],
-                    sample_additional_inputs=[additional_inputs[i] for i in x],
                 ),
                 axis=axis,
                 arr=arr,
             )
-            # when running with bca interval (default), the statistic is called twice: with the
-            # original data and with the resamples. here we want to focus only on the latter.
-            if scores.size > 1:
-                # here we deal with samples on which the metric could not be computed. These are
-                # edge cases - for example, when the sample contains only empty strings.
-                # CI is about the distribution around the statistic (e.g. mean), it doesn't deal with
-                # cases in which the metric is not computable. Therefore, we ignore these edge cases
-                # as part of the computation of CI. The question is how to implement this policy.
-                # Options:
-                # 1. skip the errors and return a shorter array => this fails because Scipy demans
-                # this callback (i.e. the statistic() callback) to return an array of the same size
-                # as the number of resamples
-                # 2. Put np.nan for the errors => this fails because in such case the ci itself
-                # becomes np.nan. So one edge case can fail the whole CI computation.
-                # 3. Replace the errors with a sampling from the successful cases => this is what
-                # is implemented.
-                error_indices = numpy.isnan(scores)
-                n_errors = sum(error_indices)
-                if n_errors > 0:
-                    new_scores = random_gen.choice(scores, n_errors, replace=True)
-                    scores = scores[~error_indices]
-                    scores = np.concatenate([scores, new_scores])
-            return scores
         result = {}
         num_predictions = len(predictions)
@@ -202,12 +322,15 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
     need to be considered.  Accuracy, on the other hand, is just an average of the accuracy of all the instances.
     """
-    n_resamples = _N_RESAMPLES_DEFAULT_FOR_GLOBAL_METRICS
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         references = []
         predictions = []
-        additional_inputs = []
         global_score = {}
         instances = []
@@ -226,31 +349,40 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             predictions.append(instance_prediction)
             instances.append(instance)
-            instance_additional_inputs = (
-                instance["additional_inputs"] if "additional_inputs" in instance else {}
             )
-            additional_inputs.append(instance_additional_inputs)
-            try:
-                instance_score = self._compute(
-                    [instance_references],
-                    [instance_prediction],
-                    [instance_additional_inputs],
-                )
-            except:
-                instance_score = {"score": None, "score_name": self.main_score}
                 if isinstance(self.main_score, str):
-                    instance_score[self.main_score] = None
             instance["score"]["instance"].update(instance_score)
-        result = self._compute(references, predictions, additional_inputs)
         global_score.update(result)
         score_name = global_score["score_name"]
         confidence_interval = self.compute_global_confidence_intervals(
-            references, predictions, additional_inputs, score_name
         )
         global_score.update(confidence_interval)
@@ -262,9 +394,9 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         self,
         references: List[List[str]],
         predictions: List[str],
-        additional_inputs: List[Any],
     ) -> dict:
-        result = self.compute(references, predictions, additional_inputs)
         result["score"] = result[self.main_score]
         result["score_name"] = self.main_score
         return result
@@ -274,13 +406,25 @@ class GlobalMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         self,
         references: List[List[Any]],
         predictions: List[Any],
-        additional_inputs: List[Any],
     ) -> dict:
         pass
 class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
-    n_resamples = _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS
     main_score: str
     reduction_map: Dict[str, List[str]]
@@ -301,8 +445,8 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             ),
         )
-        additional_inputs = [
-            instance["additional_inputs"] if "additional_inputs" in instance else {}
             for instance in stream
         ]
@@ -310,7 +454,7 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         instance_scores = self.compute(
             references=references,
             predictions=predictions,
-            additional_inputs=additional_inputs,
         )
         # add the score and score_name fields
@@ -334,8 +478,6 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             ), f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
             if reduction == "mean":
-                from statistics import mean
                 for field_name in fields:
                     global_score[field_name] = mean(
                         [
@@ -347,8 +489,13 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
                         global_score["score"] = global_score[field_name]
                         global_score["score_name"] = self.main_score
                 confidence_interval = self.score_based_confidence_interval(
-                    instances=instances
                 )
                 global_score.update(confidence_interval)
@@ -360,33 +507,217 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
         self,
         references: List[List[Any]],
         predictions: List[Any],
-        additional_inputs: List[Dict],
     ) -> List[Dict[str, Any]]:
         pass
 class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
-    n_resamples = _N_RESAMPLES_DEFAULT_FOR_INSTANCE_METRICS
-    implemented_reductions: List[str] = field(default_factory=lambda: ["mean"])
     @property
     @abstractmethod
     def reduction_map(self) -> dict:
         pass
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         global_score = {}
         instances = []
         for instance in stream:
             refs, pred = instance["references"], instance["prediction"]
-            additional_inputs = (
-                instance["additional_inputs"] if "additional_inputs" in instance else {}
-            )
             instance_score = self.compute(
-                references=refs, prediction=pred, additional_inputs=additional_inputs
             )
             instance_score["score"] = instance_score[self.main_score]
             instance_score["score_name"] = self.main_score
@@ -399,36 +730,100 @@ class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
             instances.append(instance)
-        for reduction, fields in self.reduction_map.items():
-            assert (
-                reduction in self.implemented_reductions
-            ), f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
-            if reduction == "mean":
-                from statistics import mean
-                for field_name in fields:
-                    scores = [
-                        instance["score"]["instance"][field_name]
-                        for instance in instances
-                    ]
-                    global_score[field_name] = mean(scores)
-                    if field_name == self.main_score:
-                        global_score["score"] = global_score[field_name]
-                        global_score["score_name"] = self.main_score
-                confidence_interval = self.score_based_confidence_interval(
-                    instances=instances
                 )
-                global_score.update(confidence_interval)
-        for instance in instances:
-            yield instance
     @abstractmethod
-    def compute(
-        self, references: List[Any], prediction: Any, additional_inputs: Dict
-    ) -> dict:
         pass
@@ -445,7 +840,7 @@ class Squad(GlobalMetric):
         self,
         references: List[List[str]],
         predictions: List[str],
-        additional_inputs: List[Dict],
     ) -> dict:
         ids = [str(uuid.uuid4()).replace("-", "") for _ in range(len(predictions))]
         formatted_predictions = [
@@ -466,9 +861,10 @@ class Squad(GlobalMetric):
 class Accuracy(InstanceMetric):
     reduction_map = {"mean": ["accuracy"]}
     main_score = "accuracy"
     def compute(
-        self, references: List[Any], prediction: Any, additional_inputs: List[Dict]
     ) -> dict:
         result = {
             self.main_score: float(
@@ -483,13 +879,14 @@ class Accuracy(InstanceMetric):
 class StringContainment(InstanceMetric):
     reduction_map = {"mean": ["string_containment"]}
     main_score = "string_containment"
     def compute(
-        self, references: List[Any], prediction: Any, additional_inputs: List[Dict]
     ) -> dict:
         result = {
             self.main_score: float(
-                any(str(reference) in prediction for reference in references)
             )
         }
         result["score"] = result[self.main_score]
@@ -505,6 +902,13 @@ class MetricPipeline(MultiStreamOperator, Metric):
     )
     metric: Metric = None
     def verify(self):
         assert self.main_score is not None, "main_score is not set"
@@ -569,37 +973,37 @@ class HuggingfaceMetric(GlobalMetric):
         self,
         references: List[List[Any]],
         predictions: List[Any],
-        additional_inputs: List[Dict],
     ) -> dict:
-        passed_additional_inputs = {}
         for additional_input_field in self.hf_additional_input_fields:
             assert (
-                additional_input_field in additional_inputs[0]
-            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in additional inputs: {additional_inputs[0]}"
-            passed_additional_inputs[additional_input_field] = [
                 additional_input[additional_input_field]
-                for additional_input in additional_inputs
             ]
         for additional_input_field in self.hf_additional_input_fields_pass_one_value:
             assert (
-                additional_input_field in additional_inputs[0]
-            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in additional inputs: {additional_inputs[0]}"
             values = {
                 additional_input[additional_input_field]
-                for additional_input in additional_inputs
             }
             assert (
                 len(values) == 1
             ), f"Values of '{additional_input_field}' field required by {__class__.__name__}  should all be the same, but have multiple values {values}"
-            passed_additional_inputs[additional_input_field] = next(iter(values))
-        # add check that all required fields in self.metrics are in passed_additional_inputs       print(passed_additional_inputs)
         result = self.metric.compute(
             predictions=predictions,
             references=references,
-            **passed_additional_inputs,
             **self.hf_compute_args,
         )
         if self.hf_main_score:
@@ -641,23 +1045,23 @@ class HuggingfaceBulkMetric(BulkInstanceMetric):
         self,
         references: List[List[str]],
         predictions: List[str],
-        additional_inputs: List[Any],
     ) -> List[Dict[str, Any]]:
-        passed_additional_inputs = {}
         for additional_input_field in self.hf_additional_input_fields:
             assert (
-                additional_input_field in additional_inputs[0]
-            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in additional inputs: {additional_inputs[0]}"
-            passed_additional_inputs[additional_input_field] = [
                 additional_input[additional_input_field]
-                for additional_input in additional_inputs
             ]
-        # add check that all required fields in self.metrics are in passed_additional_inputs
         scores = self.metric.compute(
             predictions=predictions,
             references=references,
-            **passed_additional_inputs,
             **self.hf_compute_args,
         )
@@ -692,7 +1096,7 @@ class F1(GlobalMetric):
         self,
         references: List[List[str]],
         predictions: List[str],
-        additional_inputs: List[Dict],
     ) -> dict:
         assert all(
             len(reference) == 1 for reference in references
@@ -714,8 +1118,6 @@ class F1(GlobalMetric):
             average=self.average,
         )
         if isinstance(result["f1"], numpy.ndarray):
-            from statistics import mean
             final_result = {self.main_score: mean(result["f1"])}
             for i, label in enumerate(labels):
                 final_result["f1_" + self.id_to_str[label]] = result["f1"][i]
@@ -742,7 +1144,6 @@ class F1MultiLabel(GlobalMetric):
     _metric = None
     main_score = "f1_macro"
     average = None  # Report per class then aggregate by mean
-    classes_to_ignore = ["none"]
     metric = "f1"
     def prepare(self):
@@ -767,7 +1168,7 @@ class F1MultiLabel(GlobalMetric):
         self,
         references: List[List[str]],
         predictions: List[List[str]],
-        additional_inputs: List[Dict],
     ) -> dict:
         self.str_to_id = {}
         self.id_to_str = {}
@@ -775,13 +1176,9 @@ class F1MultiLabel(GlobalMetric):
         self._validate_references_and_prediction(references, predictions)
         references = [reference[0] for reference in references]
-        labels = [
-            lbl
-            for lbl in {label for reference in references for label in reference}
-            if lbl not in self.classes_to_ignore
-        ]
         # if no classes are left then F1 is not defined
-        # (e.g. only "none" in references)
         if len(labels) == 0:
             return {self.main_score: float("nan")}
@@ -809,8 +1206,6 @@ class F1MultiLabel(GlobalMetric):
             labels=labels_param,
         )
         if isinstance(result[self.metric], numpy.ndarray):
-            from statistics import mean
             assert (
                 len(result[self.metric]) == len(labels)
             ), f"F1 result ({result[self.metric]}) has more entries than labels ({labels})"
@@ -883,6 +1278,8 @@ class Rouge(HuggingfaceMetric):
     sent_split_newline: bool = True
     def prepare(self):
         super().prepare()
@@ -895,7 +1292,7 @@ class Rouge(HuggingfaceMetric):
         nltk.download("punkt")
         self.sent_tokenize = nltk.sent_tokenize
-    def compute(self, references, predictions, additional_inputs: List[Dict]):
         if self.sent_split_newline:
             predictions = [
                 "\n".join(self.sent_tokenize(prediction.strip()))
@@ -905,13 +1302,16 @@ class Rouge(HuggingfaceMetric):
                 ["\n".join(self.sent_tokenize(r.strip())) for r in reference]
                 for reference in references
             ]
-        return super().compute(references, predictions, additional_inputs)
 # Computes char edit distance, ignoring whitespace
 class CharEditDistanceAccuracy(InstanceMetric):
     reduction_map = {"mean": ["char_edit_dist_accuracy"]}
     main_score = "char_edit_dist_accuracy"
     def prepare(self):
         super().prepare()
@@ -919,9 +1319,7 @@ class CharEditDistanceAccuracy(InstanceMetric):
         self.eval = editdistance.eval
-    def compute(
-        self, references, prediction: str, additional_inputs: List[Dict]
-    ) -> dict:
         assert (
             len(references) == 1
         ), f"Expected only one reference , but received: {references}"
@@ -939,11 +1337,13 @@ class Wer(HuggingfaceMetric):
     hf_metric_name = "wer"
     main_score = "wer"
     def compute(
         self,
         references: List[List[str]],
         predictions: List[str],
-        additional_inputs: List[Dict],
     ) -> dict:
         assert all(
             len(reference) == 1 for reference in references
@@ -955,6 +1355,43 @@ class Wer(HuggingfaceMetric):
         return {self.main_score: result}
 class MatthewsCorrelation(HuggingfaceMetric):
     hf_metric_name = "matthews_correlation"
     main_score = "matthews_correlation"
@@ -970,7 +1407,7 @@ class MatthewsCorrelation(HuggingfaceMetric):
         self,
         references: List[List[str]],
         predictions: List[str],
-        additional_inputs: List[Dict],
     ) -> dict:
         formatted_references = [
             self.get_str_id(reference[0]) for reference in references
@@ -983,6 +1420,33 @@ class MatthewsCorrelation(HuggingfaceMetric):
         )
 class CustomF1(GlobalMetric):
     main_score = "f1_micro"
     groups = None
@@ -1036,9 +1500,9 @@ class CustomF1(GlobalMetric):
         except ZeroDivisionError:
             return self.zero_division
-    def get_groups(self, elements, additional_inputs):
         groups = set()
-        for sublist, additional_input in zip(elements, additional_inputs):
             for e in sublist:
                 if self.should_ignore_element(e, additional_input):
                     continue
@@ -1049,7 +1513,7 @@ class CustomF1(GlobalMetric):
         self,
         references: List[List[Any]],
         predictions: List[Any],
-        additional_inputs: List[Dict],
     ) -> dict:
         # in case reference are List[List[List[Any]]] and predictions are List[List[Any]]:
         if (
@@ -1065,12 +1529,12 @@ class CustomF1(GlobalMetric):
         )
         if self.groups is None:
-            groups = self.get_groups(references, additional_inputs)
         else:
             groups = self.groups
         groups_statistics = {}
         for references_batch, predictions_batch, additional_input in zip(
-            references, predictions, additional_inputs
         ):
             grouped_references = self.group_elements(references_batch, additional_input)
             grouped_predictions = self.group_elements(
@@ -1187,10 +1651,11 @@ class TokenOverlap(InstanceMetric):
     ci_scores = ["f1", "precision", "recall"]
     def compute(
-        self, references: List[Any], prediction: Any, additional_inputs: List[Dict]
     ) -> dict:
         results = [
-            self._compute_single_ref(reference, prediction) for reference in references
         ]
         return {
             measure: max(r[i] for r in results)
@@ -1200,8 +1665,8 @@ class TokenOverlap(InstanceMetric):
     def _compute_single_ref(
         self, reference: Any, prediction: Any
     ) -> Tuple[float, float, float]:
-        prediction_tokens = normalize_answer(prediction).split()
-        reference_tokens = normalize_answer(reference).split()
         common = Counter(prediction_tokens) & Counter(reference_tokens)
         num_same = sum(common.values())
         if num_same == 0:
@@ -1221,9 +1686,11 @@ class BertScore(HuggingfaceBulkMetric):
     ci_scores = ["f1", "precision", "recall"]
     model_name: str
     def prepare(self):
         super().prepare()
-        self.hf_compute_args = {"model_type": self.model_name}
 class SentenceBert(BulkInstanceMetric):
@@ -1233,19 +1700,23 @@ class SentenceBert(BulkInstanceMetric):
     model_name: str
     def prepare(self):
         super().prepare()
         from sentence_transformers import SentenceTransformer
         from sentence_transformers import util as sbert_util
-        self.model = SentenceTransformer(self.model_name)
         self.util = sbert_util
     def compute(
         self,
         references: List[List[Any]],
         predictions: List[Any],
-        additional_inputs: List[Dict],
     ) -> List[Dict[str, Any]]:
         scores = []
@@ -1260,9 +1731,9 @@ class SentenceBert(BulkInstanceMetric):
             count += len(ref_group)
         # compute s-bert embeddings
-        preds_emb = self.model.encode(predictions)
         refs_emb = self.model.encode(
-            [ref for ref_group in references for ref in ref_group]
         )
         # for each candidate, pick the reference with the highest score
@@ -1280,17 +1751,23 @@ class Reward(BulkInstanceMetric):
     model_name: str
     def prepare(self):
         super().prepare()
         from transformers import pipeline
-        self.pipe = pipeline("text-classification", model=self.model_name)
     def compute(
         self,
         references: List[List[Any]],
         predictions: List[Any],
-        additional_inputs: List[Dict],
     ) -> List[Dict[str, Any]]:
         # treat the references as the questions and the predictions as answers
         # assume a single reference
@@ -1316,25 +1793,27 @@ class Perplexity(BulkInstanceMetric):
     batch_size: int = 32
     model_name: str
     def compute(
         self,
         references: List[List[Any]],
         predictions: List[Any],
-        additional_inputs: List[Dict],
     ) -> List[Dict[str, Any]]:
         """Computes the likelihood of generating text Y after text X - P(Y|X).
-        :param references: the list of Y texts as a list of singletons.
-        :param predictions: the list of X texts as a plain list of strings
-        :return: the likelihood of generating text Y_i after text X_i = P(Y_i|X_i) for every i.
         """
         sources = []
         targets = []
         for prediction, instance_references in zip(predictions, references):
             for instance_reference in instance_references:
-                sources.append(f"{self.perplexity_prompt} {prediction}")
-                targets.append(instance_reference)
         from transformers import AutoConfig
@@ -1375,9 +1854,11 @@ class Perplexity(BulkInstanceMetric):
             from transformers import AutoTokenizer
             self.model_name = model_name
             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            self.model = self.model_class().from_pretrained(self.model_name)
-            self.is_cuda = torch.cuda.is_available()
         def compute_lm(
             self, source: List[str], target: List[str], batch_size: int
@@ -1470,16 +1951,9 @@ class Perplexity(BulkInstanceMetric):
             return AutoModelForSeq2SeqLM
         def compute_batch(self, tokens_source, tokens_target):
-            tokens_docs_ids = tokens_source["input_ids"]
-            attention = tokens_source["attention_mask"]
-            labels = tokens_target["input_ids"]
-            if self.is_cuda:
-                tokens_docs_ids, attention, labels = (
-                    tokens_docs_ids.cuda(),
-                    attention.cuda(),
-                    labels.cuda(),
-                )
             logits = self.model(
                 input_ids=tokens_docs_ids.long(),
@@ -1519,12 +1993,9 @@ class Perplexity(BulkInstanceMetric):
             # replace the padding token in the labels by -100
             labels[labels == self.tokenizer.pad_token_id] = -100
-            if self.is_cuda:
-                tokens, attention, labels = (
-                    tokens.cuda(),
-                    attention.cuda(),
-                    labels.cuda(),
-                )
             # no need to pass labels as we calculate the loss below per document
             model_output = self.model(
@@ -1558,6 +2029,8 @@ class NDCG(GlobalMetric):
     main_score = "nDCG"
     def prepare(self):
         from sklearn.metrics import ndcg_score
@@ -1568,15 +2041,12 @@ class NDCG(GlobalMetric):
         self,
         references: List[List[Any]],
         predictions: List[Any],
-        additional_inputs: List[Any],
     ) -> dict:
         from collections import defaultdict
-        from statistics import mean
         query_to_predictions_and_references = defaultdict(lambda: [[], []])
-        for reference, pred, inputs_dict in zip(
-            references, predictions, additional_inputs
-        ):
             query = inputs_dict.get("query")
             query_to_predictions_and_references[query][0].append(pred)
             query_to_predictions_and_references[query][1].append(reference)
@@ -1606,9 +2076,7 @@ class NDCG(GlobalMetric):
 class RetrievalMetric(InstanceMetric):
-    def compute(
-        self, references: List[Any], prediction: Any, additional_inputs: Dict
-    ) -> dict:
         # digest input
         pred_ids: List[Any] = prediction
         ref_ids: List[Any] = list(dict.fromkeys(references))
@@ -1681,6 +2149,7 @@ class RetrievalMetric(InstanceMetric):
 class MRR(RetrievalMetric):
     reduction_map = {"mean": ["mrr"]}
     main_score = "mrr"
     def _compute(
         self,
@@ -1697,6 +2166,7 @@ class MRR(RetrievalMetric):
 class MAP(RetrievalMetric):
     reduction_map = {"mean": ["map"]}
     main_score = "map"
     def _compute(
         self,
@@ -1765,3 +2235,672 @@ class KPA(CustomF1):
     def should_ignore_element(self, element, additional_input):
         return element == "none"

 import re
 import string
 import uuid
+import warnings
 from abc import ABC, abstractmethod
 from collections import Counter
+from copy import deepcopy
 from dataclasses import field
+from statistics import mean
 from typing import Any, Dict, Generator, List, Optional, Tuple
 import evaluate
 import numpy
 import numpy as np
 from scipy.stats import bootstrap
+from scipy.stats._warnings_errors import DegenerateDataWarning
 from .artifact import Artifact
 from .dataclass import InternalField, OptionalField
 from .logging_utils import get_logger
+from .metric_utils import InstanceInput, MetricRequest, MetricResponse
 from .operator import (
     MultiStreamOperator,
     SingleStreamOperator,
 )
 from .operators import CopyFields
 from .random_utils import get_seed
+from .settings_utils import get_settings
 from .stream import MultiStream, Stream
+from .type_utils import isoftype, to_float_or_default
 logger = get_logger()
+settings = get_settings()
+warnings.filterwarnings("ignore", category=DegenerateDataWarning)
+warnings.filterwarnings("ignore", category=DegenerateDataWarning)
 def abstract_factory():
     return field(default_factory=abstract_factory)
+def nan_mean(x):
+    import warnings
+    with warnings.catch_warnings():
+        # final mean should be mean of scores, ignoring NaN, hence nanmean
+        # but if the group function values is NaN for ALL values, nanmean throws a
+        # RuntimeWarning that it is calculating the mean of an empty slice (with no non-Nans)
+        # this is the desired behavior, but we want to avoid the warning here
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        return np.nanmean(x)
 class UpdateStream(StreamInstanceOperator):
     update: dict
     def main_score(self):
         pass
+    def consume_stream(self, stream: Stream):
+        references = []
+        predictions = []
+        additional_inputs = []
+        instances = []
+        for instance in stream:
+            references.append(instance["references"])
+            predictions.append(instance["prediction"])
+            additional_inputs.append(
+                instance["additional_inputs"] if "additional_inputs" in instance else {}
+            )
+            instances.append(instance)
+        return predictions, references, additional_inputs, instances
+    @staticmethod
+    def update_instance_scores(instances, instances_scores: List[Dict[str, Any]]):
+        for instance, new_scores in zip(instances, instances_scores):
+            if "score" not in instance:
+                instance["score"] = {}
+            scores = instance["score"]
+            if "instance" not in scores:
+                scores["instance"] = {}
+            scores["instance"].update(new_scores)
+    @staticmethod
+    def set_global_score(instances, global_score: Dict[str, Any]):
+        for instance in instances:
+            if "score" not in instance:
+                instance["score"] = {}
+            scores = instance["score"]
+            if "global" not in scores:
+                scores["global"] = {}
+            scores["global"] = global_score
+    @abstractmethod
+    def disable_confidence_interval_calculation(self):
+        pass
+    @abstractmethod
+    def set_n_resamples(self, n_resample):
+        pass
 class MetricWithConfidenceInterval(Metric):
     # The number of resamples used to estimate the confidence intervals of this metric.
         return np.random.default_rng(hash(get_seed()) & _max_32bit)
     def disable_confidence_interval_calculation(self):
+        n = self.n_resamples
         self.n_resamples = None
+        return n
+    def set_n_resamples(self, n_resamples):
+        self.n_resamples = n_resamples
     def _can_compute_confidence_intervals(self, num_predictions):
         return (
             and num_predictions > 1
         )
+    @staticmethod
+    def average_item_scores(instances: List[dict], score_name: str):
+        """Calculate mean of a set of instance scores (given by score_name), omitting NaN values.
+        Args:
+            instances: list of dicts of each instance's instance scores.
+            score_name: score field names to compute the mean for.
         """
+        return nan_mean(
+            [instance["score"]["instance"][score_name] for instance in instances]
+        )
+    def score_based_confidence_interval(
+        self,
+        instances: List[dict],
+        score_names: List[str],
+        aggregation_func=None,
+        ci_score_prefix="",
+    ):
+        """Compute confidence intervals based on existing scores, already computed on the input instances.
+        Unlike GlobalMetric, this is simply a function of the instance scores (possibly taking into account task_data field),
+         so they don't need to be recomputed after every bootstrap draw.
+        Args:
+            instances: The instances for which the confidence intervals are computed; should already have the relevant instance scores calculated.
+            score_names: List of instance score field names to compute a confidence interval for.
+            aggregation_func: A function with arguments instances, field_name; is applied on list of instances (which may include task_data
+                field, as well as the prediction and references), and the field_name; default is simply to take the mean field_name from
+                instances after resampling, if argument is None.
+            ci_score_prefix: An optional string prefix to the score_name in the CI.  Useful in cases where the
+                aggregation_func is something other than the mean
+        Returns:
+            Dict of confidence interval values
+        """
         result = {}
         if not self._can_compute_confidence_intervals(num_predictions=len(instances)):
             return result
+        ci_score_prefix = str(ci_score_prefix)
+        if aggregation_func is None:
+            # if aggregation_func is None, we simply take the mean of the resampled instance scores
+            # otherwise, the aggregation_func needs to be applied AFTER resampling the instances;
+            #   that is, re-form the groups, calculate the function, and take the mean of the group scores
+            aggregation_func = self.average_item_scores
         for score_name in score_names:
+            # need to redefine the statistic function within the loop because score_name is a loop variable
+            def statistic(arr, axis, score_name=score_name):
+                # arr is a 2d array where each row is a resampling, so we
+                # iterate over the rows and compute the metric on each resampling
+                scores = numpy.apply_along_axis(
+                    lambda resampled_instances: aggregation_func(
+                        resampled_instances, score_name
+                    ),
+                    axis=axis,
+                    arr=arr,
+                )
+                return self.resample_from_non_nan(scores)
+            # apply bootstrap only on the relevant field
             ci = bootstrap(
+                (instances,),
+                statistic=statistic,
                 n_resamples=self.n_resamples,
                 confidence_level=self.confidence_level,
                 random_state=self.new_random_generator(),
             ).confidence_interval
+            full_score_name = ci_score_prefix + score_name
+            result[f"{full_score_name}_ci_low"] = ci.low
+            result[f"{full_score_name}_ci_high"] = ci.high
             if score_name == self.main_score:
                 result["score_ci_low"] = ci.low
                 result["score_ci_high"] = ci.high
         return result
+    def resample_from_non_nan(self, values):
+        """Given an array values, will replace any NaN values with elements resampled with replacement from the non-NaN ones.
+        here we deal with samples on which the metric could not be computed. These are
+        edge cases - for example, when the sample contains only empty strings.
+        CI is about the distribution around the statistic (e.g. mean), it doesn't deal with
+        cases in which the metric is not computable. Therefore, we ignore these edge cases
+        as part of the computation of CI.
+        In theory there would be several ways to deal with this:
+        1. skip the errors and return a shorter array => this fails because Scipy requires
+        this callback (i.e. the statistic() callback) to return an array of the same size
+        as the number of resamples
+        2. Put np.nan for the errors => this fails because in such case the ci itself
+        becomes np.nan. So one edge case can fail the whole CI computation.
+        3. Replace the errors with a sampling from the successful cases => this is what is implemented.
+        This resampling makes it so that, if possible, the bca confidence interval returned by bootstrap will not be NaN, since
+        bootstrap does not ignore NaNs.  However, if there are 0 or 1 non-NaN values, or all non-NaN values are equal,
+        the resulting distribution will be degenerate (only one unique value) so the CI will still be NaN since there is
+        no variability.  In this case, the CI is essentially an interval of length 0 equaling the mean itself.
+        """
+        if values.size > 1:
+            error_indices = numpy.isnan(values)
+            n_errors = sum(error_indices)
+            if 0 < n_errors < values.size:
+                # replace NaN aggregate scores with random draws from non-NaN scores, so that confidence interval isn't NaN itself
+                values[error_indices] = self.new_random_generator().choice(
+                    values[~error_indices], n_errors, replace=True
+                )
+        return values
     def compute_global_confidence_intervals(
+        self, references, predictions, task_data, score_name
     ):
         """Computed confidence intervals for a set of references and predictions."""
         random_gen = self.new_random_generator()
         def statistic(arr, axis):
             # arr is a 2d array where each row is a resampling, so we
             # iterate over the rows and compute the metric on each resampling
+            def metric(sample_refs, sample_preds, sample_task_data):
                 try:
                     return self._compute(
                         references=sample_refs,
                         predictions=sample_preds,
+                        task_data=sample_task_data,
                     )["score"]
                 except Exception as e:
                     # this happens in edge cases, for example, when the sampling creates a
                     logger.info(f"Warning in {self.__class__.__name__}", e)
                     return np.nan
+            # resample the instance scores, and then return the global score each time
             scores = numpy.apply_along_axis(
                 lambda x: metric(
                     sample_refs=[references[i] for i in x],
                     sample_preds=[predictions[i] for i in x],
+                    sample_task_data=[task_data[i] for i in x],
                 ),
                 axis=axis,
                 arr=arr,
             )
+            # in some resamplings of instances, the global score may be NaN since it cannot be computed;
+            # in these cases, the bca confidence interval will be NaN because it does not ignore these values,
+            # so we replace any NaN values with those resampled from the non-NaN ones.
+            return self.resample_from_non_nan(scores)
         result = {}
         num_predictions = len(predictions)
     need to be considered.  Accuracy, on the other hand, is just an average of the accuracy of all the instances.
     """
+    n_resamples: int = OptionalField(
+        default_factory=lambda: settings.num_resamples_for_global_metrics
+    )
+    process_single_instances = True
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
         references = []
         predictions = []
+        task_data = []
         global_score = {}
         instances = []
             predictions.append(instance_prediction)
             instances.append(instance)
+            instance_task_data = (
+                instance["task_data"] if "task_data" in instance else {}
             )
+            task_data.append(instance_task_data)
+            instance_score = None
+            # for backward compatibility
+            no_score_value = np.nan
+            if self.process_single_instances:
+                try:
+                    instance_score = self._compute(
+                        [instance_references],
+                        [instance_prediction],
+                        [instance_task_data],
+                    )
+                except:
+                    no_score_value = None
+            if not instance_score:
+                instance_score = {
+                    "score": no_score_value,
+                    "score_name": self.main_score,
+                }
                 if isinstance(self.main_score, str):
+                    instance_score[self.main_score] = no_score_value
             instance["score"]["instance"].update(instance_score)
+        result = self._compute(references, predictions, task_data)
         global_score.update(result)
         score_name = global_score["score_name"]
         confidence_interval = self.compute_global_confidence_intervals(
+            references, predictions, task_data, score_name
         )
         global_score.update(confidence_interval)
         self,
         references: List[List[str]],
         predictions: List[str],
+        task_data: List[Any],
     ) -> dict:
+        result = self.compute(references, predictions, task_data)
         result["score"] = result[self.main_score]
         result["score_name"] = self.main_score
         return result
         self,
         references: List[List[Any]],
         predictions: List[Any],
+        task_data: List[Any],
     ) -> dict:
+        """Computes a scores dictionary on a list of references, predictions and input.
+        This function is called once per instance, and then another time
+        over all data instances.
+        Returns:
+            a dictionary of scores that is set as:
+              the instance scores when called on a single data instance
+              the global score when called on the all data instances
+        """
         pass
 class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
+    n_resamples: int = OptionalField(
+        default_factory=lambda: settings.num_resamples_for_instance_metrics
+    )
     main_score: str
     reduction_map: Dict[str, List[str]]
             ),
         )
+        task_data = [
+            instance["task_data"] if "task_data" in instance else {}
             for instance in stream
         ]
         instance_scores = self.compute(
             references=references,
             predictions=predictions,
+            task_data=task_data,
         )
         # add the score and score_name fields
             ), f"Reduction {reduction} is not implemented, use one of {self.implemented_reductions}"
             if reduction == "mean":
                 for field_name in fields:
                     global_score[field_name] = mean(
                         [
                         global_score["score"] = global_score[field_name]
                         global_score["score_name"] = self.main_score
+                ci_fields = (
+                    list(set(self.ci_scores))
+                    if self.ci_scores is not None
+                    else [self.main_score]
+                )
                 confidence_interval = self.score_based_confidence_interval(
+                    instances=instances, score_names=ci_fields
                 )
                 global_score.update(confidence_interval)
         self,
         references: List[List[Any]],
         predictions: List[Any],
+        task_data: List[Dict],
     ) -> List[Dict[str, Any]]:
         pass
 class InstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
+    """Class for metrics for which a global score can be calculated by aggregating the instance scores (possibly with additional instance inputs).
+    InstanceMetric currently allows two reductions:
+    1. 'mean', which calculates the mean of instance scores,
+    2. 'group_mean', which first applies an aggregation function specified in the reduction_map
+        to instance scores grouped by the field grouping_field (which must not be None), and returns the mean
+        of the group scores; if grouping_field is None, grouping is disabled.
+        See _validate_group_mean_reduction for formatting instructions.
+    """
+    n_resamples: int = OptionalField(
+        default_factory=lambda: settings.num_resamples_for_instance_metrics
+    )
+    # some group_mean aggregation functions (3rd element of "agg_func" list in the reduction)
+    # only require a list of instance scores (e.g., mean, median, etc.).  Others aggregation functions
+    # require an additional column (e.g., a subgroup identifier) by which the instance scores will be grouped
+    # if subgroup_column is not None, a column by the specified name will be required in task_data
+    subgroup_column = None
+    implemented_reductions: List[str] = field(
+        default_factory=lambda: ["mean", "group_mean"]
+    )
     @property
     @abstractmethod
     def reduction_map(self) -> dict:
         pass
+    def _validate_group_mean_reduction(self, instances: List[dict]):
+        """Ensure that group_mean reduction_map is properly formatted.
+        Example: Apply the variance (np.var) to group Accuracy instance scores.  This class would be specified as follows:
+        class GroupVarianceAccuracy(Accuracy):
+            reduction_map = {'group_mean': {'agg_func': ['variance', np.var, True]}}
+        reduction_map must be a dict with values containing
+        - an 'agg_func' field with value being a 3-element list where
+            - 1st element is a string name of the aggregation function (used in naming the CI report)
+            - 2nd element is the callable aggregation function
+            - 3rd element is a Boolean indicator of whether, during boostrap CI calculation, the groups are to be sampled as single units.
+                If True, the group scores are calculated and then resampled.  This treats the group units as the unit of
+                interest for which the CI is being compared.
+                If False, the instances are resampled individually, and the groups determined
+                (meaning the groups may be of slightly different size or composition from the original
+                depending on the resampling of the instances).
+        - Optional: 'score_fields' key with list value containing the string names of fields to apply the aggregation to
+            - If not present, the parent class main_score is used.
+        The aggregation function (2nd element of agg_func) can be one of two types:
+        1. simple: calculate a summary statistic from a single group of values (e.g. mean, median, etc.).
+            This is best suited for cases where the instances are independent of each other, other than belonging to the same group
+        2. comparison: requires subgroup_column to be specified.  This function conducts
+            a comparison between scores for differing values of subgroup_column (e.g., 'original' vs 'paraphrase').
+            An example is where the original instance is a question, and the others are various paraphrases
+            or perturbations of this question.  Here, the function would return, say, a comparison of the instance accuracies
+            rather than, say, the average instance accuracy.
+            In these cases, we recommend setting the 3rd parameter to be True so that the groups are resampled together.
+        Example:
+            class GroupVsBaselineDiffAccuracy(Accuracy):
+                subgroup_column = 'variant_type'
+                reduction_map = {'group_mean': {'agg_func': ['accuracy_diff', accuracy_diff, True],}}
+            # where the function is defined as
+            def accuracy_diff(subgroup_scores_dict, expected_subgroup_types=['original', 'paraphrase']):
+                validate_subgroup_types(subgroup_scores_dict, expected_subgroup_types)
+                from statistics import mean
+                return mean(subgroup_scores_dict['paraphrase']) - mean(subgroup_scores_dict['original'])
+            The input dataset should look like:
+            'group_id'  'question'                                   'variant_type'
+            1           'How do you fix a car engine?'               'original'
+            1           'What is the best way to fix an engine?'     'paraphrase'
+            1           'How do you repair a car engine?'            'paraphrase'
+            1           'How do I repair my engine?'                 'paraphrase'
+            2           'Why are ants eating my food?'               'original'
+        """
+        # instances need to all have task_data field with field group_id
+        assert all(
+            "task_data" in instance for instance in instances
+        ), "each instance must have an task_data field"
+        assert all(
+            isinstance(instance["task_data"], dict) for instance in instances
+        ), "each instance must have an task_data field that is a dict"
+        assert all(
+            "group_id" in instance["task_data"] for instance in instances
+        ), "each instance task_data dict must have a key group_id"
+        # validate the reduction_map
+        assert (
+            "group_mean" in self.reduction_map
+        ), "reduction_map must have a 'group_mean' key"
+        fields = self.reduction_map["group_mean"]
+        # for group_mean, expects a dict
+        assert isinstance(fields, dict)
+        assert (
+            "agg_func" in fields
+        ), "fields should have a key 'agg_func' whose value is a 3-element list of a function name, function definition, and a boolean indicator"
+        assert isinstance(
+            fields["agg_func"], list
+        ), "fields['agg_func'] should be a list"
+        assert (
+            len(fields["agg_func"]) == 3
+        ), "fields['agg_func'] should be a 3-element list"
+        assert isinstance(
+            fields["agg_func"][0], str
+        ), "first item in fields['agg_func'] should be a string name of a function"
+        assert callable(
+            fields["agg_func"][1]
+        ), "second item in fields['agg_func'] should be a callable function"
+        assert isinstance(
+            fields["agg_func"][2], bool
+        ), "third item in fields['agg_func'] should be a boolean value"
+        if "score_fields" in fields:
+            assert isinstance(fields["score_fields"], list)
+        # for aggregation functions that use the subgroup_column (expect a dict of lists), check that
+        # this field exists
+        if self.subgroup_column is not None:
+            assert all(
+                self.subgroup_column in instance["task_data"] for instance in instances
+            ), f"each instance task_data dict must have a key {self.subgroup_column}"
     def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+        instances, global_score = self.compute_instance_scores(stream)
+        for reduction_type, reduction_params in self.reduction_map.items():
+            assert (
+                reduction_type in self.implemented_reductions
+            ), f"Reduction {reduction_type} is not implemented, use one of {self.implemented_reductions}"
+            field_name_full_prefix = ""
+            # used for passing to the bootstrapping, depends on whether the groups are fixed or not
+            aggregation_function = self.average_item_scores
+            if reduction_type == "mean":
+                reduction_fields = list(set(reduction_params))
+                # no group reduction, so resample instances individually
+                scores_to_resample = instances
+            elif reduction_type == "group_mean":
+                self._validate_group_mean_reduction(instances=instances)
+                reduction_fields = (
+                    [self.main_score]
+                    if "score_fields" not in reduction_params
+                    else list(set(reduction_params["score_fields"]))
+                )
+                aggregation_function_name = str(reduction_params["agg_func"][0])
+                field_name_full_prefix = "group_" + aggregation_function_name + "_"
+                do_resample_as_group = reduction_params["agg_func"][2]
+                if do_resample_as_group:
+                    # append fixed_ to name because resamples the groups as fixed units
+                    field_name_full_prefix = "fixed_" + field_name_full_prefix
+                (
+                    scores_to_resample,
+                    aggregation_function,
+                ) = self._set_up_group_mean_aggregation(
+                    instances, reduction_params, reduction_fields
+                )
+            else:
+                raise ValueError(
+                    f"Reduction {reduction_type} is not supported, please specify a valid reduction method in reduction_map {self.reduction_map}."
+                )
+            # calculate global scores for each reduction field
+            for field_name in reduction_fields:
+                field_name_full = field_name_full_prefix + field_name
+                # if group resampling (3rd element of agg_func parameter) is True, then
+                #   1. scores_to_resample are the group scores, and
+                #   2. aggregation_function is to take the raw mean
+                # if no group resampling (3rd element of agg_func parameter) is False, then
+                #   1. scores_to_resample are the original instance scores, and
+                #   2. aggregation_function is to apply the group aggregation from the instance scores
+                # either way, the application of aggregation_function to scores_to_resample yields the global score
+                global_score[field_name_full] = aggregation_function(
+                    scores_to_resample, field_name
+                )
+                if field_name == self.main_score:
+                    global_score["score"] = global_score[field_name_full]
+                    global_score["score_name"] = field_name_full
+            # need to specify which fields should have CIs calculated for them through ci_scores
+            # (will not automatically calculate CIs for fields in reduction map)
+            if self.ci_scores is not None:
+                confidence_interval = self.score_based_confidence_interval(
+                    instances=scores_to_resample,
+                    score_names=list(set(self.ci_scores)),
+                    ci_score_prefix=field_name_full_prefix,
+                    aggregation_func=aggregation_function,
+                )
+                global_score.update(confidence_interval)
+        yield from instances
+    def compute_instance_scores(
+        self, stream: Stream, stream_name: Optional[str] = None
+    ):
         global_score = {}
         instances = []
         for instance in stream:
             refs, pred = instance["references"], instance["prediction"]
+            task_data = instance["task_data"] if "task_data" in instance else {}
             instance_score = self.compute(
+                references=refs, prediction=pred, task_data=task_data
             )
             instance_score["score"] = instance_score[self.main_score]
             instance_score["score_name"] = self.main_score
             instances.append(instance)
+        return instances, global_score
+    def get_group_scores(
+        self, instances: List[dict], score_names: List[str], group_aggregation_func
+    ):
+        """Group scores by the group_id and subgroup_type fields of each instance, and compute group_aggregation_func by group.
+        Args:
+            instances: List of observation instances with instance-level scores (fields) computed.
+            score_names: List of instance score names in each instance to apply the aggregation function.
+            group_aggregation_func: Callable aggregation function accepting a list of numeric scores;
+                or, if self.subgroup_column is not None, a dict of subgroup types scores by subgroup_column value.
+                callable function returns a single score for the group
+        Returns:
+            List of dicts, each corresponding to a group of instances (defined by 'group_id'),
+                with an aggregate group score for each score_name
+        """
+        from collections import defaultdict
+        # three-level defaultdict:
+        # first is the grouping, second is the field name, the third is the subgroup_type (by default 'default')
+        group_to_instance_scores = defaultdict(
+            lambda: defaultdict(lambda: defaultdict(list))
+        )
+        # check if function has fields for subgroup_column
+        uses_subgroups = self.subgroup_column is not None
+        default_subgroup_name = "default"
+        # loop through the instances and group the scores
+        for instance in instances:
+            task_data = instance["task_data"]
+            group_key = task_data["group_id"]
+            # for functions that do comparisons between subgroup_column groups
+            # if function doesn't use subgroup_column, or none is present, set "default" as default value, and pass all scores
+            subgroup_type = (
+                task_data[self.subgroup_column]
+                if uses_subgroups
+                else default_subgroup_name
+            )
+            for score_name in score_names:
+                group_to_instance_scores[group_key][score_name][subgroup_type].append(
+                    instance["score"]["instance"][score_name]
                 )
+        # if group_aggregation_func expects a subgroup-types score dict, pass it; otherwise pass the default type list of scores
+        return [
+            {
+                "score": {
+                    "instance": {
+                        score_name: group_aggregation_func(
+                            score_dict
+                            if uses_subgroups
+                            else score_dict[default_subgroup_name]
+                        )
+                        for score_name, score_dict in group_scores.items()
+                    }
+                }
+            }
+            for group_scores in group_to_instance_scores.values()
+        ]
+    def _set_up_group_mean_aggregation(
+        self, instances, reduction_params, reduction_fields
+    ):
+        group_aggregation_func = reduction_params["agg_func"][1]
+        # if treat groups as units
+        do_resample_as_group = reduction_params["agg_func"][2]
+        if do_resample_as_group:
+            # pass the group aggregate---not instance---scores to resample as usual
+            aggregation_function = self.average_item_scores
+            scores_to_resample = self.get_group_scores(
+                instances, reduction_fields, group_aggregation_func
+            )
+        else:
+            # pass the instance scores to resample, and calculate the group aggregation on the resamplings
+            scores_to_resample = instances
+            def aggregation_function(
+                instances,
+                field_name,
+                group_aggregation_func=group_aggregation_func,
+            ):
+                group_scores = self.get_group_scores(
+                    instances, [field_name], group_aggregation_func
+                )
+                return nan_mean(
+                    [group["score"]["instance"][field_name] for group in group_scores]
+                )
+        return scores_to_resample, aggregation_function
     @abstractmethod
+    def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict:
         pass
         self,
         references: List[List[str]],
         predictions: List[str],
+        task_data: List[Dict],
     ) -> dict:
         ids = [str(uuid.uuid4()).replace("-", "") for _ in range(len(predictions))]
         formatted_predictions = [
 class Accuracy(InstanceMetric):
     reduction_map = {"mean": ["accuracy"]}
     main_score = "accuracy"
+    ci_scores = ["accuracy"]
     def compute(
+        self, references: List[Any], prediction: Any, task_data: List[Dict]
     ) -> dict:
         result = {
             self.main_score: float(
 class StringContainment(InstanceMetric):
     reduction_map = {"mean": ["string_containment"]}
     main_score = "string_containment"
+    ci_scores = ["string_containment"]
     def compute(
+        self, references: List[Any], prediction: Any, task_data: List[Dict]
     ) -> dict:
         result = {
             self.main_score: float(
+                any(str(reference) in str(prediction) for reference in references)
             )
         }
         result["score"] = result[self.main_score]
     )
     metric: Metric = None
+    def disable_confidence_interval_calculation(self):
+        return self.metric.disable_confidence_interval_calculation()
+    def set_n_resamples(self, n_resample):
+        if isinstance(self.metric, MetricWithConfidenceInterval):
+            self.metric.set_n_resamples(n_resample)
     def verify(self):
         assert self.main_score is not None, "main_score is not set"
         self,
         references: List[List[Any]],
         predictions: List[Any],
+        task_data: List[Dict],
     ) -> dict:
+        passed_task_data = {}
         for additional_input_field in self.hf_additional_input_fields:
             assert (
+                additional_input_field in task_data[0]
+            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
+            passed_task_data[additional_input_field] = [
                 additional_input[additional_input_field]
+                for additional_input in task_data
             ]
         for additional_input_field in self.hf_additional_input_fields_pass_one_value:
             assert (
+                additional_input_field in task_data[0]
+            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
             values = {
                 additional_input[additional_input_field]
+                for additional_input in task_data
             }
             assert (
                 len(values) == 1
             ), f"Values of '{additional_input_field}' field required by {__class__.__name__}  should all be the same, but have multiple values {values}"
+            passed_task_data[additional_input_field] = next(iter(values))
+        # add check that all required fields in self.metrics are in passed_task_data       print(passed_task_data)
         result = self.metric.compute(
             predictions=predictions,
             references=references,
+            **passed_task_data,
             **self.hf_compute_args,
         )
         if self.hf_main_score:
         self,
         references: List[List[str]],
         predictions: List[str],
+        task_data: List[Any],
     ) -> List[Dict[str, Any]]:
+        passed_task_data = {}
         for additional_input_field in self.hf_additional_input_fields:
             assert (
+                additional_input_field in task_data[0]
+            ), f"'{additional_input_field}' field required by {__class__.__name__} is not in passed in task_data: {task_data[0]}"
+            passed_task_data[additional_input_field] = [
                 additional_input[additional_input_field]
+                for additional_input in task_data
             ]
+        # add check that all required fields in self.metrics are in passed_task_data
         scores = self.metric.compute(
             predictions=predictions,
             references=references,
+            **passed_task_data,
             **self.hf_compute_args,
         )
         self,
         references: List[List[str]],
         predictions: List[str],
+        task_data: List[Dict],
     ) -> dict:
         assert all(
             len(reference) == 1 for reference in references
             average=self.average,
         )
         if isinstance(result["f1"], numpy.ndarray):
             final_result = {self.main_score: mean(result["f1"])}
             for i, label in enumerate(labels):
                 final_result["f1_" + self.id_to_str[label]] = result["f1"][i]
     _metric = None
     main_score = "f1_macro"
     average = None  # Report per class then aggregate by mean
     metric = "f1"
     def prepare(self):
         self,
         references: List[List[str]],
         predictions: List[List[str]],
+        task_data: List[Dict],
     ) -> dict:
         self.str_to_id = {}
         self.id_to_str = {}
         self._validate_references_and_prediction(references, predictions)
         references = [reference[0] for reference in references]
+        labels = list({label for reference in references for label in reference})
         # if no classes are left then F1 is not defined
         if len(labels) == 0:
             return {self.main_score: float("nan")}
             labels=labels_param,
         )
         if isinstance(result[self.metric], numpy.ndarray):
             assert (
                 len(result[self.metric]) == len(labels)
             ), f"F1 result ({result[self.metric]}) has more entries than labels ({labels})"
     sent_split_newline: bool = True
+    _requirements_list: List[str] = ["nltk", "rouge_score"]
     def prepare(self):
         super().prepare()
         nltk.download("punkt")
         self.sent_tokenize = nltk.sent_tokenize
+    def compute(self, references, predictions, task_data: List[Dict]):
         if self.sent_split_newline:
             predictions = [
                 "\n".join(self.sent_tokenize(prediction.strip()))
                 ["\n".join(self.sent_tokenize(r.strip())) for r in reference]
                 for reference in references
             ]
+        return super().compute(references, predictions, task_data)
 # Computes char edit distance, ignoring whitespace
 class CharEditDistanceAccuracy(InstanceMetric):
     reduction_map = {"mean": ["char_edit_dist_accuracy"]}
     main_score = "char_edit_dist_accuracy"
+    ci_scores = ["char_edit_dist_accuracy"]
+    _requirements_list: List[str] = ["editdistance"]
     def prepare(self):
         super().prepare()
         self.eval = editdistance.eval
+    def compute(self, references, prediction: str, task_data: List[Dict]) -> dict:
         assert (
             len(references) == 1
         ), f"Expected only one reference , but received: {references}"
     hf_metric_name = "wer"
     main_score = "wer"
+    _requirements_list: List[str] = ["jiwer"]
     def compute(
         self,
         references: List[List[str]],
         predictions: List[str],
+        task_data: List[Dict],
     ) -> dict:
         assert all(
             len(reference) == 1 for reference in references
         return {self.main_score: result}
+class Spearmanr(HuggingfaceMetric):
+    hf_metric_name = "spearmanr"
+    main_score = "spearmanr"
+    process_single_instances = False
+class KendallTauMetric(GlobalMetric):
+    main_score = "kendalltau_b"
+    variant = "b"
+    process_single_instances = False
+    _requirements_list: List[str] = ["scipy"]
+    def prepare(self):
+        from scipy.stats import kendalltau
+        self.kendalltau = kendalltau
+    def compute(
+        self,
+        references: List[List[str]],
+        predictions: List[str],
+        task_data: List[Dict],
+    ) -> dict:
+        if isinstance(references[0], list):
+            references = [reference[0] for reference in references]
+        references = [to_float_or_default(r) for r in references]
+        predictions = [to_float_or_default(p) for p in predictions]
+        kendall_results = self.kendalltau(references, predictions, variant=self.variant)
+        corr = kendall_results.correlation
+        return {
+            self.main_score: corr,
+            f"{self.main_score}_p_val": kendall_results.pvalue,
+        }
 class MatthewsCorrelation(HuggingfaceMetric):
     hf_metric_name = "matthews_correlation"
     main_score = "matthews_correlation"
         self,
         references: List[List[str]],
         predictions: List[str],
+        task_data: List[Dict],
     ) -> dict:
         formatted_references = [
             self.get_str_id(reference[0]) for reference in references
         )
+class RocAuc(GlobalMetric):
+    main_score = "roc_auc"
+    process_single_instances = False
+    _requirements_list: List[str] = ["sklearn"]
+    def prepare(self):
+        from sklearn import metrics
+        self.roc_curve = metrics.roc_curve
+        self.auc = metrics.auc
+    def compute(
+        self,
+        references: List[List[str]],
+        predictions: List[str],
+        task_data: List[Dict],
+    ) -> dict:
+        if isinstance(references[0], list):
+            references = [reference[0] for reference in references]
+        references = [to_float_or_default(r) for r in references]
+        predictions = [to_float_or_default(p) for p in predictions]
+        fpr, tpr, thrs = self.roc_curve(y_true=references, y_score=predictions)
+        roc_auc = self.auc(fpr, tpr)
+        return {self.main_score: roc_auc}
 class CustomF1(GlobalMetric):
     main_score = "f1_micro"
     groups = None
         except ZeroDivisionError:
             return self.zero_division
+    def get_groups(self, elements, task_data):
         groups = set()
+        for sublist, additional_input in zip(elements, task_data):
             for e in sublist:
                 if self.should_ignore_element(e, additional_input):
                     continue
         self,
         references: List[List[Any]],
         predictions: List[Any],
+        task_data: List[Dict],
     ) -> dict:
         # in case reference are List[List[List[Any]]] and predictions are List[List[Any]]:
         if (
         )
         if self.groups is None:
+            groups = self.get_groups(references, task_data)
         else:
             groups = self.groups
         groups_statistics = {}
         for references_batch, predictions_batch, additional_input in zip(
+            references, predictions, task_data
         ):
             grouped_references = self.group_elements(references_batch, additional_input)
             grouped_predictions = self.group_elements(
     ci_scores = ["f1", "precision", "recall"]
     def compute(
+        self, references: List[Any], prediction: Any, task_data: List[Dict]
     ) -> dict:
         results = [
+            self._compute_single_ref(str(reference), str(prediction))
+            for reference in references
         ]
         return {
             measure: max(r[i] for r in results)
     def _compute_single_ref(
         self, reference: Any, prediction: Any
     ) -> Tuple[float, float, float]:
+        prediction_tokens = normalize_answer(str(prediction)).split()
+        reference_tokens = normalize_answer(str(reference)).split()
         common = Counter(prediction_tokens) & Counter(reference_tokens)
         num_same = sum(common.values())
         if num_same == 0:
     ci_scores = ["f1", "precision", "recall"]
     model_name: str
+    _requirements_list: List[str] = ["bert_score"]
     def prepare(self):
         super().prepare()
+        self.hf_compute_args = {"model_type": self.model_name, "batch_size": 16}
 class SentenceBert(BulkInstanceMetric):
     model_name: str
+    _requirements_list: List[str] = ["sentence_transformers"]
     def prepare(self):
         super().prepare()
+        import torch
         from sentence_transformers import SentenceTransformer
         from sentence_transformers import util as sbert_util
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        self.model = SentenceTransformer(self.model_name, device=self.device)
         self.util = sbert_util
     def compute(
         self,
         references: List[List[Any]],
         predictions: List[Any],
+        task_data: List[Dict],
     ) -> List[Dict[str, Any]]:
         scores = []
             count += len(ref_group)
         # compute s-bert embeddings
+        preds_emb = self.model.encode(predictions, device=self.device)
         refs_emb = self.model.encode(
+            [ref for ref_group in references for ref in ref_group], device=self.device
         )
         # for each candidate, pick the reference with the highest score
     model_name: str
+    _requirements_list: List[str] = ["transformers"]
     def prepare(self):
         super().prepare()
+        import torch
         from transformers import pipeline
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        self.pipe = pipeline(
+            "text-classification", model=self.model_name, device=device
+        )
     def compute(
         self,
         references: List[List[Any]],
         predictions: List[Any],
+        task_data: List[Dict],
     ) -> List[Dict[str, Any]]:
         # treat the references as the questions and the predictions as answers
         # assume a single reference
     batch_size: int = 32
     model_name: str
+    _requirements_list: List[str] = ["transformers"]
     def compute(
         self,
         references: List[List[Any]],
         predictions: List[Any],
+        task_data: List[Dict],
     ) -> List[Dict[str, Any]]:
         """Computes the likelihood of generating text Y after text X - P(Y|X).
+        :param predictions: the list of Y texts = the targets of the generation
+        :param references: the list of list of X texts = the sources of the generation
+        :return: the likelihood of generating text Y_i after each text X_i_j = P(Y_i|X_i_1), ..., P(Y_i|X_i_n)  for every i.
         """
         sources = []
         targets = []
         for prediction, instance_references in zip(predictions, references):
             for instance_reference in instance_references:
+                sources.append(f"{self.perplexity_prompt} {instance_reference}")
+                targets.append(prediction)
         from transformers import AutoConfig
             from transformers import AutoTokenizer
             self.model_name = model_name
+            self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+            self.model = (
+                self.model_class().from_pretrained(self.model_name).to(self.device)
+            )
             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         def compute_lm(
             self, source: List[str], target: List[str], batch_size: int
             return AutoModelForSeq2SeqLM
         def compute_batch(self, tokens_source, tokens_target):
+            tokens_docs_ids = tokens_source["input_ids"].to(self.device)
+            attention = tokens_source["attention_mask"].to(self.device)
+            labels = tokens_target["input_ids"].to(self.device)
             logits = self.model(
                 input_ids=tokens_docs_ids.long(),
             # replace the padding token in the labels by -100
             labels[labels == self.tokenizer.pad_token_id] = -100
+            tokens = tokens.to(self.device)
+            attention = attention.to(self.device)
+            labels = labels.to(self.device)
             # no need to pass labels as we calculate the loss below per document
             model_output = self.model(
     main_score = "nDCG"
+    _requirements_list: List[str] = ["sklearn"]
     def prepare(self):
         from sklearn.metrics import ndcg_score
         self,
         references: List[List[Any]],
         predictions: List[Any],
+        task_data: List[Any],
     ) -> dict:
         from collections import defaultdict
         query_to_predictions_and_references = defaultdict(lambda: [[], []])
+        for reference, pred, inputs_dict in zip(references, predictions, task_data):
             query = inputs_dict.get("query")
             query_to_predictions_and_references[query][0].append(pred)
             query_to_predictions_and_references[query][1].append(reference)
 class RetrievalMetric(InstanceMetric):
+    def compute(self, references: List[Any], prediction: Any, task_data: Dict) -> dict:
         # digest input
         pred_ids: List[Any] = prediction
         ref_ids: List[Any] = list(dict.fromkeys(references))
 class MRR(RetrievalMetric):
     reduction_map = {"mean": ["mrr"]}
     main_score = "mrr"
+    ci_scores = ["mrr"]
     def _compute(
         self,
 class MAP(RetrievalMetric):
     reduction_map = {"mean": ["map"]}
     main_score = "map"
+    ci_scores = ["map"]
     def _compute(
         self,
     def should_ignore_element(self, element, additional_input):
         return element == "none"
+class RemoteMetric(SingleStreamOperator, Metric):
+    """A metric that runs another metric remotely.
+    main_score: the score updated by this metric.
+    endpoint: the remote host that supports the remote metric execution.
+    metric_name: the name of the metric that is executed remotely.
+    api_key: optional, passed to the remote metric with the input, allows secure authentication.
+    """
+    main_score: str = None
+    endpoint: str
+    metric_name: str
+    api_key: str = None
+    @staticmethod
+    def wrap_inner_metric_pipeline_metric(
+        metric_pipeline: MetricPipeline, remote_metrics_endpoint: str
+    ) -> MetricPipeline:
+        """Wrap the inner metric in a MetricPipeline with a RemoteMetric.
+        When executing the returned MetricPipeline, the inner metric will be computed
+        remotely (pre and post processing steps in the MetricPipeline will be computed locally).
+        """
+        local_inner_metric = metric_pipeline.metric
+        metric_pipeline = deepcopy(
+            metric_pipeline
+        )  # To avoid unintentional changes to the catalog contents
+        metric_pipeline.metric = RemoteMetric(
+            main_score=local_inner_metric.main_score,
+            metric_name=local_inner_metric.artifact_identifier,
+            endpoint=remote_metrics_endpoint,
+        )
+        return metric_pipeline
+    def get_metric_url(self) -> str:
+        return f"{self.endpoint}/{self.metric_name}"
+    def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+        predictions, references, additional_inputs, instances = self.consume_stream(
+            stream
+        )
+        metric_request = self.create_metric_request(
+            predictions, references, additional_inputs
+        )
+        metric_response = self.get_metric_response(metric_request)
+        self.update_instance_scores(instances, metric_response.instances_scores)
+        self.set_global_score(instances, metric_response.global_score)
+        yield from instances
+    @staticmethod
+    def create_metric_request(predictions, references, additional_inputs):
+        instance_inputs = [
+            InstanceInput(
+                prediction=prediction,
+                references=reference,
+                additional_inputs=additional_input,
+            )
+            for prediction, reference, additional_input in zip(
+                predictions, references, additional_inputs
+            )
+        ]
+        return MetricRequest(instance_inputs=instance_inputs)
+    def get_metric_response(self, metric_request: MetricRequest) -> MetricResponse:
+        import requests
+        response = requests.post(
+            url=self.get_metric_url(),
+            json=metric_request.to_dict(),
+            headers={"Authorization": f"Bearer {self.api_key}"},
+        )
+        response.raise_for_status()
+        response_json = response.json()
+        return MetricResponse(**response_json)
+    def disable_confidence_interval_calculation(self):
+        """Confidence intervals are always disabled for RemoteMetric.
+        No need to do anything.
+        """
+        pass
+    def set_n_resamples(self, n_resample):
+        """Since confidence intervals are always disabled for remote metrics, this is a no-op."""
+        pass
+def validate_subgroup_types(
+    subgroup_scores_dict: Dict[str, List],
+    control_subgroup_types: List[str],
+    comparison_subgroup_types: List[str],
+):
+    """Validate a dict of subgroup type instance score lists, and subgroup type lists.
+    Args:
+        subgroup_scores_dict: dict where keys are subgroup types and values are lists of instance scores.
+        control_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the control (baseline) group
+        comparison_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the group
+            to be compared to the control group.
+    Returns:
+        dict with all NaN scores removed; control_subgroup_types and comparison_subgroup_types will have non-unique elements removed
+    """
+    # note: subgroup_scores_dict is already a defaultdict of lists, so don't need to check that keys in control_ and comparison_subgroup_types exist in it
+    # remove any NaNs
+    subgroup_scores_dict.update(
+        {
+            subgroup_name: [score for score in score_list if not np.isnan(score)]
+            for subgroup_name, score_list in subgroup_scores_dict.items()
+        }
+    )
+    assert isinstance(
+        control_subgroup_types, list
+    ), "control_subgroup_types must be a list"
+    assert isinstance(
+        comparison_subgroup_types, list
+    ), "comparison_subgroup_types must be a list"
+    # make sure each list is unique, so that labels aren't double-counted
+    control_subgroup_types = list(set(control_subgroup_types))
+    comparison_subgroup_types = list(set(comparison_subgroup_types))
+    return subgroup_scores_dict, control_subgroup_types, comparison_subgroup_types
+def performance_drop_rate(
+    subgroup_scores_dict: Dict[str, List],
+    control_subgroup_types: List[str],
+    comparison_subgroup_types: List[str],
+):
+    """Percentage decrease of mean performance on test elements relative to that on a baseline (control).
+    from https://arxiv.org/pdf/2306.04528.pdf.
+    Args:
+        subgroup_scores_dict: dict where keys are subgroup types and values are lists of instance scores.
+        control_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the control (baseline) group
+        comparison_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the group
+            to be compared to the control group.
+    Returns:
+        numeric PDR metric.
+        If only one element (no test set) or the first is 0 (percentage change is undefined) return NaN
+        otherwise, calculate PDR
+    """
+    (
+        subgroup_scores_dict,
+        control_subgroup_types,
+        comparison_subgroup_types,
+    ) = validate_subgroup_types(
+        subgroup_scores_dict, control_subgroup_types, comparison_subgroup_types
+    )
+    # combine all scores from each label (if there are more than 1 in each group) into a list
+    group_scores_list = [
+        np.concatenate(
+            [subgroup_scores_dict[subgroup_name] for subgroup_name in name_list]
+        )
+        for name_list in [control_subgroup_types, comparison_subgroup_types]
+    ]
+    if any(len(scores) == 0 for scores in group_scores_list):
+        # no comparison can be made since there is not at least one score per type
+        return np.nan
+    control_mean = mean(group_scores_list[0])
+    comparison_mean = mean(group_scores_list[1])
+    if control_mean == 0:
+        # return 0 if comparison is also 0
+        if comparison_mean == 0:
+            return 0
+        return np.nan
+    # otherwise, take the percentage change (which may also be 0)
+    return 1 - comparison_mean / control_mean
+def interpret_effect_size(x: float):
+    """Return a string rule-of-thumb interpretation of an effect size value, as defined by Cohen/Sawilowsky.
+    See https://en.wikipedia.org/wiki/Effect_size;
+    Cohen, Jacob (1988). Statistical Power Analysis for the Behavioral Sciences; and
+    Sawilowsky, S (2009). "New effect size rules of thumb". Journal of Modern Applied Statistical Methods. 8 (2): 467-474.
+    Value has interpretation of
+    - essentially 0 if |x| < 0.01
+    - very small if 0.01 <= |x| < 0.2
+    - small difference if 0.2 <= |x| < 0.5
+    - a medium difference if 0.5 <= |x| < 0.8
+    - a large difference if 0.8 <= |x| < 1.2
+    - a very large difference if 1.2 <= |x| < 2.0
+    - a huge difference if 2.0 <= |x|
+    Args:
+        x: float effect size value
+    Returns:
+        string interpretation
+    """
+    import pandas as pd
+    # assign a label according to threshold of the absolute value
+    return pd.cut(
+        x=[np.abs(x)],
+        right=False,
+        bins=[-1, 0.01, 0.2, 0.5, 0.8, 1.2, 2.0, np.Inf],
+        labels=[
+            "essentially zero",
+            "very small",
+            "small",
+            "medium",
+            "large",
+            "very large",
+            "huge",
+        ],
+    )[0]
+def normalized_cohens_h(
+    subgroup_scores_dict: Dict[str, List],
+    control_subgroup_types: List[str],
+    comparison_subgroup_types: List[str],
+    interpret=False,
+):
+    """Cohen's h effect size between two proportions, normalized to interval [-1,1].
+    Allows for change-type metric when the baseline is 0 (percentage change, and thus PDR, is undefined)
+    https://en.wikipedia.org/wiki/Cohen%27s_h
+    Cohen's h effect size metric between two proportions p2 and p1 is 2 * (arcsin(sqrt(p2)) - arcsin(sqrt(p1))).
+    h in -pi, pi, with +/-pi representing the largest increase/decrease (p1=0, p2=1), or (p1=1, p2=0).
+    h=0 is no change. Unlike percentage change, h is defined even if the baseline (p1) is 0.
+    Assumes the scores are in [0,1], either continuous or binary; hence taking the average of a group of scores yields a proportion..
+    Calculates the change in the average of the other_scores relative to the average of the baseline_scores.    We rescale this to [-1,1] from [-pi,pi] for clarity, where +- 1 are the most extreme changes, and 0 is no change
+    Interpretation: the original unscaled Cohen's h can be interpreted according to function interpret_effect_size
+    Thus, the rule of interpreting the effect of the normalized value is to use the same thresholds divided by pi
+        - essentially 0 if |norm h| < 0.0031831
+        - very small if 0.0031831 <= |norm h| < 0.06366198
+        - small difference if 0.06366198 <= |norm h| < 0.15915494
+        - a medium difference if 0.15915494 <= |norm h| < 0.25464791
+        - a large difference if 0.25464791 <= |norm h| < 0.38197186
+        - a very large difference if 0.38197186 <= |norm h| < 0.63661977
+        - a huge difference if 0.63661977 <= |norm h|
+    Args:
+        subgroup_scores_dict: dict where keys are subgroup types and values are lists of instance scores.
+        control_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the control (baseline) group
+        comparison_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the group
+            to be compared to the control group.
+        interpret: boolean, whether to interpret the significance of the score or not
+    Returns:
+        float score between -1 and 1, and a string interpretation if interpret=True
+    """
+    (
+        subgroup_scores_dict,
+        control_subgroup_types,
+        comparison_subgroup_types,
+    ) = validate_subgroup_types(
+        subgroup_scores_dict, control_subgroup_types, comparison_subgroup_types
+    )
+    # requires scores to be in [0,1]
+    for subgroup_name, score_list in subgroup_scores_dict.items():
+        assert all(
+            0 <= score <= 1 for score in score_list
+        ), f"all {subgroup_name} scores must be in [0,1]"
+    # combine all scores from each label (if there are more than 1 in each group) into a list
+    group_scores_list = [
+        np.concatenate(
+            [subgroup_scores_dict[subgroup_name] for subgroup_name in name_list]
+        )
+        for name_list in [control_subgroup_types, comparison_subgroup_types]
+    ]
+    if any(len(scores) == 0 for scores in group_scores_list):
+        # no comparison can be made since there is not at least one score per type
+        h, norm_h = np.nan, np.nan
+    else:
+        control_mean = mean(group_scores_list[0])
+        comparison_mean = mean(group_scores_list[1])
+        h = 2 * (np.arcsin(np.sqrt(comparison_mean)) - np.arcsin(np.sqrt(control_mean)))
+        norm_h = np.clip(a=h / np.pi, a_min=-1, a_max=1)
+    if not interpret:
+        return norm_h
+    return norm_h, interpret_effect_size(h)
+def normalized_hedges_g(
+    subgroup_scores_dict: Dict[str, List[float]],
+    control_subgroup_types: List[str],
+    comparison_subgroup_types: List[str],
+    interpret=False,
+):
+    """Hedge's g effect size between mean of two samples, normalized to interval [-1,1].  Better than Cohen's d for small sample sizes.
+    Takes into account the variances within the samples, not just the means.
+    Args:
+        subgroup_scores_dict: dict where keys are subgroup types and values are lists of instance scores.
+        control_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the control (baseline) group
+        comparison_subgroup_types: list of subgroup types (potential keys of subgroup_scores_dict) that are the group
+            to be compared to the control group.
+        interpret: boolean, whether to interpret the significance of the score or not
+    Returns:
+        float score between -1 and 1, and a string interpretation if interpret=True
+    """
+    (
+        subgroup_scores_dict,
+        control_subgroup_types,
+        comparison_subgroup_types,
+    ) = validate_subgroup_types(
+        subgroup_scores_dict, control_subgroup_types, comparison_subgroup_types
+    )
+    # combine all scores from each label (if there are more than 1 in each group) into a list
+    group_scores_list = [
+        np.concatenate(
+            [subgroup_scores_dict[subgroup_name] for subgroup_name in name_list]
+        )
+        for name_list in [control_subgroup_types, comparison_subgroup_types]
+    ]
+    group_n = [len(scores) for scores in group_scores_list]
+    if any(nn == 0 for nn in group_n) or all(nn <= 1 for nn in group_n):
+        # if at least one sample size is 0 for one type, no comparison can be made at all
+        # if both sample sizes are 1, then the denominator is undefined since divide by n1 + n2 - 2
+        # so require at least one sample to have > 1 observation, and both to have >= 1.
+        g, norm_g = np.nan, np.nan
+    else:
+        # otherwise, calculate the variances
+        group_mean = [mean(scores) for scores in group_scores_list]
+        # sample variance with 1 degree of freedom (denominator n-1); if n=1, return 0 since otherwise throws an error
+        group_var = [
+            0.0 if nn == 1 else np.var(scores, ddof=1)
+            for scores, nn in zip(group_scores_list, group_n)
+        ]
+        var_total = sum([(nn - 1) * vv for vv, nn in zip(group_var, group_n)])
+        pooled_sd = np.sqrt(var_total / (sum(group_n) - 2))
+        max_absolute_value = 5
+        gmd = float(group_mean[1] - group_mean[0])
+        if gmd == 0:
+            # if exactly the same, return 0
+            g = 0.0
+        else:
+            try:
+                g = gmd / pooled_sd
+            except ZeroDivisionError:
+                # return a large effect size to avoid explosion if there is zero variance
+                g = np.sign(gmd) * max_absolute_value
+        n = sum(group_n)
+        if 3 < n < 50:
+            # small sample adjustment see https://www.itl.nist.gov/div898/software/dataplot/refman2/auxillar/hedgeg.htm
+            # the multiplier is 0 if n <= 3
+            g *= ((n - 3) / (n - 2.25)) * np.sqrt((n - 2) / n)
+        # clip it at a very large value so it doesn't become infinite if the variance (denominator) is very small or 0
+        g = float(np.clip(a=g, a_min=-1 * max_absolute_value, a_max=max_absolute_value))
+        norm_g = g / max_absolute_value
+    if not interpret:
+        return norm_g
+    return norm_g, interpret_effect_size(g)
+def mean_subgroup_score(
+    subgroup_scores_dict: Dict[str, List], subgroup_types: List[str]
+):
+    """Return the mean instance score for a subset (possibly a single type) of variants (not a comparison).
+    Args:
+        subgroup_scores_dict: dict where keys are subgroup types and values are lists of instance scores.
+        subgroup_types: the keys (subgroup types) for which the average will be computed.
+    Returns:
+        float score
+    """
+    subgroup_scores_dict, subgroup_types, _ = validate_subgroup_types(
+        subgroup_scores_dict, subgroup_types, []
+    )
+    # combine all desired subgroup scores
+    score_list = np.concatenate(
+        [subgroup_scores_dict[subgroup_name] for subgroup_name in subgroup_types]
+    )
+    if len(score_list) == 0:
+        # no scores to use
+        return np.nan
+    return mean(score_list)
+# metrics using mean reduction
+class GroupMeanAccuracy(Accuracy):
+    reduction_map = {"group_mean": {"agg_func": ["mean", nan_mean, False]}}
+class FixedGroupMeanAccuracy(Accuracy):
+    # the same as GroupMeanAccuracy, except the groups are fixed and are resampled together
+    reduction_map = {"group_mean": {"agg_func": ["mean", nan_mean, True]}}
+# same as above, now using StringContainment
+class GroupMeanStringContainment(StringContainment):
+    reduction_map = {"group_mean": {"agg_func": ["mean", nan_mean, False]}}
+class FixedGroupMeanStringContainment(StringContainment):
+    # the same as GroupMeanStringContainment, except the groups are fixed and are resampled together
+    reduction_map = {"group_mean": {"agg_func": ["mean", nan_mean, True]}}
+# take only the (fixed) group mean of baseline or other (paraphrases) scores
+class FixedGroupMeanBaselineAccuracy(Accuracy):
+    subgroup_column = "variant_type"
+    # take mean of "original" variants only
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "mean_baseline",
+                lambda scd: mean_subgroup_score(
+                    subgroup_scores_dict=scd, subgroup_types=["original"]
+                ),
+                True,
+            ],
+        }
+    }
+class FixedGroupMeanParaphraseAccuracy(Accuracy):
+    subgroup_column = "variant_type"
+    # take mean of "paraphrase" variants only
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "mean_paraphrase",
+                lambda scd: mean_subgroup_score(
+                    subgroup_scores_dict=scd, subgroup_types=["paraphrase"]
+                ),
+                True,
+            ],
+        }
+    }
+# same as above but using StringContainment
+class FixedGroupMeanBaselineStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    # take mean of "original" variants only
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "mean_baseline",
+                lambda scd: mean_subgroup_score(
+                    subgroup_scores_dict=scd, subgroup_types=["original"]
+                ),
+                True,
+            ],
+        }
+    }
+class FixedGroupMeanParaphraseStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    # take mean of "paraphrase" variants only
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "mean_paraphrase",
+                lambda scd: mean_subgroup_score(
+                    subgroup_scores_dict=scd, subgroup_types=["paraphrase"]
+                ),
+                True,
+            ],
+        }
+    }
+# using PDR
+class FixedGroupPDRParaphraseAccuracy(Accuracy):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "pdr_paraphrase",
+                lambda scd: performance_drop_rate(
+                    subgroup_scores_dict=scd,
+                    control_subgroup_types=["original"],
+                    comparison_subgroup_types=["paraphrase"],
+                ),
+                True,
+            ],
+        }
+    }
+class FixedGroupPDRParaphraseStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "pdr_paraphrase",
+                lambda scd: performance_drop_rate(
+                    subgroup_scores_dict=scd,
+                    control_subgroup_types=["original"],
+                    comparison_subgroup_types=["paraphrase"],
+                ),
+                True,
+            ],
+        }
+    }
+class GroupMeanTokenOverlap(TokenOverlap):
+    reduction_map = {
+        "group_mean": {
+            "agg_func": ["mean", nan_mean, False],
+            "score_fields": ["f1", "precision", "recall"],
+        }
+    }
+# using Cohens's h for proportions
+class FixedGroupNormCohensHParaphraseAccuracy(Accuracy):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "norm_cohens_h_paraphrase",
+                lambda scd: normalized_cohens_h(
+                    subgroup_scores_dict=scd,
+                    control_subgroup_types=["original"],
+                    comparison_subgroup_types=["paraphrase"],
+                ),
+                True,
+            ],
+        }
+    }
+class FixedGroupNormCohensHParaphraseStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "norm_cohens_h_paraphrase",
+                lambda scd: normalized_cohens_h(
+                    subgroup_scores_dict=scd,
+                    control_subgroup_types=["original"],
+                    comparison_subgroup_types=["paraphrase"],
+                ),
+                True,
+            ],
+        }
+    }
+# using Hedges' g (takes into account internal variation in group scores)
+class FixedGroupNormHedgesGParaphraseAccuracy(Accuracy):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "norm_hedges_g_paraphrase",
+                lambda scd: normalized_hedges_g(
+                    subgroup_scores_dict=scd,
+                    control_subgroup_types=["original"],
+                    comparison_subgroup_types=["paraphrase"],
+                ),
+                True,
+            ],
+        }
+    }
+class FixedGroupNormHedgesGParaphraseStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "norm_hedges_g_paraphrase",
+                lambda scd: normalized_hedges_g(
+                    subgroup_scores_dict=scd,
+                    control_subgroup_types=["original"],
+                    comparison_subgroup_types=["paraphrase"],
+                ),
+                True,
+            ],
+        }
+    }
+# for above metrics, take absolute value of group score first; this measures variation in either direction
+class FixedGroupAbsvalNormCohensHParaphraseAccuracy(Accuracy):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "absval_norm_cohens_h_paraphrase",
+                lambda scd: np.abs(
+                    normalized_cohens_h(
+                        subgroup_scores_dict=scd,
+                        control_subgroup_types=["original"],
+                        comparison_subgroup_types=["paraphrase"],
+                    )
+                ),
+                True,
+            ],
+        }
+    }
+class FixedGroupAbsvalNormCohensHParaphraseStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "absval_norm_cohens_h_paraphrase",
+                lambda scd: np.abs(
+                    normalized_cohens_h(
+                        subgroup_scores_dict=scd,
+                        control_subgroup_types=["original"],
+                        comparison_subgroup_types=["paraphrase"],
+                    )
+                ),
+                True,
+            ],
+        }
+    }
+class FixedGroupAbsvalNormHedgesGParaphraseAccuracy(Accuracy):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "absval_norm_hedges_g_paraphrase",
+                lambda scd: np.abs(
+                    normalized_hedges_g(
+                        subgroup_scores_dict=scd,
+                        control_subgroup_types=["original"],
+                        comparison_subgroup_types=["paraphrase"],
+                    )
+                ),
+                True,
+            ],
+        }
+    }
+class FixedGroupAbsvalNormHedgesGParaphraseStringContainment(StringContainment):
+    subgroup_column = "variant_type"
+    reduction_map = {
+        "group_mean": {
+            "agg_func": [
+                "absval_norm_hedges_g_paraphrase",
+                lambda scd: np.abs(
+                    normalized_hedges_g(
+                        subgroup_scores_dict=scd,
+                        control_subgroup_types=["original"],
+                        comparison_subgroup_types=["paraphrase"],
+                    )
+                ),
+                True,
+            ],
+        }
+    }