Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Jan 11, 2024

Commit

80500e3

verified ·

1 Parent(s): 5de16c3

Upload splitters.py with huggingface_hub

Browse files

Files changed (1) hide show

splitters.py +57 -12

splitters.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import itertools
 from abc import abstractmethod
 from typing import Dict, List
 from .artifact import Artifact
 from .operator import InstanceOperatorWithMultiStreamAccess, MultiStreamOperator
-from .random_utils import get_random
 from .split_utils import (
     parse_random_mix_string,
     parse_slices_string,
@@ -82,6 +83,7 @@ class SliceSplit(Splitter):
 class Sampler(Artifact):
     sample_size: int = None
     def prepare(self):
         super().prepare()
@@ -95,6 +97,11 @@ class Sampler(Artifact):
             size = int(size)
         self.sample_size = size
     @abstractmethod
     def sample(
         self, instances_pool: List[Dict[str, object]]
@@ -107,22 +114,52 @@ class RandomSampler(Sampler):
         self, instances_pool: List[Dict[str, object]]
     ) -> List[Dict[str, object]]:
         instances_pool = list(instances_pool)
-        return get_random().sample(instances_pool, self.sample_size)
 class DiverseLabelsSampler(Sampler):
     choices: str = "choices"
     def prepare(self):
         super().prepare()
-        self.labels = None
     def examplar_repr(self, examplar):
         if "inputs" not in examplar:
             raise ValueError(f"'inputs' field is missing from '{examplar}'.")
         inputs = examplar["inputs"]
         if self.choices not in inputs:
-            raise ValueError(f"{self.choices} field is missing from '{inputs}'.")
         choices = inputs[self.choices]
         if not isinstance(choices, list):
             raise ValueError(
@@ -131,7 +168,11 @@ class DiverseLabelsSampler(Sampler):
         if "outputs" not in examplar:
             raise ValueError(f"'outputs' field is missing from '{examplar}'.")
-        examplar_outputs = next(iter(examplar["outputs"].values()))
         if not isinstance(examplar_outputs, list):
             raise ValueError(
                 f"Unexpected examplar_outputs value '{examplar_outputs}'. Expected a list."
@@ -151,19 +192,23 @@ class DiverseLabelsSampler(Sampler):
     def sample(
         self, instances_pool: List[Dict[str, object]]
     ) -> List[Dict[str, object]]:
-        if self.labels is None:
-            self.labels = self.divide_by_repr(instances_pool)
-        all_labels = list(self.labels.keys())
-        get_random().shuffle(all_labels)
         from collections import Counter
         total_allocated = 0
         allocations = Counter()
         while total_allocated < self.sample_size:
             for label in all_labels:
                 if total_allocated < self.sample_size:
-                    if len(self.labels[label]) - allocations[label] > 0:
                         allocations[label] += 1
                         total_allocated += 1
                 else:
@@ -171,10 +216,10 @@ class DiverseLabelsSampler(Sampler):
         result = []
         for label, allocation in allocations.items():
-            sample = get_random().sample(self.labels[label], allocation)
             result.extend(sample)
-        get_random().shuffle(result)
         return result

 import itertools
 from abc import abstractmethod
+from random import Random
 from typing import Dict, List
 from .artifact import Artifact
 from .operator import InstanceOperatorWithMultiStreamAccess, MultiStreamOperator
+from .random_utils import new_random_generator
 from .split_utils import (
     parse_random_mix_string,
     parse_slices_string,
 class Sampler(Artifact):
     sample_size: int = None
+    random_generator: Random = new_random_generator(sub_seed="Sampler")
     def prepare(self):
         super().prepare()
             size = int(size)
         self.sample_size = size
+    def init_new_random_generator(self):
+        self.random_generator = new_random_generator(
+            sub_seed="init_new_random_generator"
+        )
     @abstractmethod
     def sample(
         self, instances_pool: List[Dict[str, object]]
         self, instances_pool: List[Dict[str, object]]
     ) -> List[Dict[str, object]]:
         instances_pool = list(instances_pool)
+        return self.random_generator.sample(instances_pool, self.sample_size)
 class DiverseLabelsSampler(Sampler):
+    """Selects a balanced sample of instances based on an output field.
+    (used for selecting demonstrations in-context learning)
+    The field must contain list of values e.g ['dog'], ['cat'], ['dog','cat','cow'].
+    The balancing is done such that each value or combination of values
+    appears as equals as possible in the samples.
+    The `choices` param is required and determines which values should be considered.
+    Example:
+        If choices is ['dog,'cat'] , then the following combinations will be considered.
+        ['']
+        ['cat']
+        ['dog']
+        ['dog','cat']
+        If the instance contains a value not in the 'choice' param, it is ignored. For example,
+        if choices is ['dog,'cat'] and the instance field is ['dog','cat','cow'], then 'cow' is ignored
+        then the instance is considered as ['dog','cat'].
+    Args:
+        sample_size - number of samples to extract
+        choices - name of input field that contains the list of values to balance on
+        labels - name of output field with labels that must be balanced
+    """
     choices: str = "choices"
+    labels: str = "labels"
     def prepare(self):
         super().prepare()
+        self.labels_cache = None
     def examplar_repr(self, examplar):
         if "inputs" not in examplar:
             raise ValueError(f"'inputs' field is missing from '{examplar}'.")
         inputs = examplar["inputs"]
         if self.choices not in inputs:
+            raise ValueError(f"'{self.choices}' field is missing from '{inputs}'.")
         choices = inputs[self.choices]
         if not isinstance(choices, list):
             raise ValueError(
         if "outputs" not in examplar:
             raise ValueError(f"'outputs' field is missing from '{examplar}'.")
+        outputs = examplar["outputs"]
+        if self.labels not in outputs:
+            raise ValueError(f"'{self.labels}' field is missing from '{outputs}'.")
+        examplar_outputs = examplar["outputs"][self.labels]
         if not isinstance(examplar_outputs, list):
             raise ValueError(
                 f"Unexpected examplar_outputs value '{examplar_outputs}'. Expected a list."
     def sample(
         self, instances_pool: List[Dict[str, object]]
     ) -> List[Dict[str, object]]:
+        if self.labels_cache is None:
+            self.labels_cache = self.divide_by_repr(instances_pool)
+        all_labels = list(self.labels_cache.keys())
+        self.random_generator.shuffle(all_labels)
         from collections import Counter
+        if self.sample_size > len(instances_pool):
+            raise ValueError(
+                f"Request sample size {self.sample_size} is greater than number of instances {len(instances_pool)}"
+            )
         total_allocated = 0
         allocations = Counter()
         while total_allocated < self.sample_size:
             for label in all_labels:
                 if total_allocated < self.sample_size:
+                    if len(self.labels_cache[label]) - allocations[label] > 0:
                         allocations[label] += 1
                         total_allocated += 1
                 else:
         result = []
         for label, allocation in allocations.items():
+            sample = self.random_generator.sample(self.labels_cache[label], allocation)
             result.extend(sample)
+        self.random_generator.shuffle(result)
         return result