Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Dec 3, 2023

Commit

212beb8

1 Parent(s): 7f6dcb7

Upload split_utils.py with huggingface_hub

Browse files

Files changed (1) hide show

split_utils.py +34 -32

split_utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import itertools
 import re
 from typing import Dict
@@ -8,8 +9,7 @@ from .stream import Stream
 def parse_random_mix_string(input_str):
-    """
-    Parses a string of format "source1[percentage1%]+source2[value2]+..." and returns a dictionary.
     Args:
         input_str (str): A string containing source names and their respective proportions. The format is
@@ -28,23 +28,27 @@ def parse_random_mix_string(input_str):
         >>> parse_random_mix_string("dale[90%]+oren[0.7]+mike")
             {'dale': 0.9, 'oren': 0.7, 'mike': 1.0}
     """
-    if not re.fullmatch(r"(([a-zA-Z]+\[\d*\.?\d*%?\]|[a-zA-Z]+)\+)*([a-zA-Z]+\[\d*\.?\d*%?\]|[a-zA-Z]+)", input_str):
         raise ValueError(f"Invalid input format for split '{input_str}'")
     pattern = re.compile(r"([a-zA-Z]+)(\[\d*\.?\d*%?\])?")
     matches = pattern.findall(input_str)
     return {
-        name: float(value.strip("[]%")) / 100 if "%" in value else (float(value.strip("[]")) if value else 1.0)
         for name, value in matches
     }
 def parse_slices_string(input_str):
-    """
-    Parses a string of format "source1[value1:value2] + source2[value2:] + source3 + ..." and returns a dictionary:
-    {"source1": [(value1,value2)], "source2": [(value2, None)], "source3": [(None,None)]...}
     If a source appears multiple times with different indices, all index pairs are included in the list.
@@ -65,7 +69,6 @@ def parse_slices_string(input_str):
         >>> parse_slices_string("oren[:50]+jake[24:]+test+oren[5:10]")
         {'oren': [(None, 50), (5, 10)], 'jake': [(24, None)], 'test': [(None, None)]}
     """
     result_dict = {}
     # Split the input string into a list of sources
@@ -82,7 +85,9 @@ def parse_slices_string(input_str):
             name = source
             start = end = None
         else:
-            raise ValueError(f'The input string "{input_str}" is not in the correct format.')
         if name not in result_dict:
             result_dict[name] = [(start, end)]
@@ -100,14 +105,12 @@ def slice_stream(stream, start, end):
     if end is not None:
         stream = itertools.islice(stream, end)
-    for item in stream:
-        yield item
     # return stream
 def slice_streams(input_streams, mapping):
-    """
-    Slices multiple input streams according to a mapping and chains the results together.
     Args:
         input_streams (dict): A dictionary where the keys are the names of the input streams
@@ -129,12 +132,13 @@ def slice_streams(input_streams, mapping):
         >>> slice_streams(old_streams, mapping)
         {"new_train": [1, 2, 3, 4, 5, 8, 9], "new_test": [12, 13, 14]}
     """
     new_streams = {}
     for new_stream, sources in mapping.items():
         def generator(new_stream, sources):
             for old_stream, slices in sources.items():
                 old_stream_content = input_streams[old_stream]
                 for start, end in slices:
                     yield from slice_stream(old_stream_content, start, end)
@@ -147,8 +151,7 @@ def slice_streams(input_streams, mapping):
 def build_stream_routing(mapping):
-    """
-    Builds the stream mapping dictionary based on the provided mapping.
     The stream mapping dictionary represents the mapping of old streams to new streams
     and their respective probabilities. It ensures that the probabilities for each old stream
@@ -176,16 +179,15 @@ def build_stream_routing(mapping):
                 }
             }
             stream_mapping = build_stream_mapping(mapping)
-            print(stream_mapping)
             # Output: {'my_old_stream1': (['my_new_stream', 'my_new_stream2'], [0.6, 0.4]),
             #          'my_old_stream2': (['my_new_stream', 'my_new_stream2'], [0.2, 0.8])}
     """
     stream_mapping = {}
     # Calculate total weight for each old stream
     total_weights = {}
-    for new_stream, old_streams in mapping.items():
         for old_stream, weight in old_streams.items():
             if old_stream not in total_weights:
                 total_weights[old_stream] = weight
@@ -203,13 +205,12 @@ def build_stream_routing(mapping):
             if total_weights[old_stream] < 1:
                 stream_mapping[old_stream][None] = 1 - total_weights[old_stream]
-    stream_mapping = {k: (list(v.keys()), list(v.values())) for k, v in stream_mapping.items()}
-    return stream_mapping
 def rename_split(input_streams: Dict[str, Stream], mapping: Dict[str, str]):
-    """
-    Renames the streams
     Args:
         input_streams (dict): A dictionary containing the input streams, where each key is
                               the name of the stream and the value is an iterable or generator
@@ -219,11 +220,14 @@ def rename_split(input_streams: Dict[str, Stream], mapping: Dict[str, str]):
     Returns:
         dict: A dictionary containing the generated new streams, where each key is the name
-              of the new stream and the value is a generator representing the stream."""
     return {mapping.get(key, key): val for key, val in input_streams.items()}
-def random_mix_generator(new_stream_name, new_stream_sources, stream_routing, input_streams):
     for old_stream_name in new_stream_sources:
         optinal_streams, weights = stream_routing[old_stream_name]
         with nested_seed(old_stream_name) as rand:
@@ -237,8 +241,7 @@ def random_mix_generator(new_stream_name, new_stream_sources, stream_routing, in
 def random_mix_streams(input_streams, mapping):
-    """
-    Creates new streams based on the provided input streams and mapping.
     The create_streams function generates new streams by selectively including items from
     the old streams based on the specified mapping. Each item will be included in at most
@@ -273,11 +276,10 @@ def random_mix_streams(input_streams, mapping):
             }
             new_streams = create_streams(input_streams, mapping)
             for new_stream_name, new_stream in new_streams.items():
-                print(f"{new_stream_name}:")
                 for _, item in zip(range(10), new_stream):
-                    print(item)
     """
     new_streams = {}
     # Build stream routing
@@ -300,4 +302,4 @@ def random_mix_streams(input_streams, mapping):
 if __name__ == "__main__":
-    print(parse_random_mix_string("dale[90%]+oren[0.7]+mike"))

 import itertools
+import logging
 import re
 from typing import Dict
 def parse_random_mix_string(input_str):
+    """Parses a string of format "source1[percentage1%]+source2[value2]+..." and returns a dictionary.
     Args:
         input_str (str): A string containing source names and their respective proportions. The format is
         >>> parse_random_mix_string("dale[90%]+oren[0.7]+mike")
             {'dale': 0.9, 'oren': 0.7, 'mike': 1.0}
     """
+    if not re.fullmatch(
+        r"(([a-zA-Z]+\[\d*\.?\d*%?\]|[a-zA-Z]+)\+)*([a-zA-Z]+\[\d*\.?\d*%?\]|[a-zA-Z]+)",
+        input_str,
+    ):
         raise ValueError(f"Invalid input format for split '{input_str}'")
     pattern = re.compile(r"([a-zA-Z]+)(\[\d*\.?\d*%?\])?")
     matches = pattern.findall(input_str)
     return {
+        name: float(value.strip("[]%")) / 100
+        if "%" in value
+        else (float(value.strip("[]")) if value else 1.0)
         for name, value in matches
     }
 def parse_slices_string(input_str):
+    """Parses a string of format "source1[value1:value2] + source2[value2:] + source3 + ..." and returns a dictionary.
+    {"source1": [(value1,value2)], "source2": [(value2, None)], "source3": [(None,None)]...}.
     If a source appears multiple times with different indices, all index pairs are included in the list.
         >>> parse_slices_string("oren[:50]+jake[24:]+test+oren[5:10]")
         {'oren': [(None, 50), (5, 10)], 'jake': [(24, None)], 'test': [(None, None)]}
     """
     result_dict = {}
     # Split the input string into a list of sources
             name = source
             start = end = None
         else:
+            raise ValueError(
+                f'The input string "{input_str}" is not in the correct format.'
+            )
         if name not in result_dict:
             result_dict[name] = [(start, end)]
     if end is not None:
         stream = itertools.islice(stream, end)
+    yield from stream
     # return stream
 def slice_streams(input_streams, mapping):
+    """Slices multiple input streams according to a mapping and chains the results together.
     Args:
         input_streams (dict): A dictionary where the keys are the names of the input streams
         >>> slice_streams(old_streams, mapping)
         {"new_train": [1, 2, 3, 4, 5, 8, 9], "new_test": [12, 13, 14]}
     """
     new_streams = {}
     for new_stream, sources in mapping.items():
         def generator(new_stream, sources):
             for old_stream, slices in sources.items():
+                if old_stream not in input_streams:
+                    raise ValueError(f"'{old_stream}' is not available in input stream")
                 old_stream_content = input_streams[old_stream]
                 for start, end in slices:
                     yield from slice_stream(old_stream_content, start, end)
 def build_stream_routing(mapping):
+    """Builds the stream mapping dictionary based on the provided mapping.
     The stream mapping dictionary represents the mapping of old streams to new streams
     and their respective probabilities. It ensures that the probabilities for each old stream
                 }
             }
             stream_mapping = build_stream_mapping(mapping)
+            logging.info(stream_mapping)
             # Output: {'my_old_stream1': (['my_new_stream', 'my_new_stream2'], [0.6, 0.4]),
             #          'my_old_stream2': (['my_new_stream', 'my_new_stream2'], [0.2, 0.8])}
     """
     stream_mapping = {}
     # Calculate total weight for each old stream
     total_weights = {}
+    for _new_stream, old_streams in mapping.items():
         for old_stream, weight in old_streams.items():
             if old_stream not in total_weights:
                 total_weights[old_stream] = weight
             if total_weights[old_stream] < 1:
                 stream_mapping[old_stream][None] = 1 - total_weights[old_stream]
+    return {k: (list(v.keys()), list(v.values())) for k, v in stream_mapping.items()}
 def rename_split(input_streams: Dict[str, Stream], mapping: Dict[str, str]):
+    """Renames the streams.
     Args:
         input_streams (dict): A dictionary containing the input streams, where each key is
                               the name of the stream and the value is an iterable or generator
     Returns:
         dict: A dictionary containing the generated new streams, where each key is the name
+    of the new stream and the value is a generator representing the stream.
+    """
     return {mapping.get(key, key): val for key, val in input_streams.items()}
+def random_mix_generator(
+    new_stream_name, new_stream_sources, stream_routing, input_streams
+):
     for old_stream_name in new_stream_sources:
         optinal_streams, weights = stream_routing[old_stream_name]
         with nested_seed(old_stream_name) as rand:
 def random_mix_streams(input_streams, mapping):
+    """Creates new streams based on the provided input streams and mapping.
     The create_streams function generates new streams by selectively including items from
     the old streams based on the specified mapping. Each item will be included in at most
             }
             new_streams = create_streams(input_streams, mapping)
             for new_stream_name, new_stream in new_streams.items():
+                logging.info(f"{new_stream_name}:")
                 for _, item in zip(range(10), new_stream):
+                    logging.info(item)
     """
     new_streams = {}
     # Build stream routing
 if __name__ == "__main__":
+    logging.info(parse_random_mix_string("dale[90%]+oren[0.7]+mike"))