Upload splitters.py with huggingface_hub
Browse files- splitters.py +23 -0
splitters.py
CHANGED
|
@@ -29,6 +29,29 @@ class RenameSplits(Splitter):
|
|
| 29 |
|
| 30 |
|
| 31 |
class SplitRandomMix(Splitter):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
mix: Dict[str, str]
|
| 33 |
|
| 34 |
def process(self, multi_stream: MultiStream) -> MultiStream:
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
class SplitRandomMix(Splitter):
|
| 32 |
+
"""Splits a multistream into new streams (splits), whose names, source input stream, and amount of instances, are specified by arg 'mix'.
|
| 33 |
+
|
| 34 |
+
The keys of arg 'mix', are the names of the new streams, the values are of the form: 'name-of-source-stream[percentage-of-source-stream]'
|
| 35 |
+
Each input instance, of any input stream, is selected exactly once for inclusion in any of the output streams.
|
| 36 |
+
|
| 37 |
+
Examples:
|
| 38 |
+
When processing a multistream made of two streams whose names are 'train' and 'test', by
|
| 39 |
+
SplitRandomMix(mix = { "train": "train[99%]", "validation": "train[1%]", "test": "test" })
|
| 40 |
+
the output is a multistream, whose three streams are named 'train', 'validation', and 'test'.
|
| 41 |
+
Output stream 'train' is made of randomly selected 99% of the instances of input stream 'train',
|
| 42 |
+
output stream 'validation' is made of the remaining 1% instances of input 'train', and output stream 'test' is made
|
| 43 |
+
of the whole of input stream 'test'.
|
| 44 |
+
|
| 45 |
+
When processing the above input multistream by
|
| 46 |
+
SplitRandomMix(mix = { "train": "train[50%]+test[0.1]", "validation": "train[50%]+test[0.2]", "test": "test[0.7]" })
|
| 47 |
+
the output is a multistream, whose three streams are named 'train', 'validation', and 'test'.
|
| 48 |
+
Output stream 'train' is made of randomly selected 50% of the instances of input stream 'train' + randomly selected
|
| 49 |
+
0.1 (i.e., 10%) of the instances of input stream 'test'.
|
| 50 |
+
Output stream 'validation' is made of the remaining 50% instances of input 'train'+ randomly selected 0.2 (i.e.,
|
| 51 |
+
20%) of the original instances of input 'test', that were not selected for output 'train',
|
| 52 |
+
and output stream 'test' is made of the remaining instances of input 'test'.
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
mix: Dict[str, str]
|
| 56 |
|
| 57 |
def process(self, multi_stream: MultiStream) -> MultiStream:
|