Upload dataset.py with huggingface_hub
Browse files- dataset.py +5 -56
dataset.py
CHANGED
|
@@ -2,14 +2,15 @@ import os
|
|
| 2 |
|
| 3 |
import datasets
|
| 4 |
|
| 5 |
-
from .
|
| 6 |
from .artifact import __file__ as _
|
| 7 |
-
from .artifact import fetch_artifact
|
| 8 |
from .blocks import __file__ as _
|
| 9 |
from .card import __file__ as _
|
| 10 |
from .catalog import __file__ as _
|
| 11 |
from .collections import __file__ as _
|
| 12 |
from .dataclass import __file__ as _
|
|
|
|
|
|
|
| 13 |
from .dict_utils import __file__ as _
|
| 14 |
from .file_utils import __file__ as _
|
| 15 |
from .formats import __file__ as _
|
|
@@ -17,11 +18,11 @@ from .fusion import __file__ as _
|
|
| 17 |
from .generator_utils import __file__ as _
|
| 18 |
from .hf_utils import __file__ as _
|
| 19 |
from .instructions import __file__ as _
|
| 20 |
-
from .load import __file__ as _
|
| 21 |
from .loaders import __file__ as _
|
| 22 |
from .logging_utils import __file__ as _
|
| 23 |
from .logging_utils import get_logger
|
| 24 |
from .metric import __file__ as _
|
|
|
|
| 25 |
from .metrics import __file__ as _
|
| 26 |
from .normalizers import __file__ as _
|
| 27 |
from .operator import __file__ as _
|
|
@@ -30,7 +31,6 @@ from .processors import __file__ as _
|
|
| 30 |
from .random_utils import __file__ as _
|
| 31 |
from .recipe import __file__ as _
|
| 32 |
from .register import __file__ as _
|
| 33 |
-
from .register import _reset_env_local_catalogs, register_all_artifacts
|
| 34 |
from .schema import __file__ as _
|
| 35 |
from .split_utils import __file__ as _
|
| 36 |
from .splitters import __file__ as _
|
|
@@ -47,57 +47,6 @@ from .version import version
|
|
| 47 |
|
| 48 |
logger = get_logger()
|
| 49 |
|
| 50 |
-
__default_recipe__ = "standard_recipe"
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
def fetch(artifact_name):
|
| 54 |
-
try:
|
| 55 |
-
artifact, _ = fetch_artifact(artifact_name)
|
| 56 |
-
return artifact
|
| 57 |
-
except UnitxtArtifactNotFoundError:
|
| 58 |
-
return None
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
def parse(query: str):
|
| 62 |
-
"""Parses a query of the form 'key1=value1,key2=value2,...' into a dictionary."""
|
| 63 |
-
result = {}
|
| 64 |
-
kvs = query.split(",")
|
| 65 |
-
if len(kvs) == 0:
|
| 66 |
-
raise ValueError(
|
| 67 |
-
'Illegal query: "{query}" should contain at least one assignment of the form: key1=value1,key2=value2'
|
| 68 |
-
)
|
| 69 |
-
for kv in kvs:
|
| 70 |
-
key_val = kv.split("=")
|
| 71 |
-
if (
|
| 72 |
-
len(key_val) != 2
|
| 73 |
-
or len(key_val[0].strip()) == 0
|
| 74 |
-
or len(key_val[1].strip()) == 0
|
| 75 |
-
):
|
| 76 |
-
raise ValueError(
|
| 77 |
-
f'Illegal query: "{query}" with wrong assignment "{kv}" should be of the form: key=value.'
|
| 78 |
-
)
|
| 79 |
-
key, val = key_val
|
| 80 |
-
if val.isdigit():
|
| 81 |
-
result[key] = int(val)
|
| 82 |
-
elif val.replace(".", "", 1).isdigit():
|
| 83 |
-
result[key] = float(val)
|
| 84 |
-
else:
|
| 85 |
-
result[key] = val
|
| 86 |
-
|
| 87 |
-
return result
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
def get_dataset_artifact(dataset_str):
|
| 91 |
-
_reset_env_local_catalogs()
|
| 92 |
-
register_all_artifacts()
|
| 93 |
-
recipe = fetch(dataset_str)
|
| 94 |
-
if recipe is None:
|
| 95 |
-
args = parse(dataset_str)
|
| 96 |
-
if "type" not in args:
|
| 97 |
-
args["type"] = os.environ.get("UNITXT_DEFAULT_RECIPE", __default_recipe__)
|
| 98 |
-
recipe = Artifact.from_dict(args)
|
| 99 |
-
return recipe
|
| 100 |
-
|
| 101 |
|
| 102 |
class Dataset(datasets.GeneratorBasedBuilder):
|
| 103 |
"""TODO: Short description of my dataset."""
|
|
@@ -108,7 +57,7 @@ class Dataset(datasets.GeneratorBasedBuilder):
|
|
| 108 |
def generators(self):
|
| 109 |
if not hasattr(self, "_generators") or self._generators is None:
|
| 110 |
try:
|
| 111 |
-
from unitxt.
|
| 112 |
get_dataset_artifact as get_dataset_artifact_installed
|
| 113 |
|
| 114 |
unitxt_installed = True
|
|
|
|
| 2 |
|
| 3 |
import datasets
|
| 4 |
|
| 5 |
+
from .api import __file__ as _
|
| 6 |
from .artifact import __file__ as _
|
|
|
|
| 7 |
from .blocks import __file__ as _
|
| 8 |
from .card import __file__ as _
|
| 9 |
from .catalog import __file__ as _
|
| 10 |
from .collections import __file__ as _
|
| 11 |
from .dataclass import __file__ as _
|
| 12 |
+
from .dataset_utils import __file__ as _
|
| 13 |
+
from .dataset_utils import get_dataset_artifact
|
| 14 |
from .dict_utils import __file__ as _
|
| 15 |
from .file_utils import __file__ as _
|
| 16 |
from .formats import __file__ as _
|
|
|
|
| 18 |
from .generator_utils import __file__ as _
|
| 19 |
from .hf_utils import __file__ as _
|
| 20 |
from .instructions import __file__ as _
|
|
|
|
| 21 |
from .loaders import __file__ as _
|
| 22 |
from .logging_utils import __file__ as _
|
| 23 |
from .logging_utils import get_logger
|
| 24 |
from .metric import __file__ as _
|
| 25 |
+
from .metric_utils import __file__ as _
|
| 26 |
from .metrics import __file__ as _
|
| 27 |
from .normalizers import __file__ as _
|
| 28 |
from .operator import __file__ as _
|
|
|
|
| 31 |
from .random_utils import __file__ as _
|
| 32 |
from .recipe import __file__ as _
|
| 33 |
from .register import __file__ as _
|
|
|
|
| 34 |
from .schema import __file__ as _
|
| 35 |
from .split_utils import __file__ as _
|
| 36 |
from .splitters import __file__ as _
|
|
|
|
| 47 |
|
| 48 |
logger = get_logger()
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
class Dataset(datasets.GeneratorBasedBuilder):
|
| 52 |
"""TODO: Short description of my dataset."""
|
|
|
|
| 57 |
def generators(self):
|
| 58 |
if not hasattr(self, "_generators") or self._generators is None:
|
| 59 |
try:
|
| 60 |
+
from unitxt.dataset_utils import \
|
| 61 |
get_dataset_artifact as get_dataset_artifact_installed
|
| 62 |
|
| 63 |
unitxt_installed = True
|