Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on Feb 7, 2024

Commit

87d48ff

verified ·

1 Parent(s): fe54ce9

Upload loaders.py with huggingface_hub

Browse files

Files changed (1) hide show

loaders.py +33 -14

loaders.py CHANGED Viewed

@@ -60,7 +60,7 @@ class Loader(SourceOperator):
     # loader may ingore this.  In any case, the recipe, will limit the number of instances in the returned
     # stream, after load is complete.
     loader_limit: int = None
-    pass
 class LoadHF(Loader):
@@ -71,21 +71,27 @@ class LoadHF(Loader):
     data_files: Optional[
         Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     ] = None
-    streaming: bool = True
     def process(self):
         try:
             with tempfile.TemporaryDirectory() as dir_to_be_deleted:
-                dataset = hf_load_dataset(
-                    self.path,
-                    name=self.name,
-                    data_dir=self.data_dir,
-                    data_files=self.data_files,
-                    streaming=self.streaming,
-                    cache_dir=None if self.streaming else dir_to_be_deleted,
-                    split=self.split,
-                    trust_remote_code=settings.allow_unverified_code,
-                )
             if self.split is not None:
                 dataset = {self.split: dataset}
         except (
@@ -122,15 +128,23 @@ class LoadCSV(Loader):
     files: Dict[str, str]
     chunksize: int = 1000
-    def load_csv(self, file):
         for chunk in pd.read_csv(file, chunksize=self.chunksize):
             for _index, row in chunk.iterrows():
                 yield row.to_dict()
     def process(self):
         return MultiStream(
             {
-                name: Stream(generator=self.load_csv, gen_kwargs={"file": file})
                 for name, file in self.files.items()
             }
         )
@@ -155,6 +169,9 @@ class LoadFromKaggle(Loader):
                 "Please obtain kaggle credentials https://christianjmills.com/posts/kaggle-obtain-api-key-tutorial/ and save them to local ./kaggle.json file"
             )
     def prepare(self):
         super().prepare()
         from opendatasets import download
@@ -246,6 +263,8 @@ class LoadFromIBMCloud(Loader):
         assert (
             self.aws_secret_access_key is not None
         ), f"Please set {self.aws_secret_access_key_env} environmental variable"
     def process(self):
         cos = ibm_boto3.resource(

     # loader may ingore this.  In any case, the recipe, will limit the number of instances in the returned
     # stream, after load is complete.
     loader_limit: int = None
+    streaming: bool = False
 class LoadHF(Loader):
     data_files: Optional[
         Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     ] = None
+    streaming: bool = False
     def process(self):
         try:
             with tempfile.TemporaryDirectory() as dir_to_be_deleted:
+                try:
+                    dataset = hf_load_dataset(
+                        self.path,
+                        name=self.name,
+                        data_dir=self.data_dir,
+                        data_files=self.data_files,
+                        streaming=self.streaming,
+                        cache_dir=None if self.streaming else dir_to_be_deleted,
+                        split=self.split,
+                        trust_remote_code=settings.allow_unverified_code,
+                    )
+                except ValueError as e:
+                    if "trust_remote_code" in str(e):
+                        raise ValueError(
+                            f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment vairable: UNITXT_ALLOW_UNVERIFIED_CODE."
+                        ) from e
             if self.split is not None:
                 dataset = {self.split: dataset}
         except (
     files: Dict[str, str]
     chunksize: int = 1000
+    def stream_csv(self, file):
         for chunk in pd.read_csv(file, chunksize=self.chunksize):
             for _index, row in chunk.iterrows():
                 yield row.to_dict()
     def process(self):
+        if self.streaming:
+            return MultiStream(
+                {
+                    name: Stream(generator=self.stream_csv, gen_kwargs={"file": file})
+                    for name, file in self.files.items()
+                }
+            )
         return MultiStream(
             {
+                name: pd.read_csv(file).to_dict("records")
                 for name, file in self.files.items()
             }
         )
                 "Please obtain kaggle credentials https://christianjmills.com/posts/kaggle-obtain-api-key-tutorial/ and save them to local ./kaggle.json file"
             )
+        if self.streaming:
+            raise NotImplementedError("LoadFromKaggle cannot load with streaming.")
     def prepare(self):
         super().prepare()
         from opendatasets import download
         assert (
             self.aws_secret_access_key is not None
         ), f"Please set {self.aws_secret_access_key_env} environmental variable"
+        if self.streaming:
+            raise NotImplementedError("LoadFromKaggle cannot load with streaming.")
     def process(self):
         cos = ibm_boto3.resource(