Spaces:

Dovakiins
/

qwerrwe

Build error

App Files Files Community

casperhansen commited on Oct 26, 2023

Commit

05bd6f1

unverified ·

1 Parent(s): 20aa4b5

Threaded MultipackDistributedDataloader with prefetched samples (#759)

Browse files

* Multithreading implementation [WIP]

* Added benchmarking

* 35% increased throughput

* Memory pinning

* Start threads in init

* Correct print of samples

* Sleep if queue is full

* Remove pin_memory (worse)

* Simplify logic to one thread

* Remove benchmark

* Use deque for constant speed

* Formatting

* Formatting

* Formatting

* Formatting

* Rollback to use queue

* Fix multi-epoch training

* Add num epochs arg

* Start thread in __iter__

* Formatting

* Use is_alive correctly

* Simplify loading thread

Files changed (3) hide show

src/axolotl/core/trainer_builder.py +5 -1
src/axolotl/utils/dataloader.py +45 -5
src/axolotl/utils/trainer.py +1 -0

src/axolotl/core/trainer_builder.py CHANGED Viewed

@@ -111,7 +111,8 @@ class AxolotlTrainer(Trainer):
     args = None  # type: AxolotlTrainingArguments
-    def __init__(self, *args, bench_data_collator=None, **kwargs):
         self.bench_data_collator = bench_data_collator
         super().__init__(*args, **kwargs)
@@ -182,6 +183,7 @@ class AxolotlTrainer(Trainer):
                     packing_efficiency_estimate=self.args.sample_packing_efficiency,
                     sample_packing_seq_len_multiplier=self.args.sample_packing_seq_len_multiplier,
                     device_count=int(os.environ.get("WORLD_SIZE", 1)),
                 )
             )
         return super().get_train_dataloader()
@@ -205,6 +207,7 @@ class AxolotlTrainer(Trainer):
                     packing_efficiency_estimate=self.args.sample_packing_efficiency,
                     sample_packing_seq_len_multiplier=self.args.eval_batch_size,
                     device_count=int(os.environ.get("WORLD_SIZE", 1)),
                 )
             )
         return super().get_eval_dataloader(eval_dataset)
@@ -680,6 +683,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                 **data_collator_kwargs,
             ),
             callbacks=self.get_callbacks(),
             **trainer_kwargs,
         )
         trainer = self.hook_post_create_trainer(trainer)

     args = None  # type: AxolotlTrainingArguments
+    def __init__(self, *args, num_epochs=1, bench_data_collator=None, **kwargs):
+        self.num_epochs = num_epochs
         self.bench_data_collator = bench_data_collator
         super().__init__(*args, **kwargs)
                     packing_efficiency_estimate=self.args.sample_packing_efficiency,
                     sample_packing_seq_len_multiplier=self.args.sample_packing_seq_len_multiplier,
                     device_count=int(os.environ.get("WORLD_SIZE", 1)),
+                    num_epochs=self.num_epochs,
                 )
             )
         return super().get_train_dataloader()
                     packing_efficiency_estimate=self.args.sample_packing_efficiency,
                     sample_packing_seq_len_multiplier=self.args.eval_batch_size,
                     device_count=int(os.environ.get("WORLD_SIZE", 1)),
+                    num_epochs=self.num_epochs,
                 )
             )
         return super().get_eval_dataloader(eval_dataset)
                 **data_collator_kwargs,
             ),
             callbacks=self.get_callbacks(),
+            num_epochs=self.cfg.num_epochs,
             **trainer_kwargs,
         )
         trainer = self.hook_post_create_trainer(trainer)

src/axolotl/utils/dataloader.py CHANGED Viewed

@@ -3,6 +3,9 @@ import hashlib
 import itertools
 import logging
 import math
 from typing import Any, Callable, List, Union
 import numba
@@ -149,6 +152,8 @@ class MultipackDistributedDataloader:
         packing_efficiency_estimate: float = 1.0,
         sample_packing_seq_len_multiplier: int = 1,
         device_count: int = 1,
     ):
         # Dataset
         self.dataset = dataset
@@ -167,6 +172,7 @@ class MultipackDistributedDataloader:
         self.seq_max_length = seq_max_length
         self.batch_max_length = batch_size * seq_max_length
         self.collate_fn = collate_fn
         self.num_replicas = 1
         self.rank = 0
@@ -177,6 +183,44 @@ class MultipackDistributedDataloader:
         self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
         self.device_count = device_count
     def generate_batches(self, set_stats=False):
         LOG.info("generating packed batches")
         if self.sampler:
@@ -206,11 +250,7 @@ class MultipackDistributedDataloader:
         return batches, totseqs
-    def __iter__(self):
-        if hasattr(self.sampler, "set_epoch"):
-            new_epoch = self.sampler.epoch + 1
-            self.sampler.set_epoch(new_epoch)
-            LOG.info(f"calling sampler.set_epoch({new_epoch})")
         all_batches, _ = self.generate_batches(set_stats=True)
         features = self.dataset.features.keys()
         len_remaining = self._len_est()

 import itertools
 import logging
 import math
+import time
+from queue import Queue
+from threading import Thread
 from typing import Any, Callable, List, Union
 import numba
         packing_efficiency_estimate: float = 1.0,
         sample_packing_seq_len_multiplier: int = 1,
         device_count: int = 1,
+        prefetch_max: int = 1000,
+        num_epochs: int = 1,
     ):
         # Dataset
         self.dataset = dataset
         self.seq_max_length = seq_max_length
         self.batch_max_length = batch_size * seq_max_length
         self.collate_fn = collate_fn
+        self.num_epochs = num_epochs
         self.num_replicas = 1
         self.rank = 0
         self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
         self.device_count = device_count
+        # maxsize is maximum number of samples in queue
+        self.prefetch_max = prefetch_max
+        self.queue: Queue = Queue(maxsize=prefetch_max)
+        self.thread = None
+    def _worker(self):
+        LOG.info(
+            f"[WORKER] Epochs: {self.num_epochs}, Samples: {self.len_w_stats()*self.batch_size}"
+        )
+        for epoch in range(self.num_epochs):
+            for sample in self._internal_batch_generator():
+                while True:
+                    if self.queue.full():
+                        time.sleep(1)
+                    else:
+                        break
+                self.queue.put(sample)
+            # stop the queue when epoch is done
+            self.queue.put(None)
+    def __iter__(self):
+        if hasattr(self.sampler, "set_epoch"):
+            new_epoch = self.sampler.epoch + 1
+            self.sampler.set_epoch(new_epoch)
+            LOG.info(f"calling sampler.set_epoch({new_epoch})")
+        if self.thread is None:
+            self.thread = Thread(target=self._worker, daemon=True)
+            self.thread.start()
+        while True:
+            item = self.queue.get()
+            if item is None:
+                break
+            yield item
     def generate_batches(self, set_stats=False):
         LOG.info("generating packed batches")
         if self.sampler:
         return batches, totseqs
+    def _internal_batch_generator(self):
         all_batches, _ = self.generate_batches(set_stats=True)
         features = self.dataset.features.keys()
         len_remaining = self._len_est()

src/axolotl/utils/trainer.py CHANGED Viewed

@@ -216,6 +216,7 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
                 packing_efficiency_estimate=cfg.sample_packing_eff_est,
                 sample_packing_seq_len_multiplier=cfg.micro_batch_size,
                 device_count=int(os.environ.get("WORLD_SIZE", 1)),
             )
             data_loader_len = data_loader.len_w_stats()
             actual_eff = data_loader.efficiency()

                 packing_efficiency_estimate=cfg.sample_packing_eff_est,
                 sample_packing_seq_len_multiplier=cfg.micro_batch_size,
                 device_count=int(os.environ.get("WORLD_SIZE", 1)),
+                num_epochs=cfg.num_epochs,
             )
             data_loader_len = data_loader.len_w_stats()
             actual_eff = data_loader.efficiency()