Spaces:

flax-community
/

dalle-mini

Running

App Files Files Community

boris commited on Oct 5, 2021

Commit

074c5e1

1 Parent(s): 9ed6378

feat: log epoch + check params

Browse files

Files changed (3) hide show

dev/seq2seq/do_big_run.sh +4 -4
dev/seq2seq/do_small_run.sh +3 -3
dev/seq2seq/run_seq2seq_flax.py +21 -30

dev/seq2seq/do_big_run.sh CHANGED Viewed

@@ -1,16 +1,16 @@
 python run_seq2seq_flax.py \
-	--max_source_length 128 \
 	--dataset_repo_or_path dalle-mini/encoded \
 	--train_file **/train/*/*.jsonl \
 	--validation_file **/valid/*/*.jsonl \
 	--streaming \
-	--len_train 1000000 \
-	--len_eval 100 \
 	--output_dir output \
 	--per_device_train_batch_size 56 \
 	--per_device_eval_batch_size 56 \
 	--preprocessing_num_workers 80 \
-	--warmup_steps 250 \
 	--gradient_accumulation_steps 8 \
 	--do_train \
 	--do_eval \

 python run_seq2seq_flax.py \
 	--dataset_repo_or_path dalle-mini/encoded \
 	--train_file **/train/*/*.jsonl \
 	--validation_file **/valid/*/*.jsonl \
+	--len_train 42684248 \
+	--len_eval 34328 \
 	--streaming \
+	--normalize_text \
 	--output_dir output \
 	--per_device_train_batch_size 56 \
 	--per_device_eval_batch_size 56 \
 	--preprocessing_num_workers 80 \
+	--warmup_steps 500 \
 	--gradient_accumulation_steps 8 \
 	--do_train \
 	--do_eval \

dev/seq2seq/do_small_run.sh CHANGED Viewed

@@ -2,9 +2,9 @@ python run_seq2seq_flax.py \
 	--dataset_repo_or_path dalle-mini/encoded \
 	--train_file **/train/*/*.jsonl \
 	--validation_file **/valid/*/*.jsonl \
 	--streaming \
-	--len_train 1000000 \
-	--len_eval 1000 \
 	--output_dir output \
 	--per_device_train_batch_size 56 \
 	--per_device_eval_batch_size 56 \
@@ -15,5 +15,5 @@ python run_seq2seq_flax.py \
 	--do_eval \
 	--adafactor \
 	--num_train_epochs 1 \
-	--max_train_samples 20000 \
 	--learning_rate 0.005

 	--dataset_repo_or_path dalle-mini/encoded \
 	--train_file **/train/*/*.jsonl \
 	--validation_file **/valid/*/*.jsonl \
+	--len_train 42684248 \
+	--len_eval 34328 \
 	--streaming \
 	--output_dir output \
 	--per_device_train_batch_size 56 \
 	--per_device_eval_batch_size 56 \
 	--do_eval \
 	--adafactor \
 	--num_train_epochs 1 \
+	--max_train_samples 10000 \
 	--learning_rate 0.005

dev/seq2seq/run_seq2seq_flax.py CHANGED Viewed

@@ -138,16 +138,6 @@ class DataTrainingArguments:
     Arguments pertaining to what data we are going to input our model for training and eval.
     """
-    dataset_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "The name of the dataset to use (via the datasets library)."},
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The configuration name of the dataset to use (via the datasets library)."
-        },
-    )
     text_column: Optional[str] = field(
         default="caption",
         metadata={
@@ -260,14 +250,10 @@ class DataTrainingArguments:
     )
     def __post_init__(self):
-        if (
-            self.dataset_name is None
-            and self.train_file is None
-            and self.validation_file is None
-        ):
-            raise ValueError(
-                "Need either a dataset name or a training/validation file."
-            )
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
@@ -287,6 +273,10 @@ class DataTrainingArguments:
                 ], "`validation_file` should be a tsv, csv or json file."
         if self.val_max_target_length is None:
             self.val_max_target_length = self.max_target_length
 class TrainState(train_state.TrainState):
@@ -467,18 +457,6 @@ def main():
             "Use --overwrite_output_dir to overcome."
         )
-    # Set up wandb run
-    wandb.init(
-        entity="dalle-mini",
-        project="dalle-mini",
-        job_type="Seq2Seq",
-        config=parser.parse_args(),
-    )
-    # set default x-axis as 'train/step'
-    wandb.define_metric("train/step")
-    wandb.define_metric("*", step_metric="train/step")
     # Make one log on every process with the configuration for debugging.
     pylogging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -528,6 +506,18 @@ def main():
         return step, optimizer_step, opt_state
     if model_args.from_checkpoint is not None:
         artifact = wandb.run.use_artifact(model_args.from_checkpoint)
         artifact_dir = artifact.download()
@@ -1006,6 +996,7 @@ def main():
     for epoch in epochs:
         # ======================== Training ================================
         # Create sampling rng
         rng, input_rng = jax.random.split(rng)

     Arguments pertaining to what data we are going to input our model for training and eval.
     """
     text_column: Optional[str] = field(
         default="caption",
         metadata={
     )
     def __post_init__(self):
+        if self.dataset_repo_or_path is None:
+            raise ValueError("Need a dataset repository or path.")
+        if self.train_file is None or self.validation_file is None:
+            raise ValueError("Need training/validation file.")
         else:
             if self.train_file is not None:
                 extension = self.train_file.split(".")[-1]
                 ], "`validation_file` should be a tsv, csv or json file."
         if self.val_max_target_length is None:
             self.val_max_target_length = self.max_target_length
+        if self.streaming and (self.len_train is None or self.len_eval is None):
+            raise ValueError(
+                "Streaming requires providing length of training and validation datasets"
+            )
 class TrainState(train_state.TrainState):
             "Use --overwrite_output_dir to overcome."
         )
     # Make one log on every process with the configuration for debugging.
     pylogging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
         return step, optimizer_step, opt_state
+    # Set up wandb run
+    wandb.init(
+        entity="dalle-mini",
+        project="dalle-mini",
+        job_type="Seq2Seq",
+        config=parser.parse_args(),
+    )
+    # set default x-axis as 'train/step'
+    wandb.define_metric("train/step")
+    wandb.define_metric("*", step_metric="train/step")
     if model_args.from_checkpoint is not None:
         artifact = wandb.run.use_artifact(model_args.from_checkpoint)
         artifact_dir = artifact.download()
     for epoch in epochs:
         # ======================== Training ================================
+        wandb_log({"train/epoch": epoch}, step=global_step)
         # Create sampling rng
         rng, input_rng = jax.random.split(rng)