Spaces:

flax-community
/

dalle-mini

Running

App Files Files Community

boris commited on Jul 14, 2021

Commit

bf4da91

unverified ·

2 Parent(s): 3073ff4 5b79afd

Merge pull request #16 from borisdayma/feat-log_model

Browse files

Files changed (2) hide show

requirements.txt +1 -1
seq2seq/run_seq2seq_flax.py +34 -11

requirements.txt CHANGED Viewed

@@ -9,4 +9,4 @@ flax
 jupyter
 # for logging
 tensorboard
-tetnsorflow

 jupyter
 # for logging
 tensorboard
+tensorflow

seq2seq/run_seq2seq_flax.py CHANGED Viewed

@@ -199,7 +199,7 @@ class DataTrainingArguments:
         },
     )
     preprocessing_num_workers: Optional[int] = field(
-        default=None,
         metadata={"help": "The number of processes to use for the preprocessing."},
     )
     source_prefix: Optional[str] = field(
@@ -225,6 +225,9 @@ class DataTrainingArguments:
             "value if set."
         },
     )
     def __post_init__(self):
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -812,6 +815,36 @@ def main():
             cur_step = epoch * (len(train_dataset) // train_batch_size)
             write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step)
     # ======================== Prediction loop ==============================
     if training_args.do_predict:
         logger.info("*** Predict ***")
@@ -851,16 +884,6 @@ def main():
         desc = f"Predict Loss: {pred_metrics['loss']} | {rouge_desc})"
         logger.info(desc)
-        # save checkpoint after each epoch and push checkpoint to the hub
-        if jax.process_index() == 0:
-            params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-            model.save_pretrained(
-                training_args.output_dir,
-                params=params,
-                push_to_hub=training_args.push_to_hub,
-                commit_message=f"Saving weights and logs of epoch {epoch+1}",
-            )
 if __name__ == "__main__":
     main()

         },
     )
     preprocessing_num_workers: Optional[int] = field(
+        default=80,     # ensure we have the same datasets cached data and avoid using too much space
         metadata={"help": "The number of processes to use for the preprocessing."},
     )
     source_prefix: Optional[str] = field(
             "value if set."
         },
     )
+    log_model: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
     def __post_init__(self):
         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
             cur_step = epoch * (len(train_dataset) // train_batch_size)
             write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step)
+        # save checkpoint after each epoch and push checkpoint to the hub
+        if jax.process_index() == 0:
+            params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+            # save model locally
+            model.save_pretrained(
+                training_args.output_dir,
+                params=params,
+            )
+            # save to W&B
+            if data_args.log_model:
+                metadata = {'epoch': epoch+1, 'eval/loss': eval_metrics['loss']}
+                artifact = wandb.Artifact(
+                    name=f"model-{wandb.run.id}", type="bart_model", metadata=metadata
+                )
+                artifact.add_file(str(Path(training_args.output_dir) / 'flax_model.msgpack'))
+                artifact.add_file(str(Path(training_args.output_dir) / 'config.json'))
+                wandb.run.log_artifact(artifact)
+            # save to the hub
+            if training_args.push_to_hub:
+                model.save_pretrained(
+                    training_args.output_dir,
+                    params=params,
+                    push_to_hub=training_args.push_to_hub,
+                    commit_message=f"Saving weights and logs of epoch {epoch+1}",
+                    temp_dir=True  # avoid issues with being in a repository
+                )
     # ======================== Prediction loop ==============================
     if training_args.do_predict:
         logger.info("*** Predict ***")
         desc = f"Predict Loss: {pred_metrics['loss']} | {rouge_desc})"
         logger.info(desc)
 if __name__ == "__main__":
     main()