description: Distributed Train AE on Wikipedia Dataset auth: # which virtual cluster you belong to (msrlabs, resrchprojvc6, etc.). Everyone has access to "pnrsy". vc: resrchprojvc7 # physical cluster to use (cam, gcr, rr1) or Azure clusters (eu1, eu2, etc.) # cluster: rr2, eu2, eu1 et1 cluster: rr1 # docker environment (vm) in which your job will run. we provide "generic" dockers # with the main deep learning toolkit installed (PyTorch, TF, Chainer, etc.) docker: # image: philly/jobs/custom/generic-docker:py27 # registry: phillyregistry.azurecr.io image: chunyl/pytorch-transformers:v0 registry: index.docker.io storage: _default: #use_phillyfs: True storage_account_name: textae container_name: bigtextae mount_path: /mnt/_default code: # local directory of the code. this will be uploaded to the server. # $CONFIG_DIR is expanded to the directory of this config file code_upload: False remote_dir: code/ local_dir: $CONFIG_DIR/code #data: # data upload is not required for this example #data_upload: False search: job_template: name: exp_{experiment_name:s}_b{bs_option:.0f}_beta_{beta_option:.2f}_d_{dim_target_kl_option:.2f}_r0_{ratio_zero_option:.2f}_ra_{ratio_increase_option:.2f} sku: G8 # G4 # G1 command: - pip install --user --editable . - pip install --user azure - pip install --user tqdm - python -m torch.distributed.launch --nproc_per_node 8 examples/big_ae/run_lm_vae_pretraining_distributed.py --use_philly --num_train_epochs 1.0 --beta {beta_option} --dim_target_kl {dim_target_kl_option} --ratio_zero {ratio_zero_option} --ratio_increase {ratio_increase_option} --dataset wikipedia --per_gpu_train_batch_size {bs_option} --per_gpu_eval_batch_size 1 --block_size 128 --output_dir ../output/philly_rr1_vae_wikipedia_pretraining_b{beta_option}_d{dim_target_kl_option}_r0{ratio_zero_option}_ra{ratio_increase_option} --encoder_model_type bert --encoder_model_name_or_path bert-base-cased --decoder_model_type gpt2 --decoder_model_name_or_path gpt2 --do_train --train_data_file ../data/datasets/wikipedia_json_64/ --overwrite_output_dir --save_steps 20000 --logging_steps 100 max_trials: 50 type: grid params: - name: bs_option spec: discrete values: [16] # - name: beta_option spec: discrete values: [0.0] # - name: dim_target_kl_option spec: discrete values: [1.0] # - name: ratio_zero_option spec: discrete values: [1.0] # - name: ratio_increase_option spec: discrete values: [0.1] #