echo 'python run_speech_recognition_seq2seq_streaming.py \
--model_name_or_path="openai/whisper-small" \
--dataset_name="mozilla-foundation/common_voice_11_0" \
--dataset_config_name="bn" \
--language="bengali" \
--train_split_name="train+validation" \
--eval_split_name="test" \
--model_index_name="Whisper Small Bengali" \
--output_dir="./" \
--overwrite_output_dir \
--max_steps="60000" \
--per_device_train_batch_size="4" \
--per_device_eval_batch_size="2" \
--gradient_accumulation_steps="8" \
--gradient_checkpointing="False" \
--evaluation_strategy="steps" \
--eval_steps="1000" \
--save_strategy="steps" \
--save_steps="1000" \
--save_total_limit="5" \
--learning_rate="1e-5" \
--warmup_steps="5000" \
--logging_steps="25" \
--weight_decay="0.01" \
--load_best_model_at_end="True" \
--metric_for_best_model="wer" \
--greater_is_better="False" \
--bf16="True" \
--tf32="True" \
--streaming="False" \
--generation_max_length="225" \
--length_column_name="input_length" \
--max_duration_in_seconds="30" \
--text_column_name="sentence" \
--freeze_feature_encoder="False" \
--report_to="tensorboard" \
--do_train \
--do_eval \
--predict_with_generate \
--do_normalize_eval \
--use_auth_token \
--push_to_hub' >> run.sh

#max_steps MAX_STEPS - If > 0: set total number of training steps to perform. Override num_train_epochs. (default: -1)

	--max_steps="20000" \


#output_dir OUTPUT_DIR - The output directory where the model predictions and checkpoints will be written. (default: None)

	--output_dir="./" \


#overwrite_output_dir [OVERWRITE_OUTPUT_DIR] - Overwrite the content of the output directory. Use this to continue training if output_dir points to a
#checkpoint directory. (default: False)

	--overwrite_output_dir \


#weight_decay (float, optional, defaults to 0) — The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW 
optimizer. weight decay prevents overfitting. visit: https://towardsdatascience.com/this-thing-called-weight-decay-a7cd4bcfccab
# 0.1-just right, 0.01-takes more epochs to fit, 10-never quite fits

	--weight_decay="0.01" \


#bf16 (bool, optional, defaults to False) — Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher 
NVIDIA architecture or using CPU (no_cuda). This is an experimental API and it may change.

	--bf16="True" \
	

#fp16 (bool, optional, defaults to False) — Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.

	--fp16="True" \
	

#tf32 (bool, optional) — Whether to enable the TF32 mode, available in Ampere and newer GPU architectures. The default value depends on PyTorch’s 
#version default of torch.backends.cuda.matmul.allow_tf32. This is an experimental API and it may change.
#details: https://huggingface.co/docs/transformers/perf_train_gpu_one

	--tf32="True" \
	

#gradient_checkpointing (bool, optional, defaults to False) — If True, use gradient checkpointing to save memory at the expense of slower backward
#pass.

	--gradient_checkpointing="False" \
	

#deepspeed (str or dict, optional) — Use Deepspeed. This is an experimental feature and its API may evolve in the future. The value is either the 
#location of DeepSpeed json config file (e.g., ds_config.json) or an already loaded json file as a dict”

	--deepspeed="ds_config.json" \
	
	
#auto_find_batch_size (bool, optional, defaults to False) — Whether to find a batch size that will fit into memory automatically through exponential
#decay, avoiding CUDA Out-of-Memory errors. Requires accelerate to be installed (pip install accelerate)

	--auto_find_batch_size="True" \


#lr_scheduler_type (str or SchedulerType, optional, defaults to "linear") — The scheduler type to use.
#Scheduler types: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"

	--lr_scheduler_type="linear" \
	

#torch_compile (bool, optional, defaults to False) — Whether or not to compile the model using PyTorch 2.0 torch.compile (requires a nighlty install of
#PyTorch). If set, the backend will default to "inductor" (can be customized with torch_compile_backend) and the mode will default to "default" (can be
#customized with torch_compile_mode).

	--torch_compile="True" \


#torch_compile_backend (str, optional) — The backend to use in torch.compile. If set to any value, torch_compile will be set to True.
#Possible choices are "eager", "aot_eager", "inductor", "nvfuser", "aot_nvfuser", "aot_cudagraphs", "ofi", "fx2trt", "onnxrt" and "ipex".

	--torch_compile_backend="inductor" \

#torch_compile_mode (str, optional) — The mode to use in torch.compile. If set to any value, torch_compile will be set to True.
#Possible choices are "default", "reduce-overhead" and "max-autotune". See: https://pytorch.org/get-started/pytorch-2.0/#modes

	--torch_compile_mode="default" \

	
#push_to_hub (bool, optional, defaults to False) — Whether or not to push the model to the Hub every time the model is saved. If this is activated, 
#output_dir will begin a git directory synced with the repo (determined by hub_model_id) and the content will be pushed each time a save is triggered
#(depending on your save_strategy). Calling save_model() will also trigger a push. If output_dir exists, it needs to be a local clone of the repository
#to which the Trainer will be pushed.
	
	--push_to_hub="False" \
	

#resume_from_checkpoint (str, optional) — The path to a folder with a valid checkpoint for your model. This argument is not directly used by Trainer, 
#it’s intended to be used by your training/evaluation scripts instead. See the example scripts for more details.

	--resume_from_checkpoint="directory" \
	
	
#load_best_model_at_end (bool, optional, defaults to False) — Whether or not to load the best model found during training at the end of training.
#When set to True, the parameters save_strategy needs to be the same as evaluation_strategy, and in the case it is “steps”, save_steps must be a 
#round multiple of eval_steps.

	--load_best_model_at_end="True" \
	

#metric_for_best_model (str, optional) — Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models. 
#Must be the name of a metric returned by the evaluation with or without the prefix "eval_". Will default to "loss" if unspecified and 
#load_best_model_at_end=True (to use the evaluation loss). If you set this value, greater_is_better will default to True. Don’t forget to set it to 
#False if your metric is better when lower.

	--metric_for_best_model="wer" \
	

#greater_is_better (bool, optional) — Use in conjunction with load_best_model_at_end and metric_for_best_model to specify if better models should have
#a greater metric or not. Will default to: True if metric_for_best_model is set to a value that isn’t "loss" or "eval_loss". False if 
#metric_for_best_model is not set, or set to "loss" or "eval_loss".

	--greater_is_better="False"


#eval_steps (int, optional) — Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as 
#logging_steps if not set.

	--eval_steps="1000" \
	

#dataloader_num_workers (int, optional, defaults to 0) — Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be 
#loaded in the main process.

	--dataloader_num_workers="1" \
	

#disable_tqdm (bool, optional) — Whether or not to disable the tqdm progress bars and table of metrics produced by ~notebook.NotebookTrainingTracker 
# in Jupyter Notebooks. Will default to True if the logging level is set to warn or lower (default), False otherwise.

	--disable_tqdm="False" \

#optim (str or training_args.OptimizerNames, optional, defaults to "adamw_hf") — The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, 
#adamw_anyprecision or adafactor.

	--optim="adamw_hf" \


See this article for more intuition:

		https://huggingface.co/docs/transformers/perf_train_gpu_one
		

#cache_dir CACHE_DIR. Where to store the pretrained models downloaded from huggingface.co (default: None)

	--cache_dir="~/asr_training/models_cache"


#max_train_samples MAX_TRAIN_SAMPLES. For debugging purposes or quicker training, truncate the number of training examples to this value if set.
#(default: None)

	--max_train_samples="1000"
	
	
#max_eval_samples MAX_EVAL_SAMPLES. For debugging purposes or quicker training, truncate the number of evaluation examples to this value if set.
#(default: None)

	--max_eval_samples="100"
	

#train_split_name TRAIN_SPLIT_NAME. The name of the training data set split to use (via the datasets library). Defaults to 'train' (default: train)

	--train_split_name="train" \
	
	
#eval_split_name EVAL_SPLIT_NAME. The name of the training data set split to use (via the datasets library). Defaults to 'train' (default: test)

	--eval_split_name="valid" \
	

#do_lower_case [DO_LOWER_CASE]. Whether the target text should be lower cased. (default: False)

	do_lower_case="False" \

#do_remove_punctuation [DO_REMOVE_PUNCTUATION]. Whether the target text should be striped of punctuation. (default: False)

	--do_remove_punctuation="False" \
	
#do_normalize_eval [DO_NORMALIZE_EVAL]. Whether to normalise the references and predictions in the eval WER calculation. (default: True)

	--do_normalize_eval="True" \
	
#no_do_normalize_eval. Whether to normalise the references and predictions in the eval WER calculation. (default: False)

	--no_do_normalize_eval="False" \