File size: 2,560 Bytes
24c2665 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
set -ex
# export CUDA_VISIBLE_DEVICES=7
PROMPT_TYPE=$1
MODEL_NAME_OR_PATH=$2
OUTPUT_DIR=$3
temperature=$4
max_tokens=$5
top_p=$6
benchmarks=${7:-"gsm8k,math500,minerva_math,gaokao2023en,olympiadbench,college_math,aime24,amc23"}
SPLIT="test"
NUM_TEST_SAMPLE=-1
OVERWRITE=${8:-false}
N_SAMPLING=${9:-1}
seed=${10:-0}
# English open datasets
DATA_NAME=${benchmarks}
if [ "$OVERWRITE" = "true" ]; then
OVERWRITE="--overwrite"
else
OVERWRITE=""
fi
# Split benchmarks into two groups
IFS=',' read -ra BENCHMARK_ARRAY <<< "$benchmarks"
REGULAR_BENCHMARKS=()
SPECIAL_BENCHMARKS=()
for benchmark in "${BENCHMARK_ARRAY[@]}"; do
if [[ "$benchmark" == "aime24" || "$benchmark" == "amc23" ]]; then
SPECIAL_BENCHMARKS+=("$benchmark")
else
REGULAR_BENCHMARKS+=("$benchmark")
fi
done
# If temperature is 0, combine the benchmark arrays
if [ "$temperature" = "0.0" ] || [ "$temperature" = "0" ]; then
REGULAR_BENCHMARKS=("${REGULAR_BENCHMARKS[@]}" "${SPECIAL_BENCHMARKS[@]}")
SPECIAL_BENCHMARKS=()
fi
# Run regular benchmarks with n_sampling=1
if [ ${#REGULAR_BENCHMARKS[@]} -gt 0 ]; then
REGULAR_BENCHMARKS_STR=$(IFS=,; echo "${REGULAR_BENCHMARKS[*]}")
TOKENIZERS_PARALLELISM=false \
python -u math_eval.py \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--data_name ${REGULAR_BENCHMARKS_STR} \
--output_dir ${OUTPUT_DIR} \
--split ${SPLIT} \
--prompt_type ${PROMPT_TYPE} \
--num_test_sample ${NUM_TEST_SAMPLE} \
--max_tokens_per_call ${max_tokens} \
--seed ${seed} \
--temperature ${temperature} \
--n_sampling ${N_SAMPLING} \
--top_p ${top_p} \
--start 0 \
--end -1 \
--use_vllm \
--save_outputs \
${OVERWRITE}
fi
# Run special benchmarks (aime24, amc23) with n_sampling=8
if [ ${#SPECIAL_BENCHMARKS[@]} -gt 0 ]; then
SPECIAL_BENCHMARKS_STR=$(IFS=,; echo "${SPECIAL_BENCHMARKS[*]}")
TOKENIZERS_PARALLELISM=false \
python -u math_eval.py \
--model_name_or_path ${MODEL_NAME_OR_PATH} \
--data_name ${SPECIAL_BENCHMARKS_STR} \
--output_dir ${OUTPUT_DIR} \
--split ${SPLIT} \
--prompt_type ${PROMPT_TYPE} \
--num_test_sample ${NUM_TEST_SAMPLE} \
--max_tokens_per_call ${max_tokens} \
--seed ${seed} \
--temperature ${temperature} \
--n_sampling ${N_SAMPLING} \
--top_p ${top_p} \
--start 0 \
--end -1 \
--use_vllm \
--save_outputs \
${OVERWRITE}
fi
|