|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
print_usage() { |
|
echo "μ¬μ©λ²: $0 [μ΅μ
]" |
|
echo "μ΅μ
:" |
|
echo " --math-only μν λ²€μΉλ§ν¬λ§ μ€ν" |
|
echo " --coding-only μ½λ© λ²€μΉλ§ν¬λ§ μ€ν" |
|
echo " --help, -h λμλ§ μΆλ ₯" |
|
echo "" |
|
echo "κΈ°λ³Έκ°: μν + μ½λ© λ²€μΉλ§ν¬ λͺ¨λ μ€ν" |
|
} |
|
|
|
|
|
RUN_MATH=true |
|
RUN_CODING=true |
|
|
|
while [[ $# -gt 0 ]]; do |
|
case $1 in |
|
--math-only) |
|
RUN_MATH=true |
|
RUN_CODING=false |
|
shift |
|
;; |
|
--coding-only) |
|
RUN_MATH=false |
|
RUN_CODING=true |
|
shift |
|
;; |
|
--help|-h) |
|
print_usage |
|
exit 0 |
|
;; |
|
*) |
|
echo "μ μ μλ μ΅μ
: $1" |
|
print_usage |
|
exit 1 |
|
;; |
|
esac |
|
done |
|
|
|
|
|
GPU_ID=5 |
|
export CUDA_VISIBLE_DEVICES=$GPU_ID |
|
echo "π― GPU μ€μ : GPU ${GPU_ID}λ² μ¬μ© (CUDA_VISIBLE_DEVICES=${GPU_ID})" |
|
|
|
|
|
export HUMANEVAL_OVERRIDE_PATH=/home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/code_eval/data/HumanEvalPlus.jsonl |
|
export MBPP_OVERRIDE_PATH=/home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/code_eval/data/MbppPlus.jsonl |
|
|
|
echo "π Complete AZR vs Base Model Performance Comparison" |
|
echo "======================================================" |
|
echo "π μ€ν λͺ¨λ:" |
|
if [ "$RUN_MATH" = true ] && [ "$RUN_CODING" = true ]; then |
|
echo " - μν + μ½λ© λ²€μΉλ§ν¬ λͺ¨λ μ€ν" |
|
elif [ "$RUN_MATH" = true ]; then |
|
echo " - μν λ²€μΉλ§ν¬λ§ μ€ν" |
|
elif [ "$RUN_CODING" = true ]; then |
|
echo " - μ½λ© λ²€μΉλ§ν¬λ§ μ€ν" |
|
fi |
|
|
|
|
|
START_TIME=$(date +%s) |
|
TIMESTAMP=$(date +"%Y%m%d_%H%M%S") |
|
|
|
echo "β° μμ μκ°: $(date)" |
|
echo "π μμ
λλ ν 리: $(pwd)" |
|
|
|
|
|
echo "π μ¬μ©ν GPU μ 보:" |
|
if command -v nvidia-smi &> /dev/null; then |
|
nvidia-smi --id=$GPU_ID --query-gpu=name,memory.total,memory.free,utilization.gpu --format=csv,noheader,nounits 2>/dev/null || echo " GPU ${GPU_ID} μ 보λ₯Ό κ°μ Έμ¬ μ μμ΅λλ€" |
|
else |
|
echo " nvidia-smiλ₯Ό μ¬μ©ν μ μμ΅λλ€" |
|
fi |
|
|
|
|
|
LOG_DIR="/home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/evaluation_logs_$TIMESTAMP" |
|
mkdir -p $LOG_DIR |
|
|
|
echo "π λ‘κ·Έ λλ ν 리: $LOG_DIR" |
|
|
|
|
|
echo "" |
|
echo "β οΈ GPU λ©λͺ¨λ¦¬ μ¬μ©λ νμΈ:" |
|
if command -v nvidia-smi &> /dev/null; then |
|
GPU_MEMORY_USED=$(nvidia-smi --id=$GPU_ID --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null || echo "0") |
|
GPU_MEMORY_TOTAL=$(nvidia-smi --id=$GPU_ID --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null || echo "1") |
|
|
|
if [ "$GPU_MEMORY_USED" -gt 5000 ]; then |
|
echo " β οΈ GPU ${GPU_ID}λ²μ μ΄λ―Έ ${GPU_MEMORY_USED}MB μ¬μ© μ€μ
λλ€." |
|
echo " λ€λ₯Έ νλ‘μΈμ€κ° μ€ν μ€μΌ μ μμ΅λλ€." |
|
echo " κ³μ μ§ννμκ² μ΅λκΉ? (y/n)" |
|
read -r continue_eval |
|
if [[ ! "$continue_eval" =~ ^[Yy]$ ]]; then |
|
echo " νκ°λ₯Ό μ€λ¨ν©λλ€." |
|
exit 1 |
|
fi |
|
else |
|
echo " β
GPU ${GPU_ID}λ² μ¬μ© κ°λ₯ (${GPU_MEMORY_USED}MB/${GPU_MEMORY_TOTAL}MB μ¬μ© μ€)" |
|
fi |
|
else |
|
echo " GPU μνλ₯Ό νμΈν μ μμ΅λλ€. κ³μ μ§νν©λλ€." |
|
fi |
|
|
|
|
|
if [ "$RUN_MATH" = true ]; then |
|
echo "" |
|
echo "===============================================" |
|
echo "π μν λ²€μΉλ§ν¬ νκ° μμ (GPU ${GPU_ID}λ² μ¬μ©)" |
|
echo "===============================================" |
|
|
|
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/math_eval |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MATH_MODELS=( |
|
"andrewzh/Absolute_Zero_Reasoner-Coder-7b" |
|
"andrewzh2/Absolute_Zero_Reasoner-Base-7b" |
|
"Qwen/Qwen2.5-Coder-7B" |
|
"Qwen/Qwen2.5-7B" |
|
) |
|
|
|
MATH_MODEL_NAMES=( |
|
"azr_coder_7b" |
|
"azr_base_7b" |
|
"qwen25_7b_coder" |
|
"qwen25_7b" |
|
) |
|
|
|
MATH_TEMPLATES=( |
|
"azr" |
|
"azr" |
|
"qwen25-math-cot" |
|
"qwen25-math-cot" |
|
) |
|
|
|
|
|
|
|
echo "π ${#MATH_MODELS[@]}κ° λͺ¨λΈμ μν λ²€μΉλ§ν¬ νκ°λ₯Ό μμν©λλ€... (AZR λͺ¨λΈλΆν°)" |
|
echo "" |
|
|
|
for i in "${!MATH_MODELS[@]}"; do |
|
MODEL="${MATH_MODELS[$i]}" |
|
NAME="${MATH_MODEL_NAMES[$i]}" |
|
TEMPLATE="${MATH_TEMPLATES[$i]}" |
|
|
|
echo "π [$((i+1))/${#MATH_MODELS[@]}] $NAME μν νκ° μ€..." |
|
echo " λͺ¨λΈ: $MODEL" |
|
echo " ν
νλ¦Ώ: $TEMPLATE" |
|
|
|
|
|
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/math_eval/eval |
|
|
|
|
|
OUTPUT_DIR="/home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/math_eval/EVAL/results/${NAME}" |
|
mkdir -p "$OUTPUT_DIR" |
|
|
|
|
|
TOKENIZERS_PARALLELISM=false python -u math_eval.py \ |
|
--model_name_or_path "$MODEL" \ |
|
--data_name aime24,aime25,amc23,math500,olympiadbench,minerva_math \ |
|
--output_dir "$OUTPUT_DIR" \ |
|
--split test \ |
|
--prompt_type "$TEMPLATE" \ |
|
--num_test_sample -1 \ |
|
--max_tokens_per_call 16000 \ |
|
--seed 42 \ |
|
--temperature 0 \ |
|
--n_sampling 1 \ |
|
--top_p 0.95 \ |
|
--start 0 \ |
|
--end -1 \ |
|
--use_vllm \ |
|
--save_outputs \ |
|
2>&1 | tee "$LOG_DIR/${NAME}_math_evaluation.log" |
|
|
|
|
|
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/math_eval |
|
|
|
echo "β
$NAME μν νκ° μλ£" |
|
echo "" |
|
done |
|
|
|
echo "π λͺ¨λ μν λ²€μΉλ§ν¬ νκ° μλ£!" |
|
else |
|
echo "βοΈ μν λ²€μΉλ§ν¬ νκ° κ±΄λλ°κΈ°" |
|
fi |
|
|
|
|
|
if [ "$RUN_CODING" = true ]; then |
|
echo "" |
|
echo "===============================================" |
|
echo "π» μ½λ© λ²€μΉλ§ν¬ νκ° μμ (GPU ${GPU_ID}λ² μ¬μ©)" |
|
echo "===============================================" |
|
|
|
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/code_eval |
|
|
|
|
|
MODELS=("${MATH_MODELS[@]}") |
|
MODEL_NAMES=("${MATH_MODEL_NAMES[@]}") |
|
|
|
echo "π μλμΌλ‘ ${#MODELS[@]}κ° λͺ¨λΈμ μ½λ© λ²€μΉλ§ν¬ νκ°λ₯Ό μμν©λλ€... (AZR λͺ¨λΈλΆν°)" |
|
echo " - HumanEval+ λ° MBPP+ νκ°" |
|
echo " - LiveCodeBench νκ°" |
|
echo "" |
|
|
|
|
|
mkdir -p "$LOG_DIR/coding_results" |
|
|
|
for i in "${!MODELS[@]}"; do |
|
MODEL="${MODELS[$i]}" |
|
NAME="${MODEL_NAMES[$i]}" |
|
|
|
echo "π [$((i+1))/${#MODELS[@]}] $NAME μ½λ© νκ° μ€..." |
|
echo " λͺ¨λΈ: $MODEL" |
|
|
|
|
|
echo " π HumanEval+ νκ° μ€..." |
|
bash scripts/run_evalplus.sh humaneval "$MODEL" 1 0.0 0.95 1 \ |
|
2>&1 | tee "$LOG_DIR/coding_results/${NAME}_humaneval.log" || echo "β οΈ HumanEval+ νκ° μ€ν¨: $NAME" |
|
|
|
|
|
echo " π MBPP+ νκ° μ€..." |
|
bash scripts/run_evalplus.sh mbpp "$MODEL" 1 0.0 0.95 1 \ |
|
2>&1 | tee "$LOG_DIR/coding_results/${NAME}_mbpp.log" || echo "β οΈ MBPP+ νκ° μ€ν¨: $NAME" |
|
|
|
|
|
echo " π LiveCodeBench νκ° μ€..." |
|
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner |
|
bash evaluation/code_eval/scripts/run_lcb_gen.sh \ |
|
--model "$MODEL" \ |
|
--gpu $GPU_ID \ |
|
--n 1 \ |
|
--temperature 0.0 \ |
|
--top_p 0.95 \ |
|
--max_tokens 2048 \ |
|
2>&1 | tee "$LOG_DIR/coding_results/${NAME}_lcb.log" || echo "β οΈ LiveCodeBench νκ° μ€ν¨: $NAME" |
|
|
|
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/code_eval |
|
|
|
echo "β
$NAME μ½λ© νκ° μλ£" |
|
echo "" |
|
done |
|
|
|
echo "π λͺ¨λ μ½λ© λ²€μΉλ§ν¬ νκ° μλ£!" |
|
echo "π κ²°κ³Ό νμΌλ€:" |
|
echo " - HumanEval+: evalplus_results/humaneval/" |
|
echo " - MBPP+: evalplus_results/mbpp/" |
|
echo " - LiveCodeBench: coding/LiveCodeBench/outputs/" |
|
echo " - λ‘κ·Έ: $LOG_DIR/coding_results/" |
|
else |
|
echo "βοΈ μ½λ© λ²€μΉλ§ν¬ νκ° κ±΄λλ°κΈ°" |
|
fi |
|
|
|
|
|
echo "" |
|
echo "===============================================" |
|
echo "π κ²°κ³Ό μμ§ λ° λΆμ" |
|
echo "===============================================" |
|
|
|
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation |
|
|
|
|
|
if [ "$RUN_MATH" = true ]; then |
|
echo "π μν νκ° κ²°κ³Ό μμ§ μ€..." |
|
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/math_eval |
|
python3 collect_all_math_results.py 2>&1 | tee $LOG_DIR/result_collection.log |
|
fi |
|
|
|
|
|
echo "" |
|
echo "===============================================" |
|
echo "π μ’
ν© λ¦¬ν¬νΈ μμ±" |
|
echo "===============================================" |
|
|
|
REPORT_FILE="$LOG_DIR/evaluation_summary_$TIMESTAMP.md" |
|
|
|
cat > $REPORT_FILE << EOF |
|
# AZR vs Base Models Performance Evaluation Report |
|
|
|
## νκ° κ°μ |
|
- **νκ° μμ**: $(date) |
|
- **νκ° λͺ¨λΈ**: 4κ° (AZR 2κ° + Base 2κ°) |
|
- **λ²€μΉλ§ν¬**: Math (6κ°) + Coding (μ ν) |
|
|
|
## νκ°λ λͺ¨λΈλ€ |
|
1. **Qwen/Qwen2.5-7B** (Base Model) |
|
2. **andrewzh/Absolute_Zero_Reasoner-Coder-7b** (AZR Coder) |
|
3. **andrewzh2/Absolute_Zero_Reasoner-Base-7b** (AZR Base) |
|
4. **Qwen/Qwen2.5-7B-Coder** (Coder Base) |
|
|
|
## μν λ²€μΉλ§ν¬ |
|
- AIME 2024/2025 |
|
- AMC 2023 |
|
- Math500 |
|
- OlympiadBench |
|
- Minerva Math |
|
|
|
## νκ° μ€μ |
|
**μν λ²€μΉλ§ν¬:** |
|
- Temperature: 0 (greedy decoding) |
|
- Max tokens: 16000 |
|
- Seed: 42 |
|
- Benchmarks: AIME24/25, AMC23, Math500, OlympiadBench, Minerva Math |
|
|
|
**μ½λ© λ²€μΉλ§ν¬:** |
|
- Temperature: 0 (greedy decoding) |
|
- Max tokens: 2048 (LiveCodeBench), default (EvalPlus) |
|
- Datasets: HumanEval+, MBPP+, LiveCodeBench v5 |
|
|
|
## κ²°κ³Ό νμΌ μμΉ |
|
**μν νκ°:** |
|
- μν νκ° λ‘κ·Έ: $LOG_DIR/math_evaluation.log |
|
- κ²°κ³Ό μμ§ λ‘κ·Έ: $LOG_DIR/result_collection.log |
|
- μμΈ κ²°κ³Ό: evaluation/math_eval/eval/eval_results/ |
|
|
|
**μ½λ© νκ°:** |
|
- HumanEval+ κ²°κ³Ό: evaluation/code_eval/evalplus_results/humaneval/ |
|
- MBPP+ κ²°κ³Ό: evaluation/code_eval/evalplus_results/mbpp/ |
|
- LiveCodeBench κ²°κ³Ό: evaluation/code_eval/coding/LiveCodeBench/outputs/ |
|
- λ‘κ·Έ νμΌ: $LOG_DIR/coding_results/ |
|
|
|
## λ€μ λ¨κ³ |
|
1. WandB λμ보λμμ μ€μκ° κ²°κ³Ό νμΈ |
|
2. λ‘컬 κ²°κ³Ό νμΌμμ μμΈ λΆμ (μν + μ½λ©) |
|
3. AZR vs Base λͺ¨λΈ μ’
ν© μ±λ₯ λΉκ΅ |
|
4. Cross-domain μ±λ₯ ν₯μ ν¨κ³Ό λΆμ |
|
|
|
EOF |
|
|
|
echo "π μ’
ν© λ¦¬ν¬νΈ μμ± μλ£: $REPORT_FILE" |
|
|
|
|
|
END_TIME=$(date +%s) |
|
DURATION=$((END_TIME - START_TIME)) |
|
HOURS=$((DURATION / 3600)) |
|
MINUTES=$(((DURATION % 3600) / 60)) |
|
SECONDS=$((DURATION % 60)) |
|
|
|
echo "" |
|
echo "π μ 체 νκ° μλ£!" |
|
echo "β±οΈ μ΄ μμ μκ°: ${HOURS}μκ° ${MINUTES}λΆ ${SECONDS}μ΄" |
|
echo "π λͺ¨λ λ‘κ·Έλ $LOG_DIR λλ ν 리μ μ μ₯λμμ΅λλ€." |
|
|
|
echo "" |
|
echo "π κ²°κ³Ό νμΈ λ°©λ²:" |
|
echo " 1. WandB: https://wandb.ai (νλ‘μ νΈ: verl_math_evaluate)" |
|
echo " 2. λ‘컬 νμΌ: evaluation/math_eval/eval/eval_results/" |
|
echo " 3. μμ½ λ¦¬ν¬νΈ: $REPORT_FILE" |