neural-mesh-v2 / evaluation /run_complete_evaluation.sh
hjkim00's picture
Restore all essential files - code, configs, and MBPP/HumanEval data
24c2665 verified
#!/bin/bash
# μ™„μ „ν•œ AZR vs Base λͺ¨λΈ μ„±λŠ₯ 비ꡐ 슀크립트
# μˆ˜ν•™ λ²€μΉ˜λ§ˆν¬μ™€ μ½”λ”© 벀치마크 λͺ¨λ‘ μ‹€ν–‰
# μ‚¬μš©λ²• 좜λ ₯ ν•¨μˆ˜
print_usage() {
echo "μ‚¬μš©λ²•: $0 [μ˜΅μ…˜]"
echo "μ˜΅μ…˜:"
echo " --math-only μˆ˜ν•™ 벀치마크만 μ‹€ν–‰"
echo " --coding-only μ½”λ”© 벀치마크만 μ‹€ν–‰"
echo " --help, -h 도움말 좜λ ₯"
echo ""
echo "κΈ°λ³Έκ°’: μˆ˜ν•™ + μ½”λ”© 벀치마크 λͺ¨λ‘ μ‹€ν–‰"
}
# λͺ…령쀄 인자 νŒŒμ‹±
RUN_MATH=true
RUN_CODING=true
while [[ $# -gt 0 ]]; do
case $1 in
--math-only)
RUN_MATH=true
RUN_CODING=false
shift
;;
--coding-only)
RUN_MATH=false
RUN_CODING=true
shift
;;
--help|-h)
print_usage
exit 0
;;
*)
echo "μ•Œ 수 μ—†λŠ” μ˜΅μ…˜: $1"
print_usage
exit 1
;;
esac
done
# GPU μ„€μ • - 이 λ³€μˆ˜λ§Œ λ³€κ²½ν•˜λ©΄ λͺ¨λ“  GPU 섀정이 λ°”λ€λ‹ˆλ‹€
GPU_ID=5
export CUDA_VISIBLE_DEVICES=$GPU_ID
echo "🎯 GPU μ„€μ •: GPU ${GPU_ID}번 μ‚¬μš© (CUDA_VISIBLE_DEVICES=${GPU_ID})"
# EvalPlus 데이터셋 경둜 μ„€μ •
export HUMANEVAL_OVERRIDE_PATH=/home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/code_eval/data/HumanEvalPlus.jsonl
export MBPP_OVERRIDE_PATH=/home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/code_eval/data/MbppPlus.jsonl
echo "πŸš€ Complete AZR vs Base Model Performance Comparison"
echo "======================================================"
echo "πŸ“‹ μ‹€ν–‰ λͺ¨λ“œ:"
if [ "$RUN_MATH" = true ] && [ "$RUN_CODING" = true ]; then
echo " - μˆ˜ν•™ + μ½”λ”© 벀치마크 λͺ¨λ‘ μ‹€ν–‰"
elif [ "$RUN_MATH" = true ]; then
echo " - μˆ˜ν•™ 벀치마크만 μ‹€ν–‰"
elif [ "$RUN_CODING" = true ]; then
echo " - μ½”λ”© 벀치마크만 μ‹€ν–‰"
fi
# ν˜„μž¬ μ‹œκ°„ 기둝
START_TIME=$(date +%s)
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
echo "⏰ μ‹œμž‘ μ‹œκ°„: $(date)"
echo "πŸ“ μž‘μ—… 디렉토리: $(pwd)"
# GPU 정보 확인
echo "πŸ” μ‚¬μš©ν•  GPU 정보:"
if command -v nvidia-smi &> /dev/null; then
nvidia-smi --id=$GPU_ID --query-gpu=name,memory.total,memory.free,utilization.gpu --format=csv,noheader,nounits 2>/dev/null || echo " GPU ${GPU_ID} 정보λ₯Ό κ°€μ Έμ˜¬ 수 μ—†μŠ΅λ‹ˆλ‹€"
else
echo " nvidia-smiλ₯Ό μ‚¬μš©ν•  수 μ—†μŠ΅λ‹ˆλ‹€"
fi
# 둜그 디렉토리 생성 (μ ˆλŒ€ 경둜)
LOG_DIR="/home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/evaluation_logs_$TIMESTAMP"
mkdir -p $LOG_DIR
echo "πŸ“ 둜그 디렉토리: $LOG_DIR"
# GPU λ©”λͺ¨λ¦¬ 확인 및 κ²½κ³ 
echo ""
echo "⚠️ GPU λ©”λͺ¨λ¦¬ μ‚¬μš©λŸ‰ 확인:"
if command -v nvidia-smi &> /dev/null; then
GPU_MEMORY_USED=$(nvidia-smi --id=$GPU_ID --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null || echo "0")
GPU_MEMORY_TOTAL=$(nvidia-smi --id=$GPU_ID --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null || echo "1")
if [ "$GPU_MEMORY_USED" -gt 5000 ]; then
echo " ⚠️ GPU ${GPU_ID}λ²ˆμ— 이미 ${GPU_MEMORY_USED}MB μ‚¬μš© μ€‘μž…λ‹ˆλ‹€."
echo " λ‹€λ₯Έ ν”„λ‘œμ„ΈμŠ€κ°€ μ‹€ν–‰ 쀑일 수 μžˆμŠ΅λ‹ˆλ‹€."
echo " 계속 μ§„ν–‰ν•˜μ‹œκ² μŠ΅λ‹ˆκΉŒ? (y/n)"
read -r continue_eval
if [[ ! "$continue_eval" =~ ^[Yy]$ ]]; then
echo " 평가λ₯Ό μ€‘λ‹¨ν•©λ‹ˆλ‹€."
exit 1
fi
else
echo " βœ… GPU ${GPU_ID}번 μ‚¬μš© κ°€λŠ₯ (${GPU_MEMORY_USED}MB/${GPU_MEMORY_TOTAL}MB μ‚¬μš© 쀑)"
fi
else
echo " GPU μƒνƒœλ₯Ό 확인할 수 μ—†μŠ΅λ‹ˆλ‹€. 계속 μ§„ν–‰ν•©λ‹ˆλ‹€."
fi
# 1. μˆ˜ν•™ 벀치마크 평가
if [ "$RUN_MATH" = true ]; then
echo ""
echo "==============================================="
echo "πŸ“Š μˆ˜ν•™ 벀치마크 평가 μ‹œμž‘ (GPU ${GPU_ID}번 μ‚¬μš©)"
echo "==============================================="
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/math_eval
# ==========================================
# λͺ¨λΈ μ„€μ • (μƒˆ λͺ¨λΈ μΆ”κ°€ μ‹œ 이 λΆ€λΆ„λ§Œ μˆ˜μ •)
# ==========================================
# MATH_MODELS=(
# "andrewzh/Absolute_Zero_Reasoner-Coder-7b"
# "andrewzh2/Absolute_Zero_Reasoner-Base-7b"
# "Qwen/Qwen2.5-Coder-7B"
# "Qwen/Qwen2.5-7B"
# )
# MATH_MODEL_NAMES=(
# "azr_coder_7b"
# "azr_base_7b"
# "qwen25_7b_coder"
# "qwen25_7b"
# )
# MATH_TEMPLATES=(
# "azr"
# "azr"
# "qwen25-math-cot"
# "qwen25-math-cot"
# )
MATH_MODELS=(
"andrewzh/Absolute_Zero_Reasoner-Coder-7b"
"andrewzh2/Absolute_Zero_Reasoner-Base-7b"
"Qwen/Qwen2.5-Coder-7B"
"Qwen/Qwen2.5-7B"
)
MATH_MODEL_NAMES=(
"azr_coder_7b"
"azr_base_7b"
"qwen25_7b_coder"
"qwen25_7b"
)
MATH_TEMPLATES=(
"azr"
"azr"
"qwen25-math-cot"
"qwen25-math-cot"
)
# 각 λͺ¨λΈμ— λŒ€ν•΄ κ°œλ³„μ μœΌλ‘œ 평가 μ‹€ν–‰ (AZR λͺ¨λΈλΆ€ν„° μ‹œμž‘)
echo "πŸš€ ${#MATH_MODELS[@]}개 λͺ¨λΈμ˜ μˆ˜ν•™ 벀치마크 평가λ₯Ό μ‹œμž‘ν•©λ‹ˆλ‹€... (AZR λͺ¨λΈλΆ€ν„°)"
echo ""
for i in "${!MATH_MODELS[@]}"; do
MODEL="${MATH_MODELS[$i]}"
NAME="${MATH_MODEL_NAMES[$i]}"
TEMPLATE="${MATH_TEMPLATES[$i]}"
echo "πŸ”„ [$((i+1))/${#MATH_MODELS[@]}] $NAME μˆ˜ν•™ 평가 쀑..."
echo " λͺ¨λΈ: $MODEL"
echo " ν…œν”Œλ¦Ώ: $TEMPLATE"
# eval λ””λ ‰ν† λ¦¬λ‘œ 이동
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/math_eval/eval
# 좜λ ₯ 디렉토리 생성
OUTPUT_DIR="/home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/math_eval/EVAL/results/${NAME}"
mkdir -p "$OUTPUT_DIR"
# Python 슀크립트 직접 μ‹€ν–‰
TOKENIZERS_PARALLELISM=false python -u math_eval.py \
--model_name_or_path "$MODEL" \
--data_name aime24,aime25,amc23,math500,olympiadbench,minerva_math \
--output_dir "$OUTPUT_DIR" \
--split test \
--prompt_type "$TEMPLATE" \
--num_test_sample -1 \
--max_tokens_per_call 16000 \
--seed 42 \
--temperature 0 \
--n_sampling 1 \
--top_p 0.95 \
--start 0 \
--end -1 \
--use_vllm \
--save_outputs \
2>&1 | tee "$LOG_DIR/${NAME}_math_evaluation.log"
# μ›λž˜ λ””λ ‰ν† λ¦¬λ‘œ λŒμ•„κ°€κΈ°
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/math_eval
echo "βœ… $NAME μˆ˜ν•™ 평가 μ™„λ£Œ"
echo ""
done
echo "πŸŽ‰ λͺ¨λ“  μˆ˜ν•™ 벀치마크 평가 μ™„λ£Œ!"
else
echo "⏭️ μˆ˜ν•™ 벀치마크 평가 κ±΄λ„ˆλ›°κΈ°"
fi
# 2. μ½”λ”© 벀치마크 평가 (선택적 μ‹€ν–‰)
if [ "$RUN_CODING" = true ]; then
echo ""
echo "==============================================="
echo "πŸ’» μ½”λ”© 벀치마크 평가 μ‹œμž‘ (GPU ${GPU_ID}번 μ‚¬μš©)"
echo "==============================================="
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/code_eval
# 각 λͺ¨λΈμ— λŒ€ν•΄ μ½”λ”© 평가 μ‹€ν–‰ (μœ„μ—μ„œ μ„€μ •ν•œ λͺ¨λΈ λ°°μ—΄ μ‚¬μš©)
MODELS=("${MATH_MODELS[@]}")
MODEL_NAMES=("${MATH_MODEL_NAMES[@]}")
echo "πŸš€ μžλ™μœΌλ‘œ ${#MODELS[@]}개 λͺ¨λΈμ˜ μ½”λ”© 벀치마크 평가λ₯Ό μ‹œμž‘ν•©λ‹ˆλ‹€... (AZR λͺ¨λΈλΆ€ν„°)"
echo " - HumanEval+ 및 MBPP+ 평가"
echo " - LiveCodeBench 평가"
echo ""
# κ²°κ³Ό 디렉토리 생성
mkdir -p "$LOG_DIR/coding_results"
for i in "${!MODELS[@]}"; do
MODEL="${MODELS[$i]}"
NAME="${MODEL_NAMES[$i]}"
echo "πŸ”„ [$((i+1))/${#MODELS[@]}] $NAME μ½”λ”© 평가 쀑..."
echo " λͺ¨λΈ: $MODEL"
# 1. HumanEval+ 평가
echo " πŸ“Š HumanEval+ 평가 쀑..."
bash scripts/run_evalplus.sh humaneval "$MODEL" 1 0.0 0.95 1 \
2>&1 | tee "$LOG_DIR/coding_results/${NAME}_humaneval.log" || echo "⚠️ HumanEval+ 평가 μ‹€νŒ¨: $NAME"
# 2. MBPP+ 평가
echo " πŸ“Š MBPP+ 평가 쀑..."
bash scripts/run_evalplus.sh mbpp "$MODEL" 1 0.0 0.95 1 \
2>&1 | tee "$LOG_DIR/coding_results/${NAME}_mbpp.log" || echo "⚠️ MBPP+ 평가 μ‹€νŒ¨: $NAME"
# 3. LiveCodeBench 평가
echo " πŸ“Š LiveCodeBench 평가 쀑..."
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner
bash evaluation/code_eval/scripts/run_lcb_gen.sh \
--model "$MODEL" \
--gpu $GPU_ID \
--n 1 \
--temperature 0.0 \
--top_p 0.95 \
--max_tokens 2048 \
2>&1 | tee "$LOG_DIR/coding_results/${NAME}_lcb.log" || echo "⚠️ LiveCodeBench 평가 μ‹€νŒ¨: $NAME"
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/code_eval
echo "βœ… $NAME μ½”λ”© 평가 μ™„λ£Œ"
echo ""
done
echo "πŸŽ‰ λͺ¨λ“  μ½”λ”© 벀치마크 평가 μ™„λ£Œ!"
echo "πŸ“ κ²°κ³Ό νŒŒμΌλ“€:"
echo " - HumanEval+: evalplus_results/humaneval/"
echo " - MBPP+: evalplus_results/mbpp/"
echo " - LiveCodeBench: coding/LiveCodeBench/outputs/"
echo " - 둜그: $LOG_DIR/coding_results/"
else
echo "⏭️ μ½”λ”© 벀치마크 평가 κ±΄λ„ˆλ›°κΈ°"
fi
# 3. κ²°κ³Ό μˆ˜μ§‘ 및 뢄석
echo ""
echo "==============================================="
echo "πŸ“ˆ κ²°κ³Ό μˆ˜μ§‘ 및 뢄석"
echo "==============================================="
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation
# μˆ˜ν•™ 평가 κ²°κ³Ό μˆ˜μ§‘
if [ "$RUN_MATH" = true ]; then
echo "πŸ” μˆ˜ν•™ 평가 κ²°κ³Ό μˆ˜μ§‘ 쀑..."
cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/math_eval
python3 collect_all_math_results.py 2>&1 | tee $LOG_DIR/result_collection.log
fi
# 4. μ’…ν•© 리포트 생성
echo ""
echo "==============================================="
echo "πŸ“‹ μ’…ν•© 리포트 생성"
echo "==============================================="
REPORT_FILE="$LOG_DIR/evaluation_summary_$TIMESTAMP.md"
cat > $REPORT_FILE << EOF
# AZR vs Base Models Performance Evaluation Report
## 평가 κ°œμš”
- **평가 μ‹œμž‘**: $(date)
- **평가 λͺ¨λΈ**: 4개 (AZR 2개 + Base 2개)
- **벀치마크**: Math (6개) + Coding (선택)
## ν‰κ°€λœ λͺ¨λΈλ“€
1. **Qwen/Qwen2.5-7B** (Base Model)
2. **andrewzh/Absolute_Zero_Reasoner-Coder-7b** (AZR Coder)
3. **andrewzh2/Absolute_Zero_Reasoner-Base-7b** (AZR Base)
4. **Qwen/Qwen2.5-7B-Coder** (Coder Base)
## μˆ˜ν•™ 벀치마크
- AIME 2024/2025
- AMC 2023
- Math500
- OlympiadBench
- Minerva Math
## 평가 μ„€μ •
**μˆ˜ν•™ 벀치마크:**
- Temperature: 0 (greedy decoding)
- Max tokens: 16000
- Seed: 42
- Benchmarks: AIME24/25, AMC23, Math500, OlympiadBench, Minerva Math
**μ½”λ”© 벀치마크:**
- Temperature: 0 (greedy decoding)
- Max tokens: 2048 (LiveCodeBench), default (EvalPlus)
- Datasets: HumanEval+, MBPP+, LiveCodeBench v5
## κ²°κ³Ό 파일 μœ„μΉ˜
**μˆ˜ν•™ 평가:**
- μˆ˜ν•™ 평가 둜그: $LOG_DIR/math_evaluation.log
- κ²°κ³Ό μˆ˜μ§‘ 둜그: $LOG_DIR/result_collection.log
- 상세 κ²°κ³Ό: evaluation/math_eval/eval/eval_results/
**μ½”λ”© 평가:**
- HumanEval+ κ²°κ³Ό: evaluation/code_eval/evalplus_results/humaneval/
- MBPP+ κ²°κ³Ό: evaluation/code_eval/evalplus_results/mbpp/
- LiveCodeBench κ²°κ³Ό: evaluation/code_eval/coding/LiveCodeBench/outputs/
- 둜그 파일: $LOG_DIR/coding_results/
## λ‹€μŒ 단계
1. WandB λŒ€μ‹œλ³΄λ“œμ—μ„œ μ‹€μ‹œκ°„ κ²°κ³Ό 확인
2. 둜컬 κ²°κ³Ό νŒŒμΌμ—μ„œ 상세 뢄석 (μˆ˜ν•™ + μ½”λ”©)
3. AZR vs Base λͺ¨λΈ μ’…ν•© μ„±λŠ₯ 비ꡐ
4. Cross-domain μ„±λŠ₯ ν–₯상 효과 뢄석
EOF
echo "πŸ“„ μ’…ν•© 리포트 생성 μ™„λ£Œ: $REPORT_FILE"
# μ‹€ν–‰ μ‹œκ°„ 계산
END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))
HOURS=$((DURATION / 3600))
MINUTES=$(((DURATION % 3600) / 60))
SECONDS=$((DURATION % 60))
echo ""
echo "πŸŽ‰ 전체 평가 μ™„λ£Œ!"
echo "⏱️ 총 μ†Œμš” μ‹œκ°„: ${HOURS}μ‹œκ°„ ${MINUTES}λΆ„ ${SECONDS}초"
echo "πŸ“ λͺ¨λ“  λ‘œκ·ΈλŠ” $LOG_DIR 디렉토리에 μ €μž₯λ˜μ—ˆμŠ΅λ‹ˆλ‹€."
echo ""
echo "πŸ“Š κ²°κ³Ό 확인 방법:"
echo " 1. WandB: https://wandb.ai (ν”„λ‘œμ νŠΈ: verl_math_evaluate)"
echo " 2. 둜컬 파일: evaluation/math_eval/eval/eval_results/"
echo " 3. μš”μ•½ 리포트: $REPORT_FILE"