neural-mesh-v2 / evaluation /compare_azr_models.sh

Restore all essential files - code, configs, and MBPP/HumanEval data

24c2665 verified 9 days ago

2.67 kB

	#!/bin/bash

	# AZR 모델들과 Base 모델 성능 비교 스크립트
	# 사용법: bash compare_azr_models.sh

	# GPU 설정 - GPU 6번만 사용
	export CUDA_VISIBLE_DEVICES=6
	echo "🎯 GPU 설정: GPU 6번만 사용 (CUDA_VISIBLE_DEVICES=6)"

	echo "=== AZR Models vs Base Model Comparison Script ==="
	echo "이 스크립트는 다음 4개 모델을 비교합니다:"
	echo "1. Qwen/Qwen2.5-7B (Base Model)"
	echo "2. andrewzh/Absolute_Zero_Reasoner-Coder-7b (AZR Coder)"
	echo "3. andrewzh2/Absolute_Zero_Reasoner-Base-7b (AZR Base)"
	echo "4. Qwen/Qwen2.5-7B-Coder (Coder Base Model)"
	echo ""
	echo "📊 벤치마크: $BENCHMARKS"
	echo "🎯 설정: temperature=$TEMPERATURE, max_tokens=$MAX_TOKENS, seed=$SEED"
	echo ""

	cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/math_eval

	# 공통 설정
	BENCHMARKS="aime24,aime25,amc23,math500,olympiadbench,minerva_math"
	SEED=42
	TEMPERATURE=0
	MAX_TOKENS=16000

	echo "=== 1. Base Model (Qwen2.5-7B) 평가 ==="
	bash eval_math_nodes.sh \
	--run_name qwen25_7b_base \
	--init_model "Qwen/Qwen2.5-7B" \
	--template qwen25 \
	--tp_size 1 \
	--temperature $TEMPERATURE \
	--top_p 0.95 \
	--max_tokens $MAX_TOKENS \
	--benchmarks $BENCHMARKS \
	--n_sampling 1 \
	--just_wandb false \
	--seed $SEED

	echo ""
	echo "=== 2. AZR Coder 7B 평가 ==="
	bash eval_math_nodes.sh \
	--run_name azr_coder_7b_hf \
	--init_model "andrewzh/Absolute_Zero_Reasoner-Coder-7b" \
	--template azr \
	--tp_size 1 \
	--temperature $TEMPERATURE \
	--top_p 0.95 \
	--max_tokens $MAX_TOKENS \
	--benchmarks $BENCHMARKS \
	--n_sampling 1 \
	--just_wandb false \
	--seed $SEED

	echo ""
	echo "=== 3. AZR Base 7B 평가 ==="
	bash eval_math_nodes.sh \
	--run_name azr_base_7b_hf \
	--init_model "andrewzh2/Absolute_Zero_Reasoner-Base-7b" \
	--template azr \
	--tp_size 1 \
	--temperature $TEMPERATURE \
	--top_p 0.95 \
	--max_tokens $MAX_TOKENS \
	--benchmarks $BENCHMARKS \
	--n_sampling 1 \
	--just_wandb false \
	--seed $SEED

	echo ""
	echo "=== 4. Qwen2.5-7B-Coder 평가 ==="
	bash eval_math_nodes.sh \
	--run_name qwen25_7b_coder \
	--init_model "Qwen/Qwen2.5-7B-Coder" \
	--template qwen25 \
	--tp_size 1 \
	--temperature $TEMPERATURE \
	--top_p 0.95 \
	--max_tokens $MAX_TOKENS \
	--benchmarks $BENCHMARKS \
	--n_sampling 1 \
	--just_wandb false \
	--seed $SEED

	echo ""
	echo "=== 평가 완료 ==="
	echo "결과 확인 방법:"
	echo "1. wandb 대시보드에서 각 실행 결과 확인"
	echo "2. 로컬 결과 파일: evaluation/math_eval/eval/eval_results/"
	echo "3. 비교 분석을 위해 compare_results.py 실행"