neural-mesh-v2 / evaluation /compare_models.sh

Restore all essential files - code, configs, and MBPP/HumanEval data

24c2665 verified 17 days ago

1.38 kB

	#!/bin/bash

	cd /home/ubuntu/RLVR/Absolute-Zero-Reasoner/evaluation/math_eval

	# Base model (Qwen2.5-7B) 평가
	echo "=== Evaluating Base Model: Qwen2.5-7B ==="
	bash eval_math_nodes.sh \
	--run_name qwen25_7b_base \
	--init_model "Qwen/Qwen2.5-7B" \
	--template qwen25 \
	--tp_size 1 \
	--temperature 0 \
	--top_p 0.95 \
	--max_tokens 16000 \
	--benchmarks aime24,aime25,amc23,math500,olympiadbench,minerva_math \
	--n_sampling 1 \
	--just_wandb false \
	--seed 42

	# AZR Coder 7B 평가
	echo "=== Evaluating AZR Coder 7B ==="
	bash eval_math_nodes.sh \
	--run_name azr_coder_7b_hf \
	--init_model "andrewzh/Absolute_Zero_Reasoner-Coder-7b" \
	--template azr \
	--tp_size 1 \
	--temperature 0 \
	--top_p 0.95 \
	--max_tokens 16000 \
	--benchmarks aime24,aime25,amc23,math500,olympiadbench,minerva_math \
	--n_sampling 1 \
	--just_wandb false \
	--seed 42

	# AZR Base 7B 평가
	echo "=== Evaluating AZR Base 7B ==="
	bash eval_math_nodes.sh \
	--run_name azr_base_7b_hf \
	--init_model "andrewzh2/Absolute_Zero_Reasoner-Base-7b" \
	--template azr \
	--tp_size 1 \
	--temperature 0 \
	--top_p 0.95 \
	--max_tokens 16000 \
	--benchmarks aime24,aime25,amc23,math500,olympiadbench,minerva_math \
	--n_sampling 1 \
	--just_wandb false \
	--seed 42

	echo "=== All evaluations completed! ==="