neural-mesh-v2 / test_evalplus_direct.py

Restore all essential files - code, configs, and MBPP/HumanEval data

24c2665 verified 10 days ago

2.69 kB

	#!/usr/bin/env python3
	"""
	EvalPlus 직접 사용 테스트
	"""

	import json
	import sys
	from pathlib import Path

	# Add paths
	sys.path.append(str(Path(__file__).parent))
	sys.path.insert(0, "/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/coding")

	# Debug print
	print(f"Python path: {sys.path[:3]}")
	try:
	import evalplus
	print("EvalPlus imported successfully!")
	except ImportError as e:
	print(f"Failed to import evalplus: {e}")

	from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator
	from absolute_zero_reasoner.testtime.config import TestTimeConfig
	from absolute_zero_reasoner.testtime.logger import TestTimeLogger

	def load_single_problem(task_id: str = "Mbpp/2"):
	"""단일 MBPP 문제 로드"""
	dataset_path = Path("/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/data/MbppPlus.jsonl")

	with open(dataset_path, 'r') as f:
	for line in f:
	problem = json.loads(line.strip())
	if problem['task_id'] == task_id:
	return problem

	raise ValueError(f"Problem {task_id} not found")

	def test_evalplus_direct():
	"""EvalPlus 직접 사용 테스트"""
	# 설정
	config = TestTimeConfig()
	logger = TestTimeLogger()

	# 간단한 모의 모델과 토크나이저
	class MockModel:
	def device(self):
	return 'cpu'

	class MockTokenizer:
	eos_token = '</s>'
	pad_token = '</s>'

	# 솔루션 생성기 초기화
	generator = InitialSolutionGenerator(
	model=MockModel(),
	tokenizer=MockTokenizer(),
	config=config,
	logger=logger,
	use_vllm=False
	)

	# 문제 로드
	problem = load_single_problem("Mbpp/2")
	print(f"Testing problem: {problem['task_id']}")
	print(f"Entry point: {problem['entry_point']}")
	print(f"Base inputs: {len(problem.get('base_input', []))}")
	print(f"Plus inputs: {len(problem.get('plus_input', []))}")

	# 정답 솔루션으로 테스트
	solution = problem['canonical_solution']
	print("\nTesting with canonical solution:")
	print(solution)

	# 평가 실행
	result = generator.evaluate_solution(problem, solution)

	# 결과 출력
	print("\n=== Evaluation Result (Canonical) ===")
	print(f"Correct: {result['correct']}")
	print(f"Base passed: {result['base_passed']}/{result['base_total']}")
	print(f"Plus passed: {result['plus_passed']}/{result['plus_total']}")
	print(f"Total passed: {result['passed_tests']}/{result['total_tests']}")
	if result['error']:
	print(f"Error: {result['error']}")

	if __name__ == "__main__":
	test_evalplus_direct()