neural-mesh-v2 / test_evalplus_single.py

Restore all essential files - code, configs, and MBPP/HumanEval data

24c2665 verified 13 days ago

3.34 kB

	#!/usr/bin/env python3
	"""
	EvalPlus 방식으로 단일 MBPP 문제 테스트
	"""

	import json
	import sys
	from pathlib import Path

	# Add parent directory to path
	sys.path.append(str(Path(__file__).parent))

	from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator
	from absolute_zero_reasoner.testtime.config import TestTimeConfig
	from absolute_zero_reasoner.testtime.logger import TestTimeLogger

	def load_single_problem(task_id: str = "Mbpp/2"):
	"""단일 MBPP 문제 로드"""
	dataset_path = Path("/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/data/MbppPlus.jsonl")

	with open(dataset_path, 'r') as f:
	for line in f:
	problem = json.loads(line.strip())
	if problem['task_id'] == task_id:
	return problem

	raise ValueError(f"Problem {task_id} not found")

	def test_evalplus_evaluation():
	"""EvalPlus 스타일 평가 테스트"""
	# 설정
	config = TestTimeConfig()
	logger = TestTimeLogger()

	# 간단한 모의 모델과 토크나이저 (실제 평가는 안함)
	class MockModel:
	def device(self):
	return 'cpu'

	class MockTokenizer:
	eos_token = '</s>'
	pad_token = '</s>'

	# 솔루션 생성기 초기화
	generator = InitialSolutionGenerator(
	model=MockModel(),
	tokenizer=MockTokenizer(),
	config=config,
	logger=logger,
	use_vllm=False
	)

	# 문제 로드
	problem = load_single_problem("Mbpp/2")
	print(f"Testing problem: {problem['task_id']}")
	print(f"Entry point: {problem['entry_point']}")
	print(f"Base inputs: {len(problem.get('base_input', []))}")
	print(f"Plus inputs: {len(problem.get('plus_input', []))}")

	# 정답 솔루션으로 테스트
	solution = problem['canonical_solution']
	print("\nTesting with canonical solution:")
	print(solution[:200] + "..." if len(solution) > 200 else solution)

	# 평가 실행
	result = generator.evaluate_solution(problem, solution)

	# 결과 출력
	print("\n=== Evaluation Result ===")
	print(f"Correct: {result['correct']}")
	print(f"Base passed: {result['base_passed']}/{result['base_total']}")
	print(f"Plus passed: {result['plus_passed']}/{result['plus_total']}")
	print(f"Total passed: {result['passed_tests']}/{result['total_tests']}")
	if result['error']:
	print(f"Error: {result['error']}")

	# 잘못된 솔루션으로도 테스트
	wrong_solution = """
	def similar_elements(test_tup1, test_tup2):
	# 의도적으로 잘못된 솔루션 - 교집합이 아닌 합집합 반환
	return tuple(set(test_tup1) \| set(test_tup2))
	"""

	print("\n\nTesting with wrong solution:")
	print(wrong_solution)

	result2 = generator.evaluate_solution(problem, wrong_solution)

	print("\n=== Evaluation Result (Wrong Solution) ===")
	print(f"Correct: {result2['correct']}")
	print(f"Base passed: {result2['base_passed']}/{result2['base_total']}")
	print(f"Plus passed: {result2['plus_passed']}/{result2['plus_total']}")
	print(f"Total passed: {result2['passed_tests']}/{result2['total_tests']}")
	if result2['error']:
	print(f"Error: {result2['error']}")

	if __name__ == "__main__":
	test_evalplus_evaluation()