neural-mesh-v2 / test_evalplus_direct.py
hjkim00's picture
Restore all essential files - code, configs, and MBPP/HumanEval data
24c2665 verified
raw
history blame
2.69 kB
#!/usr/bin/env python3
"""
EvalPlus ์ง์ ‘ ์‚ฌ์šฉ ํ…Œ์ŠคํŠธ
"""
import json
import sys
from pathlib import Path
# Add paths
sys.path.append(str(Path(__file__).parent))
sys.path.insert(0, "/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/coding")
# Debug print
print(f"Python path: {sys.path[:3]}")
try:
import evalplus
print("EvalPlus imported successfully!")
except ImportError as e:
print(f"Failed to import evalplus: {e}")
from absolute_zero_reasoner.testtime.solution_generator import InitialSolutionGenerator
from absolute_zero_reasoner.testtime.config import TestTimeConfig
from absolute_zero_reasoner.testtime.logger import TestTimeLogger
def load_single_problem(task_id: str = "Mbpp/2"):
"""๋‹จ์ผ MBPP ๋ฌธ์ œ ๋กœ๋“œ"""
dataset_path = Path("/home/ubuntu/RLVR/TestTime-RLVR-v2/evaluation/code_eval/data/MbppPlus.jsonl")
with open(dataset_path, 'r') as f:
for line in f:
problem = json.loads(line.strip())
if problem['task_id'] == task_id:
return problem
raise ValueError(f"Problem {task_id} not found")
def test_evalplus_direct():
"""EvalPlus ์ง์ ‘ ์‚ฌ์šฉ ํ…Œ์ŠคํŠธ"""
# ์„ค์ •
config = TestTimeConfig()
logger = TestTimeLogger()
# ๊ฐ„๋‹จํ•œ ๋ชจ์˜ ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ €
class MockModel:
def device(self):
return 'cpu'
class MockTokenizer:
eos_token = '</s>'
pad_token = '</s>'
# ์†”๋ฃจ์…˜ ์ƒ์„ฑ๊ธฐ ์ดˆ๊ธฐํ™”
generator = InitialSolutionGenerator(
model=MockModel(),
tokenizer=MockTokenizer(),
config=config,
logger=logger,
use_vllm=False
)
# ๋ฌธ์ œ ๋กœ๋“œ
problem = load_single_problem("Mbpp/2")
print(f"Testing problem: {problem['task_id']}")
print(f"Entry point: {problem['entry_point']}")
print(f"Base inputs: {len(problem.get('base_input', []))}")
print(f"Plus inputs: {len(problem.get('plus_input', []))}")
# ์ •๋‹ต ์†”๋ฃจ์…˜์œผ๋กœ ํ…Œ์ŠคํŠธ
solution = problem['canonical_solution']
print("\nTesting with canonical solution:")
print(solution)
# ํ‰๊ฐ€ ์‹คํ–‰
result = generator.evaluate_solution(problem, solution)
# ๊ฒฐ๊ณผ ์ถœ๋ ฅ
print("\n=== Evaluation Result (Canonical) ===")
print(f"Correct: {result['correct']}")
print(f"Base passed: {result['base_passed']}/{result['base_total']}")
print(f"Plus passed: {result['plus_passed']}/{result['plus_total']}")
print(f"Total passed: {result['passed_tests']}/{result['total_tests']}")
if result['error']:
print(f"Error: {result['error']}")
if __name__ == "__main__":
test_evalplus_direct()