neural-mesh-v2 / test /run_validation_suite.py

Restore all essential files - code, configs, and MBPP/HumanEval data

24c2665 verified 14 days ago

12.6 kB

	#!/usr/bin/env python3
	"""
	TTRLVR + AZR 통합 검증 스위트

	전체 시스템의 검증을 위한 통합 스크립트:
	1. 환경 검증
	2. 단위 테스트
	3. 미니 통합 테스트 (1라운드 실행)
	4. 성능 벤치마크
	5. 최종 검증 보고서 생성
	"""

	import os
	import sys
	import json
	import subprocess
	import tempfile
	import time
	from datetime import datetime
	from pathlib import Path

	# 경로 설정
	sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2')

	def run_command(command, description, timeout=300):
	"""명령어 실행 및 결과 반환"""

	print(f"🔄 {description}")
	print(f" Command: {command}")

	start_time = time.time()

	try:
	result = subprocess.run(
	command,
	shell=True,
	capture_output=True,
	text=True,
	timeout=timeout,
	cwd='/home/ubuntu/RLVR/TestTime-RLVR-v2'
	)

	duration = time.time() - start_time

	if result.returncode == 0:
	print(f"✅ {description} completed ({duration:.1f}s)")
	return True, result.stdout, result.stderr
	else:
	print(f"❌ {description} failed ({duration:.1f}s)")
	print(f" Error: {result.stderr}")
	return False, result.stdout, result.stderr

	except subprocess.TimeoutExpired:
	print(f"⏰ {description} timed out after {timeout}s")
	return False, "", "Timeout"
	except Exception as e:
	print(f"💥 {description} crashed: {e}")
	return False, "", str(e)


	def run_environment_validation():
	"""환경 검증 실행"""

	print("\n" + "="*60)
	print("1️⃣ 환경 검증")
	print("="*60)

	success, _, _ = run_command(
	"cd /home/ubuntu/RLVR/TestTime-RLVR-v2/test && python validate_environment.py",
	"Environment validation"
	)

	return success


	def run_unit_tests():
	"""단위 테스트 실행"""

	print("\n" + "="*60)
	print("2️⃣ 단위 테스트")
	print("="*60)

	success, _, _ = run_command(
	"cd /home/ubuntu/RLVR/TestTime-RLVR-v2/test && python test_ttrlvr_azr_integration.py",
	"Unit tests"
	)

	return success


	def run_mini_integration_test():
	"""미니 통합 테스트 (1문제, 2라운드)"""

	print("\n" + "="*60)
	print("3️⃣ 미니 통합 테스트")
	print("="*60)

	# 짧은 통합 테스트 실행
	success, stdout, stderr = run_command(
	"cd /home/ubuntu/RLVR/TestTime-RLVR-v2/test && python train_ttrlvr_azr.py --benchmark mbpp --problems 1 --rounds 2 --debug",
	"Mini integration test (1 problem, 2 rounds)",
	timeout=1800 # 30분
	)

	if success:
	print("✅ Mini integration test completed successfully")
	# 결과 파일 확인
	results_dir = Path("/home/ubuntu/RLVR/TestTime-RLVR-v2/test/results/ttrlvr_azr")
	if results_dir.exists():
	latest_result = max(results_dir.glob("*"), key=os.path.getctime, default=None)
	if latest_result:
	print(f"📁 Results saved to: {latest_result}")

	# 결과 파일 분석
	result_file = latest_result / "training_results.json"
	if result_file.exists():
	with open(result_file, 'r') as f:
	results = json.load(f)

	print(f"📊 Test summary:")
	print(f" - Success: {results.get('success', False)}")
	print(f" - Completed rounds: {len(results.get('rounds', {}))}")
	print(f" - Final model: {results.get('final_model', 'N/A')}")

	return success


	def check_disk_space():
	"""디스크 공간 확인"""

	print("\n" + "="*60)
	print("4️⃣ 디스크 공간 확인")
	print("="*60)

	# 중요 디렉토리들의 디스크 사용량 확인
	paths_to_check = [
	"/home/ubuntu/RLVR",
	"/data",
	"/tmp"
	]

	all_good = True

	for path in paths_to_check:
	if os.path.exists(path):
	success, stdout, _ = run_command(f"df -h {path}", f"Disk usage for {path}")
	if success:
	lines = stdout.strip().split('\n')
	if len(lines) > 1:
	fields = lines[1].split()
	if len(fields) >= 5:
	used_percent = fields[4].rstrip('%')
	if used_percent.isdigit() and int(used_percent) > 90:
	print(f"⚠️ Warning: {path} is {used_percent}% full")
	all_good = False
	else:
	print(f"✅ {path}: {used_percent}% used")
	else:
	print(f"⚠️ Path not found: {path}")

	return all_good


	def run_performance_benchmark():
	"""성능 벤치마크"""

	print("\n" + "="*60)
	print("5️⃣ 성능 벤치마크")
	print("="*60)

	# GPU 메모리 사용량 확인
	print("🖥️ GPU 메모리 상태:")
	gpu_success, gpu_output, _ = run_command("nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits", "GPU memory check")

	if gpu_success:
	for i, line in enumerate(gpu_output.strip().split('\n')):
	if line.strip():
	try:
	used, total = map(int, line.split(', '))
	usage_percent = (used / total) * 100
	print(f" GPU {i}: {used}MB / {total}MB ({usage_percent:.1f}%)")
	except:
	print(f" GPU {i}: {line}")

	# 시스템 메모리 확인
	print("\n💾 시스템 메모리 상태:")
	mem_success, mem_output, _ = run_command("free -h", "System memory check")
	if mem_success:
	for line in mem_output.split('\n')[:2]: # 첫 2줄만
	print(f" {line}")

	# CPU 사용률 확인
	print("\n🖥️ CPU 상태:")
	cpu_success, cpu_output, _ = run_command("top -bn1 \| grep 'Cpu(s)' \| head -1", "CPU usage check")
	if cpu_success:
	print(f" {cpu_output.strip()}")

	return gpu_success and mem_success


	def generate_validation_report(results):
	"""검증 보고서 생성"""

	print("\n" + "="*60)
	print("6️⃣ 검증 보고서 생성")
	print("="*60)

	timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
	report_file = f"/tmp/ttrlvr_azr_validation_report_{timestamp}.json"

	# 보고서 데이터
	report = {
	'timestamp': datetime.now().isoformat(),
	'validation_results': results,
	'summary': {
	'total_tests': len(results),
	'passed_tests': sum(1 for result in results.values() if result['success']),
	'overall_success': all(result['success'] for result in results.values())
	},
	'recommendations': []
	}

	# HTML 보고서 생성
	html_report = f"/tmp/ttrlvr_azr_validation_report_{timestamp}.html"

	html_content = f"""
	<!DOCTYPE html>
	<html>
	<head>
	<title>TTRLVR + AZR Validation Report</title>
	<style>
	body {{ font-family: Arial, sans-serif; margin: 40px; }}
	.header {{ background-color: #f0f0f0; padding: 20px; border-radius: 5px; }}
	.success {{ color: green; }}
	.failure {{ color: red; }}
	.test-section {{ margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }}
	.recommendations {{ background-color: #fff3cd; padding: 15px; border-radius: 5px; }}
	</style>
	</head>
	<body>
	<div class="header">
	<h1>TTRLVR + AZR Integration Validation Report</h1>
	<p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
	<p>Overall Status: <span class="{'success' if report['summary']['overall_success'] else 'failure'}">
	{'✅ ALL TESTS PASSED' if report['summary']['overall_success'] else '❌ SOME TESTS FAILED'}
	</span></p>
	<p>Tests: {report['summary']['passed_tests']}/{report['summary']['total_tests']} passed</p>
	</div>

	<h2>Test Results</h2>
	"""

	for test_name, result in results.items():
	status = "success" if result['success'] else "failure"
	icon = "✅" if result['success'] else "❌"

	html_content += f"""
	<div class="test-section">
	<h3 class="{status}">{icon} {test_name}</h3>
	<p><strong>Duration:</strong> {result.get('duration', 'N/A')}</p>
	<p><strong>Details:</strong> {result.get('details', 'No details available')}</p>
	</div>
	"""

	if report['recommendations']:
	html_content += """
	<div class="recommendations">
	<h2>Recommendations</h2>
	<ul>
	"""
	for rec in report['recommendations']:
	html_content += f"<li>{rec}</li>"

	html_content += """
	</ul>
	</div>
	"""

	html_content += """
	</body>
	</html>
	"""

	# 파일 저장
	with open(report_file, 'w') as f:
	json.dump(report, f, indent=2)

	with open(html_report, 'w') as f:
	f.write(html_content)

	print(f"📄 JSON 보고서: {report_file}")
	print(f"🌐 HTML 보고서: {html_report}")

	return report


	def main():
	"""메인 실행 함수"""

	print("🧪 TTRLVR + AZR 통합 검증 스위트 시작")
	print("=" * 60)
	print(f"시작 시간: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
	print("=" * 60)

	# 검증 결과 저장
	results = {}
	start_time = time.time()

	# 1. 환경 검증
	test_start = time.time()
	success = run_environment_validation()
	results['environment_validation'] = {
	'success': success,
	'duration': f"{time.time() - test_start:.1f}s",
	'details': 'Environment setup and dependencies check'
	}

	# 2. 단위 테스트 (환경 검증 성공 시에만)
	if success:
	test_start = time.time()
	success = run_unit_tests()
	results['unit_tests'] = {
	'success': success,
	'duration': f"{time.time() - test_start:.1f}s",
	'details': 'Component unit tests and integration tests'
	}
	else:
	results['unit_tests'] = {
	'success': False,
	'duration': '0s',
	'details': 'Skipped due to environment validation failure'
	}

	# 3. 미니 통합 테스트 (이전 테스트들 성공 시에만)
	if results['unit_tests']['success']:
	test_start = time.time()
	success = run_mini_integration_test()
	results['mini_integration_test'] = {
	'success': success,
	'duration': f"{time.time() - test_start:.1f}s",
	'details': 'End-to-end pipeline test with 1 problem, 2 rounds'
	}
	else:
	results['mini_integration_test'] = {
	'success': False,
	'duration': '0s',
	'details': 'Skipped due to previous test failures'
	}

	# 4. 디스크 공간 확인 (항상 실행)
	test_start = time.time()
	success = check_disk_space()
	results['disk_space_check'] = {
	'success': success,
	'duration': f"{time.time() - test_start:.1f}s",
	'details': 'Available disk space in critical directories'
	}

	# 5. 성능 벤치마크 (항상 실행)
	test_start = time.time()
	success = run_performance_benchmark()
	results['performance_benchmark'] = {
	'success': success,
	'duration': f"{time.time() - test_start:.1f}s",
	'details': 'System resource usage and performance metrics'
	}

	# 6. 보고서 생성
	total_duration = time.time() - start_time
	print(f"\n⏱️ 총 실행 시간: {total_duration:.1f}초 ({total_duration/60:.1f}분)")

	report = generate_validation_report(results)

	# 최종 결과
	print("\n" + "="*60)
	print("🏁 검증 스위트 완료")
	print("="*60)

	passed = sum(1 for result in results.values() if result['success'])
	total = len(results)

	print(f"📊 최종 결과: {passed}/{total} 테스트 통과")

	if report['summary']['overall_success']:
	print("🎉 모든 검증 통과! TTRLVR + AZR 시스템 실행 준비 완료")
	return 0
	else:
	print("⚠️ 일부 검증 실패. 위의 결과를 확인하고 문제를 해결하세요.")
	return 1


	if __name__ == '__main__':
	exit_code = main()
	sys.exit(exit_code)