neural-mesh-v2 / test /run_validation_suite.py
hjkim00's picture
Restore all essential files - code, configs, and MBPP/HumanEval data
24c2665 verified
raw
history blame
12.6 kB
#!/usr/bin/env python3
"""
TTRLVR + AZR ํ†ตํ•ฉ ๊ฒ€์ฆ ์Šค์œ„ํŠธ
์ „์ฒด ์‹œ์Šคํ…œ์˜ ๊ฒ€์ฆ์„ ์œ„ํ•œ ํ†ตํ•ฉ ์Šคํฌ๋ฆฝํŠธ:
1. ํ™˜๊ฒฝ ๊ฒ€์ฆ
2. ๋‹จ์œ„ ํ…Œ์ŠคํŠธ
3. ๋ฏธ๋‹ˆ ํ†ตํ•ฉ ํ…Œ์ŠคํŠธ (1๋ผ์šด๋“œ ์‹คํ–‰)
4. ์„ฑ๋Šฅ ๋ฒค์น˜๋งˆํฌ
5. ์ตœ์ข… ๊ฒ€์ฆ ๋ณด๊ณ ์„œ ์ƒ์„ฑ
"""
import os
import sys
import json
import subprocess
import tempfile
import time
from datetime import datetime
from pathlib import Path
# ๊ฒฝ๋กœ ์„ค์ •
sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2')
def run_command(command, description, timeout=300):
"""๋ช…๋ น์–ด ์‹คํ–‰ ๋ฐ ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜"""
print(f"๐Ÿ”„ {description}")
print(f" Command: {command}")
start_time = time.time()
try:
result = subprocess.run(
command,
shell=True,
capture_output=True,
text=True,
timeout=timeout,
cwd='/home/ubuntu/RLVR/TestTime-RLVR-v2'
)
duration = time.time() - start_time
if result.returncode == 0:
print(f"โœ… {description} completed ({duration:.1f}s)")
return True, result.stdout, result.stderr
else:
print(f"โŒ {description} failed ({duration:.1f}s)")
print(f" Error: {result.stderr}")
return False, result.stdout, result.stderr
except subprocess.TimeoutExpired:
print(f"โฐ {description} timed out after {timeout}s")
return False, "", "Timeout"
except Exception as e:
print(f"๐Ÿ’ฅ {description} crashed: {e}")
return False, "", str(e)
def run_environment_validation():
"""ํ™˜๊ฒฝ ๊ฒ€์ฆ ์‹คํ–‰"""
print("\n" + "="*60)
print("1๏ธโƒฃ ํ™˜๊ฒฝ ๊ฒ€์ฆ")
print("="*60)
success, _, _ = run_command(
"cd /home/ubuntu/RLVR/TestTime-RLVR-v2/test && python validate_environment.py",
"Environment validation"
)
return success
def run_unit_tests():
"""๋‹จ์œ„ ํ…Œ์ŠคํŠธ ์‹คํ–‰"""
print("\n" + "="*60)
print("2๏ธโƒฃ ๋‹จ์œ„ ํ…Œ์ŠคํŠธ")
print("="*60)
success, _, _ = run_command(
"cd /home/ubuntu/RLVR/TestTime-RLVR-v2/test && python test_ttrlvr_azr_integration.py",
"Unit tests"
)
return success
def run_mini_integration_test():
"""๋ฏธ๋‹ˆ ํ†ตํ•ฉ ํ…Œ์ŠคํŠธ (1๋ฌธ์ œ, 2๋ผ์šด๋“œ)"""
print("\n" + "="*60)
print("3๏ธโƒฃ ๋ฏธ๋‹ˆ ํ†ตํ•ฉ ํ…Œ์ŠคํŠธ")
print("="*60)
# ์งง์€ ํ†ตํ•ฉ ํ…Œ์ŠคํŠธ ์‹คํ–‰
success, stdout, stderr = run_command(
"cd /home/ubuntu/RLVR/TestTime-RLVR-v2/test && python train_ttrlvr_azr.py --benchmark mbpp --problems 1 --rounds 2 --debug",
"Mini integration test (1 problem, 2 rounds)",
timeout=1800 # 30๋ถ„
)
if success:
print("โœ… Mini integration test completed successfully")
# ๊ฒฐ๊ณผ ํŒŒ์ผ ํ™•์ธ
results_dir = Path("/home/ubuntu/RLVR/TestTime-RLVR-v2/test/results/ttrlvr_azr")
if results_dir.exists():
latest_result = max(results_dir.glob("*"), key=os.path.getctime, default=None)
if latest_result:
print(f"๐Ÿ“ Results saved to: {latest_result}")
# ๊ฒฐ๊ณผ ํŒŒ์ผ ๋ถ„์„
result_file = latest_result / "training_results.json"
if result_file.exists():
with open(result_file, 'r') as f:
results = json.load(f)
print(f"๐Ÿ“Š Test summary:")
print(f" - Success: {results.get('success', False)}")
print(f" - Completed rounds: {len(results.get('rounds', {}))}")
print(f" - Final model: {results.get('final_model', 'N/A')}")
return success
def check_disk_space():
"""๋””์Šคํฌ ๊ณต๊ฐ„ ํ™•์ธ"""
print("\n" + "="*60)
print("4๏ธโƒฃ ๋””์Šคํฌ ๊ณต๊ฐ„ ํ™•์ธ")
print("="*60)
# ์ค‘์š” ๋””๋ ‰ํ† ๋ฆฌ๋“ค์˜ ๋””์Šคํฌ ์‚ฌ์šฉ๋Ÿ‰ ํ™•์ธ
paths_to_check = [
"/home/ubuntu/RLVR",
"/data",
"/tmp"
]
all_good = True
for path in paths_to_check:
if os.path.exists(path):
success, stdout, _ = run_command(f"df -h {path}", f"Disk usage for {path}")
if success:
lines = stdout.strip().split('\n')
if len(lines) > 1:
fields = lines[1].split()
if len(fields) >= 5:
used_percent = fields[4].rstrip('%')
if used_percent.isdigit() and int(used_percent) > 90:
print(f"โš ๏ธ Warning: {path} is {used_percent}% full")
all_good = False
else:
print(f"โœ… {path}: {used_percent}% used")
else:
print(f"โš ๏ธ Path not found: {path}")
return all_good
def run_performance_benchmark():
"""์„ฑ๋Šฅ ๋ฒค์น˜๋งˆํฌ"""
print("\n" + "="*60)
print("5๏ธโƒฃ ์„ฑ๋Šฅ ๋ฒค์น˜๋งˆํฌ")
print("="*60)
# GPU ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰ ํ™•์ธ
print("๐Ÿ–ฅ๏ธ GPU ๋ฉ”๋ชจ๋ฆฌ ์ƒํƒœ:")
gpu_success, gpu_output, _ = run_command("nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits", "GPU memory check")
if gpu_success:
for i, line in enumerate(gpu_output.strip().split('\n')):
if line.strip():
try:
used, total = map(int, line.split(', '))
usage_percent = (used / total) * 100
print(f" GPU {i}: {used}MB / {total}MB ({usage_percent:.1f}%)")
except:
print(f" GPU {i}: {line}")
# ์‹œ์Šคํ…œ ๋ฉ”๋ชจ๋ฆฌ ํ™•์ธ
print("\n๐Ÿ’พ ์‹œ์Šคํ…œ ๋ฉ”๋ชจ๋ฆฌ ์ƒํƒœ:")
mem_success, mem_output, _ = run_command("free -h", "System memory check")
if mem_success:
for line in mem_output.split('\n')[:2]: # ์ฒซ 2์ค„๋งŒ
print(f" {line}")
# CPU ์‚ฌ์šฉ๋ฅ  ํ™•์ธ
print("\n๐Ÿ–ฅ๏ธ CPU ์ƒํƒœ:")
cpu_success, cpu_output, _ = run_command("top -bn1 | grep 'Cpu(s)' | head -1", "CPU usage check")
if cpu_success:
print(f" {cpu_output.strip()}")
return gpu_success and mem_success
def generate_validation_report(results):
"""๊ฒ€์ฆ ๋ณด๊ณ ์„œ ์ƒ์„ฑ"""
print("\n" + "="*60)
print("6๏ธโƒฃ ๊ฒ€์ฆ ๋ณด๊ณ ์„œ ์ƒ์„ฑ")
print("="*60)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
report_file = f"/tmp/ttrlvr_azr_validation_report_{timestamp}.json"
# ๋ณด๊ณ ์„œ ๋ฐ์ดํ„ฐ
report = {
'timestamp': datetime.now().isoformat(),
'validation_results': results,
'summary': {
'total_tests': len(results),
'passed_tests': sum(1 for result in results.values() if result['success']),
'overall_success': all(result['success'] for result in results.values())
},
'recommendations': []
}
# HTML ๋ณด๊ณ ์„œ ์ƒ์„ฑ
html_report = f"/tmp/ttrlvr_azr_validation_report_{timestamp}.html"
html_content = f"""
<!DOCTYPE html>
<html>
<head>
<title>TTRLVR + AZR Validation Report</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 40px; }}
.header {{ background-color: #f0f0f0; padding: 20px; border-radius: 5px; }}
.success {{ color: green; }}
.failure {{ color: red; }}
.test-section {{ margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }}
.recommendations {{ background-color: #fff3cd; padding: 15px; border-radius: 5px; }}
</style>
</head>
<body>
<div class="header">
<h1>TTRLVR + AZR Integration Validation Report</h1>
<p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<p>Overall Status: <span class="{'success' if report['summary']['overall_success'] else 'failure'}">
{'โœ… ALL TESTS PASSED' if report['summary']['overall_success'] else 'โŒ SOME TESTS FAILED'}
</span></p>
<p>Tests: {report['summary']['passed_tests']}/{report['summary']['total_tests']} passed</p>
</div>
<h2>Test Results</h2>
"""
for test_name, result in results.items():
status = "success" if result['success'] else "failure"
icon = "โœ…" if result['success'] else "โŒ"
html_content += f"""
<div class="test-section">
<h3 class="{status}">{icon} {test_name}</h3>
<p><strong>Duration:</strong> {result.get('duration', 'N/A')}</p>
<p><strong>Details:</strong> {result.get('details', 'No details available')}</p>
</div>
"""
if report['recommendations']:
html_content += """
<div class="recommendations">
<h2>Recommendations</h2>
<ul>
"""
for rec in report['recommendations']:
html_content += f"<li>{rec}</li>"
html_content += """
</ul>
</div>
"""
html_content += """
</body>
</html>
"""
# ํŒŒ์ผ ์ €์žฅ
with open(report_file, 'w') as f:
json.dump(report, f, indent=2)
with open(html_report, 'w') as f:
f.write(html_content)
print(f"๐Ÿ“„ JSON ๋ณด๊ณ ์„œ: {report_file}")
print(f"๐ŸŒ HTML ๋ณด๊ณ ์„œ: {html_report}")
return report
def main():
"""๋ฉ”์ธ ์‹คํ–‰ ํ•จ์ˆ˜"""
print("๐Ÿงช TTRLVR + AZR ํ†ตํ•ฉ ๊ฒ€์ฆ ์Šค์œ„ํŠธ ์‹œ์ž‘")
print("=" * 60)
print(f"์‹œ์ž‘ ์‹œ๊ฐ„: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 60)
# ๊ฒ€์ฆ ๊ฒฐ๊ณผ ์ €์žฅ
results = {}
start_time = time.time()
# 1. ํ™˜๊ฒฝ ๊ฒ€์ฆ
test_start = time.time()
success = run_environment_validation()
results['environment_validation'] = {
'success': success,
'duration': f"{time.time() - test_start:.1f}s",
'details': 'Environment setup and dependencies check'
}
# 2. ๋‹จ์œ„ ํ…Œ์ŠคํŠธ (ํ™˜๊ฒฝ ๊ฒ€์ฆ ์„ฑ๊ณต ์‹œ์—๋งŒ)
if success:
test_start = time.time()
success = run_unit_tests()
results['unit_tests'] = {
'success': success,
'duration': f"{time.time() - test_start:.1f}s",
'details': 'Component unit tests and integration tests'
}
else:
results['unit_tests'] = {
'success': False,
'duration': '0s',
'details': 'Skipped due to environment validation failure'
}
# 3. ๋ฏธ๋‹ˆ ํ†ตํ•ฉ ํ…Œ์ŠคํŠธ (์ด์ „ ํ…Œ์ŠคํŠธ๋“ค ์„ฑ๊ณต ์‹œ์—๋งŒ)
if results['unit_tests']['success']:
test_start = time.time()
success = run_mini_integration_test()
results['mini_integration_test'] = {
'success': success,
'duration': f"{time.time() - test_start:.1f}s",
'details': 'End-to-end pipeline test with 1 problem, 2 rounds'
}
else:
results['mini_integration_test'] = {
'success': False,
'duration': '0s',
'details': 'Skipped due to previous test failures'
}
# 4. ๋””์Šคํฌ ๊ณต๊ฐ„ ํ™•์ธ (ํ•ญ์ƒ ์‹คํ–‰)
test_start = time.time()
success = check_disk_space()
results['disk_space_check'] = {
'success': success,
'duration': f"{time.time() - test_start:.1f}s",
'details': 'Available disk space in critical directories'
}
# 5. ์„ฑ๋Šฅ ๋ฒค์น˜๋งˆํฌ (ํ•ญ์ƒ ์‹คํ–‰)
test_start = time.time()
success = run_performance_benchmark()
results['performance_benchmark'] = {
'success': success,
'duration': f"{time.time() - test_start:.1f}s",
'details': 'System resource usage and performance metrics'
}
# 6. ๋ณด๊ณ ์„œ ์ƒ์„ฑ
total_duration = time.time() - start_time
print(f"\nโฑ๏ธ ์ด ์‹คํ–‰ ์‹œ๊ฐ„: {total_duration:.1f}์ดˆ ({total_duration/60:.1f}๋ถ„)")
report = generate_validation_report(results)
# ์ตœ์ข… ๊ฒฐ๊ณผ
print("\n" + "="*60)
print("๐Ÿ ๊ฒ€์ฆ ์Šค์œ„ํŠธ ์™„๋ฃŒ")
print("="*60)
passed = sum(1 for result in results.values() if result['success'])
total = len(results)
print(f"๐Ÿ“Š ์ตœ์ข… ๊ฒฐ๊ณผ: {passed}/{total} ํ…Œ์ŠคํŠธ ํ†ต๊ณผ")
if report['summary']['overall_success']:
print("๐ŸŽ‰ ๋ชจ๋“  ๊ฒ€์ฆ ํ†ต๊ณผ! TTRLVR + AZR ์‹œ์Šคํ…œ ์‹คํ–‰ ์ค€๋น„ ์™„๋ฃŒ")
return 0
else:
print("โš ๏ธ ์ผ๋ถ€ ๊ฒ€์ฆ ์‹คํŒจ. ์œ„์˜ ๊ฒฐ๊ณผ๋ฅผ ํ™•์ธํ•˜๊ณ  ๋ฌธ์ œ๋ฅผ ํ•ด๊ฒฐํ•˜์„ธ์š”.")
return 1
if __name__ == '__main__':
exit_code = main()
sys.exit(exit_code)