#!/usr/bin/env python3 """ TTRLVR + AZR 환경 검증 스크립트 실제 실행 환경에서 필요한 모든 컴포넌트가 올바르게 설정되어 있는지 확인합니다: 1. Python 패키지 및 버전 확인 2. GPU 및 CUDA 환경 확인 3. 파일 경로 및 권한 확인 4. 모델 로딩 테스트 5. AZR 설정 파일 검증 6. 간단한 파이프라인 실행 테스트 """ import os import sys import json import subprocess import tempfile import traceback from pathlib import Path from datetime import datetime # 경로 설정 sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2') class EnvironmentValidator: """환경 검증 클래스""" def __init__(self): self.results = { 'timestamp': datetime.now().isoformat(), 'tests': {}, 'overall_success': False, 'recommendations': [] } def log_test(self, test_name: str, success: bool, message: str, details: str = None): """테스트 결과 로깅""" status = "✅ PASS" if success else "❌ FAIL" print(f"{status} {test_name}: {message}") if details: print(f" Details: {details}") self.results['tests'][test_name] = { 'success': success, 'message': message, 'details': details } if not success: print() def add_recommendation(self, recommendation: str): """권장사항 추가""" self.results['recommendations'].append(recommendation) print(f"💡 Recommendation: {recommendation}") def test_python_packages(self): """Python 패키지 확인""" required_packages = { 'torch': '2.0.0', 'transformers': '4.30.0', 'pandas': '1.5.0', 'numpy': '1.21.0', 'vllm': '0.3.0' } missing_packages = [] version_issues = [] for package, min_version in required_packages.items(): try: if package == 'vllm': # vLLM은 선택적 패키지 try: import vllm version = vllm.__version__ except ImportError: self.add_recommendation(f"Consider installing vLLM for better GPU performance: pip install vllm") continue else: exec(f"import {package}") version = eval(f"{package}.__version__") # 버전 비교는 간단히 문자열로 (정확한 비교는 packaging 모듈 필요) if version < min_version: version_issues.append(f"{package}: {version} < {min_version}") except ImportError: missing_packages.append(package) except Exception as e: version_issues.append(f"{package}: Error checking version - {e}") if missing_packages: self.log_test( "Python Packages", False, f"Missing packages: {', '.join(missing_packages)}", f"Install with: pip install {' '.join(missing_packages)}" ) return False elif version_issues: self.log_test( "Python Packages", False, f"Version issues: {', '.join(version_issues)}", "Update packages to meet minimum requirements" ) return False else: self.log_test("Python Packages", True, "All required packages installed") return True def test_gpu_environment(self): """GPU 및 CUDA 환경 확인""" try: import torch # CUDA 사용 가능성 확인 if not torch.cuda.is_available(): self.log_test("GPU Environment", False, "CUDA not available") self.add_recommendation("Install CUDA toolkit and PyTorch with CUDA support") return False # GPU 개수 및 메모리 확인 gpu_count = torch.cuda.device_count() current_device = torch.cuda.current_device() device_name = torch.cuda.get_device_name(current_device) # 메모리 정보 memory_allocated = torch.cuda.memory_allocated() / 1024**3 # GB memory_reserved = torch.cuda.memory_reserved() / 1024**3 # GB memory_total = torch.cuda.get_device_properties(current_device).total_memory / 1024**3 # GB details = f"GPUs: {gpu_count}, Current: {device_name}, Memory: {memory_total:.1f}GB total, {memory_reserved:.1f}GB reserved" if memory_total < 8.0: # 8GB 미만 self.log_test("GPU Environment", False, f"GPU memory insufficient: {memory_total:.1f}GB", details) self.add_recommendation("Use a GPU with at least 8GB VRAM for 7B models") return False self.log_test("GPU Environment", True, f"GPU environment ready", details) return True except Exception as e: self.log_test("GPU Environment", False, f"Error checking GPU: {e}") return False def test_file_paths_and_permissions(self): """파일 경로 및 권한 확인""" critical_paths = { '/home/ubuntu/RLVR/TestTime-RLVR-v2': 'Main project directory', '/home/ubuntu/RLVR/TestTime-RLVR-v2/test/configs/ttrlvr_azr_7b_single_gpu.sh': 'AZR config script', '/data/RLVR/checkpoints': 'Checkpoint directory (will be created)', '/tmp': 'Temporary directory' } issues = [] for path, description in critical_paths.items(): if not os.path.exists(path): if 'checkpoints' in path: # 체크포인트 디렉토리는 생성 시도 try: os.makedirs(path, exist_ok=True) self.log_test(f"Path: {description}", True, f"Created directory: {path}") except Exception as e: issues.append(f"{description}: Cannot create {path} - {e}") else: issues.append(f"{description}: Not found - {path}") else: # 읽기/쓰기 권한 확인 readable = os.access(path, os.R_OK) writable = os.access(path, os.W_OK) if not readable: issues.append(f"{description}: No read permission - {path}") elif os.path.isdir(path) and not writable: issues.append(f"{description}: No write permission - {path}") else: self.log_test(f"Path: {description}", True, f"Accessible: {path}") if issues: self.log_test("File Paths", False, f"{len(issues)} path issues", "; ".join(issues)) return False else: self.log_test("File Paths", True, "All critical paths accessible") return True def test_model_loading(self): """모델 로딩 테스트 (간단한 확인)""" try: # 빠른 테스트를 위해 transformers 라이브러리만 확인 from transformers import AutoTokenizer # 실제 모델 로딩 대신 라이브러리 기능만 테스트 self.log_test("Model Loading", True, "Transformers library available for model loading") self.add_recommendation("Model loading test skipped to avoid timeout. Run full model test separately if needed.") return True except Exception as e: self.log_test("Model Loading", False, f"Failed to import transformers: {e}") self.add_recommendation("Install transformers library: pip install transformers") return False def test_azr_config(self): """AZR 설정 파일 검증""" config_path = '/home/ubuntu/RLVR/TestTime-RLVR-v2/test/configs/ttrlvr_azr_7b_single_gpu.sh' try: if not os.path.exists(config_path): self.log_test("AZR Config", False, f"Config file not found: {config_path}") return False # 스크립트 실행 권한 확인 if not os.access(config_path, os.X_OK): self.log_test("AZR Config", False, f"Config file not executable: {config_path}") self.add_recommendation(f"Make config executable: chmod +x {config_path}") return False # 설정 파일 내용 기본 검증 with open(config_path, 'r') as f: content = f.read() required_settings = [ 'trainer.project_name=ttrlvr_azr', 'azr.train_propose=False', 'data.train_batch_size=8', 'actor_rollout_ref.actor.ppo_mini_batch_size=24' ] missing_settings = [] for setting in required_settings: if setting not in content: missing_settings.append(setting) if missing_settings: self.log_test( "AZR Config", False, f"Missing settings: {', '.join(missing_settings)}", f"Check config file: {config_path}" ) return False self.log_test("AZR Config", True, f"Config file validated: {config_path}") return True except Exception as e: self.log_test("AZR Config", False, f"Error validating config: {e}") return False def test_simple_pipeline(self): """간단한 파이프라인 실행 테스트""" try: from absolute_zero_reasoner.testtime.config import TestTimeConfig from absolute_zero_reasoner.testtime.logger import TestTimeLogger from absolute_zero_reasoner.testtime.task_generator import TestTimeTaskGenerator # 기본 설정 생성 config = TestTimeConfig() config.model_name = "Qwen/Qwen2.5-7B" logger = TestTimeLogger() # Task Generator 인스턴스 생성 task_generator = TestTimeTaskGenerator(config, logger) # 테스트용 IPO 트리플 test_ipo_triples = [ { 'id': 'test_triple_0', 'input': '[1, 2, 3]', 'actual_output': '[2, 4, 6]', 'program': 'def test_func(lst):\n return [x * 2 for x in lst]', 'full_input_str': 'test_func([1, 2, 3])', 'source_program_id': 'program_0', 'ipo_index': 0 } ] # Task 생성 테스트 tasks = task_generator.generate_tasks(test_ipo_triples, "TestProblem", 1) # 결과 검증 if not tasks or not any(len(task_list) > 0 for task_list in tasks.values()): self.log_test("Simple Pipeline", False, "No tasks generated") return False # AZR 메타데이터 확인 for task_type, task_list in tasks.items(): if task_list: task = task_list[0] required_fields = ['uid', 'ipo_group_id', 'basic_accuracy', 'ground_truth'] missing_fields = [field for field in required_fields if field not in task] if missing_fields: self.log_test( "Simple Pipeline", False, f"Missing AZR metadata: {missing_fields}" ) return False total_tasks = sum(len(task_list) for task_list in tasks.values()) self.log_test("Simple Pipeline", True, f"Generated {total_tasks} tasks successfully") return True except Exception as e: self.log_test("Simple Pipeline", False, f"Pipeline test failed: {e}") return False def run_all_tests(self): """모든 테스트 실행""" print("🔍 TTRLVR + AZR 환경 검증 시작") print("=" * 60) tests = [ self.test_python_packages, self.test_gpu_environment, self.test_file_paths_and_permissions, self.test_model_loading, self.test_azr_config, self.test_simple_pipeline ] passed_tests = 0 total_tests = len(tests) for test in tests: try: if test(): passed_tests += 1 print() # 빈 줄 추가 except Exception as e: print(f"❌ Test {test.__name__} crashed: {e}") print(f" Traceback: {traceback.format_exc()}") print() # 최종 결과 success_rate = passed_tests / total_tests * 100 self.results['overall_success'] = passed_tests == total_tests print("=" * 60) print("📊 환경 검증 결과:") print(f" - 통과한 테스트: {passed_tests}/{total_tests} ({success_rate:.1f}%)") if self.results['recommendations']: print(f"\n💡 권장사항 ({len(self.results['recommendations'])}개):") for i, rec in enumerate(self.results['recommendations'], 1): print(f" {i}. {rec}") if self.results['overall_success']: print("\n🎉 환경 검증 완료! TTRLVR + AZR 실행 준비가 완료되었습니다.") else: print(f"\n⚠️ 환경 검증 실패: {total_tests - passed_tests}개 테스트 실패") print(" 위의 권장사항을 참고하여 문제를 해결한 후 다시 시도하세요.") return self.results def main(): """메인 실행 함수""" validator = EnvironmentValidator() results = validator.run_all_tests() # 결과를 파일로 저장 output_file = f"/tmp/ttrlvr_azr_validation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(output_file, 'w') as f: json.dump(results, f, indent=2) print(f"\n📄 상세 결과 저장: {output_file}") return 0 if results['overall_success'] else 1 if __name__ == '__main__': sys.exit(main())