#!/usr/bin/env python3 """Test script to debug VeRL training""" import sys import os sys.path.insert(0, '/home/ubuntu/RLVR/TestTime-RLVR-v2') sys.path.insert(0, '/home/ubuntu/RLVR/verl') # Create dummy training data import pandas as pd import numpy as np output_dir = "./test_time_output_debug" training_data_path = os.path.join(output_dir, "training_data") os.makedirs(training_data_path, exist_ok=True) # Create minimal dummy data for each task type for task_type in ['induction', 'deduction', 'abduction']: data = { 'prompts': ['test prompt ' + task_type], 'responses': ['test response ' + task_type], 'rewards': [1.0], 'problem_id': ['test_id'], 'token_level_scores': [np.array([1.0] * 10)] # Dummy scores } df = pd.DataFrame(data) df.to_parquet(os.path.join(training_data_path, f'{task_type}.parquet')) print(f"Created dummy training data in {training_data_path}") # Now run Step 5 only from test.train_ttrlvr_azr import main import argparse args = argparse.Namespace( benchmark='mbpp', problem_id='Mbpp/2', rounds=1, config='test/configs/ttrlvr_azr_ppo_4gpu.yaml', step5_only=True, data_path=training_data_path, output_dir=output_dir, model='Qwen/Qwen2.5-7B', debug=True, batch_size=24, batch_epochs=1, num_programs=4, input_generation_rounds=3, parallel_batch_size=4, eval_rounds=5, skip_task_eval=False, save_every_round=False, save_round_interval=5, problems=10, resume=1, gpu=None ) # Patch sys.argv for argparse sys.argv = ['test_debug_verl.py'] # Run main main(args)