neural-mesh-v2 / test /analyze_difficulty.py

Restore all essential files - code, configs, and MBPP/HumanEval data

24c2665 verified 23 days ago

13.2 kB

	#!/usr/bin/env python3
	"""
	MBPP와 HumanEval 벤치마크 문제 난이도 분석 도구

	각 벤치마크의 문제들을 다양한 기준으로 분석하고 난이도 분포를 확인합니다.
	- 코드 복잡도 (함수 길이, 조건문 수, 루프 수)
	- 문제 설명 길이 및 복잡도
	- 테스트 케이스 수
	- 필요한 알고리즘/데이터 구조 유형
	"""

	import os
	import sys
	import json
	import re
	import argparse
	from pathlib import Path
	from datetime import datetime
	from collections import defaultdict, Counter
	import ast

	# TestTime RLVR 모듈 임포트
	sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2')


	def load_jsonl(file_path):
	"""JSONL 파일 로드"""
	if not os.path.exists(file_path):
	return []

	with open(file_path, 'r', encoding='utf-8') as f:
	return [json.loads(line.strip()) for line in f if line.strip()]


	def analyze_code_complexity(code):
	"""파이썬 코드 복잡도 분석"""
	try:
	tree = ast.parse(code)
	except:
	return {
	'lines': len(code.split('\n')),
	'functions': 0,
	'conditionals': 0,
	'loops': 0,
	'complexity_score': 1
	}

	stats = {
	'lines': len(code.split('\n')),
	'functions': 0,
	'conditionals': 0,
	'loops': 0,
	'complexity_score': 1
	}

	for node in ast.walk(tree):
	if isinstance(node, ast.FunctionDef):
	stats['functions'] += 1
	elif isinstance(node, (ast.If, ast.IfExp)):
	stats['conditionals'] += 1
	elif isinstance(node, (ast.For, ast.While)):
	stats['loops'] += 1

	# 복잡도 점수 계산 (가중합)
	stats['complexity_score'] = (
	stats['lines'] * 0.1 +
	stats['functions'] * 2 +
	stats['conditionals'] * 1.5 +
	stats['loops'] * 2
	)

	return stats


	def extract_keywords(text):
	"""텍스트에서 알고리즘/데이터 구조 키워드 추출"""
	keywords = {
	'data_structures': ['list', 'array', 'dict', 'set', 'tuple', 'stack', 'queue', 'heap'],
	'algorithms': ['sort', 'search', 'binary', 'recursive', 'dynamic', 'greedy', 'graph'],
	'math': ['prime', 'fibonacci', 'factorial', 'gcd', 'lcm', 'sqrt', 'power'],
	'string': ['string', 'substring', 'regex', 'pattern', 'replace', 'split', 'join'],
	'logic': ['condition', 'boolean', 'logic', 'and', 'or', 'not', 'if', 'else']
	}

	text_lower = text.lower()
	found_keywords = defaultdict(list)

	for category, words in keywords.items():
	for word in words:
	if word in text_lower:
	found_keywords[category].append(word)

	return dict(found_keywords)


	def analyze_problem_difficulty(problem):
	"""개별 문제 난이도 분석"""
	task_id = problem.get('task_id', 'Unknown')
	prompt = problem.get('prompt', '')
	canonical_solution = problem.get('canonical_solution', '')
	test = problem.get('test', '')

	# 기본 정보
	analysis = {
	'task_id': task_id,
	'prompt_length': len(prompt),
	'solution_length': len(canonical_solution),
	'test_length': len(test)
	}

	# 코드 복잡도 분석
	if canonical_solution:
	code_stats = analyze_code_complexity(canonical_solution)
	analysis.update(code_stats)

	# 테스트 케이스 수 추정
	test_cases = len(re.findall(r'assert', test)) if test else 0
	analysis['test_cases'] = test_cases

	# 키워드 분석
	combined_text = prompt + ' ' + canonical_solution
	keywords = extract_keywords(combined_text)
	analysis['keywords'] = keywords
	analysis['keyword_count'] = sum(len(words) for words in keywords.values())

	# 난이도 점수 계산 (0-100)
	difficulty_score = min(100, max(0, (
	analysis.get('complexity_score', 1) * 10 +
	analysis['prompt_length'] * 0.01 +
	analysis['test_cases'] * 5 +
	analysis['keyword_count'] * 2
	)))

	analysis['difficulty_score'] = round(difficulty_score, 2)

	# 난이도 레벨 분류
	if difficulty_score < 20:
	analysis['difficulty_level'] = 'Easy'
	elif difficulty_score < 50:
	analysis['difficulty_level'] = 'Medium'
	elif difficulty_score < 80:
	analysis['difficulty_level'] = 'Hard'
	else:
	analysis['difficulty_level'] = 'Very Hard'

	return analysis


	def analyze_benchmark_difficulty(problems, benchmark_name):
	"""벤치마크 전체 난이도 분석"""
	print(f"\n🔍 {benchmark_name.upper()} 난이도 분석")
	print("="*60)

	if not problems:
	print("❌ 분석할 문제가 없습니다.")
	return {}

	analyses = []
	for problem in problems:
	analysis = analyze_problem_difficulty(problem)
	analyses.append(analysis)

	# 통계 계산
	difficulty_scores = [a['difficulty_score'] for a in analyses]
	complexity_scores = [a.get('complexity_score', 1) for a in analyses]
	prompt_lengths = [a['prompt_length'] for a in analyses]

	# 난이도 레벨 분포
	level_counts = Counter(a['difficulty_level'] for a in analyses)

	# 키워드 분석
	all_keywords = defaultdict(list)
	for analysis in analyses:
	for category, words in analysis['keywords'].items():
	all_keywords[category].extend(words)

	keyword_freq = {category: Counter(words) for category, words in all_keywords.items()}

	stats = {
	'total_problems': len(analyses),
	'difficulty_distribution': dict(level_counts),
	'difficulty_stats': {
	'min': min(difficulty_scores),
	'max': max(difficulty_scores),
	'mean': round(sum(difficulty_scores) / len(difficulty_scores), 2),
	'median': round(sorted(difficulty_scores)[len(difficulty_scores)//2], 2)
	},
	'complexity_stats': {
	'min': min(complexity_scores),
	'max': max(complexity_scores),
	'mean': round(sum(complexity_scores) / len(complexity_scores), 2)
	},
	'prompt_stats': {
	'min': min(prompt_lengths),
	'max': max(prompt_lengths),
	'mean': round(sum(prompt_lengths) / len(prompt_lengths), 2)
	},
	'keyword_frequency': {k: dict(v.most_common(5)) for k, v in keyword_freq.items()},
	'detailed_analyses': analyses
	}

	# 결과 출력
	print(f"📊 총 {stats['total_problems']}개 문제 분석 완료")

	print(f"\n📈 난이도 분포:")
	for level, count in level_counts.items():
	percentage = round(count / len(analyses) * 100, 1)
	print(f" {level}: {count}개 ({percentage}%)")

	print(f"\n📋 난이도 점수 통계:")
	print(f" 최소: {stats['difficulty_stats']['min']}")
	print(f" 최대: {stats['difficulty_stats']['max']}")
	print(f" 평균: {stats['difficulty_stats']['mean']}")
	print(f" 중위값: {stats['difficulty_stats']['median']}")

	print(f"\n🔧 코드 복잡도 통계:")
	print(f" 최소: {stats['complexity_stats']['min']}")
	print(f" 최대: {stats['complexity_stats']['max']}")
	print(f" 평균: {stats['complexity_stats']['mean']}")

	print(f"\n📝 문제 설명 길이 통계:")
	print(f" 최소: {stats['prompt_stats']['min']} 글자")
	print(f" 최대: {stats['prompt_stats']['max']} 글자")
	print(f" 평균: {stats['prompt_stats']['mean']} 글자")

	print(f"\n🏷️ 주요 키워드 (상위 3개):")
	for category, freq_dict in keyword_freq.items():
	if freq_dict:
	top_words = freq_dict.most_common(3)
	print(f" {category}: {', '.join([f'{word}({count})' for word, count in top_words])}")

	# 어려운 문제 샘플 출력
	hard_problems = [a for a in analyses if a['difficulty_level'] in ['Hard', 'Very Hard']]
	if hard_problems:
	print(f"\n🔥 어려운 문제 샘플 (상위 5개):")
	hard_problems_sorted = sorted(hard_problems, key=lambda x: x['difficulty_score'], reverse=True)
	for i, problem in enumerate(hard_problems_sorted[:5]):
	print(f" {i+1}. {problem['task_id']} (점수: {problem['difficulty_score']}, 레벨: {problem['difficulty_level']})")

	return stats


	def save_analysis_results(stats, benchmark_name, output_dir):
	"""분석 결과 저장"""
	analysis_dir = os.path.join(output_dir, benchmark_name)
	os.makedirs(analysis_dir, exist_ok=True)

	# 전체 분석 결과
	full_analysis_file = os.path.join(analysis_dir, f"{benchmark_name}_difficulty_analysis.json")
	with open(full_analysis_file, 'w', encoding='utf-8') as f:
	json.dump(stats, f, indent=2, ensure_ascii=False)

	# 요약 보고서
	summary_file = os.path.join(analysis_dir, f"{benchmark_name}_difficulty_summary.txt")
	with open(summary_file, 'w', encoding='utf-8') as f:
	f.write(f"{benchmark_name.upper()} 난이도 분석 요약\n")
	f.write("="*60 + "\n\n")
	f.write(f"생성 시간: {datetime.now().isoformat()}\n\n")

	f.write(f"📊 전체 통계:\n")
	f.write(f" 총 문제 수: {stats['total_problems']}개\n")
	f.write(f" 평균 난이도 점수: {stats['difficulty_stats']['mean']}\n")
	f.write(f" 평균 코드 복잡도: {stats['complexity_stats']['mean']}\n\n")

	f.write(f"📈 난이도 분포:\n")
	for level, count in stats['difficulty_distribution'].items():
	percentage = round(count / stats['total_problems'] * 100, 1)
	f.write(f" {level}: {count}개 ({percentage}%)\n")

	f.write(f"\n🏷️ 주요 키워드:\n")
	for category, freq_dict in stats['keyword_frequency'].items():
	if freq_dict:
	f.write(f" {category}: {', '.join(freq_dict.keys())}\n")

	print(f"\n💾 분석 결과가 저장되었습니다:")
	print(f" 전체 분석: {full_analysis_file}")
	print(f" 요약 보고서: {summary_file}")


	def main():
	parser = argparse.ArgumentParser(description='벤치마크 문제 난이도 분석')
	parser.add_argument('--benchmark', type=str, default='all',
	choices=['all', 'humaneval', 'mbpp'],
	help='분석할 벤치마크 (all=모든 벤치마크)')
	parser.add_argument('--save', action='store_true',
	help='결과를 파일로 저장')
	parser.add_argument('--output_dir', type=str,
	default='/home/ubuntu/RLVR/TestTime-RLVR-v2/test/analysis_results',
	help='출력 디렉토리')
	parser.add_argument('--detailed', action='store_true',
	help='상세 분석 결과 출력')

	args = parser.parse_args()

	# 데이터 경로 설정
	base_dir = '/home/ubuntu/RLVR/TestTime-RLVR-v2'
	humaneval_path = f'{base_dir}/evaluation/code_eval/data/HumanEvalPlus.jsonl'
	mbpp_path = f'{base_dir}/evaluation/code_eval/data/MbppPlus.jsonl'

	os.makedirs(args.output_dir, exist_ok=True)

	print("🚀 TestTime RLVR 벤치마크 난이도 분석 도구")
	print("="*80)

	all_results = {}

	if args.benchmark in ['all', 'humaneval']:
	print("\n")
	problems = load_jsonl(humaneval_path)
	if problems:
	stats = analyze_benchmark_difficulty(problems, 'humaneval')
	all_results['humaneval'] = stats

	if args.save and stats:
	save_analysis_results(stats, 'humaneval', args.output_dir)

	if args.benchmark in ['all', 'mbpp']:
	print("\n")
	problems = load_jsonl(mbpp_path)
	if problems:
	stats = analyze_benchmark_difficulty(problems, 'mbpp')
	all_results['mbpp'] = stats

	if args.save and stats:
	save_analysis_results(stats, 'mbpp', args.output_dir)

	# 벤치마크 비교
	if len(all_results) > 1:
	print("\n" + "="*80)
	print("🔄 벤치마크 비교 분석")
	print("="*80)

	for benchmark, stats in all_results.items():
	print(f"\n📊 {benchmark.upper()}:")
	print(f" 총 문제: {stats['total_problems']}개")
	print(f" 평균 난이도: {stats['difficulty_stats']['mean']}")
	print(f" 평균 복잡도: {stats['complexity_stats']['mean']}")

	easy_count = stats['difficulty_distribution'].get('Easy', 0)
	hard_count = stats['difficulty_distribution'].get('Hard', 0) + stats['difficulty_distribution'].get('Very Hard', 0)
	print(f" 쉬운 문제: {easy_count}개, 어려운 문제: {hard_count}개")

	# 사용법 안내
	print("\n" + "="*80)
	print("💡 사용법")
	print("="*80)
	print("특정 벤치마크만 분석:")
	print(" python test/analyze_difficulty.py --benchmark mbpp --save")
	print(" python test/analyze_difficulty.py --benchmark humaneval --detailed")
	print("\n전체 분석 및 저장:")
	print(" python test/analyze_difficulty.py --save")


	if __name__ == '__main__':
	main()