neural-mesh-v2 / test /analyze_difficulty.py
hjkim00's picture
Restore all essential files - code, configs, and MBPP/HumanEval data
24c2665 verified
raw
history blame
13.2 kB
#!/usr/bin/env python3
"""
MBPP์™€ HumanEval ๋ฒค์น˜๋งˆํฌ ๋ฌธ์ œ ๋‚œ์ด๋„ ๋ถ„์„ ๋„๊ตฌ
๊ฐ ๋ฒค์น˜๋งˆํฌ์˜ ๋ฌธ์ œ๋“ค์„ ๋‹ค์–‘ํ•œ ๊ธฐ์ค€์œผ๋กœ ๋ถ„์„ํ•˜๊ณ  ๋‚œ์ด๋„ ๋ถ„ํฌ๋ฅผ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.
- ์ฝ”๋“œ ๋ณต์žก๋„ (ํ•จ์ˆ˜ ๊ธธ์ด, ์กฐ๊ฑด๋ฌธ ์ˆ˜, ๋ฃจํ”„ ์ˆ˜)
- ๋ฌธ์ œ ์„ค๋ช… ๊ธธ์ด ๋ฐ ๋ณต์žก๋„
- ํ…Œ์ŠคํŠธ ์ผ€์ด์Šค ์ˆ˜
- ํ•„์š”ํ•œ ์•Œ๊ณ ๋ฆฌ์ฆ˜/๋ฐ์ดํ„ฐ ๊ตฌ์กฐ ์œ ํ˜•
"""
import os
import sys
import json
import re
import argparse
from pathlib import Path
from datetime import datetime
from collections import defaultdict, Counter
import ast
# TestTime RLVR ๋ชจ๋“ˆ ์ž„ํฌํŠธ
sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2')
def load_jsonl(file_path):
"""JSONL ํŒŒ์ผ ๋กœ๋“œ"""
if not os.path.exists(file_path):
return []
with open(file_path, 'r', encoding='utf-8') as f:
return [json.loads(line.strip()) for line in f if line.strip()]
def analyze_code_complexity(code):
"""ํŒŒ์ด์ฌ ์ฝ”๋“œ ๋ณต์žก๋„ ๋ถ„์„"""
try:
tree = ast.parse(code)
except:
return {
'lines': len(code.split('\n')),
'functions': 0,
'conditionals': 0,
'loops': 0,
'complexity_score': 1
}
stats = {
'lines': len(code.split('\n')),
'functions': 0,
'conditionals': 0,
'loops': 0,
'complexity_score': 1
}
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
stats['functions'] += 1
elif isinstance(node, (ast.If, ast.IfExp)):
stats['conditionals'] += 1
elif isinstance(node, (ast.For, ast.While)):
stats['loops'] += 1
# ๋ณต์žก๋„ ์ ์ˆ˜ ๊ณ„์‚ฐ (๊ฐ€์ค‘ํ•ฉ)
stats['complexity_score'] = (
stats['lines'] * 0.1 +
stats['functions'] * 2 +
stats['conditionals'] * 1.5 +
stats['loops'] * 2
)
return stats
def extract_keywords(text):
"""ํ…์ŠคํŠธ์—์„œ ์•Œ๊ณ ๋ฆฌ์ฆ˜/๋ฐ์ดํ„ฐ ๊ตฌ์กฐ ํ‚ค์›Œ๋“œ ์ถ”์ถœ"""
keywords = {
'data_structures': ['list', 'array', 'dict', 'set', 'tuple', 'stack', 'queue', 'heap'],
'algorithms': ['sort', 'search', 'binary', 'recursive', 'dynamic', 'greedy', 'graph'],
'math': ['prime', 'fibonacci', 'factorial', 'gcd', 'lcm', 'sqrt', 'power'],
'string': ['string', 'substring', 'regex', 'pattern', 'replace', 'split', 'join'],
'logic': ['condition', 'boolean', 'logic', 'and', 'or', 'not', 'if', 'else']
}
text_lower = text.lower()
found_keywords = defaultdict(list)
for category, words in keywords.items():
for word in words:
if word in text_lower:
found_keywords[category].append(word)
return dict(found_keywords)
def analyze_problem_difficulty(problem):
"""๊ฐœ๋ณ„ ๋ฌธ์ œ ๋‚œ์ด๋„ ๋ถ„์„"""
task_id = problem.get('task_id', 'Unknown')
prompt = problem.get('prompt', '')
canonical_solution = problem.get('canonical_solution', '')
test = problem.get('test', '')
# ๊ธฐ๋ณธ ์ •๋ณด
analysis = {
'task_id': task_id,
'prompt_length': len(prompt),
'solution_length': len(canonical_solution),
'test_length': len(test)
}
# ์ฝ”๋“œ ๋ณต์žก๋„ ๋ถ„์„
if canonical_solution:
code_stats = analyze_code_complexity(canonical_solution)
analysis.update(code_stats)
# ํ…Œ์ŠคํŠธ ์ผ€์ด์Šค ์ˆ˜ ์ถ”์ •
test_cases = len(re.findall(r'assert', test)) if test else 0
analysis['test_cases'] = test_cases
# ํ‚ค์›Œ๋“œ ๋ถ„์„
combined_text = prompt + ' ' + canonical_solution
keywords = extract_keywords(combined_text)
analysis['keywords'] = keywords
analysis['keyword_count'] = sum(len(words) for words in keywords.values())
# ๋‚œ์ด๋„ ์ ์ˆ˜ ๊ณ„์‚ฐ (0-100)
difficulty_score = min(100, max(0, (
analysis.get('complexity_score', 1) * 10 +
analysis['prompt_length'] * 0.01 +
analysis['test_cases'] * 5 +
analysis['keyword_count'] * 2
)))
analysis['difficulty_score'] = round(difficulty_score, 2)
# ๋‚œ์ด๋„ ๋ ˆ๋ฒจ ๋ถ„๋ฅ˜
if difficulty_score < 20:
analysis['difficulty_level'] = 'Easy'
elif difficulty_score < 50:
analysis['difficulty_level'] = 'Medium'
elif difficulty_score < 80:
analysis['difficulty_level'] = 'Hard'
else:
analysis['difficulty_level'] = 'Very Hard'
return analysis
def analyze_benchmark_difficulty(problems, benchmark_name):
"""๋ฒค์น˜๋งˆํฌ ์ „์ฒด ๋‚œ์ด๋„ ๋ถ„์„"""
print(f"\n๐Ÿ” {benchmark_name.upper()} ๋‚œ์ด๋„ ๋ถ„์„")
print("="*60)
if not problems:
print("โŒ ๋ถ„์„ํ•  ๋ฌธ์ œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
return {}
analyses = []
for problem in problems:
analysis = analyze_problem_difficulty(problem)
analyses.append(analysis)
# ํ†ต๊ณ„ ๊ณ„์‚ฐ
difficulty_scores = [a['difficulty_score'] for a in analyses]
complexity_scores = [a.get('complexity_score', 1) for a in analyses]
prompt_lengths = [a['prompt_length'] for a in analyses]
# ๋‚œ์ด๋„ ๋ ˆ๋ฒจ ๋ถ„ํฌ
level_counts = Counter(a['difficulty_level'] for a in analyses)
# ํ‚ค์›Œ๋“œ ๋ถ„์„
all_keywords = defaultdict(list)
for analysis in analyses:
for category, words in analysis['keywords'].items():
all_keywords[category].extend(words)
keyword_freq = {category: Counter(words) for category, words in all_keywords.items()}
stats = {
'total_problems': len(analyses),
'difficulty_distribution': dict(level_counts),
'difficulty_stats': {
'min': min(difficulty_scores),
'max': max(difficulty_scores),
'mean': round(sum(difficulty_scores) / len(difficulty_scores), 2),
'median': round(sorted(difficulty_scores)[len(difficulty_scores)//2], 2)
},
'complexity_stats': {
'min': min(complexity_scores),
'max': max(complexity_scores),
'mean': round(sum(complexity_scores) / len(complexity_scores), 2)
},
'prompt_stats': {
'min': min(prompt_lengths),
'max': max(prompt_lengths),
'mean': round(sum(prompt_lengths) / len(prompt_lengths), 2)
},
'keyword_frequency': {k: dict(v.most_common(5)) for k, v in keyword_freq.items()},
'detailed_analyses': analyses
}
# ๊ฒฐ๊ณผ ์ถœ๋ ฅ
print(f"๐Ÿ“Š ์ด {stats['total_problems']}๊ฐœ ๋ฌธ์ œ ๋ถ„์„ ์™„๋ฃŒ")
print(f"\n๐Ÿ“ˆ ๋‚œ์ด๋„ ๋ถ„ํฌ:")
for level, count in level_counts.items():
percentage = round(count / len(analyses) * 100, 1)
print(f" {level}: {count}๊ฐœ ({percentage}%)")
print(f"\n๐Ÿ“‹ ๋‚œ์ด๋„ ์ ์ˆ˜ ํ†ต๊ณ„:")
print(f" ์ตœ์†Œ: {stats['difficulty_stats']['min']}")
print(f" ์ตœ๋Œ€: {stats['difficulty_stats']['max']}")
print(f" ํ‰๊ท : {stats['difficulty_stats']['mean']}")
print(f" ์ค‘์œ„๊ฐ’: {stats['difficulty_stats']['median']}")
print(f"\n๐Ÿ”ง ์ฝ”๋“œ ๋ณต์žก๋„ ํ†ต๊ณ„:")
print(f" ์ตœ์†Œ: {stats['complexity_stats']['min']}")
print(f" ์ตœ๋Œ€: {stats['complexity_stats']['max']}")
print(f" ํ‰๊ท : {stats['complexity_stats']['mean']}")
print(f"\n๐Ÿ“ ๋ฌธ์ œ ์„ค๋ช… ๊ธธ์ด ํ†ต๊ณ„:")
print(f" ์ตœ์†Œ: {stats['prompt_stats']['min']} ๊ธ€์ž")
print(f" ์ตœ๋Œ€: {stats['prompt_stats']['max']} ๊ธ€์ž")
print(f" ํ‰๊ท : {stats['prompt_stats']['mean']} ๊ธ€์ž")
print(f"\n๐Ÿท๏ธ ์ฃผ์š” ํ‚ค์›Œ๋“œ (์ƒ์œ„ 3๊ฐœ):")
for category, freq_dict in keyword_freq.items():
if freq_dict:
top_words = freq_dict.most_common(3)
print(f" {category}: {', '.join([f'{word}({count})' for word, count in top_words])}")
# ์–ด๋ ค์šด ๋ฌธ์ œ ์ƒ˜ํ”Œ ์ถœ๋ ฅ
hard_problems = [a for a in analyses if a['difficulty_level'] in ['Hard', 'Very Hard']]
if hard_problems:
print(f"\n๐Ÿ”ฅ ์–ด๋ ค์šด ๋ฌธ์ œ ์ƒ˜ํ”Œ (์ƒ์œ„ 5๊ฐœ):")
hard_problems_sorted = sorted(hard_problems, key=lambda x: x['difficulty_score'], reverse=True)
for i, problem in enumerate(hard_problems_sorted[:5]):
print(f" {i+1}. {problem['task_id']} (์ ์ˆ˜: {problem['difficulty_score']}, ๋ ˆ๋ฒจ: {problem['difficulty_level']})")
return stats
def save_analysis_results(stats, benchmark_name, output_dir):
"""๋ถ„์„ ๊ฒฐ๊ณผ ์ €์žฅ"""
analysis_dir = os.path.join(output_dir, benchmark_name)
os.makedirs(analysis_dir, exist_ok=True)
# ์ „์ฒด ๋ถ„์„ ๊ฒฐ๊ณผ
full_analysis_file = os.path.join(analysis_dir, f"{benchmark_name}_difficulty_analysis.json")
with open(full_analysis_file, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2, ensure_ascii=False)
# ์š”์•ฝ ๋ณด๊ณ ์„œ
summary_file = os.path.join(analysis_dir, f"{benchmark_name}_difficulty_summary.txt")
with open(summary_file, 'w', encoding='utf-8') as f:
f.write(f"{benchmark_name.upper()} ๋‚œ์ด๋„ ๋ถ„์„ ์š”์•ฝ\n")
f.write("="*60 + "\n\n")
f.write(f"์ƒ์„ฑ ์‹œ๊ฐ„: {datetime.now().isoformat()}\n\n")
f.write(f"๐Ÿ“Š ์ „์ฒด ํ†ต๊ณ„:\n")
f.write(f" ์ด ๋ฌธ์ œ ์ˆ˜: {stats['total_problems']}๊ฐœ\n")
f.write(f" ํ‰๊ท  ๋‚œ์ด๋„ ์ ์ˆ˜: {stats['difficulty_stats']['mean']}\n")
f.write(f" ํ‰๊ท  ์ฝ”๋“œ ๋ณต์žก๋„: {stats['complexity_stats']['mean']}\n\n")
f.write(f"๐Ÿ“ˆ ๋‚œ์ด๋„ ๋ถ„ํฌ:\n")
for level, count in stats['difficulty_distribution'].items():
percentage = round(count / stats['total_problems'] * 100, 1)
f.write(f" {level}: {count}๊ฐœ ({percentage}%)\n")
f.write(f"\n๐Ÿท๏ธ ์ฃผ์š” ํ‚ค์›Œ๋“œ:\n")
for category, freq_dict in stats['keyword_frequency'].items():
if freq_dict:
f.write(f" {category}: {', '.join(freq_dict.keys())}\n")
print(f"\n๐Ÿ’พ ๋ถ„์„ ๊ฒฐ๊ณผ๊ฐ€ ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค:")
print(f" ์ „์ฒด ๋ถ„์„: {full_analysis_file}")
print(f" ์š”์•ฝ ๋ณด๊ณ ์„œ: {summary_file}")
def main():
parser = argparse.ArgumentParser(description='๋ฒค์น˜๋งˆํฌ ๋ฌธ์ œ ๋‚œ์ด๋„ ๋ถ„์„')
parser.add_argument('--benchmark', type=str, default='all',
choices=['all', 'humaneval', 'mbpp'],
help='๋ถ„์„ํ•  ๋ฒค์น˜๋งˆํฌ (all=๋ชจ๋“  ๋ฒค์น˜๋งˆํฌ)')
parser.add_argument('--save', action='store_true',
help='๊ฒฐ๊ณผ๋ฅผ ํŒŒ์ผ๋กœ ์ €์žฅ')
parser.add_argument('--output_dir', type=str,
default='/home/ubuntu/RLVR/TestTime-RLVR-v2/test/analysis_results',
help='์ถœ๋ ฅ ๋””๋ ‰ํ† ๋ฆฌ')
parser.add_argument('--detailed', action='store_true',
help='์ƒ์„ธ ๋ถ„์„ ๊ฒฐ๊ณผ ์ถœ๋ ฅ')
args = parser.parse_args()
# ๋ฐ์ดํ„ฐ ๊ฒฝ๋กœ ์„ค์ •
base_dir = '/home/ubuntu/RLVR/TestTime-RLVR-v2'
humaneval_path = f'{base_dir}/evaluation/code_eval/data/HumanEvalPlus.jsonl'
mbpp_path = f'{base_dir}/evaluation/code_eval/data/MbppPlus.jsonl'
os.makedirs(args.output_dir, exist_ok=True)
print("๐Ÿš€ TestTime RLVR ๋ฒค์น˜๋งˆํฌ ๋‚œ์ด๋„ ๋ถ„์„ ๋„๊ตฌ")
print("="*80)
all_results = {}
if args.benchmark in ['all', 'humaneval']:
print("\n")
problems = load_jsonl(humaneval_path)
if problems:
stats = analyze_benchmark_difficulty(problems, 'humaneval')
all_results['humaneval'] = stats
if args.save and stats:
save_analysis_results(stats, 'humaneval', args.output_dir)
if args.benchmark in ['all', 'mbpp']:
print("\n")
problems = load_jsonl(mbpp_path)
if problems:
stats = analyze_benchmark_difficulty(problems, 'mbpp')
all_results['mbpp'] = stats
if args.save and stats:
save_analysis_results(stats, 'mbpp', args.output_dir)
# ๋ฒค์น˜๋งˆํฌ ๋น„๊ต
if len(all_results) > 1:
print("\n" + "="*80)
print("๐Ÿ”„ ๋ฒค์น˜๋งˆํฌ ๋น„๊ต ๋ถ„์„")
print("="*80)
for benchmark, stats in all_results.items():
print(f"\n๐Ÿ“Š {benchmark.upper()}:")
print(f" ์ด ๋ฌธ์ œ: {stats['total_problems']}๊ฐœ")
print(f" ํ‰๊ท  ๋‚œ์ด๋„: {stats['difficulty_stats']['mean']}")
print(f" ํ‰๊ท  ๋ณต์žก๋„: {stats['complexity_stats']['mean']}")
easy_count = stats['difficulty_distribution'].get('Easy', 0)
hard_count = stats['difficulty_distribution'].get('Hard', 0) + stats['difficulty_distribution'].get('Very Hard', 0)
print(f" ์‰ฌ์šด ๋ฌธ์ œ: {easy_count}๊ฐœ, ์–ด๋ ค์šด ๋ฌธ์ œ: {hard_count}๊ฐœ")
# ์‚ฌ์šฉ๋ฒ• ์•ˆ๋‚ด
print("\n" + "="*80)
print("๐Ÿ’ก ์‚ฌ์šฉ๋ฒ•")
print("="*80)
print("ํŠน์ • ๋ฒค์น˜๋งˆํฌ๋งŒ ๋ถ„์„:")
print(" python test/analyze_difficulty.py --benchmark mbpp --save")
print(" python test/analyze_difficulty.py --benchmark humaneval --detailed")
print("\n์ „์ฒด ๋ถ„์„ ๋ฐ ์ €์žฅ:")
print(" python test/analyze_difficulty.py --save")
if __name__ == '__main__':
main()