|
|
|
""" |
|
MBPP์ HumanEval ๋ฒค์น๋งํฌ ๋ฌธ์ ๋์ด๋ ๋ถ์ ๋๊ตฌ |
|
|
|
๊ฐ ๋ฒค์น๋งํฌ์ ๋ฌธ์ ๋ค์ ๋ค์ํ ๊ธฐ์ค์ผ๋ก ๋ถ์ํ๊ณ ๋์ด๋ ๋ถํฌ๋ฅผ ํ์ธํฉ๋๋ค. |
|
- ์ฝ๋ ๋ณต์ก๋ (ํจ์ ๊ธธ์ด, ์กฐ๊ฑด๋ฌธ ์, ๋ฃจํ ์) |
|
- ๋ฌธ์ ์ค๋ช
๊ธธ์ด ๋ฐ ๋ณต์ก๋ |
|
- ํ
์คํธ ์ผ์ด์ค ์ |
|
- ํ์ํ ์๊ณ ๋ฆฌ์ฆ/๋ฐ์ดํฐ ๊ตฌ์กฐ ์ ํ |
|
""" |
|
|
|
import os |
|
import sys |
|
import json |
|
import re |
|
import argparse |
|
from pathlib import Path |
|
from datetime import datetime |
|
from collections import defaultdict, Counter |
|
import ast |
|
|
|
|
|
sys.path.append('/home/ubuntu/RLVR/TestTime-RLVR-v2') |
|
|
|
|
|
def load_jsonl(file_path): |
|
"""JSONL ํ์ผ ๋ก๋""" |
|
if not os.path.exists(file_path): |
|
return [] |
|
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
return [json.loads(line.strip()) for line in f if line.strip()] |
|
|
|
|
|
def analyze_code_complexity(code): |
|
"""ํ์ด์ฌ ์ฝ๋ ๋ณต์ก๋ ๋ถ์""" |
|
try: |
|
tree = ast.parse(code) |
|
except: |
|
return { |
|
'lines': len(code.split('\n')), |
|
'functions': 0, |
|
'conditionals': 0, |
|
'loops': 0, |
|
'complexity_score': 1 |
|
} |
|
|
|
stats = { |
|
'lines': len(code.split('\n')), |
|
'functions': 0, |
|
'conditionals': 0, |
|
'loops': 0, |
|
'complexity_score': 1 |
|
} |
|
|
|
for node in ast.walk(tree): |
|
if isinstance(node, ast.FunctionDef): |
|
stats['functions'] += 1 |
|
elif isinstance(node, (ast.If, ast.IfExp)): |
|
stats['conditionals'] += 1 |
|
elif isinstance(node, (ast.For, ast.While)): |
|
stats['loops'] += 1 |
|
|
|
|
|
stats['complexity_score'] = ( |
|
stats['lines'] * 0.1 + |
|
stats['functions'] * 2 + |
|
stats['conditionals'] * 1.5 + |
|
stats['loops'] * 2 |
|
) |
|
|
|
return stats |
|
|
|
|
|
def extract_keywords(text): |
|
"""ํ
์คํธ์์ ์๊ณ ๋ฆฌ์ฆ/๋ฐ์ดํฐ ๊ตฌ์กฐ ํค์๋ ์ถ์ถ""" |
|
keywords = { |
|
'data_structures': ['list', 'array', 'dict', 'set', 'tuple', 'stack', 'queue', 'heap'], |
|
'algorithms': ['sort', 'search', 'binary', 'recursive', 'dynamic', 'greedy', 'graph'], |
|
'math': ['prime', 'fibonacci', 'factorial', 'gcd', 'lcm', 'sqrt', 'power'], |
|
'string': ['string', 'substring', 'regex', 'pattern', 'replace', 'split', 'join'], |
|
'logic': ['condition', 'boolean', 'logic', 'and', 'or', 'not', 'if', 'else'] |
|
} |
|
|
|
text_lower = text.lower() |
|
found_keywords = defaultdict(list) |
|
|
|
for category, words in keywords.items(): |
|
for word in words: |
|
if word in text_lower: |
|
found_keywords[category].append(word) |
|
|
|
return dict(found_keywords) |
|
|
|
|
|
def analyze_problem_difficulty(problem): |
|
"""๊ฐ๋ณ ๋ฌธ์ ๋์ด๋ ๋ถ์""" |
|
task_id = problem.get('task_id', 'Unknown') |
|
prompt = problem.get('prompt', '') |
|
canonical_solution = problem.get('canonical_solution', '') |
|
test = problem.get('test', '') |
|
|
|
|
|
analysis = { |
|
'task_id': task_id, |
|
'prompt_length': len(prompt), |
|
'solution_length': len(canonical_solution), |
|
'test_length': len(test) |
|
} |
|
|
|
|
|
if canonical_solution: |
|
code_stats = analyze_code_complexity(canonical_solution) |
|
analysis.update(code_stats) |
|
|
|
|
|
test_cases = len(re.findall(r'assert', test)) if test else 0 |
|
analysis['test_cases'] = test_cases |
|
|
|
|
|
combined_text = prompt + ' ' + canonical_solution |
|
keywords = extract_keywords(combined_text) |
|
analysis['keywords'] = keywords |
|
analysis['keyword_count'] = sum(len(words) for words in keywords.values()) |
|
|
|
|
|
difficulty_score = min(100, max(0, ( |
|
analysis.get('complexity_score', 1) * 10 + |
|
analysis['prompt_length'] * 0.01 + |
|
analysis['test_cases'] * 5 + |
|
analysis['keyword_count'] * 2 |
|
))) |
|
|
|
analysis['difficulty_score'] = round(difficulty_score, 2) |
|
|
|
|
|
if difficulty_score < 20: |
|
analysis['difficulty_level'] = 'Easy' |
|
elif difficulty_score < 50: |
|
analysis['difficulty_level'] = 'Medium' |
|
elif difficulty_score < 80: |
|
analysis['difficulty_level'] = 'Hard' |
|
else: |
|
analysis['difficulty_level'] = 'Very Hard' |
|
|
|
return analysis |
|
|
|
|
|
def analyze_benchmark_difficulty(problems, benchmark_name): |
|
"""๋ฒค์น๋งํฌ ์ ์ฒด ๋์ด๋ ๋ถ์""" |
|
print(f"\n๐ {benchmark_name.upper()} ๋์ด๋ ๋ถ์") |
|
print("="*60) |
|
|
|
if not problems: |
|
print("โ ๋ถ์ํ ๋ฌธ์ ๊ฐ ์์ต๋๋ค.") |
|
return {} |
|
|
|
analyses = [] |
|
for problem in problems: |
|
analysis = analyze_problem_difficulty(problem) |
|
analyses.append(analysis) |
|
|
|
|
|
difficulty_scores = [a['difficulty_score'] for a in analyses] |
|
complexity_scores = [a.get('complexity_score', 1) for a in analyses] |
|
prompt_lengths = [a['prompt_length'] for a in analyses] |
|
|
|
|
|
level_counts = Counter(a['difficulty_level'] for a in analyses) |
|
|
|
|
|
all_keywords = defaultdict(list) |
|
for analysis in analyses: |
|
for category, words in analysis['keywords'].items(): |
|
all_keywords[category].extend(words) |
|
|
|
keyword_freq = {category: Counter(words) for category, words in all_keywords.items()} |
|
|
|
stats = { |
|
'total_problems': len(analyses), |
|
'difficulty_distribution': dict(level_counts), |
|
'difficulty_stats': { |
|
'min': min(difficulty_scores), |
|
'max': max(difficulty_scores), |
|
'mean': round(sum(difficulty_scores) / len(difficulty_scores), 2), |
|
'median': round(sorted(difficulty_scores)[len(difficulty_scores)//2], 2) |
|
}, |
|
'complexity_stats': { |
|
'min': min(complexity_scores), |
|
'max': max(complexity_scores), |
|
'mean': round(sum(complexity_scores) / len(complexity_scores), 2) |
|
}, |
|
'prompt_stats': { |
|
'min': min(prompt_lengths), |
|
'max': max(prompt_lengths), |
|
'mean': round(sum(prompt_lengths) / len(prompt_lengths), 2) |
|
}, |
|
'keyword_frequency': {k: dict(v.most_common(5)) for k, v in keyword_freq.items()}, |
|
'detailed_analyses': analyses |
|
} |
|
|
|
|
|
print(f"๐ ์ด {stats['total_problems']}๊ฐ ๋ฌธ์ ๋ถ์ ์๋ฃ") |
|
|
|
print(f"\n๐ ๋์ด๋ ๋ถํฌ:") |
|
for level, count in level_counts.items(): |
|
percentage = round(count / len(analyses) * 100, 1) |
|
print(f" {level}: {count}๊ฐ ({percentage}%)") |
|
|
|
print(f"\n๐ ๋์ด๋ ์ ์ ํต๊ณ:") |
|
print(f" ์ต์: {stats['difficulty_stats']['min']}") |
|
print(f" ์ต๋: {stats['difficulty_stats']['max']}") |
|
print(f" ํ๊ท : {stats['difficulty_stats']['mean']}") |
|
print(f" ์ค์๊ฐ: {stats['difficulty_stats']['median']}") |
|
|
|
print(f"\n๐ง ์ฝ๋ ๋ณต์ก๋ ํต๊ณ:") |
|
print(f" ์ต์: {stats['complexity_stats']['min']}") |
|
print(f" ์ต๋: {stats['complexity_stats']['max']}") |
|
print(f" ํ๊ท : {stats['complexity_stats']['mean']}") |
|
|
|
print(f"\n๐ ๋ฌธ์ ์ค๋ช
๊ธธ์ด ํต๊ณ:") |
|
print(f" ์ต์: {stats['prompt_stats']['min']} ๊ธ์") |
|
print(f" ์ต๋: {stats['prompt_stats']['max']} ๊ธ์") |
|
print(f" ํ๊ท : {stats['prompt_stats']['mean']} ๊ธ์") |
|
|
|
print(f"\n๐ท๏ธ ์ฃผ์ ํค์๋ (์์ 3๊ฐ):") |
|
for category, freq_dict in keyword_freq.items(): |
|
if freq_dict: |
|
top_words = freq_dict.most_common(3) |
|
print(f" {category}: {', '.join([f'{word}({count})' for word, count in top_words])}") |
|
|
|
|
|
hard_problems = [a for a in analyses if a['difficulty_level'] in ['Hard', 'Very Hard']] |
|
if hard_problems: |
|
print(f"\n๐ฅ ์ด๋ ค์ด ๋ฌธ์ ์ํ (์์ 5๊ฐ):") |
|
hard_problems_sorted = sorted(hard_problems, key=lambda x: x['difficulty_score'], reverse=True) |
|
for i, problem in enumerate(hard_problems_sorted[:5]): |
|
print(f" {i+1}. {problem['task_id']} (์ ์: {problem['difficulty_score']}, ๋ ๋ฒจ: {problem['difficulty_level']})") |
|
|
|
return stats |
|
|
|
|
|
def save_analysis_results(stats, benchmark_name, output_dir): |
|
"""๋ถ์ ๊ฒฐ๊ณผ ์ ์ฅ""" |
|
analysis_dir = os.path.join(output_dir, benchmark_name) |
|
os.makedirs(analysis_dir, exist_ok=True) |
|
|
|
|
|
full_analysis_file = os.path.join(analysis_dir, f"{benchmark_name}_difficulty_analysis.json") |
|
with open(full_analysis_file, 'w', encoding='utf-8') as f: |
|
json.dump(stats, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
summary_file = os.path.join(analysis_dir, f"{benchmark_name}_difficulty_summary.txt") |
|
with open(summary_file, 'w', encoding='utf-8') as f: |
|
f.write(f"{benchmark_name.upper()} ๋์ด๋ ๋ถ์ ์์ฝ\n") |
|
f.write("="*60 + "\n\n") |
|
f.write(f"์์ฑ ์๊ฐ: {datetime.now().isoformat()}\n\n") |
|
|
|
f.write(f"๐ ์ ์ฒด ํต๊ณ:\n") |
|
f.write(f" ์ด ๋ฌธ์ ์: {stats['total_problems']}๊ฐ\n") |
|
f.write(f" ํ๊ท ๋์ด๋ ์ ์: {stats['difficulty_stats']['mean']}\n") |
|
f.write(f" ํ๊ท ์ฝ๋ ๋ณต์ก๋: {stats['complexity_stats']['mean']}\n\n") |
|
|
|
f.write(f"๐ ๋์ด๋ ๋ถํฌ:\n") |
|
for level, count in stats['difficulty_distribution'].items(): |
|
percentage = round(count / stats['total_problems'] * 100, 1) |
|
f.write(f" {level}: {count}๊ฐ ({percentage}%)\n") |
|
|
|
f.write(f"\n๐ท๏ธ ์ฃผ์ ํค์๋:\n") |
|
for category, freq_dict in stats['keyword_frequency'].items(): |
|
if freq_dict: |
|
f.write(f" {category}: {', '.join(freq_dict.keys())}\n") |
|
|
|
print(f"\n๐พ ๋ถ์ ๊ฒฐ๊ณผ๊ฐ ์ ์ฅ๋์์ต๋๋ค:") |
|
print(f" ์ ์ฒด ๋ถ์: {full_analysis_file}") |
|
print(f" ์์ฝ ๋ณด๊ณ ์: {summary_file}") |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description='๋ฒค์น๋งํฌ ๋ฌธ์ ๋์ด๋ ๋ถ์') |
|
parser.add_argument('--benchmark', type=str, default='all', |
|
choices=['all', 'humaneval', 'mbpp'], |
|
help='๋ถ์ํ ๋ฒค์น๋งํฌ (all=๋ชจ๋ ๋ฒค์น๋งํฌ)') |
|
parser.add_argument('--save', action='store_true', |
|
help='๊ฒฐ๊ณผ๋ฅผ ํ์ผ๋ก ์ ์ฅ') |
|
parser.add_argument('--output_dir', type=str, |
|
default='/home/ubuntu/RLVR/TestTime-RLVR-v2/test/analysis_results', |
|
help='์ถ๋ ฅ ๋๋ ํ ๋ฆฌ') |
|
parser.add_argument('--detailed', action='store_true', |
|
help='์์ธ ๋ถ์ ๊ฒฐ๊ณผ ์ถ๋ ฅ') |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
base_dir = '/home/ubuntu/RLVR/TestTime-RLVR-v2' |
|
humaneval_path = f'{base_dir}/evaluation/code_eval/data/HumanEvalPlus.jsonl' |
|
mbpp_path = f'{base_dir}/evaluation/code_eval/data/MbppPlus.jsonl' |
|
|
|
os.makedirs(args.output_dir, exist_ok=True) |
|
|
|
print("๐ TestTime RLVR ๋ฒค์น๋งํฌ ๋์ด๋ ๋ถ์ ๋๊ตฌ") |
|
print("="*80) |
|
|
|
all_results = {} |
|
|
|
if args.benchmark in ['all', 'humaneval']: |
|
print("\n") |
|
problems = load_jsonl(humaneval_path) |
|
if problems: |
|
stats = analyze_benchmark_difficulty(problems, 'humaneval') |
|
all_results['humaneval'] = stats |
|
|
|
if args.save and stats: |
|
save_analysis_results(stats, 'humaneval', args.output_dir) |
|
|
|
if args.benchmark in ['all', 'mbpp']: |
|
print("\n") |
|
problems = load_jsonl(mbpp_path) |
|
if problems: |
|
stats = analyze_benchmark_difficulty(problems, 'mbpp') |
|
all_results['mbpp'] = stats |
|
|
|
if args.save and stats: |
|
save_analysis_results(stats, 'mbpp', args.output_dir) |
|
|
|
|
|
if len(all_results) > 1: |
|
print("\n" + "="*80) |
|
print("๐ ๋ฒค์น๋งํฌ ๋น๊ต ๋ถ์") |
|
print("="*80) |
|
|
|
for benchmark, stats in all_results.items(): |
|
print(f"\n๐ {benchmark.upper()}:") |
|
print(f" ์ด ๋ฌธ์ : {stats['total_problems']}๊ฐ") |
|
print(f" ํ๊ท ๋์ด๋: {stats['difficulty_stats']['mean']}") |
|
print(f" ํ๊ท ๋ณต์ก๋: {stats['complexity_stats']['mean']}") |
|
|
|
easy_count = stats['difficulty_distribution'].get('Easy', 0) |
|
hard_count = stats['difficulty_distribution'].get('Hard', 0) + stats['difficulty_distribution'].get('Very Hard', 0) |
|
print(f" ์ฌ์ด ๋ฌธ์ : {easy_count}๊ฐ, ์ด๋ ค์ด ๋ฌธ์ : {hard_count}๊ฐ") |
|
|
|
|
|
print("\n" + "="*80) |
|
print("๐ก ์ฌ์ฉ๋ฒ") |
|
print("="*80) |
|
print("ํน์ ๋ฒค์น๋งํฌ๋ง ๋ถ์:") |
|
print(" python test/analyze_difficulty.py --benchmark mbpp --save") |
|
print(" python test/analyze_difficulty.py --benchmark humaneval --detailed") |
|
print("\n์ ์ฒด ๋ถ์ ๋ฐ ์ ์ฅ:") |
|
print(" python test/analyze_difficulty.py --save") |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |