|
|
|
""" |
|
CompI Phase 1.D: Command-Line Quality Evaluation Tool |
|
|
|
Command-line interface for batch evaluation and analysis of generated images. |
|
|
|
Usage: |
|
python src/generators/compi_phase1d_cli_evaluation.py --help |
|
python src/generators/compi_phase1d_cli_evaluation.py --analyze |
|
python src/generators/compi_phase1d_cli_evaluation.py --batch-score 4 3 4 4 3 |
|
""" |
|
|
|
import os |
|
import argparse |
|
import json |
|
from datetime import datetime |
|
from pathlib import Path |
|
from typing import Dict, List |
|
|
|
import pandas as pd |
|
from PIL import Image |
|
|
|
|
|
from compi_phase1d_evaluate_quality import ( |
|
parse_filename, get_image_metrics, load_existing_evaluations, |
|
save_evaluation, EVALUATION_CRITERIA, OUTPUT_DIR, EVAL_CSV |
|
) |
|
|
|
def setup_args(): |
|
"""Setup command line arguments.""" |
|
parser = argparse.ArgumentParser( |
|
description="CompI Phase 1.D: Command-Line Quality Evaluation", |
|
formatter_class=argparse.RawDescriptionHelpFormatter, |
|
epilog=""" |
|
Examples: |
|
# Analyze existing evaluations |
|
python %(prog)s --analyze |
|
|
|
# Batch score all unevaluated images (prompt_match, style, mood, quality, appeal) |
|
python %(prog)s --batch-score 4 3 4 4 3 --notes "Batch evaluation - good quality" |
|
|
|
# Generate detailed report |
|
python %(prog)s --report --output evaluation_report.txt |
|
|
|
# List unevaluated images |
|
python %(prog)s --list-unevaluated |
|
""" |
|
) |
|
|
|
parser.add_argument("--output-dir", default=OUTPUT_DIR, |
|
help="Directory containing generated images") |
|
|
|
|
|
parser.add_argument("--analyze", action="store_true", |
|
help="Display evaluation summary and statistics") |
|
|
|
parser.add_argument("--report", action="store_true", |
|
help="Generate detailed evaluation report") |
|
|
|
parser.add_argument("--output", "-o", |
|
help="Output file for report (default: stdout)") |
|
|
|
|
|
parser.add_argument("--batch-score", nargs=5, type=int, metavar=("PROMPT", "STYLE", "MOOD", "QUALITY", "APPEAL"), |
|
help="Batch score all unevaluated images (1-5 for each criteria)") |
|
|
|
parser.add_argument("--notes", default="CLI batch evaluation", |
|
help="Notes for batch evaluation") |
|
|
|
|
|
parser.add_argument("--list-all", action="store_true", |
|
help="List all images with evaluation status") |
|
|
|
parser.add_argument("--list-evaluated", action="store_true", |
|
help="List only evaluated images") |
|
|
|
parser.add_argument("--list-unevaluated", action="store_true", |
|
help="List only unevaluated images") |
|
|
|
|
|
parser.add_argument("--style", help="Filter by style") |
|
parser.add_argument("--mood", help="Filter by mood") |
|
|
|
return parser.parse_args() |
|
|
|
def load_images(output_dir: str) -> List[Dict]: |
|
"""Load and parse all images from output directory.""" |
|
if not os.path.exists(output_dir): |
|
print(f"β Output directory '{output_dir}' not found!") |
|
return [] |
|
|
|
image_files = [f for f in os.listdir(output_dir) if f.lower().endswith('.png')] |
|
parsed_images = [] |
|
|
|
for fname in image_files: |
|
metadata = parse_filename(fname) |
|
if metadata: |
|
parsed_images.append(metadata) |
|
|
|
return parsed_images |
|
|
|
def filter_images(images: List[Dict], style: str = None, mood: str = None) -> List[Dict]: |
|
"""Filter images by style and/or mood.""" |
|
filtered = images |
|
|
|
if style: |
|
filtered = [img for img in filtered if img.get('style', '').lower() == style.lower()] |
|
|
|
if mood: |
|
filtered = [img for img in filtered if img.get('mood', '').lower() == mood.lower()] |
|
|
|
return filtered |
|
|
|
def analyze_evaluations(existing_evals: Dict): |
|
"""Display evaluation analysis.""" |
|
if not existing_evals: |
|
print("β No evaluations found.") |
|
return |
|
|
|
df = pd.DataFrame.from_dict(existing_evals, orient='index') |
|
|
|
print("π CompI Phase 1.D - Evaluation Analysis") |
|
print("=" * 50) |
|
print(f"Total Evaluated Images: {len(df)}") |
|
print() |
|
|
|
|
|
print("π Score Statistics:") |
|
for criterion_key, criterion_info in EVALUATION_CRITERIA.items(): |
|
if criterion_key in df.columns: |
|
mean_score = df[criterion_key].mean() |
|
std_score = df[criterion_key].std() |
|
min_score = df[criterion_key].min() |
|
max_score = df[criterion_key].max() |
|
|
|
print(f" {criterion_info['name']:20}: {mean_score:.2f} Β± {std_score:.2f} (range: {min_score}-{max_score})") |
|
|
|
print() |
|
|
|
|
|
if 'style' in df.columns and 'prompt_match' in df.columns: |
|
print("π¨ Top Performing Styles (by Prompt Match):") |
|
style_scores = df.groupby('style')['prompt_match'].mean().sort_values(ascending=False) |
|
for style, score in style_scores.head(5).items(): |
|
print(f" {style:15}: {score:.2f}") |
|
print() |
|
|
|
|
|
if 'mood' in df.columns and 'creative_appeal' in df.columns: |
|
print("π Top Performing Moods (by Creative Appeal):") |
|
mood_scores = df.groupby('mood')['creative_appeal'].mean().sort_values(ascending=False) |
|
for mood, score in mood_scores.head(5).items(): |
|
print(f" {mood:15}: {score:.2f}") |
|
print() |
|
|
|
def generate_detailed_report(existing_evals: Dict) -> str: |
|
"""Generate detailed evaluation report.""" |
|
if not existing_evals: |
|
return "No evaluations found." |
|
|
|
df = pd.DataFrame.from_dict(existing_evals, orient='index') |
|
|
|
report_lines = [ |
|
"# CompI Phase 1.D - Detailed Evaluation Report", |
|
f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", |
|
f"Total Images Evaluated: {len(df)}", |
|
"", |
|
"## Overall Performance Summary" |
|
] |
|
|
|
|
|
for criterion_key, criterion_info in EVALUATION_CRITERIA.items(): |
|
if criterion_key in df.columns: |
|
mean_score = df[criterion_key].mean() |
|
std_score = df[criterion_key].std() |
|
report_lines.append(f"- **{criterion_info['name']}**: {mean_score:.2f} Β± {std_score:.2f}") |
|
|
|
|
|
report_lines.extend([ |
|
"", |
|
"## Score Distribution Analysis" |
|
]) |
|
|
|
for criterion_key, criterion_info in EVALUATION_CRITERIA.items(): |
|
if criterion_key in df.columns: |
|
scores = df[criterion_key] |
|
report_lines.extend([ |
|
f"", |
|
f"### {criterion_info['name']}", |
|
f"- Mean: {scores.mean():.2f}", |
|
f"- Median: {scores.median():.2f}", |
|
f"- Mode: {scores.mode().iloc[0] if not scores.mode().empty else 'N/A'}", |
|
f"- Range: {scores.min()}-{scores.max()}", |
|
f"- Distribution: " + " | ".join([f"{i}β
: {(scores == i).sum()}" for i in range(1, 6)]) |
|
]) |
|
|
|
|
|
if 'style' in df.columns: |
|
report_lines.extend([ |
|
"", |
|
"## Style Performance Analysis" |
|
]) |
|
|
|
for criterion_key in EVALUATION_CRITERIA.keys(): |
|
if criterion_key in df.columns: |
|
style_performance = df.groupby('style')[criterion_key].agg(['mean', 'count']).sort_values('mean', ascending=False) |
|
report_lines.extend([ |
|
f"", |
|
f"### {EVALUATION_CRITERIA[criterion_key]['name']} by Style", |
|
]) |
|
|
|
for style, (mean_score, count) in style_performance.iterrows(): |
|
report_lines.append(f"- {style}: {mean_score:.2f} (n={count})") |
|
|
|
|
|
report_lines.extend([ |
|
"", |
|
"## Recommendations", |
|
"", |
|
"### Areas for Improvement" |
|
]) |
|
|
|
|
|
criterion_means = {} |
|
for criterion_key, criterion_info in EVALUATION_CRITERIA.items(): |
|
if criterion_key in df.columns: |
|
criterion_means[criterion_info['name']] = df[criterion_key].mean() |
|
|
|
if criterion_means: |
|
lowest_criteria = sorted(criterion_means.items(), key=lambda x: x[1])[:2] |
|
for criterion_name, score in lowest_criteria: |
|
report_lines.append(f"- Focus on improving **{criterion_name}** (current: {score:.2f}/5)") |
|
|
|
report_lines.extend([ |
|
"", |
|
"### Best Practices", |
|
"- Continue systematic evaluation for trend analysis", |
|
"- Experiment with parameter adjustments for low-scoring areas", |
|
"- Consider A/B testing different generation approaches", |
|
"- Document successful style/mood combinations for reuse" |
|
]) |
|
|
|
return "\n".join(report_lines) |
|
|
|
def batch_evaluate_images(images: List[Dict], scores: List[int], notes: str, output_dir: str): |
|
"""Batch evaluate unevaluated images.""" |
|
existing_evals = load_existing_evaluations() |
|
unevaluated = [img for img in images if img['filename'] not in existing_evals] |
|
|
|
if not unevaluated: |
|
print("β
All images are already evaluated!") |
|
return |
|
|
|
print(f"π¦ Batch evaluating {len(unevaluated)} images...") |
|
|
|
|
|
criteria_keys = list(EVALUATION_CRITERIA.keys()) |
|
score_dict = dict(zip(criteria_keys, scores)) |
|
|
|
for i, img_data in enumerate(unevaluated): |
|
fname = img_data["filename"] |
|
img_path = os.path.join(output_dir, fname) |
|
|
|
try: |
|
metrics = get_image_metrics(img_path) |
|
save_evaluation(fname, img_data, score_dict, notes, metrics) |
|
print(f" β
Evaluated: {fname}") |
|
except Exception as e: |
|
print(f" β Error evaluating {fname}: {e}") |
|
|
|
print(f"π Batch evaluation completed!") |
|
|
|
def list_images(images: List[Dict], existing_evals: Dict, show_evaluated: bool = True, show_unevaluated: bool = True): |
|
"""List images with evaluation status.""" |
|
print(f"π Image List ({len(images)} total)") |
|
print("-" * 80) |
|
|
|
for img_data in images: |
|
fname = img_data["filename"] |
|
is_evaluated = fname in existing_evals |
|
|
|
if (show_evaluated and is_evaluated) or (show_unevaluated and not is_evaluated): |
|
status = "β
" if is_evaluated else "β" |
|
prompt = img_data.get('prompt', 'unknown')[:30] |
|
style = img_data.get('style', 'unknown')[:15] |
|
mood = img_data.get('mood', 'unknown')[:15] |
|
|
|
print(f"{status} {fname}") |
|
print(f" Prompt: {prompt}... | Style: {style} | Mood: {mood}") |
|
|
|
if is_evaluated: |
|
eval_data = existing_evals[fname] |
|
scores = [f"{k}:{eval_data.get(k, 'N/A')}" for k in EVALUATION_CRITERIA.keys() if k in eval_data] |
|
print(f" Scores: {' | '.join(scores[:3])}...") |
|
print() |
|
|
|
def main(): |
|
"""Main CLI function.""" |
|
args = setup_args() |
|
|
|
|
|
images = load_images(args.output_dir) |
|
if not images: |
|
return |
|
|
|
|
|
images = filter_images(images, args.style, args.mood) |
|
|
|
|
|
existing_evals = load_existing_evaluations() |
|
|
|
|
|
if args.analyze: |
|
analyze_evaluations(existing_evals) |
|
|
|
elif args.report: |
|
report = generate_detailed_report(existing_evals) |
|
if args.output: |
|
with open(args.output, 'w', encoding='utf-8') as f: |
|
f.write(report) |
|
print(f"π Report saved to: {args.output}") |
|
else: |
|
print(report) |
|
|
|
elif args.batch_score: |
|
batch_evaluate_images(images, args.batch_score, args.notes, args.output_dir) |
|
|
|
elif args.list_all: |
|
list_images(images, existing_evals, True, True) |
|
|
|
elif args.list_evaluated: |
|
list_images(images, existing_evals, True, False) |
|
|
|
elif args.list_unevaluated: |
|
list_images(images, existing_evals, False, True) |
|
|
|
else: |
|
print("β No command specified. Use --help for usage information.") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|