Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| import pandas as pd | |
| import numpy as np | |
| from typing import List, Dict, Any, Tuple | |
| from collections import defaultdict | |
| def average_counterfactuals(json_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| """ | |
| Averages scores across counterfactuals for each layer. | |
| """ | |
| processed_jsons = [] | |
| for json_file in json_files: | |
| new_json = { | |
| 'method_name': json_file['method_name'], | |
| 'results': [] | |
| } | |
| for result in json_file['results']: | |
| new_result = { | |
| 'model_id': result['model_id'], | |
| 'task_scores': {} | |
| } | |
| for task, scores in result['task_scores'].items(): | |
| new_scores = [] | |
| for layer_data in scores: | |
| new_layer_data = { | |
| 'layer': layer_data['layer'], | |
| 'layer_scores': [] | |
| } | |
| for intervention_data in layer_data['layer_scores']: | |
| avg_score = np.mean([cf['score'] for cf in intervention_data['counterfactual_scores']]) | |
| if np.isnan(avg_score): | |
| avg_score = 0.0 | |
| new_layer_data['layer_scores'].append({ | |
| 'intervention': intervention_data['intervention'], | |
| 'average_score': avg_score | |
| }) | |
| new_scores.append(new_layer_data) | |
| new_result['task_scores'][task] = new_scores | |
| new_json['results'].append(new_result) | |
| processed_jsons.append(new_json) | |
| return processed_jsons | |
| def find_layer_averages(json_files: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | |
| """ | |
| Averages scores across layers for each intervention. | |
| """ | |
| processed_jsons = [] | |
| for json_file in json_files: | |
| new_json = { | |
| 'method_name': json_file['method_name'], | |
| 'results': [] | |
| } | |
| for result in json_file['results']: | |
| new_result = { | |
| 'model_id': result['model_id'], | |
| 'task_scores': {} | |
| } | |
| for task, scores in result['task_scores'].items(): | |
| # Group by intervention first | |
| intervention_scores = defaultdict(list) | |
| for layer_data in scores: | |
| for intervention_data in layer_data['layer_scores']: | |
| intervention_key = '_'.join(intervention_data['intervention']) | |
| intervention_scores[intervention_key].append(intervention_data['average_score']) | |
| # Average across layers for each intervention | |
| new_result['task_scores'][task] = [ | |
| { | |
| 'intervention': intervention.split('_'), | |
| 'average_score': np.mean(layer_scores) if layer_scores else 0.0 | |
| } | |
| for intervention, layer_scores in intervention_scores.items() | |
| ] | |
| new_json['results'].append(new_result) | |
| processed_jsons.append(new_json) | |
| return processed_jsons | |
| def create_summary_dataframe(json_files: List[Dict[str, Any]]) -> pd.DataFrame: | |
| """ | |
| Creates a summary DataFrame with methods as rows and MODEL_TASK_INTERVENTION as columns. | |
| Handles duplicate method names by adding a counter suffix. | |
| """ | |
| data = {} | |
| method_counters = defaultdict(int) | |
| for json_file in json_files: | |
| method_name = json_file['method_name'] | |
| # Increment counter for this method name | |
| method_counters[method_name] += 1 | |
| # If this is a duplicate method name, append a counter | |
| unique_method_name = f"{method_name}_{method_counters[method_name]}" | |
| method_scores = [] | |
| column_names = [] | |
| for result in json_file['results']: | |
| model = result['model_id'] | |
| for task, scores in result['task_scores'].items(): | |
| for score_data in scores: | |
| intervention = '_'.join(score_data['intervention']) | |
| column = f"{model}_{task}_{intervention}" | |
| score = f"{score_data['average_score']:.3f}" | |
| method_scores.append((column, score)) | |
| # Sort by column names for consistency | |
| method_scores.sort(key=lambda x: x[0]) | |
| scores_only = [float(score) for _, score in method_scores] | |
| avg_score = np.mean(scores_only) | |
| # Add average as first column | |
| data[unique_method_name] = { | |
| **{col: score for col, score in method_scores} | |
| } | |
| df = pd.DataFrame.from_dict(data, orient='index') | |
| return df | |
| def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Aggregates rows with the same base method name by taking the max value for each column. | |
| """ | |
| # Create a copy of the DataFrame | |
| df_copy = df.copy() | |
| # Extract base method names (remove _2, _3, etc. suffixes) | |
| base_methods = [name.split('_')[0] if '_' in name and name.split('_')[-1].isdigit() | |
| else name for name in df_copy.index] | |
| df_copy.index = base_methods | |
| # Convert scores to numeric values | |
| def extract_score(score_str): | |
| if isinstance(score_str, str): | |
| return float(score_str) | |
| return 0.0 | |
| numeric_df = df_copy.applymap(extract_score) | |
| # Group by base method name and take the mean | |
| aggregated_df = numeric_df.groupby(level=0).max().round(3) | |
| # Convert back to string format | |
| aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}") | |
| return aggregated_df | |
| def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Creates a DataFrame where columns are model_task and cells are averaged over interventions. | |
| """ | |
| # Create a copy of the DataFrame | |
| df_copy = df.copy() | |
| # Remove the Average column if it exists | |
| if 'Average' in df_copy.columns: | |
| df_copy = df_copy.drop('Average', axis=1) | |
| # Function to extract score value from string | |
| def extract_score(score_str): | |
| if isinstance(score_str, str): | |
| return float(score_str.split()[0]) | |
| return 0.0 | |
| # Convert all scores to numeric values | |
| numeric_df = df_copy.applymap(extract_score) | |
| # Group columns by model_task | |
| model_task_groups = {} | |
| for col in numeric_df.columns: | |
| model_task = '_'.join(col.split('_')[:2]) # Get model_task part | |
| if model_task not in model_task_groups: | |
| model_task_groups[model_task] = [] | |
| model_task_groups[model_task].append(col) | |
| # Create new DataFrame with averaged intervention scores | |
| averaged_df = pd.DataFrame({ | |
| model_task: numeric_df[cols].mean(axis=1).round(3) | |
| for model_task, cols in model_task_groups.items() | |
| }) | |
| # Add overall average column | |
| averaged_df['Average'] = averaged_df.mean(axis=1).round(3) | |
| # Sort by Average column | |
| averaged_df = averaged_df.sort_values('Average', ascending=False) | |
| return averaged_df | |
| def process_json_folder(folder_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: | |
| """ | |
| Processes all JSON files in a folder and returns three DataFrames: | |
| 1. Detailed DataFrame showing all results including duplicates (with layer-averaged scores) | |
| 2. Aggregated DataFrame showing average scores for each base method | |
| 3. Intervention-averaged DataFrame showing means across interventions | |
| """ | |
| json_files = [] | |
| # Read all JSON files | |
| for filename in os.listdir(folder_path): | |
| if filename.endswith('.json'): | |
| with open(os.path.join(folder_path, filename), 'r') as f: | |
| json_files.append(json.load(f)) | |
| # Process the files through each step | |
| averaged_cf = average_counterfactuals(json_files) | |
| layer_averaged = find_layer_averages(averaged_cf) | |
| detailed_df = create_summary_dataframe(layer_averaged) | |
| aggregated_df = aggregate_methods(detailed_df) | |
| intervention_averaged_df = create_intervention_averaged_df(aggregated_df) | |
| return detailed_df, aggregated_df, intervention_averaged_df | |
| # Example usage: | |
| if __name__ == "__main__": | |
| # Replace with your folder path | |
| folder_path = "./json_files" | |
| detailed_df, aggregated_df, intervention_averaged_df = process_json_folder(folder_path) | |
| print("Detailed Results (including duplicates):") | |
| print(detailed_df) | |
| print("\nAggregated Results (max scores per method):") | |
| print(aggregated_df) | |
| print("\nIntervention-Averaged Results:") | |
| print(intervention_averaged_df) |