Spaces:

alidenewade
/

model-point-clustering

Sleeping

App Files Files Community

alidenewade commited on May 29

Commit

9846b45

verified ·

1 Parent(s): e5a1f5c

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -89

app.py CHANGED Viewed

@@ -2,11 +2,11 @@ import gradio as gr
 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
-from sklearn.metrics import pairwise_distances_argmin_min
-import seaborn as sns
 import matplotlib.pyplot as plt
 import io
-import os
 from PIL import Image
 # Define the paths for example data
@@ -23,10 +23,10 @@ EXAMPLE_FILES = {
 class Clusters:
     def __init__(self, loc_vars):
-        self.kmeans = KMeans(n_clusters=1000, random_state=0, n_init=10).fit(np.ascontiguousarray(loc_vars))
-        closest, _ = pairwise_distances_argmin_min(self.kmeans.cluster_centers_, np.ascontiguousarray(loc_vars))
-        rep_ids = pd.Series(data=(closest+1))
         rep_ids.name = 'policy_id'
         rep_ids.index.name = 'cluster_id'
         self.rep_ids = rep_ids
@@ -34,6 +34,7 @@ class Clusters:
         self.policy_count = self.agg_by_cluster(pd.DataFrame({'policy_count': [1] * len(loc_vars)}))['policy_count']
     def agg_by_cluster(self, df, agg=None):
         temp = df.copy()
         temp['cluster_id'] = self.kmeans.labels_
         temp = temp.set_index('cluster_id')
@@ -41,14 +42,17 @@ class Clusters:
         return temp.groupby(temp.index).agg(agg)
     def extract_reps(self, df):
         temp = pd.merge(self.rep_ids, df.reset_index(), how='left', on='policy_id')
         temp.index.name = 'cluster_id'
         return temp.drop('policy_id', axis=1)
     def extract_and_scale_reps(self, df, agg=None):
         if agg:
             cols = df.columns
             mult = pd.DataFrame({c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1) for c in cols})
             extracted_df = self.extract_reps(df)
             mult.index = extracted_df.index
             return extracted_df.mul(mult)
@@ -56,47 +60,57 @@ class Clusters:
             return self.extract_reps(df).mul(self.policy_count, axis=0)
     def compare(self, df, agg=None):
         source = self.agg_by_cluster(df, agg)
         target = self.extract_and_scale_reps(df, agg)
         return pd.DataFrame({'actual': source.stack(), 'estimate':target.stack()})
     def compare_total(self, df, agg=None):
         if agg:
             actual_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
                     actual_values[col] = df[col].mean()
-                else:
                     actual_values[col] = df[col].sum()
             actual = pd.Series(actual_values)
             reps_unscaled = self.extract_reps(df)
             estimate_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
                     weighted_sum = (reps_unscaled[col] * self.policy_count).sum()
                     total_weight = self.policy_count.sum()
                     estimate_values[col] = weighted_sum / total_weight if total_weight > 0 else 0
-                else:
                     estimate_values[col] = (reps_unscaled[col] * self.policy_count).sum()
             estimate = pd.Series(estimate_values)
-        else:
             actual = df.sum()
             estimate = self.extract_and_scale_reps(df).sum()
         error = np.where(actual != 0, estimate / actual - 1, 0)
         return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': error})
 def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
     if not cfs_list or not cluster_obj or not titles:
         return None
     num_plots = len(cfs_list)
     if num_plots == 0:
         return None
     cols = 2
     rows = (num_plots + cols - 1) // cols
@@ -106,17 +120,11 @@ def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
     for i, (df, title) in enumerate(zip(cfs_list, titles)):
         if i < len(axes):
             comparison = cluster_obj.compare_total(df)
-            # Plot using seaborn lineplot for cleaner aesthetics
-            data_to_plot = comparison[['actual', 'estimate']].reset_index()
-            data_melted = data_to_plot.melt(id_vars='index', var_name='Type', value_name='Value')
-            sns.lineplot(data=data_melted, x='index', y='Value', hue='Type', ax=axes[i])
-            axes[i].set_title(title)
             axes[i].set_xlabel('Time')
             axes[i].set_ylabel('Value')
-            axes[i].grid(True)
     for j in range(i + 1, len(axes)):
         fig.delaxes(axes[j])
@@ -129,7 +137,9 @@ def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
     return img
 def plot_scatter_comparison(df_compare_output, title):
     if df_compare_output is None or df_compare_output.empty:
         fig, ax = plt.subplots(figsize=(12, 8))
         ax.text(0.5, 0.5, "No data to display", ha='center', va='center', fontsize=15)
         ax.set_title(title)
@@ -143,20 +153,17 @@ def plot_scatter_comparison(df_compare_output, title):
     fig, ax = plt.subplots(figsize=(12, 8))
     if not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
-        gr.Warning("Scatter plot data is not in the expected multi-index format. Plotting raw actual vs estimate.")
-        sns.scatterplot(x='actual', y='estimate', data=df_compare_output, s=9, alpha=0.6, ax=ax)
     else:
-        # Prepare data for seaborn
-        plot_data = df_compare_output.reset_index()
-        level_1_name = df_compare_output.index.names[1]
         unique_levels = df_compare_output.index.get_level_values(1).unique()
         if len(unique_levels) > 1 and len(unique_levels) <= 10:
-            sns.scatterplot(x='actual', y='estimate', hue=level_1_name,
-                          data=plot_data, s=9, alpha=0.6, ax=ax)
-        else:
-            sns.scatterplot(x='actual', y='estimate', data=plot_data, s=9, alpha=0.6, ax=ax)
     ax.set_xlabel('Actual')
     ax.set_ylabel('Estimate')
@@ -169,9 +176,9 @@ def plot_scatter_comparison(df_compare_output, title):
         np.max([ax.get_xlim(), ax.get_ylim()]),
     ]
     if lims[0] != lims[1]:
-        ax.plot(lims, lims, 'r-', linewidth=0.5)
-        ax.set_xlim(lims)
-        ax.set_ylim(lims)
     buf = io.BytesIO()
     plt.savefig(buf, format='png', dpi=100)
@@ -180,15 +187,18 @@ def plot_scatter_comparison(df_compare_output, title):
     plt.close(fig)
     return img
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
                   policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
     try:
-        # Read files
         cfs = pd.read_excel(cashflow_base_path, index_col=0)
         cfs_lapse50 = pd.read_excel(cashflow_lapse_path, index_col=0)
         cfs_mort15 = pd.read_excel(cashflow_mort_path, index_col=0)
         pol_data_full = pd.read_excel(policy_data_path, index_col=0)
         required_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
         if all(col in pol_data_full.columns for col in required_cols):
             pol_data = pol_data_full[required_cols]
@@ -204,90 +214,104 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
         results = {}
         mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
-        # Cashflow Calibration
         cluster_cfs = Clusters(cfs)
-        results.update({
-            'cf_total_base_table': cluster_cfs.compare_total(cfs),
-            'cf_policy_attrs_total': cluster_cfs.compare_total(pol_data, agg=mean_attrs),
-            'cf_pv_total_base': cluster_cfs.compare_total(pvs),
-            'cf_pv_total_lapse': cluster_cfs.compare_total(pvs_lapse50),
-            'cf_pv_total_mort': cluster_cfs.compare_total(pvs_mort15),
-            'cf_cashflow_plot': plot_cashflows_comparison(cfs_list, cluster_cfs, scen_titles),
-            'cf_scatter_cashflows_base': plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
-        })
-        # Policy Attribute Calibration
         if not pol_data.empty and (pol_data.max() - pol_data.min()).all() != 0:
-            loc_vars_attrs = (pol_data - pol_data.min()) / (pol_data.max() - pol_data.min())
         else:
             gr.Warning("Policy data for attribute calibration is empty or has no variance. Skipping attribute calibration plots.")
             loc_vars_attrs = pol_data
         if not loc_vars_attrs.empty:
             cluster_attrs = Clusters(loc_vars_attrs)
-            results.update({
-                'attr_total_cf_base': cluster_attrs.compare_total(cfs),
-                'attr_policy_attrs_total': cluster_attrs.compare_total(pol_data, agg=mean_attrs),
-                'attr_total_pv_base': cluster_attrs.compare_total(pvs),
-                'attr_cashflow_plot': plot_cashflows_comparison(cfs_list, cluster_attrs, scen_titles),
-                'attr_scatter_cashflows_base': plot_scatter_comparison(cluster_attrs.compare(cfs), 'Policy Attr. Calib. - Cashflows (Base)')
-            })
         else:
-            results.update({
-                'attr_total_cf_base': pd.DataFrame(),
-                'attr_policy_attrs_total': pd.DataFrame(),
-                'attr_total_pv_base': pd.DataFrame(),
-                'attr_cashflow_plot': None,
-                'attr_scatter_cashflows_base': None
-            })
-        # Present Value Calibration
         cluster_pvs = Clusters(pvs)
-        results.update({
-            'pv_total_cf_base': cluster_pvs.compare_total(cfs),
-            'pv_policy_attrs_total': cluster_pvs.compare_total(pol_data, agg=mean_attrs),
-            'pv_total_pv_base': cluster_pvs.compare_total(pvs),
-            'pv_total_pv_lapse': cluster_pvs.compare_total(pvs_lapse50),
-            'pv_total_pv_mort': cluster_pvs.compare_total(pvs_mort15),
-            'pv_cashflow_plot': plot_cashflows_comparison(cfs_list, cluster_pvs, scen_titles),
-            'pv_scatter_pvs_base': plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
-        })
-        # Summary Comparison Plot
         def get_error_safe(compare_result, col_name=None):
             if compare_result.empty:
                 return np.nan
             if col_name and col_name in compare_result.index:
                 return abs(compare_result.loc[col_name, 'error'])
             else:
                 return abs(compare_result['error']).mean()
         key_pv_col = None
         for potential_col in ['PV_NetCF', 'pv_net_cf', 'net_cf_pv', 'PV_Net_CF']:
             if potential_col in pvs.columns:
                 key_pv_col = potential_col
                 break
-        error_data = {
-            'CF Calib.': [
-                get_error_safe(cluster_cfs.compare_total(pvs), key_pv_col),
-                get_error_safe(cluster_cfs.compare_total(pvs_lapse50), key_pv_col),
-                get_error_safe(cluster_cfs.compare_total(pvs_mort15), key_pv_col)
-            ],
-            'Attr Calib.': [
-                get_error_safe(cluster_attrs.compare_total(pvs), key_pv_col) if not loc_vars_attrs.empty else np.nan,
-                get_error_safe(cluster_attrs.compare_total(pvs_lapse50), key_pv_col) if not loc_vars_attrs.empty else np.nan,
-                get_error_safe(cluster_attrs.compare_total(pvs_mort15), key_pv_col) if not loc_vars_attrs.empty else np.nan
-            ] if not loc_vars_attrs.empty else [np.nan, np.nan, np.nan],
-            'PV Calib.': [
-                get_error_safe(cluster_pvs.compare_total(pvs), key_pv_col),
-                get_error_safe(cluster_pvs.compare_total(pvs_lapse50), key_pv_col),
-                get_error_safe(cluster_pvs.compare_total(pvs_mort15), key_pv_col)
             ]
-        }
         summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%'])
         fig_summary, ax_summary = plt.subplots(figsize=(10, 6))
@@ -317,6 +341,7 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         gr.Error(f"Error processing files: {str(e)}")
         return {"error": f"Error processing files: {str(e)}"}
 def create_interface():
     with gr.Blocks(title="Cluster Model Points Analysis") as demo:
         gr.Markdown("""
@@ -394,30 +419,37 @@ def create_interface():
                         pv_total_pv_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total")
                         pv_total_pv_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total")
         def get_all_output_components():
             return [
                 summary_plot_output,
                 cf_total_base_table_out, cf_policy_attrs_total_out,
                 cf_cashflow_plot_out, cf_scatter_cashflows_base_out,
                 cf_pv_total_base_out, cf_pv_total_lapse_out, cf_pv_total_mort_out,
                 attr_total_cf_base_out, attr_policy_attrs_total_out,
                 attr_cashflow_plot_out, attr_scatter_cashflows_base_out, attr_total_pv_base_out,
                 pv_total_cf_base_out, pv_policy_attrs_total_out,
                 pv_cashflow_plot_out, pv_scatter_pvs_base_out,
                 pv_total_pv_base_out, pv_total_pv_lapse_out, pv_total_pv_mort_out
             ]
         def handle_analysis(f1, f2, f3, f4, f5, f6, f7):
             files = [f1, f2, f3, f4, f5, f6, f7]
-            file_paths = []
             for i, f_obj in enumerate(files):
                 if f_obj is None:
                     gr.Error(f"Missing file input for argument {i+1}. Please upload all files or load examples.")
                     return [None] * len(get_all_output_components())
                 if hasattr(f_obj, 'name') and isinstance(f_obj.name, str):
                     file_paths.append(f_obj.name)
                 elif isinstance(f_obj, str):
                      file_paths.append(f_obj)
                 else:
@@ -431,11 +463,14 @@ def create_interface():
             return [
                 results.get('summary_plot'),
                 results.get('cf_total_base_table'), results.get('cf_policy_attrs_total'),
                 results.get('cf_cashflow_plot'), results.get('cf_scatter_cashflows_base'),
                 results.get('cf_pv_total_base'), results.get('cf_pv_total_lapse'), results.get('cf_pv_total_mort'),
                 results.get('attr_total_cf_base'), results.get('attr_policy_attrs_total'),
                 results.get('attr_cashflow_plot'), results.get('attr_scatter_cashflows_base'), results.get('attr_total_pv_base'),
                 results.get('pv_total_cf_base'), results.get('pv_policy_attrs_total'),
                 results.get('pv_cashflow_plot'), results.get('pv_scatter_pvs_base'),
                 results.get('pv_total_pv_base'), results.get('pv_total_pv_lapse'), results.get('pv_total_pv_mort')
@@ -448,6 +483,7 @@ def create_interface():
             outputs=get_all_output_components()
         )
         def load_example_files():
             missing_files = [fp for fp in EXAMPLE_FILES.values() if not os.path.exists(fp)]
             if missing_files:

 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
+from sklearn.metrics import pairwise_distances_argmin_min, r2_score
 import matplotlib.pyplot as plt
+import matplotlib.cm
 import io
+import os # Added for path joining
 from PIL import Image
 # Define the paths for example data
 class Clusters:
     def __init__(self, loc_vars):
+        self.kmeans = kmeans = KMeans(n_clusters=1000, random_state=0, n_init=10).fit(np.ascontiguousarray(loc_vars))
+        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, np.ascontiguousarray(loc_vars))
+        rep_ids = pd.Series(data=(closest+1))  # 0-based to 1-based indexes
         rep_ids.name = 'policy_id'
         rep_ids.index.name = 'cluster_id'
         self.rep_ids = rep_ids
         self.policy_count = self.agg_by_cluster(pd.DataFrame({'policy_count': [1] * len(loc_vars)}))['policy_count']
     def agg_by_cluster(self, df, agg=None):
+        """Aggregate columns by cluster"""
         temp = df.copy()
         temp['cluster_id'] = self.kmeans.labels_
         temp = temp.set_index('cluster_id')
         return temp.groupby(temp.index).agg(agg)
     def extract_reps(self, df):
+        """Extract the rows of representative policies"""
         temp = pd.merge(self.rep_ids, df.reset_index(), how='left', on='policy_id')
         temp.index.name = 'cluster_id'
         return temp.drop('policy_id', axis=1)
     def extract_and_scale_reps(self, df, agg=None):
+        """Extract and scale the rows of representative policies"""
         if agg:
             cols = df.columns
             mult = pd.DataFrame({c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1) for c in cols})
+            # Ensure mult has same index as extract_reps(df) for proper alignment
             extracted_df = self.extract_reps(df)
             mult.index = extracted_df.index
             return extracted_df.mul(mult)
             return self.extract_reps(df).mul(self.policy_count, axis=0)
     def compare(self, df, agg=None):
+        """Returns a multi-indexed Dataframe comparing actual and estimate"""
         source = self.agg_by_cluster(df, agg)
         target = self.extract_and_scale_reps(df, agg)
         return pd.DataFrame({'actual': source.stack(), 'estimate':target.stack()})
     def compare_total(self, df, agg=None):
+        """Aggregate df by columns"""
         if agg:
+            # Calculate actual values using specified aggregation
             actual_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
                     actual_values[col] = df[col].mean()
+                else:  # sum
                     actual_values[col] = df[col].sum()
             actual = pd.Series(actual_values)
+            # Calculate estimate values
             reps_unscaled = self.extract_reps(df)
             estimate_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
+                    # Weighted average for mean columns
                     weighted_sum = (reps_unscaled[col] * self.policy_count).sum()
                     total_weight = self.policy_count.sum()
                     estimate_values[col] = weighted_sum / total_weight if total_weight > 0 else 0
+                else:  # sum
                     estimate_values[col] = (reps_unscaled[col] * self.policy_count).sum()
             estimate = pd.Series(estimate_values)
+        else:  # Original logic if no agg is specified (all sum)
             actual = df.sum()
             estimate = self.extract_and_scale_reps(df).sum()
+        # Calculate error, handling division by zero
         error = np.where(actual != 0, estimate / actual - 1, 0)
         return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': error})
 def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
+    """Create cashflow comparison plots"""
     if not cfs_list or not cluster_obj or not titles:
         return None
     num_plots = len(cfs_list)
     if num_plots == 0:
         return None
+    # Determine subplot layout
     cols = 2
     rows = (num_plots + cols - 1) // cols
     for i, (df, title) in enumerate(zip(cfs_list, titles)):
         if i < len(axes):
             comparison = cluster_obj.compare_total(df)
+            comparison[['actual', 'estimate']].plot(ax=axes[i], grid=True, title=title)
             axes[i].set_xlabel('Time')
             axes[i].set_ylabel('Value')
+    # Hide any unused subplots
     for j in range(i + 1, len(axes)):
         fig.delaxes(axes[j])
     return img
 def plot_scatter_comparison(df_compare_output, title):
+    """Create scatter plot comparison from compare() output"""
     if df_compare_output is None or df_compare_output.empty:
+        # Create a blank plot with a message
         fig, ax = plt.subplots(figsize=(12, 8))
         ax.text(0.5, 0.5, "No data to display", ha='center', va='center', fontsize=15)
         ax.set_title(title)
     fig, ax = plt.subplots(figsize=(12, 8))
     if not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
+         gr.Warning("Scatter plot data is not in the expected multi-index format. Plotting raw actual vs estimate.")
+         ax.scatter(df_compare_output['actual'], df_compare_output['estimate'], s=9, alpha=0.6)
     else:
         unique_levels = df_compare_output.index.get_level_values(1).unique()
+        colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(unique_levels)))
+        for item_level, color_val in zip(unique_levels, colors):
+            subset = df_compare_output.xs(item_level, level=1)
+            ax.scatter(subset['actual'], subset['estimate'], color=color_val, s=9, alpha=0.6, label=item_level)
         if len(unique_levels) > 1 and len(unique_levels) <= 10:
+            ax.legend(title=df_compare_output.index.names[1])
     ax.set_xlabel('Actual')
     ax.set_ylabel('Estimate')
         np.max([ax.get_xlim(), ax.get_ylim()]),
     ]
     if lims[0] != lims[1]:
+      ax.plot(lims, lims, 'r-', linewidth=0.5)
+      ax.set_xlim(lims)
+      ax.set_ylim(lims)
     buf = io.BytesIO()
     plt.savefig(buf, format='png', dpi=100)
     plt.close(fig)
     return img
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
                   policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
+    """Main processing function - now accepts file paths"""
     try:
+        # Read uploaded files using paths
         cfs = pd.read_excel(cashflow_base_path, index_col=0)
         cfs_lapse50 = pd.read_excel(cashflow_lapse_path, index_col=0)
         cfs_mort15 = pd.read_excel(cashflow_mort_path, index_col=0)
         pol_data_full = pd.read_excel(policy_data_path, index_col=0)
+        # Ensure the correct columns are selected for pol_data
         required_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
         if all(col in pol_data_full.columns for col in required_cols):
             pol_data = pol_data_full[required_cols]
         scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
         results = {}
         mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
+        # --- 1. Cashflow Calibration ---
         cluster_cfs = Clusters(cfs)
+        results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
+        results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs)
+        results['cf_pv_total_base'] = cluster_cfs.compare_total(pvs)
+        results['cf_pv_total_lapse'] = cluster_cfs.compare_total(pvs_lapse50)
+        results['cf_pv_total_mort'] = cluster_cfs.compare_total(pvs_mort15)
+        results['cf_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_cfs, scen_titles)
+        results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
+        # --- 2. Policy Attribute Calibration ---
+        # Standardize policy attributes
         if not pol_data.empty and (pol_data.max() - pol_data.min()).all() != 0:
+             loc_vars_attrs = (pol_data - pol_data.min()) / (pol_data.max() - pol_data.min())
         else:
             gr.Warning("Policy data for attribute calibration is empty or has no variance. Skipping attribute calibration plots.")
             loc_vars_attrs = pol_data
         if not loc_vars_attrs.empty:
             cluster_attrs = Clusters(loc_vars_attrs)
+            results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs)
+            results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs)
+            results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs)
+            results['attr_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_attrs, scen_titles)
+            results['attr_scatter_cashflows_base'] = plot_scatter_comparison(cluster_attrs.compare(cfs), 'Policy Attr. Calib. - Cashflows (Base)')
         else:
+            results['attr_total_cf_base'] = pd.DataFrame()
+            results['attr_policy_attrs_total'] = pd.DataFrame()
+            results['attr_total_pv_base'] = pd.DataFrame()
+            results['attr_cashflow_plot'] = None
+            results['attr_scatter_cashflows_base'] = None
+        # --- 3. Present Value Calibration ---
         cluster_pvs = Clusters(pvs)
+        results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs)
+        results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs)
+        results['pv_total_pv_base'] = cluster_pvs.compare_total(pvs)
+        results['pv_total_pv_lapse'] = cluster_pvs.compare_total(pvs_lapse50)
+        results['pv_total_pv_mort'] = cluster_pvs.compare_total(pvs_mort15)
+        results['pv_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_pvs, scen_titles)
+        results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
+        # --- Summary Comparison Plot Data ---
+        # Error metric for key PV column or mean absolute error
+        error_data = {}
+        # Function to safely get error value
         def get_error_safe(compare_result, col_name=None):
             if compare_result.empty:
                 return np.nan
             if col_name and col_name in compare_result.index:
                 return abs(compare_result.loc[col_name, 'error'])
             else:
+                # Use mean absolute error if specific column not found
                 return abs(compare_result['error']).mean()
+        # Determine key PV column (try common names)
         key_pv_col = None
         for potential_col in ['PV_NetCF', 'pv_net_cf', 'net_cf_pv', 'PV_Net_CF']:
             if potential_col in pvs.columns:
                 key_pv_col = potential_col
                 break
+        # Cashflow Calibration Errors
+        error_data['CF Calib.'] = [
+            get_error_safe(cluster_cfs.compare_total(pvs), key_pv_col),
+            get_error_safe(cluster_cfs.compare_total(pvs_lapse50), key_pv_col),
+            get_error_safe(cluster_cfs.compare_total(pvs_mort15), key_pv_col)
+        ]
+        # Policy Attribute Calibration Errors
+        if not loc_vars_attrs.empty:
+            error_data['Attr Calib.'] = [
+                get_error_safe(cluster_attrs.compare_total(pvs), key_pv_col),
+                get_error_safe(cluster_attrs.compare_total(pvs_lapse50), key_pv_col),
+                get_error_safe(cluster_attrs.compare_total(pvs_mort15), key_pv_col)
             ]
+        else:
+            error_data['Attr Calib.'] = [np.nan, np.nan, np.nan]
+        # Present Value Calibration Errors
+        error_data['PV Calib.'] = [
+            get_error_safe(cluster_pvs.compare_total(pvs), key_pv_col),
+            get_error_safe(cluster_pvs.compare_total(pvs_lapse50), key_pv_col),
+            get_error_safe(cluster_pvs.compare_total(pvs_mort15), key_pv_col)
+        ]
+        # Create Summary Plot
         summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%'])
         fig_summary, ax_summary = plt.subplots(figsize=(10, 6))
         gr.Error(f"Error processing files: {str(e)}")
         return {"error": f"Error processing files: {str(e)}"}
 def create_interface():
     with gr.Blocks(title="Cluster Model Points Analysis") as demo:
         gr.Markdown("""
                         pv_total_pv_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total")
                         pv_total_pv_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total")
+        # --- Helper function to prepare outputs ---
         def get_all_output_components():
             return [
                 summary_plot_output,
+                # Cashflow Calib Outputs
                 cf_total_base_table_out, cf_policy_attrs_total_out,
                 cf_cashflow_plot_out, cf_scatter_cashflows_base_out,
                 cf_pv_total_base_out, cf_pv_total_lapse_out, cf_pv_total_mort_out,
+                # Attribute Calib Outputs
                 attr_total_cf_base_out, attr_policy_attrs_total_out,
                 attr_cashflow_plot_out, attr_scatter_cashflows_base_out, attr_total_pv_base_out,
+                # PV Calib Outputs
                 pv_total_cf_base_out, pv_policy_attrs_total_out,
                 pv_cashflow_plot_out, pv_scatter_pvs_base_out,
                 pv_total_pv_base_out, pv_total_pv_lapse_out, pv_total_pv_mort_out
             ]
+        # --- Action for Analyze Button ---
         def handle_analysis(f1, f2, f3, f4, f5, f6, f7):
             files = [f1, f2, f3, f4, f5, f6, f7]
+            file_paths = []
             for i, f_obj in enumerate(files):
                 if f_obj is None:
                     gr.Error(f"Missing file input for argument {i+1}. Please upload all files or load examples.")
                     return [None] * len(get_all_output_components())
+                # If f_obj is a Gradio FileData object (from direct upload)
                 if hasattr(f_obj, 'name') and isinstance(f_obj.name, str):
                     file_paths.append(f_obj.name)
+                # If f_obj is already a string path (from example load)
                 elif isinstance(f_obj, str):
                      file_paths.append(f_obj)
                 else:
             return [
                 results.get('summary_plot'),
+                # CF Calib
                 results.get('cf_total_base_table'), results.get('cf_policy_attrs_total'),
                 results.get('cf_cashflow_plot'), results.get('cf_scatter_cashflows_base'),
                 results.get('cf_pv_total_base'), results.get('cf_pv_total_lapse'), results.get('cf_pv_total_mort'),
+                # Attr Calib
                 results.get('attr_total_cf_base'), results.get('attr_policy_attrs_total'),
                 results.get('attr_cashflow_plot'), results.get('attr_scatter_cashflows_base'), results.get('attr_total_pv_base'),
+                # PV Calib
                 results.get('pv_total_cf_base'), results.get('pv_policy_attrs_total'),
                 results.get('pv_cashflow_plot'), results.get('pv_scatter_pvs_base'),
                 results.get('pv_total_pv_base'), results.get('pv_total_pv_lapse'), results.get('pv_total_pv_mort')
             outputs=get_all_output_components()
         )
+        # --- Action for Load Example Data Button ---
         def load_example_files():
             missing_files = [fp for fp in EXAMPLE_FILES.values() if not os.path.exists(fp)]
             if missing_files: