Spaces:

alidenewade
/

model-point-clustering

Sleeping

App Files Files Community

alidenewade commited on May 29

Commit

4355f45

verified ·

1 Parent(s): 4072b44

Update app.py

Browse files

Files changed (1) hide show

app.py +264 -105

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
-from sklearn.metrics import pairwise_distances_argmin_min, r2_score
 import matplotlib.pyplot as plt
 import matplotlib.cm
 import io
@@ -22,16 +22,41 @@ EXAMPLE_FILES = {
 }
 class Clusters:
-    def __init__(self, loc_vars):
-        self.kmeans = kmeans = KMeans(n_clusters=1000, random_state=0, n_init=10).fit(np.ascontiguousarray(loc_vars))
-        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, np.ascontiguousarray(loc_vars))
-        rep_ids = pd.Series(data=(closest+1))  # 0-based to 1-based indexes
-        rep_ids.name = 'policy_id'
-        rep_ids.index.name = 'cluster_id'
-        self.rep_ids = rep_ids
-        self.policy_count = self.agg_by_cluster(pd.DataFrame({'policy_count': [1] * len(loc_vars)}))['policy_count']
     def agg_by_cluster(self, df, agg=None):
         """Aggregate columns by cluster"""
@@ -43,21 +68,46 @@ class Clusters:
     def extract_reps(self, df):
         """Extract the rows of representative policies"""
-        temp = pd.merge(self.rep_ids, df.reset_index(), how='left', on='policy_id')
-        temp.index.name = 'cluster_id'
-        return temp.drop('policy_id', axis=1)
     def extract_and_scale_reps(self, df, agg=None):
         """Extract and scale the rows of representative policies"""
         if agg:
-            cols = df.columns
             mult = pd.DataFrame({c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1) for c in cols})
-            # Ensure mult has same index as extract_reps(df) for proper alignment
-            extracted_df = self.extract_reps(df)
-            mult.index = extracted_df.index
             return extracted_df.mul(mult)
         else:
-            return self.extract_reps(df).mul(self.policy_count, axis=0)
     def compare(self, df, agg=None):
         """Returns a multi-indexed Dataframe comparing actual and estimate"""
@@ -68,7 +118,6 @@ class Clusters:
     def compare_total(self, df, agg=None):
         """Aggregate df by columns"""
         if agg:
-            # Calculate actual values using specified aggregation
             actual_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
@@ -77,13 +126,19 @@ class Clusters:
                     actual_values[col] = df[col].sum()
             actual = pd.Series(actual_values)
-            # Calculate estimate values
             reps_unscaled = self.extract_reps(df)
             estimate_values = {}
-            for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
-                    # Weighted average for mean columns
                     weighted_sum = (reps_unscaled[col] * self.policy_count).sum()
                     total_weight = self.policy_count.sum()
                     estimate_values[col] = weighted_sum / total_weight if total_weight > 0 else 0
@@ -96,12 +151,14 @@ class Clusters:
             actual = df.sum()
             estimate = self.extract_and_scale_reps(df).sum()
-        # Calculate error, handling division by zero
         error = np.where(actual != 0, estimate / actual - 1, 0)
         return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': error})
 def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
     """Create cashflow comparison plots"""
     if not cfs_list or not cluster_obj or not titles:
@@ -110,7 +167,6 @@ def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
     if num_plots == 0:
         return None
-    # Determine subplot layout
     cols = 2
     rows = (num_plots + cols - 1) // cols
@@ -119,12 +175,17 @@ def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
     for i, (df, title) in enumerate(zip(cfs_list, titles)):
         if i < len(axes):
-            comparison = cluster_obj.compare_total(df)
             comparison[['actual', 'estimate']].plot(ax=axes[i], grid=True, title=title)
             axes[i].set_xlabel('Time')
             axes[i].set_ylabel('Value')
-    # Hide any unused subplots
     for j in range(i + 1, len(axes)):
         fig.delaxes(axes[j])
@@ -139,7 +200,6 @@ def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
 def plot_scatter_comparison(df_compare_output, title):
     """Create scatter plot comparison from compare() output"""
     if df_compare_output is None or df_compare_output.empty:
-        # Create a blank plot with a message
         fig, ax = plt.subplots(figsize=(12, 8))
         ax.text(0.5, 0.5, "No data to display", ha='center', va='center', fontsize=15)
         ax.set_title(title)
@@ -153,29 +213,28 @@ def plot_scatter_comparison(df_compare_output, title):
     fig, ax = plt.subplots(figsize=(12, 8))
     if not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
-         gr.Warning("Scatter plot data is not in the expected multi-index format. Plotting raw actual vs estimate.")
-         ax.scatter(df_compare_output['actual'], df_compare_output['estimate'], s=9, alpha=0.6)
     else:
         unique_levels = df_compare_output.index.get_level_values(1).unique()
         colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(unique_levels)))
         for item_level, color_val in zip(unique_levels, colors):
             subset = df_compare_output.xs(item_level, level=1)
-            ax.scatter(subset['actual'], subset['estimate'], color=color_val, s=9, alpha=0.6, label=item_level)
-        if len(unique_levels) > 1 and len(unique_levels) <= 10:
-            ax.legend(title=df_compare_output.index.names[1])
     ax.set_xlabel('Actual')
     ax.set_ylabel('Estimate')
     ax.set_title(title)
     ax.grid(True)
-    # Draw identity line
     lims = [
-        np.min([ax.get_xlim(), ax.get_ylim()]),
-        np.max([ax.get_xlim(), ax.get_ylim()]),
     ]
-    if lims[0] != lims[1]:
       ax.plot(lims, lims, 'r-', linewidth=0.5)
       ax.set_xlim(lims)
       ax.set_ylim(lims)
@@ -187,28 +246,55 @@ def plot_scatter_comparison(df_compare_output, title):
     plt.close(fig)
     return img
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
                   policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
     """Main processing function - now accepts file paths"""
     try:
-        # Read uploaded files using paths
         cfs = pd.read_excel(cashflow_base_path, index_col=0)
         cfs_lapse50 = pd.read_excel(cashflow_lapse_path, index_col=0)
         cfs_mort15 = pd.read_excel(cashflow_mort_path, index_col=0)
         pol_data_full = pd.read_excel(policy_data_path, index_col=0)
-        # Ensure the correct columns are selected for pol_data
         required_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
-        if all(col in pol_data_full.columns for col in required_cols):
-            pol_data = pol_data_full[required_cols]
         else:
-            gr.Warning(f"Policy data might be missing required columns. Found: {pol_data_full.columns.tolist()}")
-            pol_data = pol_data_full
         pvs = pd.read_excel(pv_base_path, index_col=0)
         pvs_lapse50 = pd.read_excel(pv_lapse_path, index_col=0)
         pvs_mort15 = pd.read_excel(pv_mort_path, index_col=0)
         cfs_list = [cfs, cfs_lapse50, cfs_mort15]
         scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
@@ -217,8 +303,14 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
         # --- 1. Cashflow Calibration ---
-        cluster_cfs = Clusters(cfs)
         results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
         results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs)
@@ -231,15 +323,22 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
         # --- 2. Policy Attribute Calibration ---
-        # Standardize policy attributes
-        if not pol_data.empty and (pol_data.max() - pol_data.min()).all() != 0:
-             loc_vars_attrs = (pol_data - pol_data.min()) / (pol_data.max() - pol_data.min())
         else:
-            gr.Warning("Policy data for attribute calibration is empty or has no variance. Skipping attribute calibration plots.")
-            loc_vars_attrs = pol_data
         if not loc_vars_attrs.empty:
-            cluster_attrs = Clusters(loc_vars_attrs)
             results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs)
             results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs)
             results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs)
@@ -249,11 +348,12 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
             results['attr_total_cf_base'] = pd.DataFrame()
             results['attr_policy_attrs_total'] = pd.DataFrame()
             results['attr_total_pv_base'] = pd.DataFrame()
-            results['attr_cashflow_plot'] = None
-            results['attr_scatter_cashflows_base'] = None
         # --- 3. Present Value Calibration ---
-        cluster_pvs = Clusters(pvs)
         results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs)
         results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs)
@@ -266,52 +366,47 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
         # --- Summary Comparison Plot Data ---
-        # Error metric for key PV column or mean absolute error
         error_data = {}
-        # Function to safely get error value
         def get_error_safe(compare_result, col_name=None):
-            if compare_result.empty:
                 return np.nan
             if col_name and col_name in compare_result.index:
                 return abs(compare_result.loc[col_name, 'error'])
             else:
-                # Use mean absolute error if specific column not found
                 return abs(compare_result['error']).mean()
-        # Determine key PV column (try common names)
         key_pv_col = None
         for potential_col in ['PV_NetCF', 'pv_net_cf', 'net_cf_pv', 'PV_Net_CF']:
-            if potential_col in pvs.columns:
                 key_pv_col = potential_col
                 break
-        # Cashflow Calibration Errors
         error_data['CF Calib.'] = [
-            get_error_safe(cluster_cfs.compare_total(pvs), key_pv_col),
-            get_error_safe(cluster_cfs.compare_total(pvs_lapse50), key_pv_col),
-            get_error_safe(cluster_cfs.compare_total(pvs_mort15), key_pv_col)
         ]
-        # Policy Attribute Calibration Errors
         if not loc_vars_attrs.empty:
             error_data['Attr Calib.'] = [
-                get_error_safe(cluster_attrs.compare_total(pvs), key_pv_col),
-                get_error_safe(cluster_attrs.compare_total(pvs_lapse50), key_pv_col),
-                get_error_safe(cluster_attrs.compare_total(pvs_mort15), key_pv_col)
             ]
         else:
             error_data['Attr Calib.'] = [np.nan, np.nan, np.nan]
-        # Present Value Calibration Errors
         error_data['PV Calib.'] = [
-            get_error_safe(cluster_pvs.compare_total(pvs), key_pv_col),
-            get_error_safe(cluster_pvs.compare_total(pvs_lapse50), key_pv_col),
-            get_error_safe(cluster_pvs.compare_total(pvs_mort15), key_pv_col)
         ]
-        # Create Summary Plot
         summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%'])
         fig_summary, ax_summary = plt.subplots(figsize=(10, 6))
@@ -335,13 +430,15 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         gr.Error(f"File not found: {e.filename}. Please ensure example files are in '{EXAMPLE_DATA_DIR}' or all files are uploaded.")
         return {"error": f"File not found: {e.filename}"}
     except KeyError as e:
-        gr.Error(f"A required column is missing from one of the excel files: {e}. Please check data format.")
-        return {"error": f"Missing column: {e}"}
     except Exception as e:
-        gr.Error(f"Error processing files: {str(e)}")
         return {"error": f"Error processing files: {str(e)}"}
 def create_interface():
     with gr.Blocks(title="Cluster Model Points Analysis") as demo:
         gr.Markdown("""
@@ -351,13 +448,15 @@ def create_interface():
         Upload your Excel files or use the example data to analyze cashflows, policy attributes, and present values using different calibration methods.
         **Required Files (Excel .xlsx):**
-        - Cashflows - Base Scenario
-        - Cashflows - Lapse Stress (+50%)
-        - Cashflows - Mortality Stress (+15%)
-        - Policy Data (including 'age_at_entry', 'policy_term', 'sum_assured', 'duration_mth')
-        - Present Values - Base Scenario
-        - Present Values - Lapse Stress
-        - Present Values - Mortality Stress
         """)
         with gr.Row():
@@ -404,7 +503,11 @@ def create_interface():
                 attr_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios")
                 attr_scatter_cashflows_base_out = gr.Image(label="Scatter Plot - Per-Cluster Cashflows (Base Scenario)")
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
-                     attr_total_pv_base_out = gr.Dataframe(label="PVs - Base Scenario Total")
             with gr.TabItem("💰 Present Value Calibration"):
                 gr.Markdown("### Results: Using Present Values (Base Scenario) as Calibration Variables")
@@ -441,24 +544,28 @@ def create_interface():
             files = [f1, f2, f3, f4, f5, f6, f7]
             file_paths = []
             for i, f_obj in enumerate(files):
-                if f_obj is None:
-                    gr.Error(f"Missing file input for argument {i+1}. Please upload all files or load examples.")
-                    return [None] * len(get_all_output_components())
-                # If f_obj is a Gradio FileData object (from direct upload)
-                if hasattr(f_obj, 'name') and isinstance(f_obj.name, str):
                     file_paths.append(f_obj.name)
-                # If f_obj is already a string path (from example load)
-                elif isinstance(f_obj, str):
-                     file_paths.append(f_obj)
-                else:
                     gr.Error(f"Invalid file input for argument {i+1}. Type: {type(f_obj)}")
                     return [None] * len(get_all_output_components())
             results = process_files(*file_paths)
-            if "error" in results:
                 return [None] * len(get_all_output_components())
             return [
@@ -485,18 +592,48 @@ def create_interface():
         # --- Action for Load Example Data Button ---
         def load_example_files():
-            missing_files = [fp for fp in EXAMPLE_FILES.values() if not os.path.exists(fp)]
-            if missing_files:
-                gr.Error(f"Missing example data files in '{EXAMPLE_DATA_DIR}': {', '.join(missing_files)}. Please ensure they exist.")
-                return [None] * 7
             gr.Info("Example data paths loaded. Click 'Analyze Dataset'.")
             return [
-                EXAMPLE_FILES["cashflow_base"], EXAMPLE_FILES["cashflow_lapse"], EXAMPLE_FILES["cashflow_mort"],
-                EXAMPLE_FILES["policy_data"], EXAMPLE_FILES["pv_base"], EXAMPLE_FILES["pv_lapse"],
-                EXAMPLE_FILES["pv_mort"]
             ]
         load_example_btn.click(
             load_example_files,
             inputs=[],
@@ -509,8 +646,30 @@ def create_interface():
 if __name__ == "__main__":
     if not os.path.exists(EXAMPLE_DATA_DIR):
         os.makedirs(EXAMPLE_DATA_DIR)
-        print(f"Created directory '{EXAMPLE_DATA_DIR}'. Please place example Excel files there.")
-        print(f"Expected files in '{EXAMPLE_DATA_DIR}': {list(EXAMPLE_FILES.values())}")
     demo_app = create_interface()
     demo_app.launch()

 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
+from sklearn.metrics import pairwise_distances_argmin_min # r2_score is not used in the provided snippet.
 import matplotlib.pyplot as plt
 import matplotlib.cm
 import io
 }
 class Clusters:
+    def __init__(self, loc_vars_df): # Expecting a pandas DataFrame
+        # "Quantisize" by converting input DataFrame to float32 for KMeans.
+        # This reduces precision, potentially speeding up calculations and lowering memory.
+        # Results might have minor numerical differences compared to float64.
+        # Ensure data is a C-contiguous NumPy array.
+        if loc_vars_df.empty:
+            # Handle empty DataFrame case to avoid errors with .values or astype
+            # KMeans would fail anyway, but this prevents issues before that.
+            loc_vars_np_float32 = np.array([], dtype=np.float32).reshape(0, loc_vars_df.shape[1] if loc_vars_df.shape[1] > 0 else 0)
+        else:
+            loc_vars_np_float32 = np.ascontiguousarray(loc_vars_df.astype(np.float32).values)
+        # Initialize KMeans with algorithm="elkan" for potential speedup
+        # and fit on the float32 data.
+        self.kmeans = KMeans(
+            n_clusters=1000,
+            random_state=0,
+            n_init=10,
+            algorithm="elkan"  # Added for speed optimization
+        ).fit(loc_vars_np_float32)
+        # cluster_centers_ will be float32 if fitted on float32 data.
+        # Pass the same float32 NumPy array for distance calculations.
+        closest, _ = pairwise_distances_argmin_min(
+            self.kmeans.cluster_centers_,
+            loc_vars_np_float32
+        )
+        self.rep_ids = pd.Series(data=(closest + 1))  # 0-based to 1-based indexes
+        self.rep_ids.name = 'policy_id'
+        self.rep_ids.index.name = 'cluster_id'
+        # policy_count is based on the number of items in the input data.
+        # Use loc_vars_np_float32.shape[0] which is the number of rows.
+        self.policy_count = self.agg_by_cluster(pd.DataFrame({'policy_count': [1] * loc_vars_np_float32.shape[0]}))['policy_count']
     def agg_by_cluster(self, df, agg=None):
         """Aggregate columns by cluster"""
     def extract_reps(self, df):
         """Extract the rows of representative policies"""
+        # Ensure policy_id in df is of the same type as self.rep_ids if it's not already the index
+        # Typically, df here will have 'policy_id' as its index as per original data.
+        # If df's index is not 'policy_id', ensure 'policy_id' column exists and has compatible type.
+        current_df_index_name = df.index.name
+        # If 'policy_id' is not the index, reset it. Otherwise, use the index.
+        if 'policy_id' not in df.columns and df.index.name != 'policy_id':
+            # This case should ideally not happen if inputs are consistent
+            # Forcing index to be named 'policy_id' if it's the policy identifier
+             df_indexed = df.copy()
+             if df_indexed.index.name is None: # Or some other logic to identify the policy_id column
+                 gr.Warning("DataFrame passed to extract_reps has no index name, assuming index is policy_id.")
+                 df_indexed.index.name = 'policy_id'
+             temp = pd.merge(self.rep_ids, df_indexed.reset_index(), how='left', on='policy_id')
+        elif 'policy_id' in df.columns and df.index.name == 'policy_id' and df.index.name in df.columns: # if policy_id is both index and a column
+            temp = pd.merge(self.rep_ids, df, how='left', on='policy_id') # Merge on column if available
+        elif df.index.name == 'policy_id':
+             temp = pd.merge(self.rep_ids, df.reset_index(), how='left', on='policy_id')
+        else: # 'policy_id' is a column, not the index
+             temp = pd.merge(self.rep_ids, df.reset_index(drop=df.index.name is None), how='left', on='policy_id')
+        temp.index.name = 'cluster_id' # The merge result's index is not cluster_id by default
+        temp = temp.set_index(self.rep_ids.index) # Set index to be cluster_id from self.rep_ids
+        return temp.drop('policy_id', axis=1, errors='ignore')
     def extract_and_scale_reps(self, df, agg=None):
         """Extract and scale the rows of representative policies"""
+        extracted_df = self.extract_reps(df)
         if agg:
+            cols = extracted_df.columns # Use columns from extracted_df
             mult = pd.DataFrame({c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1) for c in cols})
+            mult.index = extracted_df.index # Align index
             return extracted_df.mul(mult)
         else:
+            return extracted_df.mul(self.policy_count, axis=0)
     def compare(self, df, agg=None):
         """Returns a multi-indexed Dataframe comparing actual and estimate"""
     def compare_total(self, df, agg=None):
         """Aggregate df by columns"""
         if agg:
             actual_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
                     actual_values[col] = df[col].sum()
             actual = pd.Series(actual_values)
             reps_unscaled = self.extract_reps(df)
             estimate_values = {}
+            for col in df.columns: # Iterate over original df columns to ensure all are covered
+                if col not in reps_unscaled.columns: # Column might not be in reps_unscaled if it was dropped or not selected
+                    if agg.get(col, 'sum') == 'mean':
+                        estimate_values[col] = np.nan # Or some other placeholder like 0, or actual.get(col, 0)
+                    else:
+                        estimate_values[col] = 0
+                    gr.Warning(f"Column '{col}' not found in representative policies output for 'compare_total'. Estimate will be 0/NaN.")
+                    continue
                 if agg.get(col, 'sum') == 'mean':
                     weighted_sum = (reps_unscaled[col] * self.policy_count).sum()
                     total_weight = self.policy_count.sum()
                     estimate_values[col] = weighted_sum / total_weight if total_weight > 0 else 0
             actual = df.sum()
             estimate = self.extract_and_scale_reps(df).sum()
+        # Ensure alignment for error calculation
+        actual, estimate = actual.align(estimate, fill_value=0)
         error = np.where(actual != 0, estimate / actual - 1, 0)
         return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': error})
+# --- Plotting functions (plot_cashflows_comparison, plot_scatter_comparison) remain unchanged ---
 def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
     """Create cashflow comparison plots"""
     if not cfs_list or not cluster_obj or not titles:
     if num_plots == 0:
         return None
     cols = 2
     rows = (num_plots + cols - 1) // cols
     for i, (df, title) in enumerate(zip(cfs_list, titles)):
         if i < len(axes):
+            # Ensure df passed to compare_total is appropriate.
+            # If df has policy_id as index, it matches expectations of downstream functions in Clusters.
+            # If not, ensure policy_id is a column or handle appropriately.
+            if df.index.name != 'policy_id' and 'policy_id' not in df.columns:
+                 gr.Warning(f"DataFrame for plot '{title}' does not have 'policy_id' as index or column. Results may be incorrect.")
+            comparison = cluster_obj.compare_total(df.set_index('policy_id') if 'policy_id' in df.columns and df.index.name != 'policy_id' else df)
             comparison[['actual', 'estimate']].plot(ax=axes[i], grid=True, title=title)
             axes[i].set_xlabel('Time')
             axes[i].set_ylabel('Value')
     for j in range(i + 1, len(axes)):
         fig.delaxes(axes[j])
 def plot_scatter_comparison(df_compare_output, title):
     """Create scatter plot comparison from compare() output"""
     if df_compare_output is None or df_compare_output.empty:
         fig, ax = plt.subplots(figsize=(12, 8))
         ax.text(0.5, 0.5, "No data to display", ha='center', va='center', fontsize=15)
         ax.set_title(title)
     fig, ax = plt.subplots(figsize=(12, 8))
     if not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
+        gr.Warning("Scatter plot data is not in the expected multi-index format. Plotting raw actual vs estimate.")
+        ax.scatter(df_compare_output['actual'], df_compare_output['estimate'], s=9, alpha=0.6)
     else:
         unique_levels = df_compare_output.index.get_level_values(1).unique()
         colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(unique_levels)))
         for item_level, color_val in zip(unique_levels, colors):
             subset = df_compare_output.xs(item_level, level=1)
+            ax.scatter(subset['actual'], subset['estimate'], color=color_val, s=9, alpha=0.6, label=str(item_level)) # Ensure label is string
+        if len(unique_levels) > 1 and len(unique_levels) <= 20: # Increased legend item limit slightly
+            ax.legend(title=str(df_compare_output.index.names[1]))
     ax.set_xlabel('Actual')
     ax.set_ylabel('Estimate')
     ax.set_title(title)
     ax.grid(True)
     lims = [
+        np.nanmin([ax.get_xlim(), ax.get_ylim()]), # Use nanmin/nanmax
+        np.nanmax([ax.get_xlim(), ax.get_ylim()]),
     ]
+    if lims[0] != lims[1] and np.isfinite(lims[0]) and np.isfinite(lims[1]): # Check for valid limits
       ax.plot(lims, lims, 'r-', linewidth=0.5)
       ax.set_xlim(lims)
       ax.set_ylim(lims)
     plt.close(fig)
     return img
+# --- Main processing function (process_files) ---
+# Ensure DataFrames passed to Clusters methods have 'policy_id' as index if expected.
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
                   policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
     """Main processing function - now accepts file paths"""
     try:
+        # Consider using engine='calamine' for faster Excel reading if available (pip install pandas[calamine])
+        # e.g., cfs = pd.read_excel(cashflow_base_path, index_col=0, engine='calamine')
         cfs = pd.read_excel(cashflow_base_path, index_col=0)
         cfs_lapse50 = pd.read_excel(cashflow_lapse_path, index_col=0)
         cfs_mort15 = pd.read_excel(cashflow_mort_path, index_col=0)
         pol_data_full = pd.read_excel(policy_data_path, index_col=0)
         required_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
+        # Ensure index is named 'policy_id' if it's not already named, assuming index is the policy identifier
+        for df in [cfs, cfs_lapse50, cfs_mort15, pol_data_full]:
+            if df.index.name is None:
+                df.index.name = 'policy_id'
+            if 'policy_id' not in df.columns and df.index.name == 'policy_id': # Add policy_id as column if its only an index
+                df.reset_index(inplace=True) # this makes policy_id a column
+                df.set_index('policy_id', inplace=True) # and keeps it as index
+        if all(col in pol_data_full.columns or col == pol_data_full.index.name for col in required_cols):
+            # If policy_id is index, it won't be in columns. Adjust selection.
+            cols_to_select = [col for col in required_cols if col in pol_data_full.columns]
+            if pol_data_full.index.name in required_cols and pol_data_full.index.name not in cols_to_select:
+                 # This case is tricky; if an ID is part of required_cols and is the index.
+                 # For simplicity, assume required_cols are actual data columns.
+                 pass # Let it proceed, it might be handled by selection or error later.
+            pol_data = pol_data_full[cols_to_select].copy() # Use .copy() to avoid SettingWithCopyWarning
+            # If 'policy_id' was the index and required, it's implicitly handled or needs specific logic.
+            # For K-Means, policy_id itself is usually not a feature.
         else:
+            missing_req_cols = [col for col in required_cols if col not in pol_data_full.columns and col != pol_data_full.index.name]
+            gr.Warning(f"Policy data might be missing required columns: {missing_req_cols}. Found: {pol_data_full.columns.tolist()}")
+            pol_data = pol_data_full # Fallback, but ensure it's numeric for clustering/scaling
         pvs = pd.read_excel(pv_base_path, index_col=0)
         pvs_lapse50 = pd.read_excel(pv_lapse_path, index_col=0)
         pvs_mort15 = pd.read_excel(pv_mort_path, index_col=0)
+        for df in [pvs, pvs_lapse50, pvs_mort15]:
+            if df.index.name is None:
+                df.index.name = 'policy_id'
+            if 'policy_id' not in df.columns and df.index.name == 'policy_id':
+                df.reset_index(inplace=True)
+                df.set_index('policy_id', inplace=True)
         cfs_list = [cfs, cfs_lapse50, cfs_mort15]
         scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
         mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
+        # DataFrames passed to Clusters should be policy_id indexed for .values to exclude it.
+        # Or, select only feature columns before passing.
+        # The Clusters class now expects a DataFrame and will use .values, so pass only feature columns.
+        # If index is policy_id, df.values will not include it. This is good.
         # --- 1. Cashflow Calibration ---
+        # Ensure 'cfs' DataFrame does not include 'policy_id' when .values is called in Clusters
+        cluster_cfs = Clusters(cfs.reset_index().set_index('policy_id')) # Pass with policy_id as index
         results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
         results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs)
         results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
         # --- 2. Policy Attribute Calibration ---
+        loc_vars_attrs = pd.DataFrame() # Initialize
+        if not pol_data.empty:
+            # Ensure pol_data is purely numeric for scaling and KMeans
+            numeric_pol_data = pol_data.select_dtypes(include=np.number)
+            if not numeric_pol_data.empty and not (numeric_pol_data.max(numeric_only=True) - numeric_pol_data.min(numeric_only=True) == 0).all():
+                loc_vars_attrs = (numeric_pol_data - numeric_pol_data.min(numeric_only=True)) / \
+                                 (numeric_pol_data.max(numeric_only=True) - numeric_pol_data.min(numeric_only=True))
+                loc_vars_attrs.index = numeric_pol_data.index # Preserve index
+            else:
+                gr.Warning("Policy data for attribute calibration is empty, non-numeric, or has no variance. Skipping attribute calibration content.")
+                loc_vars_attrs = numeric_pol_data # or an empty DataFrame with original index
         else:
+            gr.Warning("Policy data is empty. Skipping attribute calibration content.")
         if not loc_vars_attrs.empty:
+            cluster_attrs = Clusters(loc_vars_attrs.reset_index().set_index('policy_id')) # Pass with policy_id as index
             results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs)
             results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs)
             results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs)
             results['attr_total_cf_base'] = pd.DataFrame()
             results['attr_policy_attrs_total'] = pd.DataFrame()
             results['attr_total_pv_base'] = pd.DataFrame()
+            results['attr_cashflow_plot'] = plot_scatter_comparison(pd.DataFrame(), 'Policy Attr. Calib. - Cashflows (Base) - No Data') # Empty plot
+            results['attr_scatter_cashflows_base'] = plot_scatter_comparison(pd.DataFrame(), 'Policy Attr. Calib. - Cashflows (Base) - No Data')
         # --- 3. Present Value Calibration ---
+        cluster_pvs = Clusters(pvs.reset_index().set_index('policy_id')) # Pass with policy_id as index
         results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs)
         results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs)
         results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
         # --- Summary Comparison Plot Data ---
         error_data = {}
         def get_error_safe(compare_result, col_name=None):
+            if compare_result is None or compare_result.empty or 'error' not in compare_result.columns: # Check if None
                 return np.nan
             if col_name and col_name in compare_result.index:
                 return abs(compare_result.loc[col_name, 'error'])
             else:
                 return abs(compare_result['error']).mean()
         key_pv_col = None
+        # Use pvs.columns (which should be only feature columns after reset_index().set_index())
+        # Or, use the original pvs DataFrame if it's guaranteed to have the PV_NetCF column.
+        # For safety, check in the original pvs DataFrame which has not been stripped of columns.
+        original_pvs_cols = pd.read_excel(pv_base_path).columns # Quick read just for columns
         for potential_col in ['PV_NetCF', 'pv_net_cf', 'net_cf_pv', 'PV_Net_CF']:
+            if potential_col in original_pvs_cols: # Check against original columns
                 key_pv_col = potential_col
                 break
         error_data['CF Calib.'] = [
+            get_error_safe(results.get('cf_pv_total_base'), key_pv_col),
+            get_error_safe(results.get('cf_pv_total_lapse'), key_pv_col),
+            get_error_safe(results.get('cf_pv_total_mort'), key_pv_col)
         ]
         if not loc_vars_attrs.empty:
             error_data['Attr Calib.'] = [
+                get_error_safe(results.get('attr_total_pv_base'), key_pv_col), # This was pvs, should be fine
+                get_error_safe(cluster_attrs.compare_total(pvs_lapse50), key_pv_col), # Re-calculate for pvs_lapse50
+                get_error_safe(cluster_attrs.compare_total(pvs_mort15), key_pv_col)  # Re-calculate for pvs_mort15
             ]
         else:
             error_data['Attr Calib.'] = [np.nan, np.nan, np.nan]
         error_data['PV Calib.'] = [
+            get_error_safe(results.get('pv_total_pv_base'), key_pv_col),
+            get_error_safe(results.get('pv_total_pv_lapse'), key_pv_col),
+            get_error_safe(results.get('pv_total_pv_mort'), key_pv_col)
         ]
         summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%'])
         fig_summary, ax_summary = plt.subplots(figsize=(10, 6))
         gr.Error(f"File not found: {e.filename}. Please ensure example files are in '{EXAMPLE_DATA_DIR}' or all files are uploaded.")
         return {"error": f"File not found: {e.filename}"}
     except KeyError as e:
+        # Check if the KeyError is from trying to access a column that became an index
+        gr.Error(f"A required column or index is missing or misnamed: {e}. Please check data format and ensure 'policy_id' is correctly handled as index for feature dataframes.")
+        return {"error": f"Missing column/index: {e}"}
     except Exception as e:
+        import traceback
+        gr.Error(f"Error processing files: {str(e)}. Trace: {traceback.format_exc()}")
         return {"error": f"Error processing files: {str(e)}"}
+# --- Gradio interface creation (create_interface, etc.) remains unchanged ---
 def create_interface():
     with gr.Blocks(title="Cluster Model Points Analysis") as demo:
         gr.Markdown("""
         Upload your Excel files or use the example data to analyze cashflows, policy attributes, and present values using different calibration methods.
         **Required Files (Excel .xlsx):**
+        - Cashflows - Base Scenario (index = policy_id, columns = time periods)
+        - Cashflows - Lapse Stress (+50%) (index = policy_id)
+        - Cashflows - Mortality Stress (+15%) (index = policy_id)
+        - Policy Data (index = policy_id, including 'age_at_entry', 'policy_term', 'sum_assured', 'duration_mth' as columns)
+        - Present Values - Base Scenario (index = policy_id, columns = PV components like 'PV_NetCF')
+        - Present Values - Lapse Stress (index = policy_id)
+        - Present Values - Mortality Stress (index = policy_id)
+        *Note: Ensure 'policy_id' is the index for all input files for correct processing.*
         """)
         with gr.Row():
                 attr_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios")
                 attr_scatter_cashflows_base_out = gr.Image(label="Scatter Plot - Per-Cluster Cashflows (Base Scenario)")
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
+                    with gr.Row(): # Changed to Row for consistency
+                        attr_total_pv_base_out = gr.Dataframe(label="PVs - Base Scenario Total")
+                        # Added placeholders for other scenarios if they were intended
+                        # attr_total_pv_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total")
+                        # attr_total_pv_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total")
             with gr.TabItem("💰 Present Value Calibration"):
                 gr.Markdown("### Results: Using Present Values (Base Scenario) as Calibration Variables")
             files = [f1, f2, f3, f4, f5, f6, f7]
             file_paths = []
+            # Check if any FileData object is None (no file uploaded for a slot)
+            if any(f_obj is None for f_obj in files):
+                # Attempt to load from EXAMPLE_FILES if any input is missing
+                # This logic might be complex if mixing examples and uploads.
+                # For now, strict: all files must be present.
+                gr.Error("Missing file input for one or more fields. Please upload all required files or load the complete example dataset.")
+                return [None] * len(get_all_output_components())
             for i, f_obj in enumerate(files):
+                # f_obj is TempFilePath (older Gradio) or FileData (newer) or str (from example load)
+                if hasattr(f_obj, 'name') and isinstance(f_obj.name, str): # Gradio FileData or similar
                     file_paths.append(f_obj.name)
+                elif isinstance(f_obj, str): # Path from example load
+                    file_paths.append(f_obj)
+                else: # Should not happen if inputs are Files or paths
                     gr.Error(f"Invalid file input for argument {i+1}. Type: {type(f_obj)}")
                     return [None] * len(get_all_output_components())
             results = process_files(*file_paths)
+            if "error" in results : # Check if process_files returned an error dict
+                # Error already shown by gr.Error in process_files
                 return [None] * len(get_all_output_components())
             return [
         # --- Action for Load Example Data Button ---
         def load_example_files():
+            # Create dummy example files if they don't exist for demonstration if needed
+            # For this exercise, we assume they exist or user is warned.
+            os.makedirs(EXAMPLE_DATA_DIR, exist_ok=True) # Ensure dir exists
+            missing_files = []
+            for key, fp in EXAMPLE_FILES.items():
+                if not os.path.exists(fp):
+                    missing_files.append(fp)
+                    # Create a minimal dummy Excel file if it's missing
+                    try:
+                        dummy_df_data = {'policy_id': [1,2,3], 'col1': [0.1,0.2,0.3], 'col2':[10,20,30]}
+                        if "cashflow" in key or "pv" in key: # Time series like
+                            dummy_df_data = {'policy_id': [1,2,3], '0': [1,2,3], '1': [4,5,6]}
+                        elif "policy_data" in key:
+                             dummy_df_data = {'policy_id': [1,2,3], 'age_at_entry': [20,30,40], 'policy_term': [10,20,15],
+                                              'sum_assured': [1000,2000,1500], 'duration_mth': [5,10,7]}
+                        dummy_df = pd.DataFrame(dummy_df_data).set_index('policy_id')
+                        dummy_df.to_excel(fp)
+                        gr.Warning(f"Example file '{fp}' was missing and a dummy file has been created. Results may not be meaningful.")
+                    except Exception as e:
+                        gr.Warning(f"Could not create dummy file for {fp}: {e}")
+            if missing_files and not all(os.path.exists(fp) for fp in EXAMPLE_FILES.values()): # Re-check after dummy creation attempt
+                 # If still missing after trying to create dummies
+                gr.Error(f"Critical example data files are missing from '{EXAMPLE_DATA_DIR}': {', '.join(missing_files)}. Please ensure they exist or check permissions.")
+                return [None] * 7 # Return None for all file inputs
             gr.Info("Example data paths loaded. Click 'Analyze Dataset'.")
+            # Return the string paths for the file components
             return [
+                gr.File(value=EXAMPLE_FILES["cashflow_base"], Labeled_input=cashflow_base_input.label),
+                gr.File(value=EXAMPLE_FILES["cashflow_lapse"], Labeled_input=cashflow_lapse_input.label),
+                gr.File(value=EXAMPLE_FILES["cashflow_mort"], Labeled_input=cashflow_mort_input.label),
+                gr.File(value=EXAMPLE_FILES["policy_data"], Labeled_input=policy_data_input.label),
+                gr.File(value=EXAMPLE_FILES["pv_base"], Labeled_input=pv_base_input.label),
+                gr.File(value=EXAMPLE_FILES["pv_lapse"], Labeled_input=pv_lapse_input.label),
+                gr.File(value=EXAMPLE_FILES["pv_mort"], Labeled_input=pv_mort_input.label)
             ]
         load_example_btn.click(
             load_example_files,
             inputs=[],
 if __name__ == "__main__":
     if not os.path.exists(EXAMPLE_DATA_DIR):
         os.makedirs(EXAMPLE_DATA_DIR)
+        print(f"Created directory '{EXAMPLE_DATA_DIR}'. Please place example Excel files there or they will be generated as dummies.")
+    # Simple check and dummy file creation for example data if not present
+    for key, fp in EXAMPLE_FILES.items():
+        if not os.path.exists(fp):
+            print(f"Example file {fp} not found. Attempting to create a dummy file.")
+            try:
+                dummy_df_data = {'policy_id': [1,2,3], 'col1': [0.1,0.2,0.3], 'col2':[10,20,30]}
+                if "cashflow" in key or "pv" in key:
+                    dummy_df_data = {f'{i}':np.random.rand(3) for i in range(10)} # 10 time periods
+                    dummy_df_data['policy_id'] = [f'P{j}' for j in range(3)]
+                elif "policy_data" in key:
+                    dummy_df_data = {'policy_id': [f'P{j}' for j in range(3)],
+                                     'age_at_entry': np.random.randint(20, 50, 3),
+                                     'policy_term': np.random.randint(10, 30, 3),
+                                     'sum_assured': np.random.randint(10000, 50000, 3),
+                                     'duration_mth': np.random.randint(1, 120, 3)}
+                dummy_df = pd.DataFrame(dummy_df_data).set_index('policy_id')
+                dummy_df.to_excel(fp)
+                print(f"Dummy file for '{fp}' created.")
+            except Exception as e:
+                print(f"Could not create dummy file for {fp}: {e}")
     demo_app = create_interface()
     demo_app.launch()