Spaces:

alidenewade
/

model-point-clustering

Sleeping

App Files Files Community

alidenewade commited on May 22

Commit

7e17387

verified ·

1 Parent(s): e54138b

Create app.py

Browse files

Files changed (1) hide show

app.py +297 -0

app.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+from sklearn.cluster import KMeans
+from sklearn.metrics import r2_score, pairwise_distances_argmin_min
+import matplotlib.pyplot as plt
+import io
+def run_cluster_analysis(
+    policy_data_file,
+    cashflow_data_file,
+    pv_data_file,
+    num_clusters,
+    clustering_variable_choice
+):
+    """
+    Performs cluster analysis for model point selection and generates comparative outputs.
+    Args:
+        policy_data_file: Gradio File object for policy attributes data (e.g., issue age, policy term).
+        cashflow_data_file: Gradio File object for seriatim cashflow data.
+        pv_data_file: Gradio File object for seriatim present value data.
+        num_clusters: The desired number of representative model points (k for K-means).
+        clustering_variable_choice: The type of variables to use for clustering
+                                    ("Net Cashflows", "Policy Attributes", "Present Values").
+    Returns:
+        A tuple containing:
+        - A CSV string of the selected model points with their weights.
+        - A BytesIO object containing a PNG image of the cashflow comparison plot.
+        - A BytesIO object containing a PNG image of the present value comparison plot.
+        - A string summarizing key accuracy metrics.
+    """
+    # --- 1. Load Data ---
+    # Actuaries: Please ensure your Excel files have the correct format and column names.
+    # The notebook mentions 'policy data', 'cashflow data', and 'present value data'.
+    # For this app, we assume these are Excel files.
+    try:
+        # Load policy data; assuming policy identifiers are implicitly handled or not index.
+        policy_data = pd.read_excel(policy_data_file.name)
+        # Load cashflow data; assuming policy identifiers are in the first column or index.
+        # The notebook implies policies as rows and periods as columns for cashflows.
+        cashflow_data = pd.read_excel(cashflow_data_file.name, index_col=0)
+        # Load present value data; assuming policy identifiers are in the first column or index.
+        # The notebook implies policies as rows and PV components as columns, or a single PV column.
+        pv_data = pd.read_excel(pv_data_file.name, index_col=0)
+    except Exception as e:
+        return (f"Error loading files: {e}. Please ensure you upload valid Excel files "
+                "with appropriate data (e.g., policy IDs as index for cashflows/PVs).",
+                None, None, None)
+    # --- 2. Data Preparation for Clustering ---
+    X = pd.DataFrame() # Initialize an empty DataFrame for clustering variables
+    if clustering_variable_choice == "Policy Attributes":
+        # Actuaries: Adjust these column names to match your policy_data.xlsx file.
+        # The notebook mentions 'issue age', 'policy term', 'sum assured', and 'duration'.
+        required_cols = ['IssueAge', 'PolicyTerm', 'SumAssured', 'Duration']
+        if not all(col in policy_data.columns for col in required_cols):
+            return (f"Missing expected columns in Policy Data for 'Policy Attributes' clustering. "
+                    f"Expected: {required_cols}. Please adjust your file or the code.",
+                    None, None, None)
+        X = policy_data[required_cols]
+    elif clustering_variable_choice == "Net Cashflows":
+        # The notebook uses the full cashflow series for clustering.
+        # Ensure cashflow_data is purely numerical and represents cashflows over time.
+        X = cashflow_data.fillna(0) # Handle potential NaN values
+    elif clustering_variable_choice == "Present Values":
+        # Actuaries: Adjust this column name to match your pv_data.xlsx file.
+        # The notebook implies a main present value column (e.g., 'PV_Net_CF').
+        required_col = 'PV_Net_CF'
+        if required_col not in pv_data.columns:
+            return (f"Missing expected column '{required_col}' in Present Value Data for 'Present Values' clustering. "
+                    "Please adjust your file or the code.",
+                    None, None, None)
+        X = pv_data[[required_col]]
+    else:
+        return "Invalid clustering variable choice.", None, None, None
+    # Ensure policy_data, cashflow_data, and pv_data have a common index for merging later
+    # If not, you might need to merge them based on a common 'PolicyID' column
+    # For this example, we assume they all share a common index (e.g., policy IDs).
+    if not all(df.index.equals(policy_data.index) for df in [cashflow_data, pv_data]):
+         # If indexes are not aligned, try to align them by a common 'PolicyID' column if available.
+         # For simplicity in this demo, we'll assume they are aligned by index for now.
+         # A robust solution would involve merging or re-indexing.
+         pass # No action, assume alignment for now.
+    # Standardize data for clustering to prevent features with large values from dominating
+    # This is a common practice in k-means.
+    X_scaled = (X - X.mean()) / X.std()
+    X_scaled = X_scaled.fillna(0) # Replace NaNs after scaling (e.g., for columns with zero standard deviation)
+    # --- 3. K-means Clustering ---
+    try:
+        # n_init='auto' (default in scikit-learn 1.4+) or n_init=10 (for older versions)
+        # provides more robust centroid initialization.
+        kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
+        kmeans.fit(X_scaled)
+        # Assign cluster labels back to the original policy data
+        policy_data['Cluster'] = kmeans.labels_
+    except Exception as e:
+        return f"Error during K-means clustering: {e}", None, None, None
+    # --- 4. Select Representative Model Points and Calculate Weights ---
+    # Find the policy closest to each cluster centroid to represent that cluster.
+    # `pairwise_distances_argmin_min` returns indices of closest points.
+    closest_policies_indices = pairwise_distances_argmin_min(kmeans.cluster_centers_, X_scaled)[0]
+    # Select the actual policy data for these representative points.
+    model_points = policy_data.iloc[closest_policies_indices].copy()
+    # Calculate weights for each model point: count of original policies in its cluster.
+    cluster_counts = policy_data['Cluster'].value_counts()
+    model_points['Weight'] = model_points['Cluster'].map(cluster_counts)
+    # --- 5. Aggregate Cashflows and Present Values for comparison ---
+    # Compare aggregated results of seriatim portfolio vs. proxy portfolio.
+    # Total aggregated cashflows from the original seriatim data
+    total_seriatim_cashflows = cashflow_data.sum(axis=0)
+    # Total aggregated present values from the original seriatim data
+    total_seriatim_pvs = pv_data.sum(axis=0)
+    # Calculate proxy aggregated cashflows and present values
+    # We multiply the cashflows/PVs of the selected model points by their calculated weights.
+    # Ensure the indices align for correct multiplication.
+    proxy_cashflows = cashflow_data.loc[model_points.index].multiply(model_points['Weight'], axis=0).sum(axis=0)
+    proxy_pvs = pv_data.loc[model_points.index].multiply(model_points['Weight'], axis=0).sum(axis=0)
+    # --- 6. Generate Outputs ---
+    # Prepare model points for download as a CSV file.
+    model_points_output = model_points.to_csv(index=False)
+    # --- Plotting Aggregated Cashflows ---
+    fig_cf, ax_cf = plt.subplots(figsize=(10, 6))
+    total_seriatim_cashflows.plot(ax=ax_cf, label='Seriatim Cashflows', color='blue')
+    proxy_cashflows.plot(ax=ax_cf, label='Proxy Cashflows', linestyle='--', color='orange')
+    ax_cf.set_title('Aggregated Cashflows: Seriatim vs. Proxy')
+    ax_cf.set_xlabel('Projection Period')
+    ax_cf.set_ylabel('Cashflow Amount')
+    ax_cf.legend()
+    ax_cf.grid(True)
+    # Save plot to a BytesIO object
+    buf_cf = io.BytesIO()
+    plt.savefig(buf_cf, format='png')
+    plt.close(fig_cf) # Close the figure to free up memory
+    buf_cf.seek(0) # Reset buffer position
+    img_cf = buf_cf.read()
+    # --- Plotting Aggregated Present Values ---
+    fig_pv, ax_pv = plt.subplots(figsize=(8, 5))
+    pv_comparison_data = pd.DataFrame({
+        'Seriatim PV': total_seriatim_pvs.iloc[0] if isinstance(total_seriatim_pvs, pd.Series) else total_seriatim_pvs,
+        'Proxy PV': proxy_pvs.iloc[0] if isinstance(proxy_pvs, pd.Series) else proxy_pvs
+    }, index=['Total PV']) # Use a dummy index for plotting if it's a single value
+    pv_comparison_data.plot(kind='bar', ax=ax_pv, color=['blue', 'orange'])
+    ax_pv.set_title('Aggregated Present Values: Seriatim vs. Proxy')
+    ax_pv.set_ylabel('Present Value')
+    ax_pv.tick_params(axis='x', rotation=45)
+    ax_pv.legend()
+    ax_pv.grid(axis='y')
+    # Save plot to a BytesIO object
+    buf_pv = io.BytesIO()
+    plt.savefig(buf_pv, format='png')
+    plt.close(fig_pv) # Close the figure to free up memory
+    buf_pv.seek(0) # Reset buffer position
+    img_pv = buf_pv.read()
+    # --- Accuracy Metrics ---
+    # Calculate R-squared for cashflows to measure goodness of fit.
+    common_periods_cf = total_seriatim_cashflows.index.intersection(proxy_cashflows.index)
+    r2_cf = r2_score(total_seriatim_cashflows[common_periods_cf], proxy_cashflows[common_periods_cf])
+    # Calculate absolute percentage error for Present Values.
+    seriatim_total_pv_val = total_seriatim_pvs.iloc[0] if isinstance(total_seriatim_pvs, pd.Series) else total_seriatim_pvs
+    proxy_total_pv_val = proxy_pvs.iloc[0] if isinstance(proxy_pvs, pd.Series) else proxy_pvs
+    if seriatim_total_pv_val == 0:
+        pv_error_percent = float('inf') # Handle division by zero
+    else:
+        pv_error_percent = abs((proxy_total_pv_val - seriatim_total_pv_val) / seriatim_total_pv_val) * 100
+    metrics_output = (
+        f"--- Accuracy Metrics ---\n"
+        f"R-squared (Aggregated Cashflows): {r2_cf:.4f}\n"
+        f"Absolute Percentage Error (Aggregated Present Value): {pv_error_percent:.4f}%\n\n"
+        f"Note: The acceptable error percentage for Present Value should be specified in practice (e.g., 1%).\n"
+        f"For better accuracy, consider trying different 'Number of Model Points' and 'Clustering Variables'."
+    )
+    return model_points_output, img_cf, img_pv, metrics_output
+# Gradio Interface Setup
+# Using a minimalistic theme with default black and orange colors and default font.
+with gr.Blocks(theme=gr.themes.Base(primary_hue="orange", secondary_hue="black", font="default")) as demo:
+    gr.Markdown("# <center> Actuarial Model Point Selection using Cluster Analysis </center>")
+    gr.Markdown("This app helps actuaries select representative model points from a large portfolio "
+                "using K-means clustering.")
+    gr.Markdown("Upload your policy data, cashflow data, and present value data. "
+                "Then, configure the clustering parameters to generate representative model points and "
+                "analyze the accuracy of the proxy portfolio.")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Input Data (Excel Files)")
+            policy_data_input = gr.File(
+                label="Upload Policy Data (e.g., policy_data.xlsx)",
+                file_types=[".xlsx", ".xls"],
+                type="filepath",
+                info="Contains policy attributes like Issue Age, Policy Term, Sum Assured, Duration."
+            )
+            cashflow_data_input = gr.File(
+                label="Upload Base Scenario Cashflow Data (e.g., cashflows_seriatim_10K.xlsx)",
+                file_types=[".xlsx", ".xls"],
+                type="filepath",
+                info="Net annual cashflows for each seriatim policy over projection periods. "
+                     "Policies as rows, periods as columns. First column as Policy ID/Index."
+            )
+            pv_data_input = gr.File(
+                label="Upload Base Scenario Present Value Data (e.g., pvs_seriatim_10K.xlsx)",
+                file_types=[".xlsx", ".xls"],
+                type="filepath",
+                info="Present values for each seriatim policy. "
+                     "Policies as rows, PV components as columns. First column as Policy ID/Index. "
+                     "Expected column for total PV: 'PV_Net_CF'."
+            )
+        with gr.Column():
+            gr.Markdown("### Clustering Parameters")
+            num_clusters_input = gr.Slider(
+                minimum=10,
+                maximum=2000, # A reasonable range for 10,000 policies, can be adjusted
+                value=1000,   # Default based on the notebook's example (1000 out of 10,000 policies)
+                step=10,
+                label="Number of Representative Model Points (k)",
+                info="This determines the size of the proxy portfolio. Higher values may increase accuracy but reduce efficiency."
+            )
+            clustering_variable_choice_input = gr.Dropdown(
+                choices=["Net Cashflows", "Policy Attributes", "Present Values"],
+                value="Present Values", # Notebook indicates Present Values often yield best results for PV estimation
+                label="Variables for Clustering",
+                info="The choice of variables significantly impacts results. "
+                     "The chosen variables are more accurately estimated by the proxy portfolio."
+            )
+            process_button = gr.Button("Run Cluster Analysis")
+    with gr.Tab("Results"):
+        gr.Markdown("### Selected Model Points")
+        gr.Markdown("Download the CSV below to get the representative model points and their assigned weights. "
+                    "These can be used to construct a proxy portfolio.")
+        model_points_output = gr.File(label="Download Selected Model Points (CSV)", file_types=[".csv"])
+        gr.Markdown("### Aggregated Cashflows Comparison")
+        gr.Markdown("This plot compares the total aggregated cashflows from your original seriatim portfolio "
+                    "against the aggregated cashflows generated by the selected proxy model points.")
+        cashflow_plot_output = gr.Image(label="Seriatim vs. Proxy Aggregated Cashflows")
+        gr.Markdown("### Aggregated Present Values Comparison")
+        gr.Markdown("This plot compares the total aggregated present values of your original seriatim portfolio "
+                    "against the aggregated present values generated by the selected proxy model points.")
+        pv_plot_output = gr.Image(label="Seriatim vs. Proxy Aggregated Present Values")
+        gr.Markdown("### Accuracy Summary")
+        gr.Markdown("Key metrics to assess how well the proxy portfolio represents the seriatim portfolio.")
+        metrics_output = gr.Textbox(label="Key Accuracy Metrics", lines=7)
+    process_button.click(
+        run_cluster_analysis,
+        inputs=[
+            policy_data_input,
+            cashflow_data_input,
+            pv_data_input,
+            num_clusters_input,
+            clustering_variable_choice_input
+        ],
+        outputs=[
+            model_points_output,
+            cashflow_plot_output,
+            pv_plot_output,
+            metrics_output
+        ]
+    )
+demo.launch(debug=True) # Set debug=True for local testing and more verbose output