Spaces:

alidenewade
/

model-point-clustering

Sleeping

App Files Files Community

alidenewade commited on May 22

Commit

ee9fc3c

verified ·

1 Parent(s): 91e876f

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -248

app.py CHANGED Viewed

@@ -5,280 +5,139 @@ from sklearn.cluster import KMeans
 from sklearn.metrics import r2_score, pairwise_distances_argmin_min
 import matplotlib.pyplot as plt
 import io
-import os # For checking file extensions
 def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
-    """
-    Performs cluster analysis for actuarial model point selection.
-    Accepts both Excel and CSV files.
-    Args:
-        policy_file: Gradio File object for policy data.
-        cashflow_file: Gradio File object for cashflow data.
-        pv_file: Gradio File object for present value data.
-        num_clusters: Number of clusters (model points) to generate.
-    Returns:
-        A tuple: (csv_data_string, cashflow_plot_bytes, pv_plot_bytes, metrics_text_string)
-        Returns (None, None, None, error_message_string) if an error occurs.
-    """
-    # Initialize outputs to None or empty strings for Gradio components
-    csv_data = ""
-    cashflow_plot = None
-    pv_plot = None
-    metrics_text = "Starting analysis..." # Initial status message
-    # --- Start of detailed logging ---
-    print("\n" + "="*50)
-    print(f"[{pd.Timestamp.now()}] --- cluster_analysis function called ---")
-    print(f"Received num_clusters: {num_clusters}")
-    print(f"Policy file received: {policy_file.name if policy_file else 'None'}")
-    print(f"Cashflow file received: {cashflow_file.name if cashflow_file else 'None'}")
-    print(f"PV file received: {pv_file.name if pv_file else 'None'}")
-    print("="*50 + "\n")
-    # Helper function to read files based on extension
-    def read_data_file(file_obj, index_col=None):
-        if file_obj is None:
-            raise ValueError("File object is None.")
-        file_path = file_obj.name
-        file_extension = os.path.splitext(file_path)[1].lower()
-        if file_extension in ['.xlsx', '.xls']:
-            print(f"Attempting to read Excel file: {file_path}")
-            return pd.read_excel(file_path, index_col=index_col)
-        elif file_extension == '.csv':
-            print(f"Attempting to read CSV file: {file_path}")
-            # Consider adding 'sep' argument if CSV delimiter is not comma, e.g., sep=';'
-            return pd.read_csv(file_path, index_col=index_col)
-        else:
-            raise ValueError(f"Unsupported file type: {file_extension}. Please upload .xlsx, .xls, or .csv files.")
-    # 1. Basic checks and file reading
     try:
-        policy_df = read_data_file(policy_file)
-        # index_col=0 is crucial. Ensure the first column contains unique policy identifiers.
-        cashflow_df = read_data_file(cashflow_file, index_col=0)
-        pv_df = read_data_file(pv_file, index_col=0)
-        print(f"[{pd.Timestamp.now()}] Files read successfully.")
-        print(f"Policy data shape: {policy_df.shape}, Columns: {policy_df.columns.tolist()}")
-        print(f"Cashflow data shape: {cashflow_df.shape}, Index type: {cashflow_df.index.dtype}")
-        print(f"PV data shape: {pv_df.shape}, Index type: {pv_df.index.dtype}")
     except Exception as e:
-        error_msg = f"Error reading files: {e}"
-        print(f"[{pd.Timestamp.now()}] {error_msg}")
-        return (None, None, None, error_msg)
-    # 2. Validate Policy Data Columns
     required_cols = ['IssueAge', 'PolicyTerm', 'SumAssured', 'Duration']
-    # Strip whitespace from column names for robust matching
-    policy_df.columns = policy_df.columns.str.strip()
     if not all(col in policy_df.columns for col in required_cols):
-        found_cols = policy_df.columns.tolist()
-        error_msg = (f"Policy data missing required columns. Expected: {required_cols}. "
-                     f"Found: {found_cols}. Please check your policy data column headers for typos or extra spaces.")
-        print(f"[{pd.Timestamp.now()}] {error_msg}")
-        return (None, None, None, error_msg)
-    # 3. Prepare data for clustering
-    try:
-        X = policy_df[required_cols].fillna(0)
-        # Scale data, handle cases where std is 0 (e.g., all values are the same for a feature)
-        # Add a small epsilon to avoid division by zero if all values are identical
-        X_scaled = X.apply(lambda x: (x - x.mean()) / (x.std() if x.std() != 0 else 1e-9), axis=0)
-        print(f"[{pd.Timestamp.now()}] Policy attributes scaled.")
-    except Exception as e:
-        error_msg = f"Error preparing data for clustering: {e}"
-        print(f"[{pd.Timestamp.now()}] {error_msg}")
-        return (None, None, None, error_msg)
-    # 4. Perform Clustering
     try:
-        n_samples = X_scaled.shape[0]
-        if num_clusters <= 1:
-            error_msg = "Number of clusters must be at least 2."
-            print(f"[{pd.Timestamp.now()}] {error_msg}")
-            return (None, None, None, error_msg)
-        if num_clusters > n_samples:
-            original_num_clusters = num_clusters
-            num_clusters = n_samples # Adjust if clusters > samples
-            print(f"[{pd.Timestamp.now()}] Warning: Number of clusters ({original_num_clusters}) "
-                  f"exceeded number of samples ({n_samples}). Reduced to {num_clusters}.")
-        # Use 'auto' for n_init for newer scikit-learn versions
-        kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
         kmeans.fit(X_scaled)
         policy_df['Cluster'] = kmeans.labels_
-        print(f"[{pd.Timestamp.now()}] Clustering successful with {num_clusters} clusters.")
     except Exception as e:
-        error_msg = f"Clustering error: {e}"
-        print(f"[{pd.Timestamp.now()}] {error_msg}")
-        return (None, None, None, error_msg)
-    # 5. Select Model Points and Calculate Weights
-    try:
-        centers = kmeans.cluster_centers_
-        closest, _ = pairwise_distances_argmin_min(centers, X_scaled)
-        model_points = policy_df.iloc[closest].copy()
-        counts = policy_df['Cluster'].value_counts()
-        model_points['Weight'] = model_points['Cluster'].map(counts)
-        print(f"[{pd.Timestamp.now()}] Model points selected and weights calculated. Model points shape: {model_points.shape}")
-        print(f"Model points indices (first 5): {model_points.index.tolist()[:5]}...")
-    except Exception as e:
-        error_msg = f"Error selecting model points or calculating weights: {e}"
-        print(f"[{pd.Timestamp.now()}] {error_msg}")
-        return (None, None, None, error_msg)
-    # 6. Generate Model Points CSV for display
     csv_buffer = io.StringIO()
-    model_points.to_csv(csv_buffer, index=True) # index=True to include the policy_id column
     csv_data = csv_buffer.getvalue()
-    print(f"[{pd.Timestamp.now()}] Model points CSV generated.")
-    # 7. Aggregate cashflows
-    try:
-        # Check if all model_points indices exist in cashflow_df and pv_df
-        missing_cf_indices = [idx for idx in model_points.index if idx not in cashflow_df.index]
-        if missing_cf_indices:
-            raise KeyError(f"Cashflow data is missing entries for model point indices: {missing_cf_indices[:5]}... Please check Cashflow data's first column (index).")
-        proxy_cashflows = cashflow_df.loc[model_points.index].multiply(model_points['Weight'], axis=0).sum()
-        seriatim_cashflows = cashflow_df.sum()
-        print(f"[{pd.Timestamp.now()}] Cashflows aggregated.")
-    except KeyError as e:
-        error_msg = (f"Key Error during cashflow aggregation. "
-                     f"Ensure the first column of your Cashflow Excel/CSV file contains policy IDs "
-                     f"that match the indices from your Policy data: {e}")
-        print(f"[{pd.Timestamp.now()}] {error_msg}")
-        return (None, None, None, error_msg)
-    except Exception as e:
-        error_msg = f"Error aggregating cashflows: {e}"
-        print(f"[{pd.Timestamp.now()}] {error_msg}")
-        return (None, None, None, error_msg)
-    # 8. Plot aggregated cashflows
-    try:
-        fig, ax = plt.subplots(figsize=(10, 5)) # Slightly larger plot
-        seriatim_cashflows.plot(ax=ax, label='Seriatim Cashflows', marker='.', linestyle='-')
-        proxy_cashflows.plot(ax=ax, label='Proxy Cashflows', marker='x', linestyle='--')
-        ax.set_title('Aggregated Cashflows Comparison')
-        ax.set_xlabel('Period')
-        ax.set_ylabel('Cashflow Amount')
-        ax.legend()
-        ax.grid(True, linestyle=':', alpha=0.7)
-        plt.tight_layout() # Adjust layout to prevent labels from overlapping
-        buf = io.BytesIO()
-        plt.savefig(buf, format='png')
-        plt.close(fig)
-        buf.seek(0)
-        cashflow_plot = buf.read()
-        print(f"[{pd.Timestamp.now()}] Cashflow plot generated.")
-    except Exception as e:
-        error_msg = f"Error generating cashflow plot: {e}"
-        print(f"[{pd.Timestamp.now()}] {error_msg}")
-        return (None, None, None, error_msg)
-    # 9. Aggregate present values
-    try:
-        missing_pv_indices = [idx for idx in model_points.index if idx not in pv_df.index]
-        if missing_pv_indices:
-            raise KeyError(f"PV data is missing entries for model point indices: {missing_pv_indices[:5]}... Please check PV data's first column (index).")
-        # Assuming PV data has only one column or the relevant column is the first one
-        proxy_pv = pv_df.loc[model_points.index].multiply(model_points['Weight'], axis=0).sum().values[0]
-        seriatim_pv = pv_df.sum().values[0] # Assuming total PV is in the first column
-        print(f"[{pd.Timestamp.now()}] Present Values aggregated.")
-    except KeyError as e:
-        error_msg = (f"Key Error during PV aggregation. "
-                     f"Ensure the first column of your PV Excel/CSV file contains policy IDs "
-                     f"that match the indices from your Policy data: {e}")
-        print(f"[{pd.Timestamp.now()}] {error_msg}")
-        return (None, None, None, error_msg)
-    except Exception as e:
-        error_msg = f"Error aggregating present values: {e}"
-        print(f"[{pd.Timestamp.now()}] {error_msg}")
-        return (None, None, None, error_msg)
-    # 10. Present Value comparison plot (bar)
-    try:
-        fig2, ax2 = plt.subplots(figsize=(6, 5)) # Adjust size
-        ax2.bar(['Seriatim PV', 'Proxy PV'], [seriatim_pv, proxy_pv], color=['#1f77b4', '#ff7f0e']) # Use nicer colors
-        ax2.set_title('Aggregated Present Values Comparison')
-        ax2.set_ylabel('Present Value')
-        ax2.grid(axis='y', linestyle=':', alpha=0.7)
-        plt.tight_layout()
-        buf2 = io.BytesIO()
-        plt.savefig(buf2, format='png')
-        plt.close(fig2)
-        buf2.seek(0)
-        pv_plot = buf2.read()
-        print(f"[{pd.Timestamp.now()}] PV plot generated.")
-    except Exception as e:
-        error_msg = f"Error generating PV plot: {e}"
-        print(f"[{pd.Timestamp.now()}] {error_msg}")
-        return (None, None, None, error_msg)
-    # 11. Calculate Accuracy metrics
-    try:
-        common_idx = seriatim_cashflows.index.intersection(proxy_cashflows.index)
-        if common_idx.empty:
-            r2 = float('nan') # Cannot compute R2 if no common cashflow periods
-            print(f"[{pd.Timestamp.now()}] Warning: No common indices for R-squared calculation.")
-        else:
-            r2 = r2_score(seriatim_cashflows.loc[common_idx], proxy_cashflows.loc[common_idx])
-        # Handle division by zero for PV error
-        pv_error = abs(proxy_pv - seriatim_pv) / seriatim_pv * 100 if seriatim_pv != 0 else float('inf')
-        metrics_text = (
-            f"R-squared for aggregated cashflows: {r2:.4f}\n"
-            f"Absolute percentage error in present value: {pv_error:.4f}%"
-        )
-        print(f"[{pd.Timestamp.now()}] Accuracy metrics calculated.")
-    except Exception as e:
-        error_msg = f"Error calculating accuracy metrics: {e}"
-        print(f"[{pd.Timestamp.now()}] {error_msg}")
-        return (None, None, None, error_msg)
-    print(f"[{pd.Timestamp.now()}] --- cluster_analysis completed successfully ---")
     return csv_data, cashflow_plot, pv_plot, metrics_text
-# --- Gradio Interface ---
 with gr.Blocks() as demo:
-    gr.Markdown("# Actuarial Model Point Selection")
-    gr.Markdown("""
-    This application performs cluster analysis on policy data to select representative model points.
-    It then aggregates cashflows and present values based on these model points and compares them
-    to the seriatim (full portfolio) results, providing accuracy metrics and visualizations.
-    **Instructions:**
-    1.  **Upload Policy Data (Excel or CSV file):** Ensure it contains columns named exactly `IssueAge`, `PolicyTerm`, `SumAssured`, and `Duration`. **Crucially, double-check for leading/trailing spaces in column names.** The first column can be a unique policy identifier (though not explicitly used for clustering, it helps with index matching).
-    2.  **Upload Cashflow Data (Excel or CSV file):** The **first column** of this file must be a unique policy identifier (e.g., `policy_id`), and this column will be used as the DataFrame index. The remaining columns should represent cashflow periods (e.g., `CF_Year_1`, `CF_Year_2`).
-    3.  **Upload Present Value Data (Excel or CSV file):** The **first column** of this file must also be a unique policy identifier, matching the policy data's identifiers. The second column should contain the present value for each policy.
-    4.  Adjust the 'Number of Model Points' using the slider.
-    5.  Click 'Run Clustering'.
-    """)
     with gr.Row():
         with gr.Column():
-            # Updated file_types to include CSV
-            policy_input = gr.File(label="1. Upload Policy Data (Excel/CSV)", file_types=[".xlsx", ".xls", ".csv"])
-            cashflow_input = gr.File(label="2. Upload Cashflow Data (Excel/CSV, first column is Policy ID)", file_types=[".xlsx", ".xls", ".csv"])
-            pv_input = gr.File(label="3. Upload Present Value Data (Excel/CSV, first column is Policy ID)", file_types=[".xlsx", ".xls", ".csv"])
-            clusters_input = gr.Slider(minimum=2, maximum=100, step=1, value=10, label="4. Number of Model Points")
-            run_btn = gr.Button("Run Clustering", variant="primary")
         with gr.Column():
-            output_csv = gr.Textbox(label="Model Points CSV Output (Scroll to view)", lines=10, interactive=False)
-            cashflow_img = gr.Image(label="Aggregated Cashflows Comparison", interactive=False)
-            pv_img = gr.Image(label="Aggregated Present Values Comparison", interactive=False)
-            metrics_box = gr.Textbox(label="Accuracy Metrics and Status", lines=4, interactive=False)
     run_btn.click(
         cluster_analysis,
@@ -286,6 +145,5 @@ with gr.Blocks() as demo:
         outputs=[output_csv, cashflow_img, pv_img, metrics_box]
     )
-# Launch the Gradio app
-# debug=True provides more console output, useful for local testing
-demo.launch(debug=True)

 from sklearn.metrics import r2_score, pairwise_distances_argmin_min
 import matplotlib.pyplot as plt
 import io
 def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
+    # Basic checks and reads
     try:
+        # Use policy_file.name which is the path to the temporary file Gradio creates
+        policy_df = pd.read_csv(policy_file.name, index_col=0)
+        cashflow_df = pd.read_csv(cashflow_file.name, index_col=0)
+        pv_df = pd.read_csv(pv_file.name, index_col=0)
     except Exception as e:
+        return (None, None, None, f"Error reading CSV files: {e}. Ensure files are CSVs and the first column is the index (e.g., Policy ID).")
+    # Use policy attributes for clustering
+    # Ensure these column names match your policy data CSV
     required_cols = ['IssueAge', 'PolicyTerm', 'SumAssured', 'Duration']
     if not all(col in policy_df.columns for col in required_cols):
+        missing_cols = [col for col in required_cols if col not in policy_df.columns]
+        return (None, None, None, f"Policy data missing required columns: {missing_cols}. Please ensure your policy CSV has these columns.")
+    X = policy_df[required_cols].fillna(0) # Simple imputation
+    # Handle cases with zero standard deviation (e.g., if a column has all same values after fillna)
+    X_std = X.std()
+    if (X_std == 0).any():
+        zero_std_cols = X_std[X_std == 0].index.tolist()
+        return (None, None, None, f"Error: Columns {zero_std_cols} have zero standard deviation after fillna(0). Cannot scale these columns. Please check your data.")
+    X_scaled = (X - X.mean()) / X_std
+    # Cluster
     try:
+        kmeans = KMeans(n_clusters=int(num_clusters), random_state=42, n_init=10)
         kmeans.fit(X_scaled)
         policy_df['Cluster'] = kmeans.labels_
     except Exception as e:
+        return (None, None, None, f"Clustering error: {e}")
+    # Select model points as closest to cluster centers
+    centers = kmeans.cluster_centers_
+    closest, _ = pairwise_distances_argmin_min(centers, X_scaled)
+    model_points = policy_df.iloc[closest].copy()
+    # Calculate weights (count per cluster)
+    counts = policy_df['Cluster'].value_counts()
+    model_points['Weight'] = model_points['Cluster'].map(counts)
+    # Ensure model_points.index are valid for cashflow_df and pv_df
+    if not model_points.index.isin(cashflow_df.index).all():
+        return (None, None, None, "Error: Model point indices not found in cashflow data. Ensure Policy IDs match.")
+    if not model_points.index.isin(pv_df.index).all():
+        return (None, None, None, "Error: Model point indices not found in PV data. Ensure Policy IDs match.")
+    # Create CSV for download
     csv_buffer = io.StringIO()
+    model_points.to_csv(csv_buffer) # index=True by default, which is good if index is PolicyID
     csv_data = csv_buffer.getvalue()
+    # Aggregate cashflows weighted by cluster counts
+    # Ensure model_points['Weight'] is numeric for multiplication
+    model_points['Weight'] = pd.to_numeric(model_points['Weight'], errors='coerce').fillna(1)
+    proxy_cashflows_df = cashflow_df.loc[model_points.index]
+    proxy_cashflows = proxy_cashflows_df.multiply(model_points['Weight'].values, axis=0).sum()
+    seriatim_cashflows = cashflow_df.sum()
+    # Plot aggregated cashflows
+    fig, ax = plt.subplots(figsize=(8,4))
+    seriatim_cashflows.plot(ax=ax, label='Seriatim Cashflows')
+    proxy_cashflows.plot(ax=ax, label='Proxy Cashflows', linestyle='--')
+    ax.set_title('Aggregated Cashflows Comparison')
+    ax.legend()
+    ax.grid(True)
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close(fig)
+    buf.seek(0)
+    cashflow_plot = buf.read()
+    # Aggregate present values weighted
+    proxy_pv_df = pv_df.loc[model_points.index]
+    # Assuming pv_df has one column of PVs, or sum all columns if multiple
+    if proxy_pv_df.shape[1] > 1:
+         proxy_pv = proxy_pv_df.multiply(model_points['Weight'].values, axis=0).sum().sum()
+         seriatim_pv = pv_df.sum().sum()
+    else:
+         proxy_pv = proxy_pv_df.multiply(model_points['Weight'].values, axis=0).sum().iloc[0]
+         seriatim_pv = pv_df.sum().iloc[0]
+    # Present Value comparison plot (bar)
+    fig2, ax2 = plt.subplots(figsize=(5,4))
+    ax2.bar(['Seriatim PV', 'Proxy PV'], [seriatim_pv, proxy_pv], color=['blue', 'orange'])
+    ax2.set_title('Aggregated Present Values')
+    ax2.grid(axis='y')
+    plt.tight_layout()
+    buf2 = io.BytesIO()
+    plt.savefig(buf2, format='png')
+    plt.close(fig2)
+    buf2.seek(0)
+    pv_plot = buf2.read()
+    # Accuracy metrics
+    common_idx = seriatim_cashflows.index.intersection(proxy_cashflows.index)
+    if not common_idx.empty:
+        r2 = r2_score(seriatim_cashflows.loc[common_idx], proxy_cashflows.loc[common_idx])
+    else:
+        r2 = float('nan') # Or handle as error
+    pv_error = abs(proxy_pv - seriatim_pv) / seriatim_pv * 100 if seriatim_pv != 0 else float('inf')
+    metrics_text = (
+        f"R-squared for aggregated cashflows: {r2:.4f}\n"
+        f"Absolute percentage error in present value: {pv_error:.4f}%"
+    )
     return csv_data, cashflow_plot, pv_plot, metrics_text
 with gr.Blocks() as demo:
+    gr.Markdown("# Actuarial Model Point Selection (CSV Upload)")
     with gr.Row():
         with gr.Column():
+            policy_input = gr.File(label="Upload Policy Data (CSV with PolicyID as first column)")
+            cashflow_input = gr.File(label="Upload Cashflow Data (CSV with PolicyID as first column)")
+            pv_input = gr.File(label="Upload Present Value Data (CSV with PolicyID as first column)")
+            clusters_input = gr.Slider(minimum=2, maximum=100, step=1, value=10, label="Number of Model Points")
+            run_btn = gr.Button("Run Clustering")
         with gr.Column():
+            output_csv = gr.Textbox(label="Model Points CSV Output", lines=10, interactive=False)
+            cashflow_img = gr.Image(label="Aggregated Cashflows Comparison", type="pil") # Using PIL for better compatibility
+            pv_img = gr.Image(label="Aggregated Present Values Comparison", type="pil")
+            metrics_box = gr.Textbox(label="Accuracy Metrics", lines=4, interactive=False)
     run_btn.click(
         cluster_analysis,
         outputs=[output_csv, cashflow_img, pv_img, metrics_box]
     )
+if __name__ == '__main__':
+    demo.launch(debug=True)