Spaces:

alidenewade
/

model-point-clustering

Sleeping

App Files Files

xet

Community

alidenewade commited on May 22

Commit

91e876f

verified ·

1 Parent(s): ba285ba

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -32

app.py CHANGED Viewed

@@ -5,15 +5,17 @@ from sklearn.cluster import KMeans
 from sklearn.metrics import r2_score, pairwise_distances_argmin_min
 import matplotlib.pyplot as plt
 import io
 def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
     """
     Performs cluster analysis for actuarial model point selection.
     Args:
-        policy_file: Gradio File object for policy data (Excel).
-        cashflow_file: Gradio File object for cashflow data (Excel).
-        pv_file: Gradio File object for present value data (Excel).
         num_clusters: Number of clusters (model points) to generate.
     Returns:
@@ -35,19 +37,30 @@ def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
     print(f"PV file received: {pv_file.name if pv_file else 'None'}")
     print("="*50 + "\n")
     # 1. Basic checks and file reading
     try:
-        if policy_file is None or cashflow_file is None or pv_file is None:
-            missing_files = []
-            if policy_file is None: missing_files.append("Policy Data")
-            if cashflow_file is None: missing_files.append("Cashflow Data")
-            if pv_file is None: missing_files.append("Present Value Data")
-            raise ValueError(f"Missing required input file(s): {', '.join(missing_files)}. Please upload all files.")
-        policy_df = pd.read_excel(policy_file.name)
         # index_col=0 is crucial. Ensure the first column contains unique policy identifiers.
-        cashflow_df = pd.read_excel(cashflow_file.name, index_col=0)
-        pv_df = pd.read_excel(pv_file.name, index_col=0)
         print(f"[{pd.Timestamp.now()}] Files read successfully.")
         print(f"Policy data shape: {policy_df.shape}, Columns: {policy_df.columns.tolist()}")
         print(f"Cashflow data shape: {cashflow_df.shape}, Index type: {cashflow_df.index.dtype}")
@@ -60,10 +73,13 @@ def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
     # 2. Validate Policy Data Columns
     required_cols = ['IssueAge', 'PolicyTerm', 'SumAssured', 'Duration']
     if not all(col in policy_df.columns for col in required_cols):
         found_cols = policy_df.columns.tolist()
         error_msg = (f"Policy data missing required columns. Expected: {required_cols}. "
-                     f"Found: {found_cols}. Please check your policy Excel file column headers.")
         print(f"[{pd.Timestamp.now()}] {error_msg}")
         return (None, None, None, error_msg)
@@ -71,7 +87,8 @@ def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
     try:
         X = policy_df[required_cols].fillna(0)
         # Scale data, handle cases where std is 0 (e.g., all values are the same for a feature)
-        X_scaled = X.apply(lambda x: (x - x.mean()) / x.std() if x.std() != 0 else 0, axis=0)
         print(f"[{pd.Timestamp.now()}] Policy attributes scaled.")
     except Exception as e:
         error_msg = f"Error preparing data for clustering: {e}"
@@ -91,7 +108,8 @@ def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
             print(f"[{pd.Timestamp.now()}] Warning: Number of clusters ({original_num_clusters}) "
                   f"exceeded number of samples ({n_samples}). Reduced to {num_clusters}.")
-        kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
         kmeans.fit(X_scaled)
         policy_df['Cluster'] = kmeans.labels_
         print(f"[{pd.Timestamp.now()}] Clustering successful with {num_clusters} clusters.")
@@ -109,7 +127,7 @@ def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
         counts = policy_df['Cluster'].value_counts()
         model_points['Weight'] = model_points['Cluster'].map(counts)
         print(f"[{pd.Timestamp.now()}] Model points selected and weights calculated. Model points shape: {model_points.shape}")
-        print(f"Model points indices: {model_points.index.tolist()}")
     except Exception as e:
         error_msg = f"Error selecting model points or calculating weights: {e}"
         print(f"[{pd.Timestamp.now()}] {error_msg}")
@@ -126,15 +144,15 @@ def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
         # Check if all model_points indices exist in cashflow_df and pv_df
         missing_cf_indices = [idx for idx in model_points.index if idx not in cashflow_df.index]
         if missing_cf_indices:
-            raise KeyError(f"Cashflow data is missing entries for model point indices: {missing_cf_indices[:5]}...") # Show first 5
         proxy_cashflows = cashflow_df.loc[model_points.index].multiply(model_points['Weight'], axis=0).sum()
         seriatim_cashflows = cashflow_df.sum()
         print(f"[{pd.Timestamp.now()}] Cashflows aggregated.")
     except KeyError as e:
         error_msg = (f"Key Error during cashflow aggregation. "
-                     f"Ensure 'policy_id' (or equivalent) column from your policy data "
-                     f"is set as the index (first column) in your Cashflow Excel file: {e}")
         print(f"[{pd.Timestamp.now()}] {error_msg}")
         return (None, None, None, error_msg)
     except Exception as e:
@@ -169,7 +187,7 @@ def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
     try:
         missing_pv_indices = [idx for idx in model_points.index if idx not in pv_df.index]
         if missing_pv_indices:
-            raise KeyError(f"PV data is missing entries for model point indices: {missing_pv_indices[:5]}...") # Show first 5
         # Assuming PV data has only one column or the relevant column is the first one
         proxy_pv = pv_df.loc[model_points.index].multiply(model_points['Weight'], axis=0).sum().values[0]
@@ -177,8 +195,8 @@ def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
         print(f"[{pd.Timestamp.now()}] Present Values aggregated.")
     except KeyError as e:
         error_msg = (f"Key Error during PV aggregation. "
-                     f"Ensure 'policy_id' (or equivalent) column from your policy data "
-                     f"is set as the index (first column) in your PV Excel file: {e}")
         print(f"[{pd.Timestamp.now()}] {error_msg}")
         return (None, None, None, error_msg)
     except Exception as e:
@@ -210,7 +228,7 @@ def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
     try:
         common_idx = seriatim_cashflows.index.intersection(proxy_cashflows.index)
         if common_idx.empty:
-            r2 = float('nan')
             print(f"[{pd.Timestamp.now()}] Warning: No common indices for R-squared calculation.")
         else:
             r2 = r2_score(seriatim_cashflows.loc[common_idx], proxy_cashflows.loc[common_idx])
@@ -240,29 +258,28 @@ with gr.Blocks() as demo:
     to the seriatim (full portfolio) results, providing accuracy metrics and visualizations.
     **Instructions:**
-    1.  **Upload Policy Data (Excel file):** Ensure it contains columns named exactly `IssueAge`, `PolicyTerm`, `SumAssured`, and `Duration`. The first column should ideally be a unique policy identifier that matches the indices in Cashflow and PV files.
-    2.  **Upload Cashflow Data (Excel file):** The **first column** of this file must be a unique policy identifier (like `policy_id`), and it will be used as the DataFrame index. The remaining columns should be cashflow periods.
-    3.  **Upload Present Value Data (Excel file):** The **first column** of this file must also be a unique policy identifier, matching the policy data's identifiers. The second column should contain the present value for each policy.
     4.  Adjust the 'Number of Model Points' using the slider.
     5.  Click 'Run Clustering'.
     """)
     with gr.Row():
         with gr.Column():
-            policy_input = gr.File(label="1. Upload Policy Data (Excel - .xlsx/.xls)", file_types=[".xlsx", ".xls"])
-            cashflow_input = gr.File(label="2. Upload Cashflow Data (Excel - .xlsx/.xls, first column is Policy ID)", file_types=[".xlsx", ".xls"])
-            pv_input = gr.File(label="3. Upload Present Value Data (Excel - .xlsx/.xls, first column is Policy ID)", file_types=[".xlsx", ".xls"])
             clusters_input = gr.Slider(minimum=2, maximum=100, step=1, value=10, label="4. Number of Model Points")
             run_btn = gr.Button("Run Clustering", variant="primary")
         with gr.Column():
-            # For displaying the CSV content directly. If you want a downloadable file, use gr.File(file_to_share=True)
             output_csv = gr.Textbox(label="Model Points CSV Output (Scroll to view)", lines=10, interactive=False)
             cashflow_img = gr.Image(label="Aggregated Cashflows Comparison", interactive=False)
             pv_img = gr.Image(label="Aggregated Present Values Comparison", interactive=False)
             metrics_box = gr.Textbox(label="Accuracy Metrics and Status", lines=4, interactive=False)
-    # Link the button click to the function
     run_btn.click(
         cluster_analysis,
         inputs=[policy_input, cashflow_input, pv_input, clusters_input],

 from sklearn.metrics import r2_score, pairwise_distances_argmin_min
 import matplotlib.pyplot as plt
 import io
+import os # For checking file extensions
 def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
     """
     Performs cluster analysis for actuarial model point selection.
+    Accepts both Excel and CSV files.
     Args:
+        policy_file: Gradio File object for policy data.
+        cashflow_file: Gradio File object for cashflow data.
+        pv_file: Gradio File object for present value data.
         num_clusters: Number of clusters (model points) to generate.
     Returns:
     print(f"PV file received: {pv_file.name if pv_file else 'None'}")
     print("="*50 + "\n")
+    # Helper function to read files based on extension
+    def read_data_file(file_obj, index_col=None):
+        if file_obj is None:
+            raise ValueError("File object is None.")
+        file_path = file_obj.name
+        file_extension = os.path.splitext(file_path)[1].lower()
+        if file_extension in ['.xlsx', '.xls']:
+            print(f"Attempting to read Excel file: {file_path}")
+            return pd.read_excel(file_path, index_col=index_col)
+        elif file_extension == '.csv':
+            print(f"Attempting to read CSV file: {file_path}")
+            # Consider adding 'sep' argument if CSV delimiter is not comma, e.g., sep=';'
+            return pd.read_csv(file_path, index_col=index_col)
+        else:
+            raise ValueError(f"Unsupported file type: {file_extension}. Please upload .xlsx, .xls, or .csv files.")
     # 1. Basic checks and file reading
     try:
+        policy_df = read_data_file(policy_file)
         # index_col=0 is crucial. Ensure the first column contains unique policy identifiers.
+        cashflow_df = read_data_file(cashflow_file, index_col=0)
+        pv_df = read_data_file(pv_file, index_col=0)
         print(f"[{pd.Timestamp.now()}] Files read successfully.")
         print(f"Policy data shape: {policy_df.shape}, Columns: {policy_df.columns.tolist()}")
         print(f"Cashflow data shape: {cashflow_df.shape}, Index type: {cashflow_df.index.dtype}")
     # 2. Validate Policy Data Columns
     required_cols = ['IssueAge', 'PolicyTerm', 'SumAssured', 'Duration']
+    # Strip whitespace from column names for robust matching
+    policy_df.columns = policy_df.columns.str.strip()
     if not all(col in policy_df.columns for col in required_cols):
         found_cols = policy_df.columns.tolist()
         error_msg = (f"Policy data missing required columns. Expected: {required_cols}. "
+                     f"Found: {found_cols}. Please check your policy data column headers for typos or extra spaces.")
         print(f"[{pd.Timestamp.now()}] {error_msg}")
         return (None, None, None, error_msg)
     try:
         X = policy_df[required_cols].fillna(0)
         # Scale data, handle cases where std is 0 (e.g., all values are the same for a feature)
+        # Add a small epsilon to avoid division by zero if all values are identical
+        X_scaled = X.apply(lambda x: (x - x.mean()) / (x.std() if x.std() != 0 else 1e-9), axis=0)
         print(f"[{pd.Timestamp.now()}] Policy attributes scaled.")
     except Exception as e:
         error_msg = f"Error preparing data for clustering: {e}"
             print(f"[{pd.Timestamp.now()}] Warning: Number of clusters ({original_num_clusters}) "
                   f"exceeded number of samples ({n_samples}). Reduced to {num_clusters}.")
+        # Use 'auto' for n_init for newer scikit-learn versions
+        kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
         kmeans.fit(X_scaled)
         policy_df['Cluster'] = kmeans.labels_
         print(f"[{pd.Timestamp.now()}] Clustering successful with {num_clusters} clusters.")
         counts = policy_df['Cluster'].value_counts()
         model_points['Weight'] = model_points['Cluster'].map(counts)
         print(f"[{pd.Timestamp.now()}] Model points selected and weights calculated. Model points shape: {model_points.shape}")
+        print(f"Model points indices (first 5): {model_points.index.tolist()[:5]}...")
     except Exception as e:
         error_msg = f"Error selecting model points or calculating weights: {e}"
         print(f"[{pd.Timestamp.now()}] {error_msg}")
         # Check if all model_points indices exist in cashflow_df and pv_df
         missing_cf_indices = [idx for idx in model_points.index if idx not in cashflow_df.index]
         if missing_cf_indices:
+            raise KeyError(f"Cashflow data is missing entries for model point indices: {missing_cf_indices[:5]}... Please check Cashflow data's first column (index).")
         proxy_cashflows = cashflow_df.loc[model_points.index].multiply(model_points['Weight'], axis=0).sum()
         seriatim_cashflows = cashflow_df.sum()
         print(f"[{pd.Timestamp.now()}] Cashflows aggregated.")
     except KeyError as e:
         error_msg = (f"Key Error during cashflow aggregation. "
+                     f"Ensure the first column of your Cashflow Excel/CSV file contains policy IDs "
+                     f"that match the indices from your Policy data: {e}")
         print(f"[{pd.Timestamp.now()}] {error_msg}")
         return (None, None, None, error_msg)
     except Exception as e:
     try:
         missing_pv_indices = [idx for idx in model_points.index if idx not in pv_df.index]
         if missing_pv_indices:
+            raise KeyError(f"PV data is missing entries for model point indices: {missing_pv_indices[:5]}... Please check PV data's first column (index).")
         # Assuming PV data has only one column or the relevant column is the first one
         proxy_pv = pv_df.loc[model_points.index].multiply(model_points['Weight'], axis=0).sum().values[0]
         print(f"[{pd.Timestamp.now()}] Present Values aggregated.")
     except KeyError as e:
         error_msg = (f"Key Error during PV aggregation. "
+                     f"Ensure the first column of your PV Excel/CSV file contains policy IDs "
+                     f"that match the indices from your Policy data: {e}")
         print(f"[{pd.Timestamp.now()}] {error_msg}")
         return (None, None, None, error_msg)
     except Exception as e:
     try:
         common_idx = seriatim_cashflows.index.intersection(proxy_cashflows.index)
         if common_idx.empty:
+            r2 = float('nan') # Cannot compute R2 if no common cashflow periods
             print(f"[{pd.Timestamp.now()}] Warning: No common indices for R-squared calculation.")
         else:
             r2 = r2_score(seriatim_cashflows.loc[common_idx], proxy_cashflows.loc[common_idx])
     to the seriatim (full portfolio) results, providing accuracy metrics and visualizations.
     **Instructions:**
+    1.  **Upload Policy Data (Excel or CSV file):** Ensure it contains columns named exactly `IssueAge`, `PolicyTerm`, `SumAssured`, and `Duration`. **Crucially, double-check for leading/trailing spaces in column names.** The first column can be a unique policy identifier (though not explicitly used for clustering, it helps with index matching).
+    2.  **Upload Cashflow Data (Excel or CSV file):** The **first column** of this file must be a unique policy identifier (e.g., `policy_id`), and this column will be used as the DataFrame index. The remaining columns should represent cashflow periods (e.g., `CF_Year_1`, `CF_Year_2`).
+    3.  **Upload Present Value Data (Excel or CSV file):** The **first column** of this file must also be a unique policy identifier, matching the policy data's identifiers. The second column should contain the present value for each policy.
     4.  Adjust the 'Number of Model Points' using the slider.
     5.  Click 'Run Clustering'.
     """)
     with gr.Row():
         with gr.Column():
+            # Updated file_types to include CSV
+            policy_input = gr.File(label="1. Upload Policy Data (Excel/CSV)", file_types=[".xlsx", ".xls", ".csv"])
+            cashflow_input = gr.File(label="2. Upload Cashflow Data (Excel/CSV, first column is Policy ID)", file_types=[".xlsx", ".xls", ".csv"])
+            pv_input = gr.File(label="3. Upload Present Value Data (Excel/CSV, first column is Policy ID)", file_types=[".xlsx", ".xls", ".csv"])
             clusters_input = gr.Slider(minimum=2, maximum=100, step=1, value=10, label="4. Number of Model Points")
             run_btn = gr.Button("Run Clustering", variant="primary")
         with gr.Column():
             output_csv = gr.Textbox(label="Model Points CSV Output (Scroll to view)", lines=10, interactive=False)
             cashflow_img = gr.Image(label="Aggregated Cashflows Comparison", interactive=False)
             pv_img = gr.Image(label="Aggregated Present Values Comparison", interactive=False)
             metrics_box = gr.Textbox(label="Accuracy Metrics and Status", lines=4, interactive=False)
     run_btn.click(
         cluster_analysis,
         inputs=[policy_input, cashflow_input, pv_input, clusters_input],