{ "cells": [ { "cell_type": "markdown", "id": "7abd29b8", "metadata": {}, "source": [ "# Health Status Classification\n", "\n", "This notebook classifies individuals into \"Healthy\" or \"Patient\" categories using SVM and Random Forest classifiers. It includes:\n", "- Data preprocessing\n", "- Training of classifiers\n", "- Comparison of performance metrics\n", "- Visualization of results\n" ] }, { "cell_type": "markdown", "id": "299604b4", "metadata": {}, "source": [ "## Data Preprocessing" ] }, { "cell_type": "markdown", "id": "22ee7ce2", "metadata": {}, "source": [ "### 1. Import Dependecies" ] }, { "cell_type": "code", "execution_count": 252, "id": "76a44a0d", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", "from sklearn.svm import SVC\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ] }, { "cell_type": "markdown", "id": "a27c6dc7", "metadata": {}, "source": [ "### 2. Load Dataset" ] }, { "cell_type": "code", "execution_count": 253, "id": "3772870e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Patient No.GenderAgeFamily historyHeightWeightBMIObese/non obeseCholesterolTriglycerides levelHDL levelLDL levelVLDL levelHealth_status
01Female65No1.646423.80Non-obese1451196066.019.0healthy
12Female50Yes1.707024.22Non-obese22010769134.017.0healthy
23Female45No1.676322.59Non-obese19025142108.040.0healthy
34Female48No1.617930.48Obese22818565134.029.0healthy
45Male74No1.768326.79Non-obese1571134990.018.0healthy
\n", "
" ], "text/plain": [ " Patient No. Gender Age Family history Height Weight BMI \\\n", "0 1 Female 65 No 1.64 64 23.80 \n", "1 2 Female 50 Yes 1.70 70 24.22 \n", "2 3 Female 45 No 1.67 63 22.59 \n", "3 4 Female 48 No 1.61 79 30.48 \n", "4 5 Male 74 No 1.76 83 26.79 \n", "\n", " Obese/non obese Cholesterol Triglycerides level HDL level LDL level \\\n", "0 Non-obese 145 119 60 66.0 \n", "1 Non-obese 220 107 69 134.0 \n", "2 Non-obese 190 251 42 108.0 \n", "3 Obese 228 185 65 134.0 \n", "4 Non-obese 157 113 49 90.0 \n", "\n", " VLDL level Health_status \n", "0 19.0 healthy \n", "1 17.0 healthy \n", "2 40.0 healthy \n", "3 29.0 healthy \n", "4 18.0 healthy " ] }, "execution_count": 253, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load dataset\n", "data = pd.read_excel(r'colelithiasis_dataset.xlsx')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 9, "id": "7edf91ac", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 100 entries, 0 to 99\n", "Data columns (total 14 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Patient No. 100 non-null int64 \n", " 1 Gender 100 non-null object \n", " 2 Age 100 non-null int64 \n", " 3 Family history 100 non-null object \n", " 4 Height 100 non-null float64\n", " 5 Weight 100 non-null int64 \n", " 6 BMI 100 non-null float64\n", " 7 Obese/non obese 100 non-null object \n", " 8 Cholesterol 100 non-null int64 \n", " 9 Triglycerides level 100 non-null int64 \n", " 10 HDL level 100 non-null int64 \n", " 11 LDL level 100 non-null float64\n", " 12 VLDL level 100 non-null float64\n", " 13 Health_status 100 non-null object \n", "dtypes: float64(4), int64(6), object(4)\n", "memory usage: 11.1+ KB\n" ] } ], "source": [ "data.info()" ] }, { "cell_type": "code", "execution_count": 10, "id": "aae142a0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Health_status\n", "patient 60\n", "healthy 40\n", "Name: count, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['Health_status'].value_counts()" ] }, { "cell_type": "code", "execution_count": 11, "id": "aafe0526", "metadata": {}, "outputs": [], "source": [ "# Drop unnecessary columns (e.g., Patient No.)\n", "data = data.drop(columns=['Patient No.'])" ] }, { "cell_type": "markdown", "id": "c7907326", "metadata": {}, "source": [ "### 3. Feature Encoding" ] }, { "cell_type": "code", "execution_count": 12, "id": "7f22b9a6", "metadata": {}, "outputs": [], "source": [ "# Encode categorical variables\n", "le_health_status = LabelEncoder()\n", "data['Health_status'] = le_health_status.fit_transform(data['Health_status']) # 0 for healthy, 1 for patient\n", "le_gender = LabelEncoder()\n", "data['Gender'] = le_gender.fit_transform(data['Gender']) # 0 for Female, 1 for Male\n", "le_family_history = LabelEncoder()\n", "data['Family history'] = le_family_history.fit_transform(data['Family history']) # 0 for No, 1 for Yes\n", "le_obese = LabelEncoder()\n", "data['Obese/non obese'] = le_obese.fit_transform(data['Obese/non obese']) # 0 for Non-obese, 1 for Obese" ] }, { "cell_type": "markdown", "id": "bcf93f5f", "metadata": {}, "source": [ "### 4. Split features and target" ] }, { "cell_type": "code", "execution_count": 13, "id": "eab4be22", "metadata": {}, "outputs": [], "source": [ "# Features and target\n", "X = data.drop(columns=['Health_status'])\n", "y = data['Health_status']\n" ] }, { "cell_type": "markdown", "id": "c5ed059c", "metadata": {}, "source": [ "### 5. Split data into training and testing sets" ] }, { "cell_type": "code", "execution_count": 244, "id": "cdeca4f2", "metadata": {}, "outputs": [], "source": [ "# Split the data\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)" ] }, { "cell_type": "markdown", "id": "0558f2c8", "metadata": {}, "source": [ "### 6. Scalling the features" ] }, { "cell_type": "code", "execution_count": 245, "id": "a037923f", "metadata": {}, "outputs": [], "source": [ "# Scale the features using StandardScaler\n", "scaler = StandardScaler()\n", "X_train = scaler.fit_transform(X_train)\n", "X_test = scaler.transform(X_test)" ] }, { "cell_type": "markdown", "id": "d934c22c", "metadata": {}, "source": [ "## Training of classifiers" ] }, { "cell_type": "markdown", "id": "c7fd1e71", "metadata": {}, "source": [ "### 1. Support Vector Machine (SVM)" ] }, { "cell_type": "code", "execution_count": 246, "id": "aca26b71", "metadata": {}, "outputs": [], "source": [ "# SVM Classifier\n", "svm_model = SVC(kernel='linear', C=0.9, random_state=42)\n", "svm_model.fit(X_train, y_train)\n", "svm_preds = svm_model.predict(X_test)" ] }, { "cell_type": "markdown", "id": "e8bfefa5", "metadata": {}, "source": [ "### 2. Random Forest Classifier" ] }, { "cell_type": "code", "execution_count": 247, "id": "a5ad9d40", "metadata": {}, "outputs": [], "source": [ "# Random Forest Classifier\n", "rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)\n", "rf_model.fit(X_train, y_train)\n", "rf_preds = rf_model.predict(X_test)" ] }, { "cell_type": "markdown", "id": "e3fb4a5a", "metadata": {}, "source": [ "## Comparison of performance metrics" ] }, { "cell_type": "code", "execution_count": 249, "id": "2ddc5b12", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Performance Metrics for SVM\n", "Accuracy: 0.65\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.55 0.75 0.63 8\n", " 1 0.78 0.58 0.67 12\n", "\n", " accuracy 0.65 20\n", " macro avg 0.66 0.67 0.65 20\n", "weighted avg 0.68 0.65 0.65 20\n", "\n", "\n", "Performance Metrics for Random Forest\n", "Accuracy: 0.7\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.60 0.75 0.67 8\n", " 1 0.80 0.67 0.73 12\n", "\n", " accuracy 0.70 20\n", " macro avg 0.70 0.71 0.70 20\n", "weighted avg 0.72 0.70 0.70 20\n", "\n" ] } ], "source": [ "def print_metrics(y_true, y_pred, model_name):\n", " print(f\"\\nPerformance Metrics for {model_name}\")\n", " print(\"Accuracy:\", accuracy_score(y_true, y_pred))\n", " print(\"\\nClassification Report:\")\n", " print(classification_report(y_true, y_pred))\n", "\n", "print_metrics(y_test, svm_preds, \"SVM\")\n", "print_metrics(y_test, rf_preds, \"Random Forest\")" ] }, { "cell_type": "markdown", "id": "38eefbb6", "metadata": {}, "source": [ "## Visualization of results" ] }, { "cell_type": "code", "execution_count": 251, "id": "f74a2f74", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Confusion Matrices\n", "svm_cm = confusion_matrix(y_test, svm_preds)\n", "rf_cm = confusion_matrix(y_test, rf_preds)\n", "\n", "fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n", "sns.heatmap(svm_cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])\n", "axes[0].set_title('SVM Confusion Matrix')\n", "axes[0].set_xlabel('Predicted')\n", "axes[0].set_ylabel('Actual')\n", "\n", "sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Greens', ax=axes[1])\n", "axes[1].set_title('Random Forest Confusion Matrix')\n", "axes[1].set_xlabel('Predicted')\n", "axes[1].set_ylabel('Actual')\n", "\n", "plt.tight_layout()\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3a2c284f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "ml_env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.1" } }, "nbformat": 4, "nbformat_minor": 5 }