mr-usman commited on
Commit
fd6fbf7
·
verified ·
1 Parent(s): dcff38b

Upload 16 files

Browse files
colelithiasis_dataset.xlsx ADDED
Binary file (16.7 kB). View file
 
encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f931d3e82fc7ab6868eb41036e2c84c14dc0efb7920b70d0a0a547f7e4975155
3
+ size 1604
gda.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1062eb7b2283f57c001ad6546390ac969fec7aa91d55182947db1307887cc256
3
+ size 3224
health_status_classification.ipynb ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "7abd29b8",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Health Status Classification\n",
9
+ "\n",
10
+ "This notebook classifies individuals into \"Healthy\" or \"Patient\" categories using SVM and Random Forest classifiers. It includes:\n",
11
+ "- Data preprocessing\n",
12
+ "- Training of classifiers\n",
13
+ "- Comparison of performance metrics\n",
14
+ "- Visualization of results\n"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "markdown",
19
+ "id": "299604b4",
20
+ "metadata": {},
21
+ "source": [
22
+ "## Data Preprocessing"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "markdown",
27
+ "id": "22ee7ce2",
28
+ "metadata": {},
29
+ "source": [
30
+ "### 1. Import Dependecies"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 252,
36
+ "id": "76a44a0d",
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "import pandas as pd\n",
41
+ "from sklearn.model_selection import train_test_split\n",
42
+ "from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
43
+ "from sklearn.svm import SVC\n",
44
+ "from sklearn.ensemble import RandomForestClassifier\n",
45
+ "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
46
+ "import matplotlib.pyplot as plt\n",
47
+ "import seaborn as sns"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "markdown",
52
+ "id": "a27c6dc7",
53
+ "metadata": {},
54
+ "source": [
55
+ "### 2. Load Dataset"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 253,
61
+ "id": "3772870e",
62
+ "metadata": {},
63
+ "outputs": [
64
+ {
65
+ "data": {
66
+ "text/html": [
67
+ "<div>\n",
68
+ "<style scoped>\n",
69
+ " .dataframe tbody tr th:only-of-type {\n",
70
+ " vertical-align: middle;\n",
71
+ " }\n",
72
+ "\n",
73
+ " .dataframe tbody tr th {\n",
74
+ " vertical-align: top;\n",
75
+ " }\n",
76
+ "\n",
77
+ " .dataframe thead th {\n",
78
+ " text-align: right;\n",
79
+ " }\n",
80
+ "</style>\n",
81
+ "<table border=\"1\" class=\"dataframe\">\n",
82
+ " <thead>\n",
83
+ " <tr style=\"text-align: right;\">\n",
84
+ " <th></th>\n",
85
+ " <th>Patient No.</th>\n",
86
+ " <th>Gender</th>\n",
87
+ " <th>Age</th>\n",
88
+ " <th>Family history</th>\n",
89
+ " <th>Height</th>\n",
90
+ " <th>Weight</th>\n",
91
+ " <th>BMI</th>\n",
92
+ " <th>Obese/non obese</th>\n",
93
+ " <th>Cholesterol</th>\n",
94
+ " <th>Triglycerides level</th>\n",
95
+ " <th>HDL level</th>\n",
96
+ " <th>LDL level</th>\n",
97
+ " <th>VLDL level</th>\n",
98
+ " <th>Health_status</th>\n",
99
+ " </tr>\n",
100
+ " </thead>\n",
101
+ " <tbody>\n",
102
+ " <tr>\n",
103
+ " <th>0</th>\n",
104
+ " <td>1</td>\n",
105
+ " <td>Female</td>\n",
106
+ " <td>65</td>\n",
107
+ " <td>No</td>\n",
108
+ " <td>1.64</td>\n",
109
+ " <td>64</td>\n",
110
+ " <td>23.80</td>\n",
111
+ " <td>Non-obese</td>\n",
112
+ " <td>145</td>\n",
113
+ " <td>119</td>\n",
114
+ " <td>60</td>\n",
115
+ " <td>66.0</td>\n",
116
+ " <td>19.0</td>\n",
117
+ " <td>healthy</td>\n",
118
+ " </tr>\n",
119
+ " <tr>\n",
120
+ " <th>1</th>\n",
121
+ " <td>2</td>\n",
122
+ " <td>Female</td>\n",
123
+ " <td>50</td>\n",
124
+ " <td>Yes</td>\n",
125
+ " <td>1.70</td>\n",
126
+ " <td>70</td>\n",
127
+ " <td>24.22</td>\n",
128
+ " <td>Non-obese</td>\n",
129
+ " <td>220</td>\n",
130
+ " <td>107</td>\n",
131
+ " <td>69</td>\n",
132
+ " <td>134.0</td>\n",
133
+ " <td>17.0</td>\n",
134
+ " <td>healthy</td>\n",
135
+ " </tr>\n",
136
+ " <tr>\n",
137
+ " <th>2</th>\n",
138
+ " <td>3</td>\n",
139
+ " <td>Female</td>\n",
140
+ " <td>45</td>\n",
141
+ " <td>No</td>\n",
142
+ " <td>1.67</td>\n",
143
+ " <td>63</td>\n",
144
+ " <td>22.59</td>\n",
145
+ " <td>Non-obese</td>\n",
146
+ " <td>190</td>\n",
147
+ " <td>251</td>\n",
148
+ " <td>42</td>\n",
149
+ " <td>108.0</td>\n",
150
+ " <td>40.0</td>\n",
151
+ " <td>healthy</td>\n",
152
+ " </tr>\n",
153
+ " <tr>\n",
154
+ " <th>3</th>\n",
155
+ " <td>4</td>\n",
156
+ " <td>Female</td>\n",
157
+ " <td>48</td>\n",
158
+ " <td>No</td>\n",
159
+ " <td>1.61</td>\n",
160
+ " <td>79</td>\n",
161
+ " <td>30.48</td>\n",
162
+ " <td>Obese</td>\n",
163
+ " <td>228</td>\n",
164
+ " <td>185</td>\n",
165
+ " <td>65</td>\n",
166
+ " <td>134.0</td>\n",
167
+ " <td>29.0</td>\n",
168
+ " <td>healthy</td>\n",
169
+ " </tr>\n",
170
+ " <tr>\n",
171
+ " <th>4</th>\n",
172
+ " <td>5</td>\n",
173
+ " <td>Male</td>\n",
174
+ " <td>74</td>\n",
175
+ " <td>No</td>\n",
176
+ " <td>1.76</td>\n",
177
+ " <td>83</td>\n",
178
+ " <td>26.79</td>\n",
179
+ " <td>Non-obese</td>\n",
180
+ " <td>157</td>\n",
181
+ " <td>113</td>\n",
182
+ " <td>49</td>\n",
183
+ " <td>90.0</td>\n",
184
+ " <td>18.0</td>\n",
185
+ " <td>healthy</td>\n",
186
+ " </tr>\n",
187
+ " </tbody>\n",
188
+ "</table>\n",
189
+ "</div>"
190
+ ],
191
+ "text/plain": [
192
+ " Patient No. Gender Age Family history Height Weight BMI \\\n",
193
+ "0 1 Female 65 No 1.64 64 23.80 \n",
194
+ "1 2 Female 50 Yes 1.70 70 24.22 \n",
195
+ "2 3 Female 45 No 1.67 63 22.59 \n",
196
+ "3 4 Female 48 No 1.61 79 30.48 \n",
197
+ "4 5 Male 74 No 1.76 83 26.79 \n",
198
+ "\n",
199
+ " Obese/non obese Cholesterol Triglycerides level HDL level LDL level \\\n",
200
+ "0 Non-obese 145 119 60 66.0 \n",
201
+ "1 Non-obese 220 107 69 134.0 \n",
202
+ "2 Non-obese 190 251 42 108.0 \n",
203
+ "3 Obese 228 185 65 134.0 \n",
204
+ "4 Non-obese 157 113 49 90.0 \n",
205
+ "\n",
206
+ " VLDL level Health_status \n",
207
+ "0 19.0 healthy \n",
208
+ "1 17.0 healthy \n",
209
+ "2 40.0 healthy \n",
210
+ "3 29.0 healthy \n",
211
+ "4 18.0 healthy "
212
+ ]
213
+ },
214
+ "execution_count": 253,
215
+ "metadata": {},
216
+ "output_type": "execute_result"
217
+ }
218
+ ],
219
+ "source": [
220
+ "# Load dataset\n",
221
+ "data = pd.read_excel(r'colelithiasis_dataset.xlsx')\n",
222
+ "data.head()"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "execution_count": 9,
228
+ "id": "7edf91ac",
229
+ "metadata": {},
230
+ "outputs": [
231
+ {
232
+ "name": "stdout",
233
+ "output_type": "stream",
234
+ "text": [
235
+ "<class 'pandas.core.frame.DataFrame'>\n",
236
+ "RangeIndex: 100 entries, 0 to 99\n",
237
+ "Data columns (total 14 columns):\n",
238
+ " # Column Non-Null Count Dtype \n",
239
+ "--- ------ -------------- ----- \n",
240
+ " 0 Patient No. 100 non-null int64 \n",
241
+ " 1 Gender 100 non-null object \n",
242
+ " 2 Age 100 non-null int64 \n",
243
+ " 3 Family history 100 non-null object \n",
244
+ " 4 Height 100 non-null float64\n",
245
+ " 5 Weight 100 non-null int64 \n",
246
+ " 6 BMI 100 non-null float64\n",
247
+ " 7 Obese/non obese 100 non-null object \n",
248
+ " 8 Cholesterol 100 non-null int64 \n",
249
+ " 9 Triglycerides level 100 non-null int64 \n",
250
+ " 10 HDL level 100 non-null int64 \n",
251
+ " 11 LDL level 100 non-null float64\n",
252
+ " 12 VLDL level 100 non-null float64\n",
253
+ " 13 Health_status 100 non-null object \n",
254
+ "dtypes: float64(4), int64(6), object(4)\n",
255
+ "memory usage: 11.1+ KB\n"
256
+ ]
257
+ }
258
+ ],
259
+ "source": [
260
+ "data.info()"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": 10,
266
+ "id": "aae142a0",
267
+ "metadata": {},
268
+ "outputs": [
269
+ {
270
+ "data": {
271
+ "text/plain": [
272
+ "Health_status\n",
273
+ "patient 60\n",
274
+ "healthy 40\n",
275
+ "Name: count, dtype: int64"
276
+ ]
277
+ },
278
+ "execution_count": 10,
279
+ "metadata": {},
280
+ "output_type": "execute_result"
281
+ }
282
+ ],
283
+ "source": [
284
+ "data['Health_status'].value_counts()"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "code",
289
+ "execution_count": 11,
290
+ "id": "aafe0526",
291
+ "metadata": {},
292
+ "outputs": [],
293
+ "source": [
294
+ "# Drop unnecessary columns (e.g., Patient No.)\n",
295
+ "data = data.drop(columns=['Patient No.'])"
296
+ ]
297
+ },
298
+ {
299
+ "cell_type": "markdown",
300
+ "id": "c7907326",
301
+ "metadata": {},
302
+ "source": [
303
+ "### 3. Feature Encoding"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": 12,
309
+ "id": "7f22b9a6",
310
+ "metadata": {},
311
+ "outputs": [],
312
+ "source": [
313
+ "# Encode categorical variables\n",
314
+ "le_health_status = LabelEncoder()\n",
315
+ "data['Health_status'] = le_health_status.fit_transform(data['Health_status']) # 0 for healthy, 1 for patient\n",
316
+ "le_gender = LabelEncoder()\n",
317
+ "data['Gender'] = le_gender.fit_transform(data['Gender']) # 0 for Female, 1 for Male\n",
318
+ "le_family_history = LabelEncoder()\n",
319
+ "data['Family history'] = le_family_history.fit_transform(data['Family history']) # 0 for No, 1 for Yes\n",
320
+ "le_obese = LabelEncoder()\n",
321
+ "data['Obese/non obese'] = le_obese.fit_transform(data['Obese/non obese']) # 0 for Non-obese, 1 for Obese"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "markdown",
326
+ "id": "bcf93f5f",
327
+ "metadata": {},
328
+ "source": [
329
+ "### 4. Split features and target"
330
+ ]
331
+ },
332
+ {
333
+ "cell_type": "code",
334
+ "execution_count": 13,
335
+ "id": "eab4be22",
336
+ "metadata": {},
337
+ "outputs": [],
338
+ "source": [
339
+ "# Features and target\n",
340
+ "X = data.drop(columns=['Health_status'])\n",
341
+ "y = data['Health_status']\n"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "markdown",
346
+ "id": "c5ed059c",
347
+ "metadata": {},
348
+ "source": [
349
+ "### 5. Split data into training and testing sets"
350
+ ]
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": 244,
355
+ "id": "cdeca4f2",
356
+ "metadata": {},
357
+ "outputs": [],
358
+ "source": [
359
+ "# Split the data\n",
360
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "markdown",
365
+ "id": "0558f2c8",
366
+ "metadata": {},
367
+ "source": [
368
+ "### 6. Scalling the features"
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": 245,
374
+ "id": "a037923f",
375
+ "metadata": {},
376
+ "outputs": [],
377
+ "source": [
378
+ "# Scale the features using StandardScaler\n",
379
+ "scaler = StandardScaler()\n",
380
+ "X_train = scaler.fit_transform(X_train)\n",
381
+ "X_test = scaler.transform(X_test)"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "markdown",
386
+ "id": "d934c22c",
387
+ "metadata": {},
388
+ "source": [
389
+ "## Training of classifiers"
390
+ ]
391
+ },
392
+ {
393
+ "cell_type": "markdown",
394
+ "id": "c7fd1e71",
395
+ "metadata": {},
396
+ "source": [
397
+ "### 1. Support Vector Machine (SVM)"
398
+ ]
399
+ },
400
+ {
401
+ "cell_type": "code",
402
+ "execution_count": 246,
403
+ "id": "aca26b71",
404
+ "metadata": {},
405
+ "outputs": [],
406
+ "source": [
407
+ "# SVM Classifier\n",
408
+ "svm_model = SVC(kernel='linear', C=0.9, random_state=42)\n",
409
+ "svm_model.fit(X_train, y_train)\n",
410
+ "svm_preds = svm_model.predict(X_test)"
411
+ ]
412
+ },
413
+ {
414
+ "cell_type": "markdown",
415
+ "id": "e8bfefa5",
416
+ "metadata": {},
417
+ "source": [
418
+ "### 2. Random Forest Classifier"
419
+ ]
420
+ },
421
+ {
422
+ "cell_type": "code",
423
+ "execution_count": 247,
424
+ "id": "a5ad9d40",
425
+ "metadata": {},
426
+ "outputs": [],
427
+ "source": [
428
+ "# Random Forest Classifier\n",
429
+ "rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)\n",
430
+ "rf_model.fit(X_train, y_train)\n",
431
+ "rf_preds = rf_model.predict(X_test)"
432
+ ]
433
+ },
434
+ {
435
+ "cell_type": "markdown",
436
+ "id": "e3fb4a5a",
437
+ "metadata": {},
438
+ "source": [
439
+ "## Comparison of performance metrics"
440
+ ]
441
+ },
442
+ {
443
+ "cell_type": "code",
444
+ "execution_count": 249,
445
+ "id": "2ddc5b12",
446
+ "metadata": {},
447
+ "outputs": [
448
+ {
449
+ "name": "stdout",
450
+ "output_type": "stream",
451
+ "text": [
452
+ "\n",
453
+ "Performance Metrics for SVM\n",
454
+ "Accuracy: 0.65\n",
455
+ "\n",
456
+ "Classification Report:\n",
457
+ " precision recall f1-score support\n",
458
+ "\n",
459
+ " 0 0.55 0.75 0.63 8\n",
460
+ " 1 0.78 0.58 0.67 12\n",
461
+ "\n",
462
+ " accuracy 0.65 20\n",
463
+ " macro avg 0.66 0.67 0.65 20\n",
464
+ "weighted avg 0.68 0.65 0.65 20\n",
465
+ "\n",
466
+ "\n",
467
+ "Performance Metrics for Random Forest\n",
468
+ "Accuracy: 0.7\n",
469
+ "\n",
470
+ "Classification Report:\n",
471
+ " precision recall f1-score support\n",
472
+ "\n",
473
+ " 0 0.60 0.75 0.67 8\n",
474
+ " 1 0.80 0.67 0.73 12\n",
475
+ "\n",
476
+ " accuracy 0.70 20\n",
477
+ " macro avg 0.70 0.71 0.70 20\n",
478
+ "weighted avg 0.72 0.70 0.70 20\n",
479
+ "\n"
480
+ ]
481
+ }
482
+ ],
483
+ "source": [
484
+ "def print_metrics(y_true, y_pred, model_name):\n",
485
+ " print(f\"\\nPerformance Metrics for {model_name}\")\n",
486
+ " print(\"Accuracy:\", accuracy_score(y_true, y_pred))\n",
487
+ " print(\"\\nClassification Report:\")\n",
488
+ " print(classification_report(y_true, y_pred))\n",
489
+ "\n",
490
+ "print_metrics(y_test, svm_preds, \"SVM\")\n",
491
+ "print_metrics(y_test, rf_preds, \"Random Forest\")"
492
+ ]
493
+ },
494
+ {
495
+ "cell_type": "markdown",
496
+ "id": "38eefbb6",
497
+ "metadata": {},
498
+ "source": [
499
+ "## Visualization of results"
500
+ ]
501
+ },
502
+ {
503
+ "cell_type": "code",
504
+ "execution_count": 251,
505
+ "id": "f74a2f74",
506
+ "metadata": {},
507
+ "outputs": [
508
+ {
509
+ "data": {
510
+ "image/png": "",
511
+ "text/plain": [
512
+ "<Figure size 1200x500 with 4 Axes>"
513
+ ]
514
+ },
515
+ "metadata": {},
516
+ "output_type": "display_data"
517
+ }
518
+ ],
519
+ "source": [
520
+ "# Confusion Matrices\n",
521
+ "svm_cm = confusion_matrix(y_test, svm_preds)\n",
522
+ "rf_cm = confusion_matrix(y_test, rf_preds)\n",
523
+ "\n",
524
+ "fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n",
525
+ "sns.heatmap(svm_cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])\n",
526
+ "axes[0].set_title('SVM Confusion Matrix')\n",
527
+ "axes[0].set_xlabel('Predicted')\n",
528
+ "axes[0].set_ylabel('Actual')\n",
529
+ "\n",
530
+ "sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Greens', ax=axes[1])\n",
531
+ "axes[1].set_title('Random Forest Confusion Matrix')\n",
532
+ "axes[1].set_xlabel('Predicted')\n",
533
+ "axes[1].set_ylabel('Actual')\n",
534
+ "\n",
535
+ "plt.tight_layout()\n",
536
+ "plt.show()\n"
537
+ ]
538
+ },
539
+ {
540
+ "cell_type": "code",
541
+ "execution_count": null,
542
+ "id": "3a2c284f",
543
+ "metadata": {},
544
+ "outputs": [],
545
+ "source": []
546
+ }
547
+ ],
548
+ "metadata": {
549
+ "kernelspec": {
550
+ "display_name": "ml_env",
551
+ "language": "python",
552
+ "name": "python3"
553
+ },
554
+ "language_info": {
555
+ "codemirror_mode": {
556
+ "name": "ipython",
557
+ "version": 3
558
+ },
559
+ "file_extension": ".py",
560
+ "mimetype": "text/x-python",
561
+ "name": "python",
562
+ "nbconvert_exporter": "python",
563
+ "pygments_lexer": "ipython3",
564
+ "version": "3.13.1"
565
+ }
566
+ },
567
+ "nbformat": 4,
568
+ "nbformat_minor": 5
569
+ }
installation.ipynb ADDED
File without changes
lr_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3afb60ab51259121d4188b2e23fa705e3f6c100a4965e4769e0812d36677d3b6
3
+ size 1391
main.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import joblib
7
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
8
+ from plotly import graph_objects as go
9
+
10
+ st.set_page_config(layout="wide")
11
+
12
+ # Load Dataset
13
+ def load_data():
14
+ data = pd.read_excel(r'Model Training/colelithiasis_dataset.xlsx') # Update with your dataset file path
15
+ data.drop('Patient No.', axis=1, inplace=True)
16
+ return data
17
+
18
+ # Initialize Session State
19
+ if "data" not in st.session_state:
20
+ st.session_state.data = load_data()
21
+
22
+ def introduction_page():
23
+ st.title("Introduction")
24
+ st.markdown("""
25
+ ## Project Overview
26
+ This project analyzes the Colelithiasis dataset to perform exploratory data analysis (EDA) and prediction using pre-trained machine learning models. The goal is to provide insights into the data and make predictions efficiently.
27
+
28
+ ## Objectives
29
+ - Perform EDA to uncover patterns and insights.
30
+ - Use pre-trained machine learning models for predictions.
31
+ - Create an interactive Streamlit application.
32
+ """)
33
+
34
+ def stats_page():
35
+ st.title("Exploratory Data Analysis")
36
+
37
+ # Dataset Overview
38
+ st.subheader("Dataset Overview")
39
+ st.dataframe(st.session_state.data.head())
40
+
41
+ # Summary Statistics
42
+ st.subheader("Summary Statistics")
43
+ st.write(st.session_state.data.describe())
44
+
45
+ # Correlation Matrix
46
+ st.subheader("Correlation Analysis")
47
+
48
+ # encode the target variable
49
+ data = st.session_state.data.copy()
50
+ data['Health_status'].replace({'healthy': 0, 'patient': 1}, inplace=True)
51
+
52
+ # apply ordinal encoding to the categorical columns
53
+ categorical_columns = ['Gender','Family history','Obese/non obese']
54
+ encoder = joblib.load('Model Training\encoder.pkl')
55
+ data[categorical_columns] = encoder.transform(data[categorical_columns])
56
+
57
+ correlation = data.corr()
58
+ plt.figure(figsize=(5, 3))
59
+ # reduce the font size of the heatmap
60
+ sns.set(font_scale=0.5)
61
+ sns.heatmap(correlation, annot=True, cmap="coolwarm", fmt=".2f")
62
+ st.pyplot(plt, use_container_width=False)
63
+
64
+ def eda_page():
65
+ st.title("Exploratory Data Analysis")
66
+
67
+ # Interactive Visualizations
68
+ st.subheader("Visualizations")
69
+ chart_type = st.selectbox("Choose Chart Type", ["Histogram", "Scatter Plot", "Box Plot"])
70
+
71
+ if chart_type == "Histogram":
72
+ column = st.selectbox("Choose Column for Visualization", st.session_state.data.columns)
73
+ fig = go.Figure()
74
+ fig.add_trace(go.Histogram(x=st.session_state.data[column], name=column, marker_color="indigo"))
75
+ fig.update_layout(
76
+ title=dict(text="Histogram Analysis", x=0.5, font=dict(size=22)),
77
+ xaxis_title=column,
78
+ yaxis_title="Count",
79
+ legend=dict(title="Legend", orientation="h", x=0.5, xanchor="center"),
80
+ bargap=0.2,
81
+ hovermode="x unified",
82
+ template="plotly_dark"
83
+ )
84
+ st.plotly_chart(fig)
85
+
86
+ elif chart_type == "Scatter Plot":
87
+ x_col = st.selectbox("Choose X-axis Column", st.session_state.data.columns)
88
+ y_col = st.selectbox("Choose Y-axis Column", st.session_state.data.columns)
89
+ fig = go.Figure()
90
+ fig.add_trace(go.Scatter(
91
+ x=st.session_state.data[x_col],
92
+ y=st.session_state.data[y_col],
93
+ mode="markers",
94
+ marker=dict(size=10, color="purple", line=dict(width=1, color="white")),
95
+ name=f"{y_col} vs {x_col}"
96
+ ))
97
+ fig.update_layout(
98
+ title=dict(text="Scatter Plot Analysis", x=0.5, font=dict(size=22)),
99
+ xaxis_title=x_col,
100
+ yaxis_title=y_col,
101
+ legend=dict(title="Legend", orientation="h", x=0.5, xanchor="center"),
102
+ hovermode="closest",
103
+ template="plotly_dark"
104
+ )
105
+ st.plotly_chart(fig)
106
+
107
+ elif chart_type == "Box Plot":
108
+ column = st.selectbox("Choose Column for Visualization", st.session_state.data.columns)
109
+ fig = go.Figure()
110
+ fig.add_trace(go.Box(
111
+ y=st.session_state.data[column],
112
+ name=column,
113
+ boxmean="sd",
114
+ marker_color="teal"
115
+ ))
116
+ fig.update_layout(
117
+ title=dict(text="Boxplot Analysis", x=0.5, font=dict(size=22)),
118
+ yaxis_title=column,
119
+ legend=dict(title="Legend", orientation="h", x=0.5, xanchor="center"),
120
+ hovermode="y",
121
+ template="plotly_dark"
122
+ )
123
+ st.plotly_chart(fig)
124
+
125
+
126
+ def model_page():
127
+ st.title("Model Evaluation")
128
+ test_data = pd.read_excel(r'Model Training\test_data.xlsx')
129
+
130
+
131
+ # encode the target variable
132
+ test_data['Health_status'].replace({'healthy': 0, 'patient': 1}, inplace=True)
133
+
134
+ # apply ordinal encoding to the categorical columns
135
+ categorical_columns = ['Gender','Family history','Obese/non obese']
136
+ encoder = joblib.load('Model Training\encoder.pkl')
137
+
138
+ X = test_data.drop( columns=['Health_status'])
139
+ X[categorical_columns] = encoder.transform(X[categorical_columns])
140
+ y = test_data['Health_status']
141
+
142
+ # apply standard scalling to numberical features in X
143
+ numerical_columns = [col_name for col_name in X.columns if col_name not in categorical_columns]
144
+ scaler = joblib.load('Model Training\scaler.pkl')
145
+ X[numerical_columns] = scaler.transform(X[numerical_columns])
146
+
147
+ # Model Selection
148
+ st.text("Model Selection")
149
+ model_choice = st.selectbox("Choose a Pre-trained Model", ["SVM - Linear", "SVM - Polynomial", "SVM - RBF",
150
+ "Random Forest","Random Forest Boosted", "Logistic Regression", "GDA"])
151
+
152
+ # Load pre-trained model
153
+ model = None
154
+ if model_choice == "SVM - Linear":
155
+ model = joblib.load('Model Training\svm_model_linear.pkl')
156
+ elif model_choice == "SVM - Polynomial":
157
+ model = joblib.load('Model Training\svm_model_poly.pkl')
158
+ elif model_choice == "SVM - RBF":
159
+ model = joblib.load('Model Training\svm_model_rbf.pkl')
160
+ elif model_choice == "Random Forest":
161
+ model = joblib.load('Model Training\rf_model.pkl')
162
+ elif model_choice == "Random Forest Boosted":
163
+ model = joblib.load('Model Training\rf_boosted.pkl')
164
+ elif model_choice == "Logistic Regression":
165
+ model = joblib.load('Model Training\lr_model.pkl')
166
+ elif model_choice == "GDA":
167
+ model = joblib.load('Model Training\gda.pkl')
168
+
169
+
170
+ if model:
171
+ # Make Predictions
172
+ y_pred = model.predict(X)
173
+ col1, col2 = st.columns(2)
174
+ with col1:
175
+ st.subheader("### Predictions on the Test Data:")
176
+ st.dataframe(pd.DataFrame({"Actual": y, "Predicted": y_pred}))
177
+
178
+ with col2:
179
+ st.subheader("Classification Report")
180
+ report = classification_report(y, y_pred, output_dict=True)
181
+ report_df = pd.DataFrame(report).transpose().reset_index()
182
+ report_df.drop('support', axis=1, inplace=True)
183
+ report_df.set_index(['index'], inplace=True)
184
+ report_df.rename(index={'0.0': 'Negative', '1.0': 'Positive'}, inplace=True)
185
+ report_df.iloc[report_df.index.get_loc('accuracy'), 0:2] = ''
186
+ st.table(report_df)
187
+
188
+ st.subheader("Confusion Matrix")
189
+ conf_matrix = confusion_matrix(y, y_pred)
190
+ # Generate text annotations for the confusion matrix
191
+ text_annotations = np.array([[str(value) for value in row] for row in conf_matrix])
192
+
193
+ col1, col2 = st.columns(2)
194
+ with col1:
195
+ # Create the heatmap using seaborn
196
+ plt.figure(figsize=(3 , 3))
197
+ sns.heatmap(conf_matrix, annot=text_annotations, fmt="", cmap="Blues", cbar=False, square=True)
198
+ plt.xlabel("Predicted")
199
+ plt.ylabel("Actual")
200
+ plt.title("Confusion Matrix")
201
+ st.pyplot(plt)
202
+
203
+
204
+ def prediction_page():
205
+ st.title("Get Your Diagnosis")
206
+ st.subheader("Symptoms Entry Form")
207
+ # Model Selection
208
+ model_choice = st.selectbox("Choose a Pre-trained Model", ["SVM - Linear", "SVM - Polynomial", "SVM - RBF",
209
+ "Random Forest","Random Forest Boosted", "Logistic Regression", "GDA"])
210
+
211
+ # Load pre-trained model
212
+ model = None
213
+ if model_choice == "SVM - Linear":
214
+ model = joblib.load('Model Training\svm_model_linear.pkl')
215
+ elif model_choice == "SVM - Polynomial":
216
+ model = joblib.load('Model Training\svm_model_poly.pkl')
217
+ elif model_choice == "SVM - RBF":
218
+ model = joblib.load('Model Training\svm_model_rbf.pkl')
219
+ elif model_choice == "Random Forest":
220
+ model = joblib.load('Model Training\rf_model.pkl')
221
+ elif model_choice == "Random Forest Boosted":
222
+ model = joblib.load('Model Training\rf_boosted.pkl')
223
+ elif model_choice == "Logistic Regression":
224
+ model = joblib.load('Model Training\lr_model.pkl')
225
+ elif model_choice == "GDA":
226
+ model = joblib.load('Model Training\gda.pkl')
227
+
228
+ with st.form(key="health_data_form"):
229
+ col1, col2, col3, col4 = st.columns(4)
230
+
231
+ with col1:
232
+ # Categorical features with dropdown selection
233
+ gender = st.selectbox("Gender", ["Male", "Female"], key="gender")
234
+ weight = st.number_input("Weight (kg)", min_value=0, step=1, key="weight")
235
+ cholesterol = st.number_input("Cholesterol (mg/dL)", min_value=0, step=1, key="cholesterol")
236
+ with col2:
237
+ family_history = st.selectbox("Family History of Illness", ["Yes", "No"], key="family_history")
238
+ bmi = st.number_input("BMI", min_value=0.0, step=0.1, key="bmi")
239
+ triglycerides = st.number_input("Triglycerides Level (mg/dL)", min_value=0, step=1, key="triglycerides")
240
+
241
+ with col3:
242
+ height = st.number_input("Height (cm)", min_value=0.0, step=0.1, key="height")
243
+ obese_status = st.selectbox("Obese/Non Obese", ["Obese", "Non-Obese"], key="obese_status")
244
+ ldl = st.number_input("LDL Level (mg/dL)", min_value=0.0, step=0.1, key="ldl")
245
+
246
+ with col4:
247
+ vldl = st.number_input("VLDL Level (mg/dL)", min_value=0.0, step=0.1, key="vldl")
248
+
249
+
250
+
251
+ # Submit button
252
+ submit_button = st.form_submit_button(label="Submit" )
253
+
254
+ if submit_button:
255
+ # Create a DataFrame directly with the user input data
256
+ data = pd.DataFrame({
257
+ "Gender": [gender],
258
+ "Family history": [family_history],
259
+ "Height": [height],
260
+ "Weight": [weight],
261
+ "BMI": [bmi],
262
+ "Obese/non obese": [obese_status],
263
+ "Cholesterol": [cholesterol],
264
+ "Triglycerides": [triglycerides],
265
+ "LDL level": [ldl],
266
+ "VLDL level": [vldl]
267
+ })
268
+
269
+
270
+ columns = ['Gender', 'Family history', 'Height', 'Weight', 'BMI', 'Obese/non obese', 'Cholesterol', 'Triglycerides level', 'LDL level', 'VLDL level']
271
+ data = data.reindex(columns=columns, fill_value=0)
272
+
273
+ categorical_columns = ['Gender','Family history','Obese/non obese']
274
+ numerical_columns = [col_name for col_name in data.columns if col_name not in categorical_columns]
275
+ # Encoding categorical data
276
+ encoder = joblib.load('Model Training\encoder.pkl')
277
+ data[categorical_columns] = encoder.transform(data[categorical_columns])
278
+
279
+ # Scaling the numeric features
280
+ scaler = joblib.load('Model Training\scaler.pkl')
281
+ data[numerical_columns] = scaler.transform(data[numerical_columns])
282
+
283
+
284
+
285
+ prediction = int(model.predict(data)[0])
286
+ st.write(f"### Predicted Diagnosis: {'Positive' if prediction == 1 else 'Negative'}")
287
+
288
+
289
+ def conclusion_page():
290
+ st.title("Conclusion")
291
+ st.markdown("""
292
+ ## Key Takeaways
293
+ - Comprehensive EDA provides actionable insights into the data.
294
+ - Pre-trained machine learning models allow efficient predictions.
295
+ - The interactive app makes the analysis accessible and engaging.
296
+
297
+ Thank you for exploring this project!
298
+ """)
299
+
300
+ # Sidebar Navigation Menu with radio buttons for page selection
301
+ page = st.sidebar.radio("Navigation Menu", ["Introduction","Descriptive Statistics", "Data Analytics", "Model Evaluation", "Get Your Diagnosis", "Conclusion"])
302
+
303
+ if page == "Introduction":
304
+ introduction_page()
305
+ elif page == "Descriptive Statistics":
306
+ stats_page()
307
+ elif page == "Data Analytics":
308
+ eda_page()
309
+ elif page == "Model Evaluation":
310
+ model_page()
311
+ elif page == "Get Your Diagnosis":
312
+ prediction_page()
313
+ elif page == "Conclusion":
314
+ conclusion_page()
requirements.txt ADDED
Binary file (2.01 kB). View file
 
rf_boosted.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bcff453f23629ac8c235105320385407489a939858ec303db312ebb934f704e
3
+ size 179112
rf_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:994d9d2dbee5adf22ee6434a0a778a0dae2f37c4883e414948a9b3fed3f53482
3
+ size 562377
scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:125d5da89197d1212608fa958179f4601f9e98f7efe03640a72cbbd33ecc84e1
3
+ size 1183
svm_model_linear.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7636d9a64fcdc959b7d3d614e7d2567c4da70539e6147558dbedf8159b0ef637
3
+ size 9611
svm_model_poly.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dcb91962b964f4d63a0acefbe2842d3d72af04c54e2c8a9341bea9688b9e027
3
+ size 8203
svm_model_rbf.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddd0af20c80ecad7f8770169f9c8ba98aecf7271b20eddd5ca5d838a04edfecc
3
+ size 9019
test_data.xlsx ADDED
Binary file (6.5 kB). View file
 
training.ipynb ADDED
The diff for this file is too large to render. See raw diff