# import the library | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.impute import KNNImputer | |
from sklearn.pipeline import Pipeline | |
from sklearn.compose import ColumnTransformer | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.ensemble import GradientBoostingClassifier | |
#libraries for model evaluation | |
import matplotlib.pyplot as plt | |
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import plot_confusion_matrix | |
from sklearn.metrics import classification_report | |
import warnings | |
warnings.filterwarnings('ignore') | |
# read the dataset | |
df = pd.read_csv('heart.csv') | |
# get categorical columns | |
categorical_cols= df.select_dtypes(include=['object']) | |
# get count of unique values for categorical columns | |
for cols in categorical_cols.columns: | |
print(cols,':', len(categorical_cols[cols].unique()),'labels') | |
# categorical columns | |
cat_col = categorical_cols.columns | |
# numerical column | |
num_col = ['Age','RestingBP','Cholesterol','FastingBS','MaxHR','Oldpeak'] | |
# define X and y | |
X = df.drop(['HeartDisease'],axis=1) | |
y = df['HeartDisease'] | |
# create a pipeline for preprocessing the dataset | |
num_pipeline = Pipeline([ | |
('imputer', KNNImputer(n_neighbors=5)), | |
('std_scaler', StandardScaler()), | |
]) | |
num_attribs = num_col | |
cat_attribs = cat_col | |
# apply transformation to the numerical and categorical columns | |
full_pipeline = ColumnTransformer([ | |
("num", num_pipeline, num_attribs), | |
("cat", OneHotEncoder(), cat_attribs), | |
]) | |
X = full_pipeline.fit_transform(X) | |
# save preprocessed data | |
temp_df = pd.DataFrame(X) | |
temp_df.to_csv('processed_data.csv') | |
# Splitting the dataset into the Training set and Test set | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) | |
# count plot for number of heart disease(1)/No heart disease(0) | |
import seaborn as sns | |
sns.countplot(y_train,palette='OrRd') | |
# create a fresh model based on tuned parameters | |
rfc1=RandomForestClassifier(random_state=42, max_features='sqrt', n_estimators= 50, max_depth=7, criterion='gini') | |, y_train) | |
# Predicting the Test set results | |
y_pred = rfc1.predict(X_test) | |
print('Random forest accuracy_score:',accuracy_score(y_test,y_pred)) | |
# Save the Model | |
import pickle | |
# save the random forest model for future use | |
pickle.dump(rfc1, open('rfc.pickle', 'wb')) | |
# save the preprocessing pipeline | |
pickle.dump(full_pipeline, open('full_pipeline.pickle', 'wb')) | |
# Load the Models for future use | |
rfc_saved = pickle.load(open('rfc.pickle','rb')) | |
full_pipeline_saved = pickle.load(open('full_pipeline.pickle','rb')) | |
# Visualization | |
target = df['HeartDisease'].replace([0,1],['Low','High']) | |
data = pd.crosstab(index=df['Sex'], | |
columns=target) | |
data.plot(kind='bar',stacked=True) | | | |
plt.figure(figsize=(10,5)) | |
bins=[0,30,50,80] | |
sns.countplot(x=pd.cut(df.Age,bins=bins),hue=target,color='r') | | | |
plt.figure(figsize=(10,5)) | |
sns.countplot(x=target,hue=df.ChestPainType) | |
plt.xticks(np.arange(2), ['No', 'Yes']) | | | |
plt.figure(figsize=(10,5)) | |
sns.countplot(x=target,hue=df.ExerciseAngina) | |
plt.xticks(np.arange(2), ['No', 'Yes']) | | | |
# feature importance | |
# get important features used by model | |
importances = rfc1.feature_importances_ | |
feature_names = num_col | |
for i in cat_col: | |
feature_names = feature_names + [i]*df[i].nunique() | |
import pandas as pd | |
forest_importances = pd.Series(importances, index=feature_names) | |
forest_importances = forest_importances.groupby(level=0).first().sort_values(ascending=False) | |
# plot the features based on their importance in model performance. | |
fig, ax = plt.subplots() | | | |
ax.set_title("Feature importances using MDI") | |
ax.set_ylabel("Mean decrease in impurity") | |
fig.tight_layout() |