|
|
|
import yaml |
|
from joblib import dump, load |
|
import pandas as pd |
|
from sklearn.model_selection import train_test_split, cross_val_score |
|
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report |
|
|
|
from sklearn.naive_bayes import MultinomialNB |
|
|
|
from sklearn.tree import DecisionTreeClassifier |
|
|
|
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier |
|
import seaborn as sn |
|
import matplotlib.pyplot as plt |
|
from concrete.ml.sklearn import DecisionTreeClassifier as ConcreteDecisionTreeClassifier |
|
from concrete.ml.sklearn import XGBClassifier as ConcreteXGBClassifier |
|
|
|
|
|
class DiseasePrediction: |
|
|
|
def __init__(self, model_name=None): |
|
|
|
try: |
|
with open('./config.yaml', 'r') as f: |
|
self.config = yaml.safe_load(f) |
|
except Exception as e: |
|
print("Error reading Config file...") |
|
|
|
|
|
self.verbose = self.config['verbose'] |
|
|
|
self.train_features, self.train_labels, self.train_df = self._load_train_dataset() |
|
|
|
self.test_features, self.test_labels, self.test_df = self._load_test_dataset() |
|
|
|
self._feature_correlation(data_frame=self.train_df, show_fig=False) |
|
|
|
self.model_name = model_name |
|
|
|
self.model_save_path = self.config['model_save_path'] |
|
|
|
|
|
def _load_train_dataset(self): |
|
df_train = pd.read_csv(self.config['dataset']['training_data_path']) |
|
cols = df_train.columns |
|
cols = cols[:-2] |
|
train_features = df_train[cols] |
|
train_labels = df_train['prognosis'] |
|
|
|
|
|
assert (len(train_features.iloc[0]) == 132) |
|
assert (len(train_labels) == train_features.shape[0]) |
|
|
|
if self.verbose: |
|
print("Length of Training Data: ", df_train.shape) |
|
print("Training Features: ", train_features.shape) |
|
print("Training Labels: ", train_labels.shape) |
|
return train_features, train_labels, df_train |
|
|
|
|
|
def _load_test_dataset(self): |
|
df_test = pd.read_csv(self.config['dataset']['test_data_path']) |
|
cols = df_test.columns |
|
cols = cols[:-1] |
|
test_features = df_test[cols] |
|
test_labels = df_test['prognosis'] |
|
|
|
|
|
assert (len(test_features.iloc[0]) == 132) |
|
assert (len(test_labels) == test_features.shape[0]) |
|
|
|
if self.verbose: |
|
print("Length of Test Data: ", df_test.shape) |
|
print("Test Features: ", test_features.shape) |
|
print("Test Labels: ", test_labels.shape) |
|
return test_features, test_labels, df_test |
|
|
|
|
|
def _feature_correlation(self, data_frame=None, show_fig=False): |
|
|
|
corr = data_frame.corr() |
|
sn.heatmap(corr, square=True, annot=False, cmap="YlGnBu") |
|
plt.title("Feature Correlation") |
|
plt.tight_layout() |
|
if show_fig: |
|
plt.show() |
|
plt.savefig('feature_correlation.png') |
|
|
|
|
|
def _train_val_split(self): |
|
X_train, X_val, y_train, y_val = train_test_split(self.train_features, self.train_labels, |
|
test_size=self.config['dataset']['validation_size'], |
|
random_state=self.config['random_state']) |
|
|
|
if self.verbose: |
|
print("Number of Training Features: {0}\tNumber of Training Labels: {1}".format(len(X_train), len(y_train))) |
|
print("Number of Validation Features: {0}\tNumber of Validation Labels: {1}".format(len(X_val), len(y_val))) |
|
return X_train, y_train, X_val, y_val |
|
|
|
|
|
def select_model(self): |
|
if self.model_name == 'mnb': |
|
self.clf = MultinomialNB() |
|
elif self.model_name == 'decision_tree': |
|
self.clf = ConcreteDecisionTreeClassifier(criterion=self.config['model']['decision_tree']['criterion']) |
|
elif self.model_name == 'gradient_boost': |
|
self.clf = ConcreteXGBClassifier(n_estimators=self.config['model']['gradient_boost']['n_estimators'], |
|
criterion=self.config['model']['gradient_boost']['criterion']) |
|
return self.clf |
|
|
|
|
|
def train_model(self): |
|
|
|
X_train, y_train, X_val, y_val = self._train_val_split() |
|
classifier = self.select_model() |
|
|
|
classifier = classifier.fit(X_train, y_train) |
|
|
|
confidence = classifier.score(X_val, y_val) |
|
|
|
y_pred = classifier.predict(X_val) |
|
|
|
accuracy = accuracy_score(y_val, y_pred) |
|
|
|
conf_mat = confusion_matrix(y_val, y_pred) |
|
|
|
clf_report = classification_report(y_val, y_pred) |
|
|
|
score = cross_val_score(classifier, X_val, y_val, cv=3) |
|
|
|
if self.verbose: |
|
print('\nTraining Accuracy: ', confidence) |
|
print('\nValidation Prediction: ', y_pred) |
|
print('\nValidation Accuracy: ', accuracy) |
|
print('\nValidation Confusion Matrix: \n', conf_mat) |
|
print('\nCross Validation Score: \n', score) |
|
print('\nClassification Report: \n', clf_report) |
|
|
|
|
|
dump(classifier, str(self.model_save_path + self.model_name + ".joblib")) |
|
|
|
|
|
def make_prediction(self, saved_model_name=None, test_data=None): |
|
try: |
|
|
|
clf = load(str(self.model_save_path + saved_model_name + ".joblib")) |
|
except Exception as e: |
|
print("Model not found...") |
|
|
|
if test_data is not None: |
|
result = clf.predict(test_data) |
|
return result |
|
else: |
|
result = clf.predict(self.test_features) |
|
accuracy = accuracy_score(self.test_labels, result) |
|
clf_report = classification_report(self.test_labels, result) |
|
return accuracy, clf_report |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
current_model_name = 'decision_tree' |
|
|
|
dp = DiseasePrediction(model_name=current_model_name) |
|
|
|
dp.train_model() |
|
|
|
test_accuracy, classification_report = dp.make_prediction(saved_model_name=current_model_name) |
|
print("Model Test Accuracy: ", test_accuracy) |
|
print("Test Data Classification Report: \n", classification_report) |