spaceX / mymodel.py
Nos7's picture
Update mymodel.py
aeb2cab verified
# Import Dependencies
import yaml
from joblib import dump, load
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Naive Bayes Approach
from sklearn.naive_bayes import MultinomialNB
# Trees Approach
from sklearn.tree import DecisionTreeClassifier
# Ensemble Approach
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import seaborn as sn
import matplotlib.pyplot as plt
from concrete.ml.sklearn import DecisionTreeClassifier as ConcreteDecisionTreeClassifier
from concrete.ml.sklearn import XGBClassifier as ConcreteXGBClassifier
class DiseasePrediction:
# Initialize and Load the Config File
def __init__(self, model_name=None):
# Load Config File
try:
with open('./config.yaml', 'r') as f:
self.config = yaml.safe_load(f)
except Exception as e:
print("Error reading Config file...")
# Verbose
self.verbose = self.config['verbose']
# Load Training Data
self.train_features, self.train_labels, self.train_df = self._load_train_dataset()
# Load Test Data
self.test_features, self.test_labels, self.test_df = self._load_test_dataset()
# Feature Correlation in Training Data
self._feature_correlation(data_frame=self.train_df, show_fig=False)
# Model Definition
self.model_name = model_name
# Model Save Path
self.model_save_path = self.config['model_save_path']
# Function to Load Train Dataset
def _load_train_dataset(self):
df_train = pd.read_csv(self.config['dataset']['training_data_path'])
cols = df_train.columns
cols = cols[:-2]
train_features = df_train[cols]
train_labels = df_train['prognosis']
# Check for data sanity
assert (len(train_features.iloc[0]) == 132)
assert (len(train_labels) == train_features.shape[0])
if self.verbose:
print("Length of Training Data: ", df_train.shape)
print("Training Features: ", train_features.shape)
print("Training Labels: ", train_labels.shape)
return train_features, train_labels, df_train
# Function to Load Test Dataset
def _load_test_dataset(self):
df_test = pd.read_csv(self.config['dataset']['test_data_path'])
cols = df_test.columns
cols = cols[:-1]
test_features = df_test[cols]
test_labels = df_test['prognosis']
# Check for data sanity
assert (len(test_features.iloc[0]) == 132)
assert (len(test_labels) == test_features.shape[0])
if self.verbose:
print("Length of Test Data: ", df_test.shape)
print("Test Features: ", test_features.shape)
print("Test Labels: ", test_labels.shape)
return test_features, test_labels, df_test
# Features Correlation
def _feature_correlation(self, data_frame=None, show_fig=False):
# Get Feature Correlation
corr = data_frame.corr()
sn.heatmap(corr, square=True, annot=False, cmap="YlGnBu")
plt.title("Feature Correlation")
plt.tight_layout()
if show_fig:
plt.show()
plt.savefig('feature_correlation.png')
# Dataset Train Validation Split
def _train_val_split(self):
X_train, X_val, y_train, y_val = train_test_split(self.train_features, self.train_labels,
test_size=self.config['dataset']['validation_size'],
random_state=self.config['random_state'])
if self.verbose:
print("Number of Training Features: {0}\tNumber of Training Labels: {1}".format(len(X_train), len(y_train)))
print("Number of Validation Features: {0}\tNumber of Validation Labels: {1}".format(len(X_val), len(y_val)))
return X_train, y_train, X_val, y_val
# Model Selection
def select_model(self):
if self.model_name == 'mnb':
self.clf = MultinomialNB()
elif self.model_name == 'decision_tree':
self.clf = ConcreteDecisionTreeClassifier(criterion=self.config['model']['decision_tree']['criterion'])
elif self.model_name == 'gradient_boost':
self.clf = ConcreteXGBClassifier(n_estimators=self.config['model']['gradient_boost']['n_estimators'],
criterion=self.config['model']['gradient_boost']['criterion'])
return self.clf
# ML Model
def train_model(self):
# Get the Data
X_train, y_train, X_val, y_val = self._train_val_split()
classifier = self.select_model()
# Training the Model
classifier = classifier.fit(X_train, y_train)
# Trained Model Evaluation on Validation Dataset
confidence = classifier.score(X_val, y_val)
# Validation Data Prediction
y_pred = classifier.predict(X_val)
# Model Validation Accuracy
accuracy = accuracy_score(y_val, y_pred)
# Model Confusion Matrix
conf_mat = confusion_matrix(y_val, y_pred)
# Model Classification Report
clf_report = classification_report(y_val, y_pred)
# Model Cross Validation Score
score = cross_val_score(classifier, X_val, y_val, cv=3)
if self.verbose:
print('\nTraining Accuracy: ', confidence)
print('\nValidation Prediction: ', y_pred)
print('\nValidation Accuracy: ', accuracy)
print('\nValidation Confusion Matrix: \n', conf_mat)
print('\nCross Validation Score: \n', score)
print('\nClassification Report: \n', clf_report)
# Save Trained Model
dump(classifier, str(self.model_save_path + self.model_name + ".joblib"))
# Function to Make Predictions on Test Data
def make_prediction(self, saved_model_name=None, test_data=None):
try:
# Load Trained Model
clf = load(str(self.model_save_path + saved_model_name + ".joblib"))
except Exception as e:
print("Model not found...")
if test_data is not None:
result = clf.predict(test_data)
return result
else:
result = clf.predict(self.test_features)
accuracy = accuracy_score(self.test_labels, result)
clf_report = classification_report(self.test_labels, result)
return accuracy, clf_report
if __name__ == "__main__":
# Model Currently Training
current_model_name = 'decision_tree'
# Instantiate the Class
dp = DiseasePrediction(model_name=current_model_name)
# Train the Model
dp.train_model()
# Get Model Performance on Test Data
test_accuracy, classification_report = dp.make_prediction(saved_model_name=current_model_name)
print("Model Test Accuracy: ", test_accuracy)
print("Test Data Classification Report: \n", classification_report)