import os import sys from dataclasses import dataclass from sklearn.metrics import r2_score from sklearn.linear_model import LinearRegression from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import ( RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ) from xgboost import XGBRegressor from catboost import CatBoostRegressor from src.logger import logging from src.exception import CustomException from src.utils import save_object, evaluate_models @dataclass class ModelTrainerConfig: trained_model_file_path = os.path.join("artifacts", "model.pkl") class ModelTrainer: def __init__(self) -> None: self.model_trainer_config = ModelTrainerConfig() def initiate_model_trainer(self, train_array, test_array): try: logging.info("Split training and testing input data") X_train, y_train, X_test, y_test = ( train_array[:, :-1], train_array[:, -1], test_array[:, :-1], test_array[:, -1], ) models = { "Linear Regression": LinearRegression(), "K-Neighbors Regressor": KNeighborsRegressor(), "Decision Tree Regressor": DecisionTreeRegressor(), "Random Forest Regressor": RandomForestRegressor(), "AdaBoost Regressor": AdaBoostRegressor(), "Gradient Boosting Regressor": GradientBoostingRegressor(), "XGBRegressor": XGBRegressor(), "CatBoosting Regressor": CatBoostRegressor(verbose=False), } params_grid = { "Linear Regression": {}, "K-Neighbors Regressor": {}, "Decision Tree Regressor": { "criterion": [ "squared_error", "friedman_mse", "absolute_error", "poisson", ], # 'splitter':['best','random'], # 'max_features':['sqrt','log2'], }, "Random Forest Regressor": { # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'], # 'max_features':['sqrt','log2',None], "n_estimators": [8, 16, 32, 64, 128, 256] }, "AdaBoost Regressor": { "learning_rate": [0.1, 0.01, 0.5, 0.001], # 'loss':['linear','square','exponential'], "n_estimators": [8, 16, 32, 64, 128, 256], }, "Gradient Boosting Regressor": { # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'], "learning_rate": [0.1, 0.01, 0.05, 0.001], "subsample": [0.6, 0.7, 0.75, 0.8, 0.85, 0.9], # 'criterion':['squared_error', 'friedman_mse'], # 'max_features':['auto','sqrt','log2'], "n_estimators": [8, 16, 32, 64, 128, 256], }, "XGBRegressor": { "learning_rate": [0.1, 0.01, 0.05, 0.001], "n_estimators": [8, 16, 32, 64, 128, 256], }, "CatBoosting Regressor": { "depth": [6, 8, 10], "learning_rate": [0.01, 0.05, 0.1], "iterations": [30, 50, 100], }, } model_report: dict = evaluate_models( X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, models=models, params_grid=params_grid, ) # To get best model score from dict best_model_score = max(sorted(model_report.values())) # To get best model name from dict best_model_name = list(model_report.keys())[ list(model_report.values()).index(best_model_score) ] best_model = models[best_model_name] if best_model_score < 0.6: raise CustomException("No best model found", sys) logging.info(f"Best found model on both training and testing dataset") save_object( file_path=self.model_trainer_config.trained_model_file_path, obj=best_model, ) print(best_model_name) predicted = best_model.predict(X_test) r2_square = r2_score(y_test, predicted) return r2_square except Exception as e: raise CustomException(e, sys)