|
from models import * |
|
from data_preprocessing import * |
|
|
|
|
|
def plot_rmse_explanation( |
|
dates: pd.Series, |
|
actual: pd.Series, |
|
predicted: pd.Series, |
|
rmse: float, |
|
title: str = "Understanding RMSE: Actual vs Predicted", |
|
): |
|
""" |
|
Plot the actual vs. predicted values with error visualization and RMSE explanation. |
|
|
|
Parameters: |
|
dates (pd.Series): Dates corresponding to the observations. |
|
actual (pd.Series): Actual target values. |
|
predicted (pd.Series): Predicted values by the model. |
|
rmse (float): The root mean squared error value. |
|
title (str): The title of the plot. |
|
""" |
|
plt.figure(figsize=(14, 8)) |
|
|
|
|
|
plt.plot(dates, actual, label="Actual Values", color="orange", linewidth=2) |
|
plt.plot( |
|
dates, |
|
predicted, |
|
label="Predicted Values", |
|
color="blue", |
|
linestyle="--", |
|
linewidth=2, |
|
) |
|
|
|
|
|
for date, act, pred in zip(dates, actual, predicted): |
|
plt.plot( |
|
[date, date], [act, pred], color="red", alpha=0.5 |
|
) |
|
|
|
|
|
plt.text( |
|
0.05, |
|
0.95, |
|
f"RMSE: {rmse:.2f}", |
|
transform=plt.gca().transAxes, |
|
fontsize=14, |
|
color="red", |
|
bbox=dict(facecolor="white", alpha=0.7, edgecolor="red"), |
|
) |
|
|
|
|
|
plt.title(title, fontsize=16) |
|
plt.xlabel("Date", fontsize=14) |
|
plt.ylabel("CPIH Medical", fontsize=14) |
|
plt.xticks(rotation=45) |
|
plt.legend(fontsize=12) |
|
plt.grid(alpha=0.3) |
|
plt.tight_layout() |
|
plt.savefig(f"quanti/data/{title},{dates.iloc[0]},{dates.iloc[-1]}.png") |
|
plt.show() |
|
|
|
|
|
def main( |
|
df: pd.DataFrame, |
|
model, |
|
train_start: str, |
|
train_end: str, |
|
test_start: str, |
|
test_end: str, |
|
target: str, |
|
features: list, |
|
param_grid: dict, |
|
train_start_bis: str = None, |
|
train_end_bis: str = None, |
|
): |
|
""" |
|
Train and evaluate a model on the given data. |
|
|
|
Parameters: |
|
df (pd.DataFrame): The DataFrame containing the data. |
|
model (str): The model to use. |
|
train_start (str): The start date for the training set, in the format "YYYY-MM-DD". |
|
train_end (str): The end date for the training set, in the format "YYYY-MM-DD". |
|
test_start (str): The start date for the testing set, in the format "YYYY-MM-DD". |
|
test_end (str): The end date for the testing set, in the format "YYYY-MM-DD". |
|
target (str): The target column. |
|
features (list): The features to use. |
|
param_grid (dict): The hyperparameter grid. |
|
train_start_bis (str): The start date for the second training set, in the format "YYYY-MM-DD". |
|
train_end_bis (str): The end date for the second training set, in the format "YYYY-MM-DD". |
|
|
|
|
|
Returns: |
|
tuple: A tuple containing the R^2, MAE, MSE, and RMSE values. |
|
""" |
|
X_train, y_train, X_test, y_test = training_testing_data( |
|
df, |
|
train_start, |
|
train_end, |
|
test_start, |
|
test_end, |
|
train_start_bis, |
|
train_end_bis, |
|
target, |
|
features, |
|
) |
|
params = get_best_params(model, X_train, y_train, X_test, y_test, param_grid) |
|
print(X_train.columns) |
|
model = model.set_params(**params) |
|
|
|
model = model.set_params(random_state=42) |
|
r2, mae, mse, rmse = train_model(model, X_train, y_train, X_test, y_test) |
|
print(f"R^2: {r2}") |
|
print(f"MAE: {mae}") |
|
print(f"MSE: {mse}") |
|
print(f"RMSE: {rmse}") |
|
plot(df, model, test_start, test_end, target, features) |
|
dates_test = df[(df["date"] >= test_start) & (df["date"] <= test_end)]["date"] |
|
actual_test = df[(df["date"] >= test_start) & (df["date"] <= test_end)][target] |
|
predicted_test = model.predict(X_test) |
|
plot_rmse_explanation(dates_test, actual_test, predicted_test, rmse) |
|
return r2, mae, mse, rmse |
|
|
|
|
|
if __name__ == "__main__": |
|
cpih_df = read_cpih("quanti/data/cpih.csv", medical=False) |
|
cpim_df = read_cpih("quanti/data/cpih_medical.csv", medical=True) |
|
hes = read_hes("quanti/data/HES_M5_OPEN_DATA.csv") |
|
df = get_global_df(cpih_df, cpim_df, hes) |
|
df = get_final_df(df) |
|
print(df.columns) |
|
model = choose_model("rf") |
|
train_start = "2014-01-01" |
|
train_end = "2025-01-01" |
|
|
|
|
|
test_start = "2007-01-01" |
|
test_end = "2014-01-01" |
|
target = "target" |
|
features = df.columns.drop(["date", "target"]).tolist() |
|
param_grid = { |
|
"n_estimators": [50, 100, 200], |
|
"max_depth": [None, 5, 10, 20], |
|
"criterion": ["mse", "poisson"], |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r2, mae, mse, rmse = main( |
|
df, |
|
model, |
|
train_start, |
|
train_end, |
|
test_start, |
|
test_end, |
|
target, |
|
features, |
|
param_grid, |
|
|
|
|
|
) |
|
|