axa / quanti /__main__.py
Mayara Ayat
Upload folder using huggingface_hub
f7ab812 verified
from models import *
from data_preprocessing import *
def plot_rmse_explanation(
dates: pd.Series,
actual: pd.Series,
predicted: pd.Series,
rmse: float,
title: str = "Understanding RMSE: Actual vs Predicted",
):
"""
Plot the actual vs. predicted values with error visualization and RMSE explanation.
Parameters:
dates (pd.Series): Dates corresponding to the observations.
actual (pd.Series): Actual target values.
predicted (pd.Series): Predicted values by the model.
rmse (float): The root mean squared error value.
title (str): The title of the plot.
"""
plt.figure(figsize=(14, 8))
# Plot actual vs. predicted values
plt.plot(dates, actual, label="Actual Values", color="orange", linewidth=2)
plt.plot(
dates,
predicted,
label="Predicted Values",
color="blue",
linestyle="--",
linewidth=2,
)
# Highlight errors (residuals)
for date, act, pred in zip(dates, actual, predicted):
plt.plot(
[date, date], [act, pred], color="red", alpha=0.5
) # Vertical lines showing residuals
# Annotate RMSE value
plt.text(
0.05,
0.95,
f"RMSE: {rmse:.2f}",
transform=plt.gca().transAxes,
fontsize=14,
color="red",
bbox=dict(facecolor="white", alpha=0.7, edgecolor="red"),
)
# Add plot details
plt.title(title, fontsize=16)
plt.xlabel("Date", fontsize=14)
plt.ylabel("CPIH Medical", fontsize=14)
plt.xticks(rotation=45)
plt.legend(fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(f"quanti/data/{title},{dates.iloc[0]},{dates.iloc[-1]}.png")
plt.show()
def main(
df: pd.DataFrame,
model,
train_start: str,
train_end: str,
test_start: str,
test_end: str,
target: str,
features: list,
param_grid: dict,
train_start_bis: str = None,
train_end_bis: str = None,
):
"""
Train and evaluate a model on the given data.
Parameters:
df (pd.DataFrame): The DataFrame containing the data.
model (str): The model to use.
train_start (str): The start date for the training set, in the format "YYYY-MM-DD".
train_end (str): The end date for the training set, in the format "YYYY-MM-DD".
test_start (str): The start date for the testing set, in the format "YYYY-MM-DD".
test_end (str): The end date for the testing set, in the format "YYYY-MM-DD".
target (str): The target column.
features (list): The features to use.
param_grid (dict): The hyperparameter grid.
train_start_bis (str): The start date for the second training set, in the format "YYYY-MM-DD".
train_end_bis (str): The end date for the second training set, in the format "YYYY-MM-DD".
Returns:
tuple: A tuple containing the R^2, MAE, MSE, and RMSE values.
"""
X_train, y_train, X_test, y_test = training_testing_data(
df,
train_start,
train_end,
test_start,
test_end,
train_start_bis,
train_end_bis,
target,
features,
)
params = get_best_params(model, X_train, y_train, X_test, y_test, param_grid)
print(X_train.columns)
model = model.set_params(**params)
# Add random state for reproducibility
model = model.set_params(random_state=42)
r2, mae, mse, rmse = train_model(model, X_train, y_train, X_test, y_test)
print(f"R^2: {r2}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
plot(df, model, test_start, test_end, target, features)
dates_test = df[(df["date"] >= test_start) & (df["date"] <= test_end)]["date"]
actual_test = df[(df["date"] >= test_start) & (df["date"] <= test_end)][target]
predicted_test = model.predict(X_test)
plot_rmse_explanation(dates_test, actual_test, predicted_test, rmse)
return r2, mae, mse, rmse
if __name__ == "__main__":
cpih_df = read_cpih("quanti/data/cpih.csv", medical=False)
cpim_df = read_cpih("quanti/data/cpih_medical.csv", medical=True)
hes = read_hes("quanti/data/HES_M5_OPEN_DATA.csv")
df = get_global_df(cpih_df, cpim_df, hes)
df = get_final_df(df)
print(df.columns)
model = choose_model("rf")
train_start = "2014-01-01"
train_end = "2025-01-01"
# train_start_bis = "2019-01-01"
# train_end_bis = "2024-12-01"
test_start = "2007-01-01"
test_end = "2014-01-01"
target = "target"
features = df.columns.drop(["date", "target"]).tolist()
param_grid = {
"n_estimators": [50, 100, 200],
"max_depth": [None, 5, 10, 20],
"criterion": ["mse", "poisson"],
}
# param_grid = {
# "n_estimators": [50, 100, 200], # Number of trees
# "max_depth": [3, 5, 7, 10], # Maximum depth of a tree
# "learning_rate": [0.01, 0.1, 0.2], # Learning rate
# }
# param_grid = {
# "n_estimators": [100, 200, 500],
# "learning_rate": [0.01, 0.1, 0.2],
# "max_depth": [5, 10, 20],
# }
r2, mae, mse, rmse = main(
df,
model,
train_start,
train_end,
test_start,
test_end,
target,
features,
param_grid,
# train_start_bis,
# train_end_bis,
)