Spaces:

macota1
/

axa

Runtime error

axa / quanti /__main__.py

Mayara Ayat

Upload folder using huggingface_hub

f7ab812 verified 4 months ago

5.31 kB

	from models import *
	from data_preprocessing import *


	def plot_rmse_explanation(
	dates: pd.Series,
	actual: pd.Series,
	predicted: pd.Series,
	rmse: float,
	title: str = "Understanding RMSE: Actual vs Predicted",
	):
	"""
	Plot the actual vs. predicted values with error visualization and RMSE explanation.

	Parameters:
	dates (pd.Series): Dates corresponding to the observations.
	actual (pd.Series): Actual target values.
	predicted (pd.Series): Predicted values by the model.
	rmse (float): The root mean squared error value.
	title (str): The title of the plot.
	"""
	plt.figure(figsize=(14, 8))

	# Plot actual vs. predicted values
	plt.plot(dates, actual, label="Actual Values", color="orange", linewidth=2)
	plt.plot(
	dates,
	predicted,
	label="Predicted Values",
	color="blue",
	linestyle="--",
	linewidth=2,
	)

	# Highlight errors (residuals)
	for date, act, pred in zip(dates, actual, predicted):
	plt.plot(
	[date, date], [act, pred], color="red", alpha=0.5
	) # Vertical lines showing residuals

	# Annotate RMSE value
	plt.text(
	0.05,
	0.95,
	f"RMSE: {rmse:.2f}",
	transform=plt.gca().transAxes,
	fontsize=14,
	color="red",
	bbox=dict(facecolor="white", alpha=0.7, edgecolor="red"),
	)

	# Add plot details
	plt.title(title, fontsize=16)
	plt.xlabel("Date", fontsize=14)
	plt.ylabel("CPIH Medical", fontsize=14)
	plt.xticks(rotation=45)
	plt.legend(fontsize=12)
	plt.grid(alpha=0.3)
	plt.tight_layout()
	plt.savefig(f"quanti/data/{title},{dates.iloc[0]},{dates.iloc[-1]}.png")
	plt.show()


	def main(
	df: pd.DataFrame,
	model,
	train_start: str,
	train_end: str,
	test_start: str,
	test_end: str,
	target: str,
	features: list,
	param_grid: dict,
	train_start_bis: str = None,
	train_end_bis: str = None,
	):
	"""
	Train and evaluate a model on the given data.

	Parameters:
	df (pd.DataFrame): The DataFrame containing the data.
	model (str): The model to use.
	train_start (str): The start date for the training set, in the format "YYYY-MM-DD".
	train_end (str): The end date for the training set, in the format "YYYY-MM-DD".
	test_start (str): The start date for the testing set, in the format "YYYY-MM-DD".
	test_end (str): The end date for the testing set, in the format "YYYY-MM-DD".
	target (str): The target column.
	features (list): The features to use.
	param_grid (dict): The hyperparameter grid.
	train_start_bis (str): The start date for the second training set, in the format "YYYY-MM-DD".
	train_end_bis (str): The end date for the second training set, in the format "YYYY-MM-DD".


	Returns:
	tuple: A tuple containing the R^2, MAE, MSE, and RMSE values.
	"""
	X_train, y_train, X_test, y_test = training_testing_data(
	df,
	train_start,
	train_end,
	test_start,
	test_end,
	train_start_bis,
	train_end_bis,
	target,
	features,
	)
	params = get_best_params(model, X_train, y_train, X_test, y_test, param_grid)
	print(X_train.columns)
	model = model.set_params(**params)
	# Add random state for reproducibility
	model = model.set_params(random_state=42)
	r2, mae, mse, rmse = train_model(model, X_train, y_train, X_test, y_test)
	print(f"R^2: {r2}")
	print(f"MAE: {mae}")
	print(f"MSE: {mse}")
	print(f"RMSE: {rmse}")
	plot(df, model, test_start, test_end, target, features)
	dates_test = df[(df["date"] >= test_start) & (df["date"] <= test_end)]["date"]
	actual_test = df[(df["date"] >= test_start) & (df["date"] <= test_end)][target]
	predicted_test = model.predict(X_test)
	plot_rmse_explanation(dates_test, actual_test, predicted_test, rmse)
	return r2, mae, mse, rmse


	if __name__ == "__main__":
	cpih_df = read_cpih("quanti/data/cpih.csv", medical=False)
	cpim_df = read_cpih("quanti/data/cpih_medical.csv", medical=True)
	hes = read_hes("quanti/data/HES_M5_OPEN_DATA.csv")
	df = get_global_df(cpih_df, cpim_df, hes)
	df = get_final_df(df)
	print(df.columns)
	model = choose_model("rf")
	train_start = "2014-01-01"
	train_end = "2025-01-01"
	# train_start_bis = "2019-01-01"
	# train_end_bis = "2024-12-01"
	test_start = "2007-01-01"
	test_end = "2014-01-01"
	target = "target"
	features = df.columns.drop(["date", "target"]).tolist()
	param_grid = {
	"n_estimators": [50, 100, 200],
	"max_depth": [None, 5, 10, 20],
	"criterion": ["mse", "poisson"],
	}
	# param_grid = {
	# "n_estimators": [50, 100, 200], # Number of trees
	# "max_depth": [3, 5, 7, 10], # Maximum depth of a tree
	# "learning_rate": [0.01, 0.1, 0.2], # Learning rate
	# }
	# param_grid = {
	# "n_estimators": [100, 200, 500],
	# "learning_rate": [0.01, 0.1, 0.2],
	# "max_depth": [5, 10, 20],
	# }
	r2, mae, mse, rmse = main(
	df,
	model,
	train_start,
	train_end,
	test_start,
	test_end,
	target,
	features,
	param_grid,
	# train_start_bis,
	# train_end_bis,
	)