import pandas as pd import numpy as np import re import matplotlib.pyplot as plt from constants import HES def categorize_title(title: str, patterns: dict) -> str: """ Categorize a title based on a dictionary of patterns. Parameters: title (str): The title to categorize. patterns (dict): A dictionary where the keys are the categories and the values are the patterns to match. Returns: str: The category of the title.""" for category, pattern in patterns.items(): if re.search(pattern, title): return category return "Uncategorized" # For rows that don't fit any of the patterns def get_category( df: pd.DataFrame, column: str, categories: list, cat: str ) -> pd.DataFrame: """ " Get a subset of a DataFrame based on the category of the titles in a column. Parameters: df (pd.DataFrame): The DataFrame to filter. column (str): The column containing the titles. categories (list): A list of categories. Returns: pd.DataFrame: The subset of the DataFrame that matches the category.""" patterns = { categories[0]: r"^(?!.*\b\d{4}\b).*$", # No 4-digit year anywhere in the title categories[1]: r"^\b\d{4}\b$", # Starts with a 4-digit year and nothing else categories[ 2 ]: r"^\b\d{4}\b.*\bQ[1-4]\b", # Starts with a year and contains "Q1", "Q2", etc. categories[ 3 ]: r"^\b\d{4}\b.*\b(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\b", # Starts with a year and contains a month name } df["category"] = df[column].apply(categorize_title, patterns=patterns) result = df[df["category"] == cat] result = result.drop(columns=["category"]) result = result.dropna() result.columns = ["date", "value"] result["date"] = pd.to_datetime(result["date"], format="%Y %b") result["value"] = result["value"].astype(float) result = result.reset_index(drop=True) return result def read_cpih( file_path: str, medical: bool = True, category: str = "Month" ) -> pd.DataFrame: """ Read the CPIH data from a CSV file and return a DataFrame. Parameters: file_path (str): The path to the CSV file. category (str): The category of the data to extract. Returns: pd.DataFrame: The CPIH data.""" return get_category( pd.read_csv(file_path), "Title", ["Month", "Year", "Quarter", "Month"], category ) def read_hes( file_path: str, ): """ Read the HES data from a CSV file and return a DataFrame. Parameters: file_path (str): The path to the CSV file. Returns: pd.DataFrame: The HES data.""" df = pd.read_csv(file_path) df["CALENDAR_MONTH_END_DATE"] = df["CALENDAR_MONTH_END_DATE"].str.replace( "-", " 20" ) df["CALENDAR_MONTH_END_DATE"] = df["CALENDAR_MONTH_END_DATE"].str.upper() df["CALENDAR_MONTH_END_DATE"] = pd.to_datetime( df["CALENDAR_MONTH_END_DATE"], format="mixed" ) df = df.dropna(axis=1, how="all") df = df.sort_values(by="CALENDAR_MONTH_END_DATE") df = df.dropna() df = df.reset_index(drop=True) df.rename(columns={"CALENDAR_MONTH_END_DATE": "date"}, inplace=True) df["date"] = pd.to_datetime(df["date"], format="%Y %b") df["date"] = df["date"] + pd.offsets.MonthBegin(-1) return df def get_global_df( cpih: pd.DataFrame, cpim: pd.DataFrame, hes: pd.DataFrame ) -> pd.DataFrame: """ Merge the CPIH, CPIM and HES data into a single DataFrame. Parameters: cpih (pd.DataFrame): The CPIH data. cpim (pd.DataFrame): The CPIM data. hes (pd.DataFrame): The HES data. Returns: pd.DataFrame: The merged DataFrame.""" joined_data = pd.merge(cpih, cpim, on="date", how="inner").merge( hes, on="date", how="inner" ) joined_data.rename( columns={"value_x": "cpih", "value_y": "cpih_medical"}, inplace=True ) joined_data["year"] = joined_data["date"].dt.year joined_data["month"] = joined_data["date"].dt.month joined_data.drop(columns=["date"], inplace=True) return joined_data def get_final_df(joined_data: pd.DataFrame) -> pd.DataFrame: """ Create the final DataFrame for training and testing. Parameters: joined_data (pd.DataFrame): The merged DataFrame. Returns: pd.DataFrame: The final DataFrame.""" joined_data["date"] = pd.to_datetime(joined_data[["year", "month"]].assign(day=1)) final_data = pd.DataFrame(columns=["date"]) final_data["date"] = joined_data["date"] final_data["target"] = joined_data["cpih_medical"] final_data["cpim_lag1"] = joined_data["cpih_medical"].shift(1) final_data["cpim_lag2"] = joined_data["cpih_medical"].shift(2) final_data["cpim_lag3"] = joined_data["cpih_medical"].shift(3) final_data["cpih_lag1"] = joined_data["cpih"].shift(1) final_data["cpih_lag2"] = joined_data["cpih"].shift(2) final_data["cpih_lag3"] = joined_data["cpih"].shift(3) final_data[HES] = joined_data[HES].shift(1) final_data.dropna(inplace=True) final_data.reset_index(drop=True, inplace=True) return final_data