axa / quanti /data_preprocessing.py
Mayara Ayat
Upload folder using huggingface_hub
f7ab812 verified
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from constants import HES
def categorize_title(title: str, patterns: dict) -> str:
"""
Categorize a title based on a dictionary of patterns.
Parameters:
title (str): The title to categorize.
patterns (dict): A dictionary where the keys are the categories and the values are the patterns to match.
Returns:
str: The category of the title."""
for category, pattern in patterns.items():
if re.search(pattern, title):
return category
return "Uncategorized" # For rows that don't fit any of the patterns
def get_category(
df: pd.DataFrame, column: str, categories: list, cat: str
) -> pd.DataFrame:
""" "
Get a subset of a DataFrame based on the category of the titles in a column.
Parameters:
df (pd.DataFrame): The DataFrame to filter.
column (str): The column containing the titles.
categories (list): A list of categories.
Returns:
pd.DataFrame: The subset of the DataFrame that matches the category."""
patterns = {
categories[0]: r"^(?!.*\b\d{4}\b).*$", # No 4-digit year anywhere in the title
categories[1]: r"^\b\d{4}\b$", # Starts with a 4-digit year and nothing else
categories[
2
]: r"^\b\d{4}\b.*\bQ[1-4]\b", # Starts with a year and contains "Q1", "Q2", etc.
categories[
3
]: r"^\b\d{4}\b.*\b(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\b", # Starts with a year and contains a month name
}
df["category"] = df[column].apply(categorize_title, patterns=patterns)
result = df[df["category"] == cat]
result = result.drop(columns=["category"])
result = result.dropna()
result.columns = ["date", "value"]
result["date"] = pd.to_datetime(result["date"], format="%Y %b")
result["value"] = result["value"].astype(float)
result = result.reset_index(drop=True)
return result
def read_cpih(
file_path: str, medical: bool = True, category: str = "Month"
) -> pd.DataFrame:
"""
Read the CPIH data from a CSV file and return a DataFrame.
Parameters:
file_path (str): The path to the CSV file.
category (str): The category of the data to extract.
Returns:
pd.DataFrame: The CPIH data."""
return get_category(
pd.read_csv(file_path), "Title", ["Month", "Year", "Quarter", "Month"], category
)
def read_hes(
file_path: str,
):
"""
Read the HES data from a CSV file and return a DataFrame.
Parameters:
file_path (str): The path to the CSV file.
Returns:
pd.DataFrame: The HES data."""
df = pd.read_csv(file_path)
df["CALENDAR_MONTH_END_DATE"] = df["CALENDAR_MONTH_END_DATE"].str.replace(
"-", " 20"
)
df["CALENDAR_MONTH_END_DATE"] = df["CALENDAR_MONTH_END_DATE"].str.upper()
df["CALENDAR_MONTH_END_DATE"] = pd.to_datetime(
df["CALENDAR_MONTH_END_DATE"], format="mixed"
)
df = df.dropna(axis=1, how="all")
df = df.sort_values(by="CALENDAR_MONTH_END_DATE")
df = df.dropna()
df = df.reset_index(drop=True)
df.rename(columns={"CALENDAR_MONTH_END_DATE": "date"}, inplace=True)
df["date"] = pd.to_datetime(df["date"], format="%Y %b")
df["date"] = df["date"] + pd.offsets.MonthBegin(-1)
return df
def get_global_df(
cpih: pd.DataFrame, cpim: pd.DataFrame, hes: pd.DataFrame
) -> pd.DataFrame:
"""
Merge the CPIH, CPIM and HES data into a single DataFrame.
Parameters:
cpih (pd.DataFrame): The CPIH data.
cpim (pd.DataFrame): The CPIM data.
hes (pd.DataFrame): The HES data.
Returns:
pd.DataFrame: The merged DataFrame."""
joined_data = pd.merge(cpih, cpim, on="date", how="inner").merge(
hes, on="date", how="inner"
)
joined_data.rename(
columns={"value_x": "cpih", "value_y": "cpih_medical"}, inplace=True
)
joined_data["year"] = joined_data["date"].dt.year
joined_data["month"] = joined_data["date"].dt.month
joined_data.drop(columns=["date"], inplace=True)
return joined_data
def get_final_df(joined_data: pd.DataFrame) -> pd.DataFrame:
"""
Create the final DataFrame for training and testing.
Parameters:
joined_data (pd.DataFrame): The merged DataFrame.
Returns:
pd.DataFrame: The final DataFrame."""
joined_data["date"] = pd.to_datetime(joined_data[["year", "month"]].assign(day=1))
final_data = pd.DataFrame(columns=["date"])
final_data["date"] = joined_data["date"]
final_data["target"] = joined_data["cpih_medical"]
final_data["cpim_lag1"] = joined_data["cpih_medical"].shift(1)
final_data["cpim_lag2"] = joined_data["cpih_medical"].shift(2)
final_data["cpim_lag3"] = joined_data["cpih_medical"].shift(3)
final_data["cpih_lag1"] = joined_data["cpih"].shift(1)
final_data["cpih_lag2"] = joined_data["cpih"].shift(2)
final_data["cpih_lag3"] = joined_data["cpih"].shift(3)
final_data[HES] = joined_data[HES].shift(1)
final_data.dropna(inplace=True)
final_data.reset_index(drop=True, inplace=True)
return final_data