| """Data pre-processing functions.""" | |
| import numpy | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, KBinsDiscretizer | |
| def _get_pipeline_replace_one_hot(func, value): | |
| return Pipeline([ | |
| ("replace", FunctionTransformer( | |
| func, | |
| kw_args={"value": value}, | |
| feature_names_out='one-to-one', | |
| )), | |
| ("one_hot", OneHotEncoder(),), | |
| ]) | |
| def _replace_values_geq(column, value): | |
| return numpy.where(column >= value, f"{value}_or_more", column) | |
| def _replace_values_eq(column, value): | |
| for desired_value, values_to_replace in value.items(): | |
| column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column) | |
| return column | |
| def get_pre_processors(): | |
| pre_processor_user = ColumnTransformer( | |
| transformers=[ | |
| ( | |
| "replace_num_children", | |
| _get_pipeline_replace_one_hot(_replace_values_geq, 2), | |
| ['Num_children'] | |
| ), | |
| ( | |
| "replace_num_family", | |
| _get_pipeline_replace_one_hot(_replace_values_geq, 3), | |
| ['Num_family'] | |
| ), | |
| ( | |
| "replace_income_type", | |
| _get_pipeline_replace_one_hot(_replace_values_eq, {"State servant": ["Pensioner", "Student"]}), | |
| ['Income_type'] | |
| ), | |
| ( | |
| "replace_education_type", | |
| _get_pipeline_replace_one_hot(_replace_values_eq, {"Higher education": ["Academic degree"]}), | |
| ['Education_type'] | |
| ), | |
| ( | |
| "replace_occupation_type_labor", | |
| _get_pipeline_replace_one_hot( | |
| _replace_values_eq, | |
| { | |
| "Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-skill Laborers", "Security staff", "Waiters/barmen staff"], | |
| "Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"], | |
| "High_tech_work": ["Managers", "High skill tech staff", "IT staff"], | |
| }, | |
| ), | |
| ['Occupation_type'] | |
| ), | |
| ('one_hot_housing_fam_status', OneHotEncoder(), ['Housing_type', 'Family_status']), | |
| ('qbin_total_income', KBinsDiscretizer(n_bins=3, strategy='quantile', encode="onehot"), ['Total_income']), | |
| ('bin_age', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Age']), | |
| ], | |
| remainder='passthrough', | |
| verbose_feature_names_out=False, | |
| ) | |
| pre_processor_third_party = ColumnTransformer( | |
| transformers=[ | |
| ('bin_years_employed', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Years_employed']) | |
| ], | |
| remainder='passthrough', | |
| verbose_feature_names_out=False, | |
| ) | |
| return pre_processor_user, pre_processor_third_party | |
| def select_and_pop_features(data, columns): | |
| new_data = data[columns].copy() | |
| data.drop(columns, axis=1, inplace=True) | |
| return new_data |