Spaces:

erasmopurif
/

FairUP

Runtime error

App Files Files Community

FairUP / src /aif360 /datasets /regression_dataset.py

erasmopurif

First commit

d2a8669 almost 2 years ago

raw

history blame

4.92 kB

	from logging import warning

	import numpy as np
	import pandas as pd

	from aif360.datasets import StructuredDataset

	from sklearn.preprocessing import MinMaxScaler


	class RegressionDataset(StructuredDataset):
	"""Base class for regression datasets."""

	def __init__(self, df, dep_var_name, protected_attribute_names,
	privileged_classes, instance_weights_name='',
	categorical_features=[], na_values=[],
	custom_preprocessing=None, metadata=None):
	"""
	Subclasses of RegressionDataset should perform the following before
	calling `super().__init__`:

	1. Load the dataframe from a raw file.

	Then, this class will go through a standard preprocessing routine which:

	2. (optional) Performs some dataset-specific preprocessing (e.g.
	renaming columns/values, handling missing data).

	3. Drops rows with NA values.

	4. Creates a one-hot encoding of the categorical variables.

	5. Maps protected attributes to binary privileged/unprivileged
	values (1/0).

	6. Normalizes df values

	Args:
	df (pandas.DataFrame): DataFrame on which to perform standard
	processing.
	dep_var_name: Name of the dependent variable column in `df`.
	protected_attribute_names (list): List of names corresponding to
	protected attribute columns in `df`.
	privileged_classes (list(list or function)): Each element is
	a list of values which are considered privileged or a boolean
	function which return `True` if privileged for the corresponding
	column in `protected_attribute_names`. All others are
	unprivileged. Values are mapped to 1 (privileged) and 0
	(unprivileged) if they are not already numerical.
	instance_weights_name (optional): Name of the instance weights
	column in `df`.
	categorical_features (optional, list): List of column names in the
	DataFrame which are to be expanded into one-hot vectors.
	na_values (optional): Additional strings to recognize as NA. See
	:func:`pandas.read_csv` for details.
	custom_preprocessing (function): A function object which
	acts on and returns a DataFrame (f: DataFrame -> DataFrame). If
	`None`, no extra preprocessing is applied.
	metadata (optional): Additional metadata to append.
	"""
	# 2. Perform dataset-specific preprocessing
	if custom_preprocessing:
	df = custom_preprocessing(df)

	# 3. Remove any rows that have missing data.
	dropped = df.dropna()
	count = df.shape[0] - dropped.shape[0]
	if count > 0:
	warning("Missing Data: {} rows removed from {}.".format(count,
	type(self).__name__))
	df = dropped

	# 4. Create a one-hot encoding of the categorical variables.
	df = pd.get_dummies(df, columns=categorical_features, prefix_sep='=')

	# 5. Map protected attributes to privileged/unprivileged
	privileged_protected_attributes = []
	unprivileged_protected_attributes = []
	for attr, vals in zip(protected_attribute_names, privileged_classes):
	privileged_values = [1.]
	unprivileged_values = [0.]
	if callable(vals):
	df[attr] = df[attr].apply(vals)
	elif np.issubdtype(df[attr].dtype, np.number):
	# this attribute is numeric; no remapping needed
	privileged_values = vals
	unprivileged_values = list(set(df[attr]).difference(vals))
	else:
	# find all instances which match any of the attribute values
	priv = np.logical_or.reduce(np.equal.outer(vals, df[attr].to_numpy()))
	df.loc[priv, attr] = privileged_values[0]
	df.loc[~priv, attr] = unprivileged_values[0]

	privileged_protected_attributes.append(
	np.array(privileged_values, dtype=np.float64))
	unprivileged_protected_attributes.append(
	np.array(unprivileged_values, dtype=np.float64))

	# 6. Normalize df values
	df = pd.DataFrame(MinMaxScaler().fit_transform(df.values),
	columns=list(df), index=df.index)

	super(RegressionDataset, self).__init__(df=df,
	label_names=[dep_var_name],
	protected_attribute_names=protected_attribute_names,
	privileged_protected_attributes=privileged_protected_attributes,
	unprivileged_protected_attributes=unprivileged_protected_attributes,
	instance_weights_name=instance_weights_name,
	scores_names=[],
	metadata=metadata)