Spaces:
Runtime error
Runtime error
import sklearn.preprocessing | |
import numpy as np | |
from aif360.algorithms import Transformer | |
class LimeEncoder(Transformer): | |
"""Tranformer for converting aif360 dataset to LIME dataset and vice versa. | |
(LIME - Local Interpretable Model-Agnostic Explanations) [2]_ | |
See for details/usage: | |
https://github.com/marcotcr/lime | |
References: | |
.. [2] M.T. Ribeiro, S. Singh, and C. Guestrin, '"Why should I trust | |
you?" Explaining the predictions of any classifier.' | |
https://arxiv.org/pdf/1602.04938v1.pdf | |
""" | |
def __init__(self): | |
super(LimeEncoder, self).__init__() | |
def fit(self, dataset): | |
"""Take an aif360 dataset and save all relevant metadata as well as | |
mappings needed to transform/inverse_transform the data between aif360 | |
and lime. | |
Args: | |
dataset (BinaryLabelDataset): aif360 dataset | |
Returns: | |
LimeEncoder: Returns self. | |
""" | |
self.s_feature_names_with_one_hot_encoding = dataset.feature_names | |
df, df_dict = dataset.convert_to_dataframe(de_dummy_code=True) | |
dfc = df.drop(dataset.label_names[0], axis=1) # remove label (class) column | |
self.s_feature_names = list(dfc.columns) # create list of feature names | |
self.s_data = dfc.values # create array of feature values | |
# since categorical features are 1-hot-encoded and their names changed, | |
# the set diff gives us the list of categorical features as non- | |
# categorical feature names are not changed | |
self.s_categorical_features = list(set(self.s_feature_names) | |
- set(self.s_feature_names_with_one_hot_encoding)) | |
self.s_protected_attribute_names = dataset.protected_attribute_names | |
# add protected attribute names to the list of categorical features | |
self.s_categorical_features = self.s_categorical_features \ | |
+ self.s_protected_attribute_names | |
self.s_labels = df[dataset.label_names[0]] # create labels | |
# following 3 lines are not really needed | |
# using to create s_class_names..can do so manually as well ...array([ 0., 1.]) | |
s_le = sklearn.preprocessing.LabelEncoder() | |
s_le.fit(self.s_labels) | |
# self.s_labels = s_le.transform(self.s_labels) | |
self.s_class_names = s_le.classes_ | |
# convert s_categorical_features to a list of array indexes in | |
# s_feature_names corresponding to categorical features | |
# (NOTE - does not included protected attributes) | |
self.s_categorical_features = [self.s_feature_names.index(x) | |
for x in self.s_categorical_features] | |
# map all the categorical features to numerical values and store the | |
# mappings in s_categorical_names | |
self.s_categorical_names = {} | |
for feature in self.s_categorical_features: | |
self.le = sklearn.preprocessing.LabelEncoder() | |
self.le.fit(self.s_data[:, feature]) | |
#self.s_data[:, feature] = le.transform(self.s_data[:, feature]) | |
self.s_categorical_names[feature] = self.le.classes_ | |
return self | |
def transform(self, aif360data): | |
"""Take aif360 data array and return data array that is lime encoded | |
(numeric array in which categorical features are NOT one-hot-encoded). | |
Args: | |
aif360data (np.ndarray): Dataset features | |
Returns: | |
np.ndarray: LIME dataset features | |
""" | |
tgtNumRows = aif360data.shape[0] | |
tgtNumcolumns = len(self.s_feature_names) | |
limedata = np.zeros(shape=(tgtNumRows, tgtNumcolumns)) | |
# non_categorical_features = list(set(self.s_feature_names) & set(self.s_feature_names_with_one_hot_encoding)) | |
for rw in range(limedata.shape[0]): | |
for ind, feature in enumerate(self.s_feature_names): | |
if ind in self.s_categorical_features: | |
# tranform the value since categorical feature except if it | |
# is also a protected attribute | |
if feature in self.s_protected_attribute_names: | |
# just copy the value as is | |
limedata[rw, ind] = aif360data[rw, self.s_feature_names_with_one_hot_encoding.index(feature)] | |
else: | |
possible_feature_values = self.s_categorical_names[ind] | |
for indc in range(len(possible_feature_values)): | |
cval = possible_feature_values[indc] | |
colName = feature + "=" + cval | |
if (aif360data[rw][self.s_feature_names_with_one_hot_encoding.index(colName)] == 1.0): | |
limedata[rw][ind] = indc | |
else: | |
# just copy the value as is | |
limedata[rw, ind] = aif360data[rw, self.s_feature_names_with_one_hot_encoding.index(feature)] | |
return limedata | |
def inverse_transform(self, limedata): | |
"""Take data array that is lime encoded (that is, lime-compatible data | |
created by this class from a given aif360 dataset) and return data array | |
consistent with the original aif360 dataset. | |
Args: | |
limedata (np.ndarray): Dataset features | |
Returns: | |
np.ndarray: aif360 dataset features | |
""" | |
tgtNumRows = limedata.shape[0] | |
tgtNumcolumns = len(self.s_feature_names_with_one_hot_encoding) | |
aif360data = np.zeros(shape=(tgtNumRows, tgtNumcolumns)) | |
for rw in range(aif360data.shape[0]): | |
for ind, feature in enumerate(self.s_feature_names): | |
# s_categorical_features has list of indexes into | |
# s_feature_names for categorical features | |
if ind in self.s_categorical_features: | |
if feature in self.s_protected_attribute_names: | |
# just copy the value as is | |
aif360data[rw, self.s_feature_names_with_one_hot_encoding.index(feature)] = limedata[rw, ind] | |
else: | |
# s_categorical_names[ind] has mapping of categorical to | |
# numerical values i.e. limedata[rw, ind] is index of | |
# this array. value is string val | |
new_feature = feature + '=' + self.s_categorical_names[ind][int(limedata[rw, ind])] | |
# categorical feature: | |
aif360data[rw, self.s_feature_names_with_one_hot_encoding.index(new_feature)] = 1.0 | |
else: # just copy value | |
aif360data[rw, self.s_feature_names_with_one_hot_encoding.index(feature)] = limedata[rw, ind] | |
return aif360data | |