FairUP / src /fainress_component.py
erasmopurif's picture
First commit
d2a8669
raw
history blame
18 kB
from cProfile import label
import numpy as np
import pandas as pd
import networkx as nx
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing import DisparateImpactRemover, Reweighing, LFR
from aif360.metrics import BinaryLabelDatasetMetric
def fairness_calculation(dataset_name, dataset_path, sens_attr, predict_attr):
if dataset_name == 'nba':
fairness_calculation_nba(dataset_path, sens_attr, predict_attr)
elif dataset_name == 'alibaba':
fairness_calculation_alibaba(dataset_path, sens_attr, predict_attr)
elif dataset_name == 'tecent':
fairness_calculation_tecent(dataset_path, sens_attr, predict_attr)
elif dataset_name == 'pokec_z' or dataset_name == 'pokec_n':
fairness_calculation_pokec(dataset_path, dataset_path, sens_attr, predict_attr)
def fairness_calculation_nba(dataset_path, sens_attr, predict_attr):
#data = nx.read_graphml(dataset_path)
#df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index')
df = pd.read_csv(dataset_path)
if df.columns[0] != 'user_id':
df = df.reset_index(level=0)
df = df.rename(columns={"index": "user_id"})
if type(df['user_id'][0]) != np.int64:
df['user_id'] = pd.to_numeric(df['user_id'])
df = df.astype({'user_id': int})
df[predict_attr] = df[predict_attr].replace(-1, 0)
#dataset_fairness(df, sens_attr, predict_attr)
disparate_impact(df, sens_attr, predict_attr)
def fairness_calculation_alibaba(dataset_path, sens_attr, label):
# data = nx.read_graphml(dataset_path)
#df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index')
df = pd.read_csv(dataset_path)
#if df.columns[0] != 'userid':
# df = df.reset_index(level=0)
# df = df.rename(columns={"index": "userid"})
#if type(df['userid'][0]) != np.int64:
# df['userid'] = pd.to_numeric(df['userid'])
# df = df.astype({'userid': int})
#if sens_attr == 'age' or sens_attr == 'age_level' or sens_attr == 'bin_age':
# df.rename(columns={'age_level':'age', 'final_gender_code':'gender'}, inplace=True)
df[sens_attr] = df[sens_attr].replace(1, 0)
df[sens_attr] = df[sens_attr].replace(2, 0)
df[sens_attr] = df[sens_attr].replace(3, 0)
df[sens_attr] = df[sens_attr].replace(4, 1)
df[sens_attr] = df[sens_attr].replace(5, 1)
df[sens_attr] = df[sens_attr].replace(6, 1)
df[label] = df[label].replace(1, 0)
df[label] = df[label].replace(2, 1)
#dataset_fairness(df, sens_attr, label)
disparate_impact(df, sens_attr, label)
def fairness_calculation_tecent(dataset_path, sens_attr, label):
#data = nx.read_graphml(dataset_path)
#df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index')
df = pd.read_csv(dataset_path)
#if df.columns[0] != 'user_id':
# df = df.reset_index(level=0)
# df = df.rename(columns={"index": "user_id"})
#if type(df['user_id'][0]) != np.int64:
# df['user_id'] = pd.to_numeric(df['user_id'])
# df = df.astype({'user_id': int})
#if sens_attr == 'bin_age':
# df.rename(columns={'age_range':'age'}, inplace=True)
if sens_attr == 'age_range':
age_dic = {'11~15':0, '16~20':0, '21~25':0, '26~30':1, '31~35':1, '36~40':2, '41~45':2, '46~50':3, '51~55':3, '56~60':4, '61~65':4, '66~70':4, '71~':4}
df[[sens_attr]] = df[[sens_attr]].applymap(lambda x:age_dic[x])
df[sens_attr] = df[sens_attr].replace(1,0)
df[sens_attr] = df[sens_attr].replace(2,1)
df[sens_attr] = df[sens_attr].replace(3,1)
df[sens_attr] = df[sens_attr].replace(4,1)
#dataset_fairness(df, sens_attr, label)
disparate_impact(df, sens_attr, label)
def fairness_calculation_pokec(dataset_path, dataset_name, sens_attr, label):
#data = nx.read_graphml(dataset_path)
#df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index')
df = pd.read_csv(dataset_path)
#if df.columns[0] != 'user_id':
# df = df.reset_index(level=0)
# df = df.rename(columns={"index": "user_id"})
#if type(df['user_id'][0]) != np.int64:
# df['user_id'] = pd.to_numeric(df['user_id'])
# df = df.astype({'user_id': int})
if dataset_name == 'pokec_z':
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 0)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 0)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(4, 1)
#elif dataset_name == 'pokec_n':
# df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0)
# df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 1)
# df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 1)
# df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1)
# df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1)
#dataset_fairness(df, sens_attr, label)
disparate_impact(df, sens_attr, label)
def dataset_fairness(df, sens_attr, label):
total_number_of_sens0 = len(df.loc[df[sens_attr] == 0])
total_number_of_sens1 = len(df.loc[df[sens_attr] == 1])
number_of_positive_sens0 = len(df.loc[(df[sens_attr] == 0) & (df[label] == 1)])
number_of_positive_sens1 = len(df.loc[(df[sens_attr] == 1) & (df[label] == 1)])
fairness = np.absolute(number_of_positive_sens0) / np.absolute(total_number_of_sens0) - np.absolute(number_of_positive_sens1) / np.absolute(total_number_of_sens1)
dataset_fainress = fairness * 100
print('Dataset fairness:', dataset_fainress)
def disparate_impact(df, sens_attr, label):
pr_unpriv = calc_prop(df, sens_attr, 1, label, 1)
#print('pr_unpriv: ', pr_unpriv)
pr_priv = calc_prop(df, sens_attr, 0, label, 1)
#print('pr_priv:', pr_priv)
disp = pr_unpriv / pr_priv
bin_label_dataset = BinaryLabelDataset(favorable_label=1,
unfavorable_label=0,
df=df,
label_names=[label],
protected_attribute_names=[sens_attr],
unprivileged_protected_attributes=[1])
privileged_groups = [{sens_attr: 0}]
unprivileged_groups = [{sens_attr: 1}]
metric_dataset = BinaryLabelDatasetMetric(bin_label_dataset,
unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
# just for comparison
print('Dataset Fairness:', disp)
#print("Disparate impact (from AIF360) = %f" %metric_dataset.disparate_impact())
def calc_prop(data, group_col, group, output_col, output_val):
new = data[data[group_col] == group]
return len(new[new[output_col] == output_val])/len(new)
def disparate_impact_remover(df, sens_attr, label):
if 'final_gender_code' in df:
df.rename(columns={'final_gender_code':'gender'}, inplace=True)
elif 'age_level' in df:
df.rename(columns={'age_level': 'age'}, inplace=True)
bin_label_dataset = BinaryLabelDataset(favorable_label=1,
unfavorable_label=0,
df=df,
label_names=[label],
protected_attribute_names=[sens_attr],
unprivileged_protected_attributes=[1])
di = DisparateImpactRemover(repair_level=1 )
di_transformation = di.fit_transform(bin_label_dataset)
privileged_groups = [{sens_attr: 0}]
unprivileged_groups = [{sens_attr: 1}]
metric_original_dataset = BinaryLabelDatasetMetric(bin_label_dataset,
unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
metric_new_dataset = BinaryLabelDatasetMetric(di_transformation,
unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
print("Original Disparate impact (from AIF360) = %f" %metric_original_dataset.disparate_impact())
print("After debaising Disparate impact (from AIF360) = %f" %metric_new_dataset.disparate_impact())
new_df = di_transformation.convert_to_dataframe()[0]
return new_df
def reweighting(df, sens_attr, label):
print('we are in reweighting')
bin_label_dataset = BinaryLabelDataset(favorable_label=1,
unfavorable_label=0,
df=df,
label_names=[label],
protected_attribute_names=[sens_attr],
unprivileged_protected_attributes=[1])
privileged_groups = [{sens_attr: 0}]
unprivileged_groups = [{sens_attr: 1}]
RW = Reweighing(unprivileged_groups = unprivileged_groups, privileged_groups = privileged_groups)
RW.fit(bin_label_dataset)
rw_transformation = RW.transform(bin_label_dataset)
metric_original_dataset = BinaryLabelDatasetMetric(bin_label_dataset,
unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
metric_new_dataset = BinaryLabelDatasetMetric(rw_transformation,
unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
print("Original Disparate impact (from AIF360) = %f" %metric_original_dataset.disparate_impact())
print("After debaising Disparate impact (from AIF360) = %f" %metric_new_dataset.disparate_impact())
df_new = rw_transformation.convert_to_dataframe()[0]
return df_new
def lfr(df, sens_attr, label):
bin_label_dataset = BinaryLabelDataset(favorable_label=1,
unfavorable_label=0,
df=df,
label_names=[label],
protected_attribute_names=[sens_attr],
unprivileged_protected_attributes=[1])
privileged_groups = [{sens_attr: 0}]
unprivileged_groups = [{sens_attr: 1}]
TR = LFR(unprivileged_groups = unprivileged_groups, privileged_groups = privileged_groups)
TR = TR.fit(bin_label_dataset)
dset_lfr_trn = TR.transform(bin_label_dataset, threshold = 0.3)
dset_lfr_trn = bin_label_dataset.align_datasets(dset_lfr_trn)
metric_original_dataset = BinaryLabelDatasetMetric(bin_label_dataset,
unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
metric_new_dataset = BinaryLabelDatasetMetric(dset_lfr_trn,
unprivileged_groups=unprivileged_groups,
privileged_groups=privileged_groups)
print("Original Disparate impact (from AIF360) = %f" %metric_original_dataset.disparate_impact())
print("After debaising Disparate impact (from AIF360) = %f" %metric_new_dataset.disparate_impact())
df_new = dset_lfr_trn.convert_to_dataframe()[0]
return df_new
def sample(df, sens_attr, label):
print('we are in sample')
dp = df.loc[(df[sens_attr] == 0) & (df[label] == 1)]
dn = df.loc[(df[sens_attr] == 0) & (df[label] == 0)]
fp = df.loc[(df[sens_attr] == 1) & (df[label] == 1)]
fn = df.loc[(df[sens_attr] == 1) & (df[label] == 0)]
wdp = len(df.loc[df[sens_attr] == 0]) * len(df.loc[df[label] == 1]) / len(df.loc[(df[label] == 1) & (df[sens_attr] == 0)])
wdn = len(df.loc[df[sens_attr] == 0]) * len(df.loc[df[label] == 0]) / len(df.loc[(df[label] == 1) & (df[sens_attr] == 0)])
wfp = len(df.loc[df[sens_attr] == 1]) * len(df.loc[df[label] == 1]) / len(df.loc[(df[label] == 1) & (df[sens_attr] == 0)])
wfn = len(df.loc[df[sens_attr] == 1]) * len(df.loc[df[label] == 0]) / len(df.loc[(df[label] == 1) & (df[sens_attr] == 0)])
# sample
dp_sample = dp.sample(n=int(wdp), random_state=1, replace=True)
dn_sample = dn.sample(n=int(wdn), random_state=1, replace=True)
fp_sample = fp.sample(n=int(wfp), random_state=1, replace=True)
fn_sample = fn.sample(n=int(wfn), random_state=1, replace=True)
# merge
df_new = pd.concat([dp_sample, dn_sample, fp_sample, fn_sample]).drop_duplicates().reset_index(drop=True)
return df_new
'''
def fairness_calculation(dataset_path, dataset_name, sens_attr, predict_attr, label):
data = nx.read_graphml(dataset_path)
df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index')
if df.columns[0] != 'userid':
# if so, then we make it as the first column
df = df.reset_index(level=0)
df = df.rename(columns={"index": 'userid'})
# check if user_id column is not string
if type(df['userid'][0]) != np.int64:
# if so, we convert it to int
df['userid'] = pd.to_numeric(df['userid'])
df = df.astype({'userid': int})
if predict_attr != None:
label == predict_attr
if dataset_name == 'pokec_z':
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 0)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 0)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(4, 1)
elif dataset_name == 'pokec_n':
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 1)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 1)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1)
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1)
elif dataset_name == 'alibaba':
df['age_level'] = df['age_level'].replace(1, 0)
df['age_level'] = df['age_level'].replace(2, 0)
df['age_level'] = df['age_level'].replace(3, 0)
df['age_level'] = df['age_level'].replace(4, 1)
df['age_level'] = df['age_level'].replace(5, 1)
df['age_level'] = df['age_level'].replace(6, 1)
df['final_gender_code'] = df['final_gender_code'].replace(1, 0)
df['final_gender_code'] = df['final_gender_code'].replace(2, 1)
#df.rename(columns={'age_level':'age', 'final_gender_code':'gender'}, inplace=True)
elif dataset_name == 'tecent':
age_dic = {'11~15':0, '16~20':0, '21~25':0, '26~30':1, '31~35':1, '36~40':2, '41~45':2, '46~50':3, '51~55':3, '56~60':4, '61~65':4, '66~70':4, '71~':4}
df[["age_range"]] = df[["age_range"]].applymap(lambda x:age_dic[x])
df["age_range"] = df["age_range"].replace(1,0)
df["age_range"] = df["age_range"].replace(2,1)
df["age_range"] = df["age_range"].replace(3,1)
df["age_range"] = df["age_range"].replace(4,1)
df.rename(columns={'age_level':'age', 'final_gender_code':'gender'}, inplace=True)
elif dataset_name == 'nba':
df['SALARY'] = df['SALARY'].replace(-1, 0)
#df['SALARY'] = df['SALARY'].replace(0, 1)
#df['SALARY'] = df['SALARY'].replace(1,1)
# old calculation
total_number_of_sens0 = len(df.loc[df[sens_attr] == 0])
total_number_of_sens1 = len(df.loc[df[sens_attr] == 1])
number_of_positive_sens0 = len(df.loc[(df[sens_attr] == 0) & (df[label] == 1)])
number_of_positive_sens1 = len(df.loc[(df[sens_attr] == 1) & (df[label] == 1)])
fairness = np.absolute(number_of_positive_sens0) / np.absolute(total_number_of_sens0) - np.absolute(number_of_positive_sens1) / np.absolute(total_number_of_sens1)
dataset_fainress = fairness * 100
print('dataset fairness:', dataset_fainress)
# new calculation
#one_df = df[df[sens_attr] == 0]
#num_of_priv = one_df.shape[0]
#zero_df = df[df[sens_attr] == 1]
#num_of_unpriv = zero_df.shape[0]
#unpriv_outcomes = zero_df[zero_df[label]==1].shape[0]
#unpriv_ratio = unpriv_outcomes/num_of_unpriv
#priv_outcomes = one_df[one_df[label]==1].shape[0]
#priv_ratio = priv_outcomes/num_of_priv
#disparate_impact = unpriv_ratio/priv_ratio
#return disparate_impact
pr_unpriv = calc_prop(df, sens_attr, 1, label, 1)
#print('pr_unpriv: ', pr_unpriv)
pr_priv = calc_prop(df, sens_attr, 0, label, 1)
#print('pr_priv:', pr_priv)
disp = pr_unpriv / pr_priv
#return pr_unpriv / pr_priv
print('Dsparate impact:', disp)
#binaryLabelDataset =BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df, label_names=[label], protected_attribute_names=[sens_attr], unprivileged_protected_attributes=['1'])
#di = DisparateImpactRemover(repair_level=1.0)
#rp_train = di.fit_transform(binaryLabelDataset)
#df_new = rp_train.convert_to_dataframe()[0]
#print(dataset)
#print(binaryLabelDataset)
#return df_new
'''