Spaces:
Runtime error
Runtime error
from cProfile import label | |
import numpy as np | |
import pandas as pd | |
import networkx as nx | |
from aif360.datasets import BinaryLabelDataset | |
from aif360.algorithms.preprocessing import DisparateImpactRemover, Reweighing, LFR | |
from aif360.metrics import BinaryLabelDatasetMetric | |
def fairness_calculation(dataset_name, dataset_path, sens_attr, predict_attr): | |
if dataset_name == 'nba': | |
fairness_calculation_nba(dataset_path, sens_attr, predict_attr) | |
elif dataset_name == 'alibaba': | |
fairness_calculation_alibaba(dataset_path, sens_attr, predict_attr) | |
elif dataset_name == 'tecent': | |
fairness_calculation_tecent(dataset_path, sens_attr, predict_attr) | |
elif dataset_name == 'pokec_z' or dataset_name == 'pokec_n': | |
fairness_calculation_pokec(dataset_path, dataset_path, sens_attr, predict_attr) | |
def fairness_calculation_nba(dataset_path, sens_attr, predict_attr): | |
#data = nx.read_graphml(dataset_path) | |
#df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index') | |
df = pd.read_csv(dataset_path) | |
if df.columns[0] != 'user_id': | |
df = df.reset_index(level=0) | |
df = df.rename(columns={"index": "user_id"}) | |
if type(df['user_id'][0]) != np.int64: | |
df['user_id'] = pd.to_numeric(df['user_id']) | |
df = df.astype({'user_id': int}) | |
df[predict_attr] = df[predict_attr].replace(-1, 0) | |
#dataset_fairness(df, sens_attr, predict_attr) | |
disparate_impact(df, sens_attr, predict_attr) | |
def fairness_calculation_alibaba(dataset_path, sens_attr, label): | |
# data = nx.read_graphml(dataset_path) | |
#df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index') | |
df = pd.read_csv(dataset_path) | |
#if df.columns[0] != 'userid': | |
# df = df.reset_index(level=0) | |
# df = df.rename(columns={"index": "userid"}) | |
#if type(df['userid'][0]) != np.int64: | |
# df['userid'] = pd.to_numeric(df['userid']) | |
# df = df.astype({'userid': int}) | |
#if sens_attr == 'age' or sens_attr == 'age_level' or sens_attr == 'bin_age': | |
# df.rename(columns={'age_level':'age', 'final_gender_code':'gender'}, inplace=True) | |
df[sens_attr] = df[sens_attr].replace(1, 0) | |
df[sens_attr] = df[sens_attr].replace(2, 0) | |
df[sens_attr] = df[sens_attr].replace(3, 0) | |
df[sens_attr] = df[sens_attr].replace(4, 1) | |
df[sens_attr] = df[sens_attr].replace(5, 1) | |
df[sens_attr] = df[sens_attr].replace(6, 1) | |
df[label] = df[label].replace(1, 0) | |
df[label] = df[label].replace(2, 1) | |
#dataset_fairness(df, sens_attr, label) | |
disparate_impact(df, sens_attr, label) | |
def fairness_calculation_tecent(dataset_path, sens_attr, label): | |
#data = nx.read_graphml(dataset_path) | |
#df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index') | |
df = pd.read_csv(dataset_path) | |
#if df.columns[0] != 'user_id': | |
# df = df.reset_index(level=0) | |
# df = df.rename(columns={"index": "user_id"}) | |
#if type(df['user_id'][0]) != np.int64: | |
# df['user_id'] = pd.to_numeric(df['user_id']) | |
# df = df.astype({'user_id': int}) | |
#if sens_attr == 'bin_age': | |
# df.rename(columns={'age_range':'age'}, inplace=True) | |
if sens_attr == 'age_range': | |
age_dic = {'11~15':0, '16~20':0, '21~25':0, '26~30':1, '31~35':1, '36~40':2, '41~45':2, '46~50':3, '51~55':3, '56~60':4, '61~65':4, '66~70':4, '71~':4} | |
df[[sens_attr]] = df[[sens_attr]].applymap(lambda x:age_dic[x]) | |
df[sens_attr] = df[sens_attr].replace(1,0) | |
df[sens_attr] = df[sens_attr].replace(2,1) | |
df[sens_attr] = df[sens_attr].replace(3,1) | |
df[sens_attr] = df[sens_attr].replace(4,1) | |
#dataset_fairness(df, sens_attr, label) | |
disparate_impact(df, sens_attr, label) | |
def fairness_calculation_pokec(dataset_path, dataset_name, sens_attr, label): | |
#data = nx.read_graphml(dataset_path) | |
#df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index') | |
df = pd.read_csv(dataset_path) | |
#if df.columns[0] != 'user_id': | |
# df = df.reset_index(level=0) | |
# df = df.rename(columns={"index": "user_id"}) | |
#if type(df['user_id'][0]) != np.int64: | |
# df['user_id'] = pd.to_numeric(df['user_id']) | |
# df = df.astype({'user_id': int}) | |
if dataset_name == 'pokec_z': | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 0) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 0) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(4, 1) | |
#elif dataset_name == 'pokec_n': | |
# df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0) | |
# df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 1) | |
# df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 1) | |
# df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1) | |
# df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1) | |
#dataset_fairness(df, sens_attr, label) | |
disparate_impact(df, sens_attr, label) | |
def dataset_fairness(df, sens_attr, label): | |
total_number_of_sens0 = len(df.loc[df[sens_attr] == 0]) | |
total_number_of_sens1 = len(df.loc[df[sens_attr] == 1]) | |
number_of_positive_sens0 = len(df.loc[(df[sens_attr] == 0) & (df[label] == 1)]) | |
number_of_positive_sens1 = len(df.loc[(df[sens_attr] == 1) & (df[label] == 1)]) | |
fairness = np.absolute(number_of_positive_sens0) / np.absolute(total_number_of_sens0) - np.absolute(number_of_positive_sens1) / np.absolute(total_number_of_sens1) | |
dataset_fainress = fairness * 100 | |
print('Dataset fairness:', dataset_fainress) | |
def disparate_impact(df, sens_attr, label): | |
pr_unpriv = calc_prop(df, sens_attr, 1, label, 1) | |
#print('pr_unpriv: ', pr_unpriv) | |
pr_priv = calc_prop(df, sens_attr, 0, label, 1) | |
#print('pr_priv:', pr_priv) | |
disp = pr_unpriv / pr_priv | |
bin_label_dataset = BinaryLabelDataset(favorable_label=1, | |
unfavorable_label=0, | |
df=df, | |
label_names=[label], | |
protected_attribute_names=[sens_attr], | |
unprivileged_protected_attributes=[1]) | |
privileged_groups = [{sens_attr: 0}] | |
unprivileged_groups = [{sens_attr: 1}] | |
metric_dataset = BinaryLabelDatasetMetric(bin_label_dataset, | |
unprivileged_groups=unprivileged_groups, | |
privileged_groups=privileged_groups) | |
# just for comparison | |
print('Dataset Fairness:', disp) | |
#print("Disparate impact (from AIF360) = %f" %metric_dataset.disparate_impact()) | |
def calc_prop(data, group_col, group, output_col, output_val): | |
new = data[data[group_col] == group] | |
return len(new[new[output_col] == output_val])/len(new) | |
def disparate_impact_remover(df, sens_attr, label): | |
if 'final_gender_code' in df: | |
df.rename(columns={'final_gender_code':'gender'}, inplace=True) | |
elif 'age_level' in df: | |
df.rename(columns={'age_level': 'age'}, inplace=True) | |
bin_label_dataset = BinaryLabelDataset(favorable_label=1, | |
unfavorable_label=0, | |
df=df, | |
label_names=[label], | |
protected_attribute_names=[sens_attr], | |
unprivileged_protected_attributes=[1]) | |
di = DisparateImpactRemover(repair_level=1 ) | |
di_transformation = di.fit_transform(bin_label_dataset) | |
privileged_groups = [{sens_attr: 0}] | |
unprivileged_groups = [{sens_attr: 1}] | |
metric_original_dataset = BinaryLabelDatasetMetric(bin_label_dataset, | |
unprivileged_groups=unprivileged_groups, | |
privileged_groups=privileged_groups) | |
metric_new_dataset = BinaryLabelDatasetMetric(di_transformation, | |
unprivileged_groups=unprivileged_groups, | |
privileged_groups=privileged_groups) | |
print("Original Disparate impact (from AIF360) = %f" %metric_original_dataset.disparate_impact()) | |
print("After debaising Disparate impact (from AIF360) = %f" %metric_new_dataset.disparate_impact()) | |
new_df = di_transformation.convert_to_dataframe()[0] | |
return new_df | |
def reweighting(df, sens_attr, label): | |
print('we are in reweighting') | |
bin_label_dataset = BinaryLabelDataset(favorable_label=1, | |
unfavorable_label=0, | |
df=df, | |
label_names=[label], | |
protected_attribute_names=[sens_attr], | |
unprivileged_protected_attributes=[1]) | |
privileged_groups = [{sens_attr: 0}] | |
unprivileged_groups = [{sens_attr: 1}] | |
RW = Reweighing(unprivileged_groups = unprivileged_groups, privileged_groups = privileged_groups) | |
RW.fit(bin_label_dataset) | |
rw_transformation = RW.transform(bin_label_dataset) | |
metric_original_dataset = BinaryLabelDatasetMetric(bin_label_dataset, | |
unprivileged_groups=unprivileged_groups, | |
privileged_groups=privileged_groups) | |
metric_new_dataset = BinaryLabelDatasetMetric(rw_transformation, | |
unprivileged_groups=unprivileged_groups, | |
privileged_groups=privileged_groups) | |
print("Original Disparate impact (from AIF360) = %f" %metric_original_dataset.disparate_impact()) | |
print("After debaising Disparate impact (from AIF360) = %f" %metric_new_dataset.disparate_impact()) | |
df_new = rw_transformation.convert_to_dataframe()[0] | |
return df_new | |
def lfr(df, sens_attr, label): | |
bin_label_dataset = BinaryLabelDataset(favorable_label=1, | |
unfavorable_label=0, | |
df=df, | |
label_names=[label], | |
protected_attribute_names=[sens_attr], | |
unprivileged_protected_attributes=[1]) | |
privileged_groups = [{sens_attr: 0}] | |
unprivileged_groups = [{sens_attr: 1}] | |
TR = LFR(unprivileged_groups = unprivileged_groups, privileged_groups = privileged_groups) | |
TR = TR.fit(bin_label_dataset) | |
dset_lfr_trn = TR.transform(bin_label_dataset, threshold = 0.3) | |
dset_lfr_trn = bin_label_dataset.align_datasets(dset_lfr_trn) | |
metric_original_dataset = BinaryLabelDatasetMetric(bin_label_dataset, | |
unprivileged_groups=unprivileged_groups, | |
privileged_groups=privileged_groups) | |
metric_new_dataset = BinaryLabelDatasetMetric(dset_lfr_trn, | |
unprivileged_groups=unprivileged_groups, | |
privileged_groups=privileged_groups) | |
print("Original Disparate impact (from AIF360) = %f" %metric_original_dataset.disparate_impact()) | |
print("After debaising Disparate impact (from AIF360) = %f" %metric_new_dataset.disparate_impact()) | |
df_new = dset_lfr_trn.convert_to_dataframe()[0] | |
return df_new | |
def sample(df, sens_attr, label): | |
print('we are in sample') | |
dp = df.loc[(df[sens_attr] == 0) & (df[label] == 1)] | |
dn = df.loc[(df[sens_attr] == 0) & (df[label] == 0)] | |
fp = df.loc[(df[sens_attr] == 1) & (df[label] == 1)] | |
fn = df.loc[(df[sens_attr] == 1) & (df[label] == 0)] | |
wdp = len(df.loc[df[sens_attr] == 0]) * len(df.loc[df[label] == 1]) / len(df.loc[(df[label] == 1) & (df[sens_attr] == 0)]) | |
wdn = len(df.loc[df[sens_attr] == 0]) * len(df.loc[df[label] == 0]) / len(df.loc[(df[label] == 1) & (df[sens_attr] == 0)]) | |
wfp = len(df.loc[df[sens_attr] == 1]) * len(df.loc[df[label] == 1]) / len(df.loc[(df[label] == 1) & (df[sens_attr] == 0)]) | |
wfn = len(df.loc[df[sens_attr] == 1]) * len(df.loc[df[label] == 0]) / len(df.loc[(df[label] == 1) & (df[sens_attr] == 0)]) | |
# sample | |
dp_sample = dp.sample(n=int(wdp), random_state=1, replace=True) | |
dn_sample = dn.sample(n=int(wdn), random_state=1, replace=True) | |
fp_sample = fp.sample(n=int(wfp), random_state=1, replace=True) | |
fn_sample = fn.sample(n=int(wfn), random_state=1, replace=True) | |
# merge | |
df_new = pd.concat([dp_sample, dn_sample, fp_sample, fn_sample]).drop_duplicates().reset_index(drop=True) | |
return df_new | |
''' | |
def fairness_calculation(dataset_path, dataset_name, sens_attr, predict_attr, label): | |
data = nx.read_graphml(dataset_path) | |
df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index') | |
if df.columns[0] != 'userid': | |
# if so, then we make it as the first column | |
df = df.reset_index(level=0) | |
df = df.rename(columns={"index": 'userid'}) | |
# check if user_id column is not string | |
if type(df['userid'][0]) != np.int64: | |
# if so, we convert it to int | |
df['userid'] = pd.to_numeric(df['userid']) | |
df = df.astype({'userid': int}) | |
if predict_attr != None: | |
label == predict_attr | |
if dataset_name == 'pokec_z': | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 0) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 0) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(4, 1) | |
elif dataset_name == 'pokec_n': | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(-1, 0) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(0, 1) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(1, 1) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(2, 1) | |
df['I_am_working_in_field'] = df['I_am_working_in_field'].replace(3, 1) | |
elif dataset_name == 'alibaba': | |
df['age_level'] = df['age_level'].replace(1, 0) | |
df['age_level'] = df['age_level'].replace(2, 0) | |
df['age_level'] = df['age_level'].replace(3, 0) | |
df['age_level'] = df['age_level'].replace(4, 1) | |
df['age_level'] = df['age_level'].replace(5, 1) | |
df['age_level'] = df['age_level'].replace(6, 1) | |
df['final_gender_code'] = df['final_gender_code'].replace(1, 0) | |
df['final_gender_code'] = df['final_gender_code'].replace(2, 1) | |
#df.rename(columns={'age_level':'age', 'final_gender_code':'gender'}, inplace=True) | |
elif dataset_name == 'tecent': | |
age_dic = {'11~15':0, '16~20':0, '21~25':0, '26~30':1, '31~35':1, '36~40':2, '41~45':2, '46~50':3, '51~55':3, '56~60':4, '61~65':4, '66~70':4, '71~':4} | |
df[["age_range"]] = df[["age_range"]].applymap(lambda x:age_dic[x]) | |
df["age_range"] = df["age_range"].replace(1,0) | |
df["age_range"] = df["age_range"].replace(2,1) | |
df["age_range"] = df["age_range"].replace(3,1) | |
df["age_range"] = df["age_range"].replace(4,1) | |
df.rename(columns={'age_level':'age', 'final_gender_code':'gender'}, inplace=True) | |
elif dataset_name == 'nba': | |
df['SALARY'] = df['SALARY'].replace(-1, 0) | |
#df['SALARY'] = df['SALARY'].replace(0, 1) | |
#df['SALARY'] = df['SALARY'].replace(1,1) | |
# old calculation | |
total_number_of_sens0 = len(df.loc[df[sens_attr] == 0]) | |
total_number_of_sens1 = len(df.loc[df[sens_attr] == 1]) | |
number_of_positive_sens0 = len(df.loc[(df[sens_attr] == 0) & (df[label] == 1)]) | |
number_of_positive_sens1 = len(df.loc[(df[sens_attr] == 1) & (df[label] == 1)]) | |
fairness = np.absolute(number_of_positive_sens0) / np.absolute(total_number_of_sens0) - np.absolute(number_of_positive_sens1) / np.absolute(total_number_of_sens1) | |
dataset_fainress = fairness * 100 | |
print('dataset fairness:', dataset_fainress) | |
# new calculation | |
#one_df = df[df[sens_attr] == 0] | |
#num_of_priv = one_df.shape[0] | |
#zero_df = df[df[sens_attr] == 1] | |
#num_of_unpriv = zero_df.shape[0] | |
#unpriv_outcomes = zero_df[zero_df[label]==1].shape[0] | |
#unpriv_ratio = unpriv_outcomes/num_of_unpriv | |
#priv_outcomes = one_df[one_df[label]==1].shape[0] | |
#priv_ratio = priv_outcomes/num_of_priv | |
#disparate_impact = unpriv_ratio/priv_ratio | |
#return disparate_impact | |
pr_unpriv = calc_prop(df, sens_attr, 1, label, 1) | |
#print('pr_unpriv: ', pr_unpriv) | |
pr_priv = calc_prop(df, sens_attr, 0, label, 1) | |
#print('pr_priv:', pr_priv) | |
disp = pr_unpriv / pr_priv | |
#return pr_unpriv / pr_priv | |
print('Dsparate impact:', disp) | |
#binaryLabelDataset =BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df, label_names=[label], protected_attribute_names=[sens_attr], unprivileged_protected_attributes=['1']) | |
#di = DisparateImpactRemover(repair_level=1.0) | |
#rp_train = di.fit_transform(binaryLabelDataset) | |
#df_new = rp_train.convert_to_dataframe()[0] | |
#print(dataset) | |
#print(binaryLabelDataset) | |
#return df_new | |
''' | |