!pip install -q squarify
!pip install -q fairlearn

   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 234.1/234.1 kB 4.6 MB/s eta 0:00:00


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import squarify
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve

import fairlearn
from fairlearn.postprocessing import ThresholdOptimizer
from fairlearn.postprocessing import plot_threshold_optimizer


# Select features
included_features = ['sex', 'race', 'age', 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count', 'priors_count.1', 'c_charge_degree']
X = df[included_features].copy()
y = df['two_year_recid']

# Clone race and sex labels (will be useful later)
X['race_label'] = X['race']
X['sex_label'] = X['sex']


# One-hot encode categorical features
dummy_following_features = ['sex', 'race', 'c_charge_degree']
X = pd.get_dummies(X, columns=dummy_following_features)
X.shape

(7214, 18)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save a copy of race and sex attribute labels from the test set, we will use it later
X_test_attributes = X_test[['race_label', 'sex_label']]

# remove race and sex attribute labels from train & test sets
X_train = X_train.drop(columns=['race_label', 'sex_label'])
X_test = X_test.drop(columns=['race_label', 'sex_label'])


# Final predictors

for i, column in enumerate(X_train.columns, start=1):
    print(f"{i}. {column}")

1. age
2. juv_fel_count
3. juv_misd_count
4. juv_other_count
5. priors_count
6. priors_count.1
7. sex_Female
8. sex_Male
9. race_African-American
10. race_Asian
11. race_Caucasian
12. race_Hispanic
13. race_Native American
14. race_Other
15. c_charge_degree_F
16. c_charge_degree_M


final_comparison_df["y_predicted_new"] = final_comparison_df["y_predicted"]
#final_comparison_df["y_predicted_new"] = np.where(final_comparison_df['race'] == "African-American",final_comparison_df["y_predicted_new"],final_comparison_df["new_threshold_decision"])
final_comparison_df["y_predicted_new"][final_comparison_df['race'] == "African-American"] = final_comparison_df["new_threshold_decision"][final_comparison_df['race'] == "African-American"]

<ipython-input-28-95aed0b6bdea>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_comparison_df["y_predicted_new"][final_comparison_df['race'] == "African-American"] = final_comparison_df["new_threshold_decision"][final_comparison_df['race'] == "African-American"]


final_comparison_df[["y_predicted",'new_threshold_decision']].corr()


def get_new_prediction_label(row):
    if row['y_observed'] == 0 and row['y_predicted_new'] == 0:
        return 'TN'
    elif row['y_observed'] == 1 and row['y_predicted_new'] == 0:
        return 'FN'
    elif row['y_observed'] == 0 and row['y_predicted_new'] == 1:
        return 'FP'
    elif row['y_observed'] == 1 and row['y_predicted_new'] == 1:
        return 'TP'


# Apply the function to each row to create the 'baseline prediction' column
final_comparison_df['updated_prediction'] = final_comparison_df.apply(get_new_prediction_label, axis=1)

final_comparison_df.head()


# Pivot table to summarize counts
pivot_table_counts = final_comparison_df.pivot_table(index='race', columns='updated_prediction', aggfunc='size', fill_value=0)

# Calculate support values
support = pivot_table_counts.sum(axis=1)

# Normalize to get proportions (rates)
pivot_table_proportions = pivot_table_counts.div(support, axis=0)

# Add the support column to the pivot table
pivot_table_proportions['support'] = support

# Round the proportions
pivot_table_proportions = pivot_table_proportions.round(2)

print(pivot_table_proportions)

updated_prediction    FN    FP    TN    TP  support
race                                               
African-American    0.18  0.13  0.36  0.32      731
Asian               0.20  0.20  0.60  0.00        5
Caucasian           0.21  0.10  0.53  0.16      505
Hispanic            0.21  0.07  0.64  0.08      117
Native American     0.00  0.33  0.33  0.33        3
Other               0.27  0.06  0.61  0.06       82

Variable	Description
id	Unique identifier for each individual
name	Full name of the individual
first	First name of the individual
last	Last name of the individual
compas_screening_date	Date when the COMPAS screening was conducted
sex	Sex of the individual
dob	Date of birth
age	Age at the time of screening
age_cat	Categorical age group (e.g., less than 25, 25-45, greater than 45)
race	Race/ethnicity of the individual
juv_fel_count	Number of juvenile felony charges
decile_score	COMPAS decile score for general recidivism risk
juv_misd_count	Number of juvenile misdemeanor charges
juv_other_count	Number of other juvenile charges
priors_count	Number of prior offenses
days_b_screening_arrest	Days between screening and arrest
c_jail_in	Date of jail entry for the current charge
c_jail_out	Date of jail release for the current charge
c_case_number	Case number for the current charge
c_offense_date	Date of the current offense
c_arrest_date	Date of the current arrest
c_days_from_compas	Days from COMPAS screening to the current charge
c_charge_degree	Degree of the current charge (e.g., felony, misdemeanor)
c_charge_desc	Description of the current charge
is_recid	Indicator of whether the individual recidivated
r_case_number	Case number for the recidivism charge
r_charge_degree	Degree of the recidivism charge
r_days_from_arrest	Days from the arrest to the recidivism charge
r_offense_date	Date of the recidivism offense
r_charge_desc	Description of the recidivism charge
r_jail_in	Date of jail entry for the recidivism charge
r_jail_out	Date of jail release for the recidivism charge
violent_recid	Indicator of violent recidivism
is_violent_recid	Binary indicator for violent recidivism
vr_case_number	Case number for the violent recidivism charge
vr_charge_degree	Degree of the violent recidivism charge
vr_offense_date	Date of the violent recidivism offense
vr_charge_desc	Description of the violent recidivism charge
type_of_assessment	Type of COMPAS assessment conducted
decile_score.1	COMPAS decile score for violent recidivism risk
score_text	Textual interpretation of the COMPAS score (e.g., Low, Medium, High)
screening_date	Date of the screening assessment
v_type_of_assessment	Type of violent recidivism assessment conducted
v_decile_score	Decile score for violent recidivism
v_score_text	Textual interpretation of the violent recidivism score
v_screening_date	Date of the violent recidivism screening
in_custody	Date of custody start
out_custody	Date of custody end
priors_count.1	Redundant count of prior offenses
start	Start day of the observation period
end	End day of the observation period
event	Event indicator
two_year_recid	Indicator for recidivism within two years

id	sex	race	y_observed	y_predicted	baseline_prediction	predicted_probability
308	Male	Caucasian	0	0	TN	0.151703
...	...	...	...	...	...	...

Practical 10: Fairness in Machine Learning¶

COMPAS Recidivism¶

Exploratory Data Analysis¶

Prediction of Recidivism - Naïve Baseline Approach¶

Bias Mitigation¶

Post-Processing - Threshold Optimizer¶

	y_predicted	new_threshold_decision
y_predicted	1.000000	0.722498
new_threshold_decision	0.722498	1.000000

	sex	race	y_observed	y_predicted	baseline_prediction	predicted_probability	new_threshold_decision	y_predicted_new	updated_prediction
308	Male	Caucasian	0	0	TN	0.151788	0	0	TN
381	Male	African-American	0	0	TN	0.422197	0	0	TN
3238	Male	African-American	1	0	FN	0.356359	0	0	FN
2312	Male	African-American	1	1	TP	0.586242	1	1	TP
251	Female	Other	0	0	TN	0.202685	0	0	TN

Race - Baseline Prediction	FN	FP	TN	TP	Support
African-American	0.15	0.16	0.33	0.36	731
Asian	0.20	0.20	0.60	0.00	5
Caucasian	0.21	0.10	0.53	0.16	505
Hispanic	0.21	0.07	0.64	0.08	117
Native American	0.00	0.33	0.33	0.33	3
Other	0.27	0.06	0.61	0.06	82