Tutorial¶
Importing initial packages¶
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
Import aprofs package modules code and utils
from aprofs import code, models, utils
/Users/filipesantos/Documents/Projects/aprofs/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Get the insurance data from kaggle here : https://www.kaggle.com/datasets/mirichoi0218/insurance
# Read the CSV file from the data folder
data = pd.read_csv("insurance.csv")
data
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 |
| 1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 |
| 1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 |
| 1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 |
1338 rows × 7 columns
Some simple feature engineering
# foor loop over a pandas dataframe columns and chnate the typos off all string columns to category
for col in data.select_dtypes(include="object").columns:
data[col] = data[col].astype("category")
# Display the data
data['is_female'] = (data['sex'] == 'female').astype(int) # is female becomes the target variable
data = data.drop(columns=["sex"])
data
| age | bmi | children | smoker | region | charges | is_female | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | 27.900 | 0 | yes | southwest | 16884.92400 | 1 |
| 1 | 18 | 33.770 | 1 | no | southeast | 1725.55230 | 0 |
| 2 | 28 | 33.000 | 3 | no | southeast | 4449.46200 | 0 |
| 3 | 33 | 22.705 | 0 | no | northwest | 21984.47061 | 0 |
| 4 | 32 | 28.880 | 0 | no | northwest | 3866.85520 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | 30.970 | 3 | no | northwest | 10600.54830 | 0 |
| 1334 | 18 | 31.920 | 0 | no | northeast | 2205.98080 | 1 |
| 1335 | 18 | 36.850 | 0 | no | southeast | 1629.83350 | 1 |
| 1336 | 21 | 25.800 | 0 | no | southwest | 2007.94500 | 1 |
| 1337 | 61 | 29.070 | 0 | yes | northwest | 29141.36030 | 1 |
1338 rows × 7 columns
Setting the roles of the features
target = "is_female" # target
T = "charges" # Treatment as price change
features = [
"age",
"bmi",
"children",
"smoker",
"region",
"charges",
]
Simple stratified split
seed = 42
X, y = data[features], data[target]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, random_state=seed)
Build your initial model, its a tree bases GBM for classification. Note: we want to work with the probabilities not the class prediction
from lightgbm import LGBMClassifier
import lightgbm as lgb
monotone_constraints = [1 if col == T else 0 for col in features]
callbacks = [lgb.early_stopping(10, verbose=0), lgb.log_evaluation(period=0)]
model = LGBMClassifier(
verbose=-1, n_estimators=100, monotone_constraints=monotone_constraints,random_state=seed
).fit(
X_train,
y_train,
eval_set=[(X_valid, y_valid)],
callbacks=callbacks,
)
pred_valid = model.predict_proba(X_valid)
print(f"The original prediction has an AUC of {roc_auc_score(y_valid, pred_valid[:, 1])}")
The original prediction has an AUC of 0.5312254936907392
feature_importance_df = model.feature_importances_
feature_names = model.feature_name_
feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": feature_importance_df})
# Sort the DataFrame by importance
feature_importance_df.sort_values("Importance", ascending=False, inplace=True)
print(feature_importance_df)
Feature Importance 1 bmi 56 0 age 49 2 children 20 5 charges 19 3 smoker 5 4 region 1
Getting your shapley values from the shap package TreeExplainer
from shap import TreeExplainer, Explainer
warnings.filterwarnings("ignore", category=UserWarning)
shap_explainer = TreeExplainer(model)
shap_valid = shap_explainer.shap_values(X_valid)
shap_expected_value = shap_explainer.expected_value
Putting you shapley values into a pandas dataframe
shaps_values = pd.DataFrame(shap_valid, index=X_valid.index, columns=X_valid.columns)
shaps_values
| age | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|
| 563 | -0.025771 | -0.000257 | 0.035974 | 0.050899 | -0.010099 | 0.036391 |
| 1327 | -0.058785 | -0.087476 | -0.002521 | 0.038555 | 0.006270 | 0.024491 |
| 1114 | 0.169415 | -0.033496 | 0.059202 | 0.027443 | -0.003710 | -0.068301 |
| 678 | 0.011816 | 0.015024 | 0.032997 | 0.056120 | -0.006188 | 0.070275 |
| 490 | 0.249472 | 0.100665 | 0.048708 | 0.054452 | 0.001997 | -0.370703 |
| ... | ... | ... | ... | ... | ... | ... |
| 1225 | -0.084233 | -0.104990 | 0.025577 | 0.031098 | -0.010582 | -0.015548 |
| 956 | -0.057537 | -0.084549 | 0.006919 | -0.151000 | 0.000211 | 0.067590 |
| 189 | -0.041293 | -0.089155 | -0.030936 | 0.027893 | -0.004784 | -0.029952 |
| 265 | 0.027805 | -0.039058 | -0.012704 | -0.108263 | -0.002953 | 0.095409 |
| 135 | 0.079413 | 0.086040 | 0.051731 | 0.038937 | 0.003567 | -0.061513 |
335 rows × 6 columns
Create an aprofs object¶
Instantiate one of link models to use with with our model. In this case its a Classification problem and we want to use the logistic function.
link_model = models.ClassificationLogisticLink()
and now create the Aprofs object
apos_test = code.Aprofs(
X_valid, y_valid, link_model=link_model
) # Initialize the Aprofs class with the data and the target columns
apos_test
Aprofs(current_data shape =(335, 6), target_column =[0 1] Shapley values have not been calculated!
See above that the Shapley values have not been added yet to the aprofs object
In here we use the built int method to calculate the shapley values and add those into the aprofs object
apos_test.calculate_shaps(model, type_model='tree')
lets take a look inside
apos_test
Aprofs(current_data shape =(335, 6), target_column =[0 1], shap_mean=0.012810903840487525, shap_values.shape=(335, 6)
Now you can see that the shap information was added.
Lets take a look at the shap_values table inside the paorfs objects
apos_test.shap_values
| age | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|
| 563 | -0.025771 | -0.000257 | 0.035974 | 0.050899 | -0.010099 | 0.036391 |
| 1327 | -0.058785 | -0.087476 | -0.002521 | 0.038555 | 0.006270 | 0.024491 |
| 1114 | 0.169415 | -0.033496 | 0.059202 | 0.027443 | -0.003710 | -0.068301 |
| 678 | 0.011816 | 0.015024 | 0.032997 | 0.056120 | -0.006188 | 0.070275 |
| 490 | 0.249472 | 0.100665 | 0.048708 | 0.054452 | 0.001997 | -0.370703 |
| ... | ... | ... | ... | ... | ... | ... |
| 1225 | -0.084233 | -0.104990 | 0.025577 | 0.031098 | -0.010582 | -0.015548 |
| 956 | -0.057537 | -0.084549 | 0.006919 | -0.151000 | 0.000211 | 0.067590 |
| 189 | -0.041293 | -0.089155 | -0.030936 | 0.027893 | -0.004784 | -0.029952 |
| 265 | 0.027805 | -0.039058 | -0.012704 | -0.108263 | -0.002953 | 0.095409 |
| 135 | 0.079413 | 0.086040 | 0.051731 | 0.038937 | 0.003567 | -0.061513 |
335 rows × 6 columns
apos_test.shap_mean
0.012810903840487525
Looks the same as the calculated above using the TreeExplainer
And the performance is the same using the shapley values and the predictive model, use the .get_feature_performance method to test.
perf = apos_test.get_feature_performance(features)
print(f"The performance calculated using the shapley value has an AUC of {perf}")
The performance calculated using the shapley value has an AUC of 0.5315819490981678
Feature selection.¶
To get a list of the best features using a brute force approach its simple, use the .brute_force_selection.
This will test all the combinations of features and return a list with the best features
best_solution = apos_test.brute_force_selection(features)
best_solution
Processing 63 combinations: 0%| | 0/63 [00:00<?, ?it/s]
Processing 63 combinations: 100%|██████████| 63/63 [00:00<00:00, 456.01it/s]
the best list is ('smoker', 'region', 'charges') with performance 0.5942111641833606
['smoker', 'region', 'charges']
monotone_constraints = [1 if col == T else 0 for col in X_train[best_solution].columns]
model_best = LGBMClassifier(
verbose=-1, n_estimators=100, monotone_constraints=monotone_constraints, random_state=seed
).fit(
X_train[best_solution],
y_train,
eval_set=[(X_valid[best_solution], y_valid)],
callbacks=callbacks,
)
pred_valid_best = model_best.predict_proba(X_valid[best_solution])
print(f"The new prediction have an AUC of {roc_auc_score(y_valid, pred_valid_best[:, 1])}")
The new prediction have an AUC of 0.5496542382547943
Lets create a new Aprofs object with the new data and model
apos_test_best = code.Aprofs(X_valid[best_solution], y_valid, link_model=None)
apos_test_best.calculate_shaps(model_best)
and take a look inside
apos_test_best
Aprofs(current_data shape =(335, 3), target_column =[0 1], shap_mean=-0.028112216240915356, shap_values.shape=(335, 3)
We can also try to go a greedy forward selection, it basically look at the approximate prediction for each features, select the best one and adds the on to the list. Then goes and select the next best one and add it to the list. It stops if adding the feature does not make the model better.
apos_test.gready_forward_selection(features)
the best feature to add is smoker with performance 0.5588864333071932 the best feature to add is charges with performance 0.589505952805304 the best feature to add is region with performance 0.5942111641833606 The feature age wont be added The feature children wont be added The feature bmi wont be added
['smoker', 'charges', 'region']
Calculating p-values¶
We can check what is the quality of our features using a p-value estimate of the change in performance if we shuffle the shapley value. Please look at the references to know more.
shap_p_values = apos_test.get_shap_p_value(features=features)
shap_p_values
100%|██████████| 6/6 [00:03<00:00, 1.97it/s]
| Feature | p-value_shap | |
|---|---|---|
| 0 | age | 0.324 |
| 1 | bmi | 0.800 |
| 2 | children | 0.934 |
| 3 | smoker | 0.086 |
| 4 | region | 0.156 |
| 5 | charges | 0.000 |
warnings.filterwarnings("ignore", category=FutureWarning)
merged_df_shap = shap_p_values.merge(feature_importance_df, on="Feature")
merged_df_shap.sort_values("Importance", ascending=False, inplace=True)
# Define a function to apply color formatting
def color_format(val):
if val < 0.05:
return "background-color: green"
elif val > 0.3:
return "background-color: red"
else:
return "background-color: gray"
# Apply color formatting to the dataframe
styled_df_shap = merged_df_shap.style.applymap(color_format, subset=["p-value_shap"])
# Display the styled dataframe
styled_df_shap
| Feature | p-value_shap | Importance | |
|---|---|---|---|
| 1 | bmi | 0.800000 | 56 |
| 0 | age | 0.324000 | 49 |
| 2 | children | 0.934000 | 20 |
| 5 | charges | 0.000000 | 19 |
| 3 | smoker | 0.086000 | 5 |
| 4 | region | 0.156000 | 1 |
Visualization: partial dependence plots to understand the features¶
apos_test.visualize_feature(main_feature="children",other_features=None, nbins=10)
apos_test.visualize_feature(main_feature="smoker")
apos_test.visualize_feature(main_feature="charges",other_features=None, nbins=10, type_bins="cut")
And we can even compare the aprofs object comparing the pdp plot for the feature fo the models. If needs to be a feature used on both models... what else to compare!
Comparing aprofs objects (shapley values)¶
apos_test.compare_feature(apos_test_best,"charges")
Comparing the behavior of feature after neutralize the marginal effect another/same feature¶
Basically here the trick is to just average our the shapley values of the features that we want to neutralize and use that to calculate the final prediction.
Example: we have feature A, feature B and feature C. What will will do to neutralize Feature B of fo use for prediction the following:
- Shappley Feature A + Overall Shappley Average Feature B + Shappley Featue C
apos_test.visualize_neutralized_feature(main_feature="children", neutralize_features="charges")
And we can see that neutralizing shap make things a bit more flat flor the affect of "Children"