Skip to content

References

Core code development of the Aprofs class.

This class is used to calculate the SHAP values of a model and evaluate the performance of the features based on the SHAP values. The class also provides a method to visualize the marginal effect of a feature on the target variable. The class is used to perform feature selection based on the SHAP values and to calculate the p-values of the SHAP values of the features. The class is also used to calculate the performance of the model based on the SHAP values of the features.

Aprofs

Aprofs Class

A class for analyzing SHAP values using approximate predictions.

Attributes:

Name Type Description
current_data DataFrame

The current data.

target_column Series

The target column.

link function

The link function.

link_srt str

The string representation of the link function.

shap_mean float

The mean SHAP value. None if SHAP values have not been calculated.

shap_values DataFrame

The SHAP values. None if SHAP values have not been calculated.

Source code in src/aprofs/code.py
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
class Aprofs:
    """
    Aprofs Class

    A class for analyzing SHAP values using approximate predictions.
    --------------------------------------------------------------


    Attributes:
        current_data (pd.DataFrame): The current data.
        target_column (Series): The target column.
        link (function): The link function.
        link_srt (str): The string representation of the link function.
        shap_mean (float): The mean SHAP value. None if SHAP values have not been calculated.
        shap_values (DataFrame): The SHAP values. None if SHAP values have not been calculated.

    """

    def __init__(self, current_data, target_column, link_model: LinkModels):
        self.current_data = current_data
        self.target_column = target_column
        self.link_model = ClassificationLogisticLink() if link_model is None else link_model
        self.shap_mean: float = None
        self.shap_values: pd.DataFrame = None

    def __repr__(self):
        return (
            f"Aprofs(current_data shape ={self.current_data.shape}, target_column ={self.target_column.unique()}"
            + (
                f", shap_mean={self.shap_mean}, shap_values.shape={self.shap_values.shape}"
                if self.shap_mean is not None
                else "\n  Shapley values have not been calculated!"
            )
        )

    def calculate_shaps(self, model: Any, type_model="tree") -> None:
        """
        Calculate the SHAP values for the given model.

        Parameters:
            model (Any): The trained model for which to calculate the SHAP values.
            type_model (str): type of model: tree based or other. If "tree" then TreeExplainer will be use, otherwise a general explainer from the SHAP package is used. Defaults to 'tree'.


        Returns:
            None
        """
        shap_values, shap_mean = utils.get_shap_values(self.current_data, model, type_model)
        self.shap_values = pd.DataFrame(shap_values, index=self.current_data.index, columns=self.current_data.columns)
        self.shap_mean = shap_mean

    def get_feature_performance(self, features: List[str]) -> float:
        """
        Calculate the performance of the features based on the SHAP values.

        Parameters:
            features (List[str]): The list of features for which to calculate the performance.

        Returns:
            float: The performance of the features based on the SHAP values.

        Raises:
            ValueError: If an any feature is missing in the SHAP values.
        """
        missing_features = [feature for feature in features if feature not in self.shap_values.columns]
        if missing_features:
            raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")
        return self.link_model.performance_fit(
            self.target_column, utils.calculate_row_sum(self.shap_values, self.shap_mean, features, self.link_model)
        )

    def brute_force_selection(self, features: List[str]) -> List[str]:
        """
        Perform brute force feature selection by evaluating the performance of all possible combinations of features.

        Parameters:
            features (List[str]): The list of features to consider for feature selection.

        Returns:
            List[str]: The best list of features with the highest performance.

        Raises:
            ValueError: If an any feature is missing in the SHAP values.
        """
        missing_features = [feature for feature in features if feature not in self.shap_values.columns]
        if missing_features:
            raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

        best_performance = 0.0
        best_list = []
        all_combinations = list(utils.generate_all_combinations(features))
        for comb in tqdm(all_combinations, desc=f"Processing {len(all_combinations)} combinations"):
            current_performance = self.get_feature_performance(list(comb))
            if current_performance > best_performance:
                best_performance = current_performance
                best_list = comb
        print(f"the best list is {best_list} with performance {best_performance}")
        return list(best_list)

    def gready_forward_selection(self, features: List[str], greediness: float = 0.001) -> List[str]:
        """
        Perform gready forward feature selection by evaluating the performance of all possible combinations of features.

        Parameters:
            features (List[str]): The list of features to consider for feature selection.
            greediness (float): The greediness factor, how much better needs to be the performance to add the feature. Default is 0.001.
        Returns:
            List[str]: The best list of features with the highest performance.

        Raises:
            ValueError: If an any feature is missing in the SHAP values.
        """
        missing_features = [feature for feature in features if feature not in self.shap_values.columns]
        if missing_features:
            raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

        best_list: List = []
        candidate_list: List[str] = features.copy()
        aproximate_performance: List[float] = []
        best_performance = 0.0
        while len(candidate_list) > 0:
            best_feature_, best_performance_ = utils.best_feature(
                self.shap_values, self.shap_mean, self.link_model, self.target_column, best_list, candidate_list
            )
            candidate_list.remove(best_feature_)

            if self.link_model.perform == "maximize":  # maximize metric
                if best_performance > best_performance_ * (1 + greediness):
                    print(f"The feature {best_feature_} wont be added")
                else:
                    best_performance = best_performance_
                    best_list.append(best_feature_)
                    print(f"the best feature to add is {best_feature_} with performance {best_performance_}")
                    aproximate_performance.append(best_performance_)

            if self.link_model.perform == "minimize":  # minimize metric
                if best_performance < best_performance_ * (1 - greediness):
                    print(f"The feature {best_feature_} wont be added")
                else:
                    best_performance = best_performance_
                    best_list.append(best_feature_)
                    print(f"the best feature to add is {best_feature_} with performance {best_performance_}")
                    aproximate_performance.append(best_performance_)

        return best_list

    def get_shap_p_value(self, features: List[str], suffle_size: int = 500) -> pd.DataFrame:
        """
        Calculate the p-values of the SHAP values of the features.

        Parameters:
            features (List[str]): The list of features for which to calculate the p-values.
            suffle_size (int): The number of shuffling iterations to perform. Default is 500.

        Returns:
            pd.DataFrame: A DataFrame containing the features and their corresponding p-values.

        Raises:
            ValueError: If an any feature is missing in the SHAP values.
        """
        missing_features = [feature for feature in features if feature not in self.shap_values.columns]
        if missing_features:
            raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

        p_values = []
        performance_threshold = self.get_feature_performance(self.shap_values.columns)
        for feature in tqdm(features):
            samples = [
                utils.random_sort_shaps(self.shap_values, self.shap_mean, feature, self.target_column, self.link_model)
                for _ in range(suffle_size)
            ]
            count = sum(sample > performance_threshold for sample in samples)
            p_values.append(count / suffle_size)

        return pd.DataFrame({"Feature": features, "p-value_shap": p_values})

    def visualize_feature(  # pylint: disable=too-many-arguments
        self,
        main_feature: str,
        other_features: List[str] = None,
        nbins: int = 20,
        type_bins: str = "qcut",
        type_plot: str = "prob",
    ) -> None:
        """
        Visualize the marginal effect of a feature on the target variable.

        Parameters:
            main_feature (str): The main feature for which to visualize the marginal effect.
            other_features (List[str]): The list of other features to include in the visualization. Default is None.
            nbins (int): The number of bins to use for the visualization. Default is 20.
            type_bins (str): The type of binning to use. Default is "qcut".
            type_plot (str): The type of plot to generate. Default is "prob".

        Returns:
            None

        Raises:
            ValueError: If an any feature is missing in the SHAP values dataframe.
        """
        # generate data to plot marginal effect shapley values
        if other_features is None:
            other_features = []
        features = []
        features.append(main_feature)
        features.extend(other_features)

        missing_features = [feature for feature in features if feature not in self.shap_values.columns]
        if missing_features:
            raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

        temp_data = utils.temp_plot_data(self, features)
        # call plotting function
        utils.plot_data(
            temp_data,
            main_feature,
            other_features=other_features,
            nbins=nbins,
            type_bins=type_bins,
            type_plot=type_plot,
        )

    def compare_feature(  # pylint: disable=too-many-arguments
        self,
        other,
        feature: str,
        nbins: int = 20,
        type_bins: str = "qcut",
        type_plot: str = "prob",
    ) -> None:
        """
        Visualize the marginal effect of a feature on the target variable.

        Parameters:
            feature (str): The main feature for which to visualize the marginal effect.
            nbins (int): The number of bins to use for the visualization. Default is 20.
            type_bins (str): The type of binning to use. Default is "qcut".
            type_plot (str): The type of plot to generate. Default is "prob".

        Returns:
            None

        Raises:
            ValueError: If an any feature is missing in the SHAP values.
        """

        if not isinstance(other, Aprofs):
            raise ValueError("Can only compare with another Aprofs object")

        if feature not in self.shap_values.columns:
            raise ValueError(f"The following feature are missing in the SHAP values: {feature}")

        temp_data = utils.temp_plot_compare_data(self, other, feature)
        # call plotting function
        utils.plot_data_compare(
            temp_data,
            feature,
            nbins=nbins,
            type_bins=type_bins,
            type_plot=type_plot,
        )

    def visualize_neutralized_feature(  # pylint: disable=too-many-arguments
        self,
        main_feature: str,
        neutralize_features: List[str] = None,
        nbins: int = 20,
        type_bins: str = "qcut",
        type_plot: str = "prob",
    ) -> None:
        """
        Visualize the marginal effect of a feature on the target variable after neutralizing the effect of other features.

        Parameters:
            main_feature (str): The main feature for which to visualize the marginal effect.
            neutralize_features (List[str]): The list of other features to be neutralized.
            nbins (int): The number of bins to use for the visualization. Default is 20.
            type_bins (str): The type of binning to use. Default is "qcut".
            type_plot (str): The type of plot to generate. Default is "prob".

        Returns:
            None

        Raises:
            ValueError: If an any feature is missing in the SHAP values dataframe.
        """
        # generate data to plot marginal effect shapley values
        if neutralize_features is None:
            neutralize_features = []
        features = []
        if not isinstance(neutralize_features, list):
            neutralize_features = [neutralize_features]

        features.append(main_feature)
        features.extend(neutralize_features)
        features = list(set(features))  # remove duplicates

        missing_features = [feature for feature in features if feature not in self.shap_values.columns]
        if missing_features:
            raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

        temp_data = utils.temp_neutral_plot_data(self, neutralize_features)
        temp_data[main_feature] = self.current_data[main_feature]
        # call plotting function
        utils.plot_data_neutral(
            temp_data,
            main_feature,
            nbins=nbins,
            type_bins=type_bins,
            type_plot=type_plot,
        )

brute_force_selection(features)

Perform brute force feature selection by evaluating the performance of all possible combinations of features.

Parameters:

Name Type Description Default
features List[str]

The list of features to consider for feature selection.

required

Returns:

Type Description
List[str]

List[str]: The best list of features with the highest performance.

Raises:

Type Description
ValueError

If an any feature is missing in the SHAP values.

Source code in src/aprofs/code.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def brute_force_selection(self, features: List[str]) -> List[str]:
    """
    Perform brute force feature selection by evaluating the performance of all possible combinations of features.

    Parameters:
        features (List[str]): The list of features to consider for feature selection.

    Returns:
        List[str]: The best list of features with the highest performance.

    Raises:
        ValueError: If an any feature is missing in the SHAP values.
    """
    missing_features = [feature for feature in features if feature not in self.shap_values.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

    best_performance = 0.0
    best_list = []
    all_combinations = list(utils.generate_all_combinations(features))
    for comb in tqdm(all_combinations, desc=f"Processing {len(all_combinations)} combinations"):
        current_performance = self.get_feature_performance(list(comb))
        if current_performance > best_performance:
            best_performance = current_performance
            best_list = comb
    print(f"the best list is {best_list} with performance {best_performance}")
    return list(best_list)

calculate_shaps(model, type_model='tree')

Calculate the SHAP values for the given model.

Parameters:

Name Type Description Default
model Any

The trained model for which to calculate the SHAP values.

required
type_model str

type of model: tree based or other. If "tree" then TreeExplainer will be use, otherwise a general explainer from the SHAP package is used. Defaults to 'tree'.

'tree'

Returns:

Type Description
None

None

Source code in src/aprofs/code.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def calculate_shaps(self, model: Any, type_model="tree") -> None:
    """
    Calculate the SHAP values for the given model.

    Parameters:
        model (Any): The trained model for which to calculate the SHAP values.
        type_model (str): type of model: tree based or other. If "tree" then TreeExplainer will be use, otherwise a general explainer from the SHAP package is used. Defaults to 'tree'.


    Returns:
        None
    """
    shap_values, shap_mean = utils.get_shap_values(self.current_data, model, type_model)
    self.shap_values = pd.DataFrame(shap_values, index=self.current_data.index, columns=self.current_data.columns)
    self.shap_mean = shap_mean

compare_feature(other, feature, nbins=20, type_bins='qcut', type_plot='prob')

Visualize the marginal effect of a feature on the target variable.

Parameters:

Name Type Description Default
feature str

The main feature for which to visualize the marginal effect.

required
nbins int

The number of bins to use for the visualization. Default is 20.

20
type_bins str

The type of binning to use. Default is "qcut".

'qcut'
type_plot str

The type of plot to generate. Default is "prob".

'prob'

Returns:

Type Description
None

None

Raises:

Type Description
ValueError

If an any feature is missing in the SHAP values.

Source code in src/aprofs/code.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
def compare_feature(  # pylint: disable=too-many-arguments
    self,
    other,
    feature: str,
    nbins: int = 20,
    type_bins: str = "qcut",
    type_plot: str = "prob",
) -> None:
    """
    Visualize the marginal effect of a feature on the target variable.

    Parameters:
        feature (str): The main feature for which to visualize the marginal effect.
        nbins (int): The number of bins to use for the visualization. Default is 20.
        type_bins (str): The type of binning to use. Default is "qcut".
        type_plot (str): The type of plot to generate. Default is "prob".

    Returns:
        None

    Raises:
        ValueError: If an any feature is missing in the SHAP values.
    """

    if not isinstance(other, Aprofs):
        raise ValueError("Can only compare with another Aprofs object")

    if feature not in self.shap_values.columns:
        raise ValueError(f"The following feature are missing in the SHAP values: {feature}")

    temp_data = utils.temp_plot_compare_data(self, other, feature)
    # call plotting function
    utils.plot_data_compare(
        temp_data,
        feature,
        nbins=nbins,
        type_bins=type_bins,
        type_plot=type_plot,
    )

get_feature_performance(features)

Calculate the performance of the features based on the SHAP values.

Parameters:

Name Type Description Default
features List[str]

The list of features for which to calculate the performance.

required

Returns:

Name Type Description
float float

The performance of the features based on the SHAP values.

Raises:

Type Description
ValueError

If an any feature is missing in the SHAP values.

Source code in src/aprofs/code.py
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def get_feature_performance(self, features: List[str]) -> float:
    """
    Calculate the performance of the features based on the SHAP values.

    Parameters:
        features (List[str]): The list of features for which to calculate the performance.

    Returns:
        float: The performance of the features based on the SHAP values.

    Raises:
        ValueError: If an any feature is missing in the SHAP values.
    """
    missing_features = [feature for feature in features if feature not in self.shap_values.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")
    return self.link_model.performance_fit(
        self.target_column, utils.calculate_row_sum(self.shap_values, self.shap_mean, features, self.link_model)
    )

get_shap_p_value(features, suffle_size=500)

Calculate the p-values of the SHAP values of the features.

Parameters:

Name Type Description Default
features List[str]

The list of features for which to calculate the p-values.

required
suffle_size int

The number of shuffling iterations to perform. Default is 500.

500

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame containing the features and their corresponding p-values.

Raises:

Type Description
ValueError

If an any feature is missing in the SHAP values.

Source code in src/aprofs/code.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def get_shap_p_value(self, features: List[str], suffle_size: int = 500) -> pd.DataFrame:
    """
    Calculate the p-values of the SHAP values of the features.

    Parameters:
        features (List[str]): The list of features for which to calculate the p-values.
        suffle_size (int): The number of shuffling iterations to perform. Default is 500.

    Returns:
        pd.DataFrame: A DataFrame containing the features and their corresponding p-values.

    Raises:
        ValueError: If an any feature is missing in the SHAP values.
    """
    missing_features = [feature for feature in features if feature not in self.shap_values.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

    p_values = []
    performance_threshold = self.get_feature_performance(self.shap_values.columns)
    for feature in tqdm(features):
        samples = [
            utils.random_sort_shaps(self.shap_values, self.shap_mean, feature, self.target_column, self.link_model)
            for _ in range(suffle_size)
        ]
        count = sum(sample > performance_threshold for sample in samples)
        p_values.append(count / suffle_size)

    return pd.DataFrame({"Feature": features, "p-value_shap": p_values})

gready_forward_selection(features, greediness=0.001)

Perform gready forward feature selection by evaluating the performance of all possible combinations of features.

Parameters:

Name Type Description Default
features List[str]

The list of features to consider for feature selection.

required
greediness float

The greediness factor, how much better needs to be the performance to add the feature. Default is 0.001.

0.001

Returns: List[str]: The best list of features with the highest performance.

Raises:

Type Description
ValueError

If an any feature is missing in the SHAP values.

Source code in src/aprofs/code.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def gready_forward_selection(self, features: List[str], greediness: float = 0.001) -> List[str]:
    """
    Perform gready forward feature selection by evaluating the performance of all possible combinations of features.

    Parameters:
        features (List[str]): The list of features to consider for feature selection.
        greediness (float): The greediness factor, how much better needs to be the performance to add the feature. Default is 0.001.
    Returns:
        List[str]: The best list of features with the highest performance.

    Raises:
        ValueError: If an any feature is missing in the SHAP values.
    """
    missing_features = [feature for feature in features if feature not in self.shap_values.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

    best_list: List = []
    candidate_list: List[str] = features.copy()
    aproximate_performance: List[float] = []
    best_performance = 0.0
    while len(candidate_list) > 0:
        best_feature_, best_performance_ = utils.best_feature(
            self.shap_values, self.shap_mean, self.link_model, self.target_column, best_list, candidate_list
        )
        candidate_list.remove(best_feature_)

        if self.link_model.perform == "maximize":  # maximize metric
            if best_performance > best_performance_ * (1 + greediness):
                print(f"The feature {best_feature_} wont be added")
            else:
                best_performance = best_performance_
                best_list.append(best_feature_)
                print(f"the best feature to add is {best_feature_} with performance {best_performance_}")
                aproximate_performance.append(best_performance_)

        if self.link_model.perform == "minimize":  # minimize metric
            if best_performance < best_performance_ * (1 - greediness):
                print(f"The feature {best_feature_} wont be added")
            else:
                best_performance = best_performance_
                best_list.append(best_feature_)
                print(f"the best feature to add is {best_feature_} with performance {best_performance_}")
                aproximate_performance.append(best_performance_)

    return best_list

visualize_feature(main_feature, other_features=None, nbins=20, type_bins='qcut', type_plot='prob')

Visualize the marginal effect of a feature on the target variable.

Parameters:

Name Type Description Default
main_feature str

The main feature for which to visualize the marginal effect.

required
other_features List[str]

The list of other features to include in the visualization. Default is None.

None
nbins int

The number of bins to use for the visualization. Default is 20.

20
type_bins str

The type of binning to use. Default is "qcut".

'qcut'
type_plot str

The type of plot to generate. Default is "prob".

'prob'

Returns:

Type Description
None

None

Raises:

Type Description
ValueError

If an any feature is missing in the SHAP values dataframe.

Source code in src/aprofs/code.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
def visualize_feature(  # pylint: disable=too-many-arguments
    self,
    main_feature: str,
    other_features: List[str] = None,
    nbins: int = 20,
    type_bins: str = "qcut",
    type_plot: str = "prob",
) -> None:
    """
    Visualize the marginal effect of a feature on the target variable.

    Parameters:
        main_feature (str): The main feature for which to visualize the marginal effect.
        other_features (List[str]): The list of other features to include in the visualization. Default is None.
        nbins (int): The number of bins to use for the visualization. Default is 20.
        type_bins (str): The type of binning to use. Default is "qcut".
        type_plot (str): The type of plot to generate. Default is "prob".

    Returns:
        None

    Raises:
        ValueError: If an any feature is missing in the SHAP values dataframe.
    """
    # generate data to plot marginal effect shapley values
    if other_features is None:
        other_features = []
    features = []
    features.append(main_feature)
    features.extend(other_features)

    missing_features = [feature for feature in features if feature not in self.shap_values.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

    temp_data = utils.temp_plot_data(self, features)
    # call plotting function
    utils.plot_data(
        temp_data,
        main_feature,
        other_features=other_features,
        nbins=nbins,
        type_bins=type_bins,
        type_plot=type_plot,
    )

visualize_neutralized_feature(main_feature, neutralize_features=None, nbins=20, type_bins='qcut', type_plot='prob')

Visualize the marginal effect of a feature on the target variable after neutralizing the effect of other features.

Parameters:

Name Type Description Default
main_feature str

The main feature for which to visualize the marginal effect.

required
neutralize_features List[str]

The list of other features to be neutralized.

None
nbins int

The number of bins to use for the visualization. Default is 20.

20
type_bins str

The type of binning to use. Default is "qcut".

'qcut'
type_plot str

The type of plot to generate. Default is "prob".

'prob'

Returns:

Type Description
None

None

Raises:

Type Description
ValueError

If an any feature is missing in the SHAP values dataframe.

Source code in src/aprofs/code.py
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
def visualize_neutralized_feature(  # pylint: disable=too-many-arguments
    self,
    main_feature: str,
    neutralize_features: List[str] = None,
    nbins: int = 20,
    type_bins: str = "qcut",
    type_plot: str = "prob",
) -> None:
    """
    Visualize the marginal effect of a feature on the target variable after neutralizing the effect of other features.

    Parameters:
        main_feature (str): The main feature for which to visualize the marginal effect.
        neutralize_features (List[str]): The list of other features to be neutralized.
        nbins (int): The number of bins to use for the visualization. Default is 20.
        type_bins (str): The type of binning to use. Default is "qcut".
        type_plot (str): The type of plot to generate. Default is "prob".

    Returns:
        None

    Raises:
        ValueError: If an any feature is missing in the SHAP values dataframe.
    """
    # generate data to plot marginal effect shapley values
    if neutralize_features is None:
        neutralize_features = []
    features = []
    if not isinstance(neutralize_features, list):
        neutralize_features = [neutralize_features]

    features.append(main_feature)
    features.extend(neutralize_features)
    features = list(set(features))  # remove duplicates

    missing_features = [feature for feature in features if feature not in self.shap_values.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

    temp_data = utils.temp_neutral_plot_data(self, neutralize_features)
    temp_data[main_feature] = self.current_data[main_feature]
    # call plotting function
    utils.plot_data_neutral(
        temp_data,
        main_feature,
        nbins=nbins,
        type_bins=type_bins,
        type_plot=type_plot,
    )

Detailed API models

This module implements the models class. this wasy we can extend this class to implement new models to calculate the use with the aprofs class

Bases: LinkModels

This class implements the interface for classification with logistic link

Source code in src/aprofs/models.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
class ClassificationLogisticLink(LinkModels):
    """This class implements the interface for classification with logistic link"""

    def __init__(self) -> None:
        super().__init__(type_model="classification", type_link="logistic", perform="maximize")

    def performance_fit(self, target: Union[np.ndarray, pd.Series], prediction: Union[np.ndarray, pd.Series]) -> float:
        return roc_auc_score(target, prediction)

    def link_calculate(
        self, inv_prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        if not isinstance(inv_prediction, (int, float, np.ndarray, pd.Series)):
            raise ValueError("Invalid input type for link_calculate")
        return 1 / (1 + np.exp(-inv_prediction))

    def inv_link_calculate(
        self, prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        return np.log(prediction / (1 - prediction))

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}() with type model {self.type_model} and type link {self.type_link}"

LinkModels

This class implements the interface for the link models to be used in the aprofs class

Functionality that needs ot be implemented
  • performance_fit: calculate the performance of the model
  • link_calculate: calculate the link function
  • inv_link_calculate: calculate the inverse link function
Source code in src/aprofs/models.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class LinkModels(metaclass=abc.ABCMeta):
    """This class implements the interface for the link models
    to be used in the aprofs class

    Functionality that needs ot be implemented:
        - performance_fit: calculate the performance of the model
        - link_calculate: calculate the link function
        - inv_link_calculate: calculate the inverse link function

    """

    def __init__(self, type_model: str, type_link: str, perform: str) -> None:
        self.type_model = type_model
        self.type_link = type_link
        self.perform = perform

    @abc.abstractmethod
    def performance_fit(self, target: Union[np.ndarray, pd.Series], prediction: Union[np.ndarray, pd.Series]) -> float:
        """
        Calculate the performance of the model.

        Args:
            target (np.ndarray): The true target values.
            prediction (np.ndarray): The predicted values.

        Returns:
            float: The performance metric.
        """

    @abc.abstractmethod
    def link_calculate(
        self, inv_prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray]:
        """
        Calculate the link function.

        Args:
            inv_prediction (Union[int, float, np.ndarray]): The input value(s).

        Returns:
            Union[int, float, np.ndarray]: The transformed value(s).
        """

    @abc.abstractmethod
    def inv_link_calculate(
        self, prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        """
        Calculate the inverse link function.

        Args:
            prediction (Union[int, float, np.ndarray]): The input value(s).

        Returns:
            Union[int, float, np.ndarray]: The transformed value(s).
        """

Calculate the inverse link function.

Parameters:

Name Type Description Default
prediction Union[int, float, ndarray]

The input value(s).

required

Returns:

Type Description
Union[int, float, ndarray, Series]

Union[int, float, np.ndarray]: The transformed value(s).

Source code in src/aprofs/models.py
60
61
62
63
64
65
66
67
68
69
70
71
72
@abc.abstractmethod
def inv_link_calculate(
    self, prediction: Union[int, float, np.ndarray, pd.Series]
) -> Union[int, float, np.ndarray, pd.Series]:
    """
    Calculate the inverse link function.

    Args:
        prediction (Union[int, float, np.ndarray]): The input value(s).

    Returns:
        Union[int, float, np.ndarray]: The transformed value(s).
    """

Calculate the link function.

Parameters:

Name Type Description Default
inv_prediction Union[int, float, ndarray]

The input value(s).

required

Returns:

Type Description
Union[int, float, ndarray]

Union[int, float, np.ndarray]: The transformed value(s).

Source code in src/aprofs/models.py
46
47
48
49
50
51
52
53
54
55
56
57
58
@abc.abstractmethod
def link_calculate(
    self, inv_prediction: Union[int, float, np.ndarray, pd.Series]
) -> Union[int, float, np.ndarray]:
    """
    Calculate the link function.

    Args:
        inv_prediction (Union[int, float, np.ndarray]): The input value(s).

    Returns:
        Union[int, float, np.ndarray]: The transformed value(s).
    """

performance_fit(target, prediction) abstractmethod

Calculate the performance of the model.

Parameters:

Name Type Description Default
target ndarray

The true target values.

required
prediction ndarray

The predicted values.

required

Returns:

Name Type Description
float float

The performance metric.

Source code in src/aprofs/models.py
33
34
35
36
37
38
39
40
41
42
43
44
@abc.abstractmethod
def performance_fit(self, target: Union[np.ndarray, pd.Series], prediction: Union[np.ndarray, pd.Series]) -> float:
    """
    Calculate the performance of the model.

    Args:
        target (np.ndarray): The true target values.
        prediction (np.ndarray): The predicted values.

    Returns:
        float: The performance metric.
    """

Bases: LinkModels

This class implements the interface for regression with identity link

Source code in src/aprofs/models.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
class RegressionIdentityLink(LinkModels):
    """This class implements the interface for regression with identity link"""

    def __init__(self) -> None:
        super().__init__(type_model="regression", type_link="identity", perform="minimize")

    def performance_fit(self, target: Union[np.ndarray, pd.Series], prediction: Union[np.ndarray, pd.Series]) -> float:
        return np.sqrt(mean_squared_error(target, prediction))

    def link_calculate(
        self, inv_prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        return inv_prediction

    def inv_link_calculate(
        self, prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        return prediction

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}() with type model {self.type_model} and type link {self.type_link}"

Bases: LinkModels

This class implements the interface for regression with logarithmic link

Source code in src/aprofs/models.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
class RegressionLogLink(LinkModels):
    """This class implements the interface for regression with logarithmic link"""

    def __init__(self) -> None:
        super().__init__(type_model="regression", type_link="logarithmic", perform="minimize")

    def performance_fit(self, target: Union[np.ndarray, pd.Series], prediction: Union[np.ndarray, pd.Series]) -> float:
        return np.sqrt(mean_squared_error(target, prediction))

    def link_calculate(
        self, inv_prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        if not isinstance(inv_prediction, (int, float, np.ndarray, pd.Series)):
            raise ValueError("Invalid input type for link_calculate")
        return np.log(inv_prediction)

    def inv_link_calculate(
        self, prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        return np.exp(prediction)

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}() with type model {self.type_model} and type link {self.type_link}"

Detailed API utilities

Utility functions for the package. this module contains utility functions that are used in the package. the core functions are used to calculate the SHAP values and expected average SHAP value for a given dataset and model.

best_feature(shaps_values, shap_expected_values, link_model, y_target, current_list, candidate_list)

Return the best feature to add to the current list based on the highest AUC score.

Parameters:

Name Type Description Default
shaps_values DataFrame

A DataFrame containing SHAP values for each feature.

required
shap_expected_values Series

A Series containing the expected SHAP values.

required
link_model aprofs model object

An object that allows to calculate the performance of the model.

required
y_target Series

The target variable for the AUC score calculation.

required
current_list list

The current list of features.

required
candidate_list list

The list of candidate features to consider adding.

required

Returns:

Name Type Description
tuple Tuple[str, float]

A tuple containing the best feature to add (str) and the corresponding best AUC score (float).

Raises:

Type Description
ValueError

If candidate_list is empty.

Source code in src/aprofs/utils.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def best_feature(  # pylint: disable=too-many-arguments
    shaps_values: pd.DataFrame,
    shap_expected_values: float,
    link_model: LinkModels,
    y_target: pd.Series,
    current_list: List[str],
    candidate_list: List[str],
) -> Tuple[str, float]:
    """
    Return the best feature to add to the current list based on the highest AUC score.

    Args:
        shaps_values (DataFrame): A DataFrame containing SHAP values for each feature.
        shap_expected_values (Series): A Series containing the expected SHAP values.
        link_model (aprofs model object): An object that allows to calculate the performance of the model.
        y_target (Series): The target variable for the AUC score calculation.
        current_list (list): The current list of features.
        candidate_list (list): The list of candidate features to consider adding.

    Returns:
        tuple: A tuple containing the best feature to add (str) and the corresponding best AUC score (float).

    Raises:
        ValueError: If `candidate_list` is empty.
    """

    if candidate_list == [] or candidate_list is None:
        raise ValueError("The candidate list cannot be empty.")

    best_feature: str = None
    best_auc: float = 0
    for feature in candidate_list:
        current_list.append(feature)
        aprox_preds = calculate_row_sum(shaps_values, shap_expected_values, current_list, link_model)
        auc = roc_auc_score(y_target, aprox_preds)
        if auc > best_auc:
            best_auc = auc
            best_feature = feature
        current_list.remove(feature)
    return best_feature, best_auc

calculate_all_row_sum(data, mean_value, link_model)

Calculates the row sum of all columns in a Shapley values DataFrame and applies a link function to the result.

Parameters:

Name Type Description Default
data DataFrame

The input Shapley values DataFrame.

required
mean_value float

The mean shapley value to be added to the row sum.

required
link_model aprofs model object

An object that allows to calculate the performance of the model.

required

Returns:

Type Description
Union[float, Series]

Union[float, pd.Series]: The result of applying the link function to the row sum.

Examples:

>>> import pandas as pd
>>> data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
>>> mean_value = 10.0
>>> link_function = lambda x: x ** 2
>>> calculate_all_row_sum(data, mean_value, link_function)
225.0
Source code in src/aprofs/utils.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def calculate_all_row_sum(data: pd.DataFrame, mean_value: float, link_model: LinkModels) -> Union[float, pd.Series]:
    """
    Calculates the row sum of **all columns** in a Shapley values DataFrame and applies a link function to the result.

    Args:
        data (pd.DataFrame): The input Shapley values DataFrame.
        mean_value (float): The mean shapley value to be added to the row sum.
        link_model (aprofs model object): An object that allows to calculate the performance of the model.

    Returns:
        Union[float, pd.Series]: The result of applying the link function to the row sum.

    Examples:
        >>> import pandas as pd
        >>> data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
        >>> mean_value = 10.0
        >>> link_function = lambda x: x ** 2
        >>> calculate_all_row_sum(data, mean_value, link_function)
        225.0
    """
    return link_model.link_calculate(mean_value + data.sum(axis=1))

calculate_row_sum(data, mean_value, columns, link_model)

Calculates the row sum of specified columns in a Shapley values DataFrame and applies a link function to the result.

Parameters:

Name Type Description Default
data DataFrame

The input DataFrame with shapley values.

required
mean_value float

The mean shapley value to be added to the row sum.

required
columns List[str]

The list of column names to be summed.

required
link_model aprofs model object

An object that allows to calculate the performance of the model.

required

Returns:

Type Description
Union[float, Series]

Union[float, pd.Series]: The result of applying the link function to the row sum.

Examples:

>>> import pandas as pd
>>> data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
>>> mean_value = 10.0
>>> columns = ['A', 'B']
>>> link_function = lambda x: x ** 2
>>> calculate_row_sum(data, mean_value, columns, link_function)
225.0
Source code in src/aprofs/utils.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def calculate_row_sum(
    data: pd.DataFrame, mean_value: float, columns: List[str], link_model: LinkModels
) -> Union[float, pd.Series]:
    """
    Calculates the row sum of specified columns in a Shapley values DataFrame and applies a link function to the result.

    Args:
        data (pd.DataFrame): The input DataFrame with shapley values.
        mean_value (float): The mean shapley value to be added to the row sum.
        columns (List[str]): The list of column names to be summed.
        link_model (aprofs model object): An object that allows to calculate the performance of the model.

    Returns:
        Union[float, pd.Series]: The result of applying the link function to the row sum.

    Examples:
        >>> import pandas as pd
        >>> data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
        >>> mean_value = 10.0
        >>> columns = ['A', 'B']
        >>> link_function = lambda x: x ** 2
        >>> calculate_row_sum(data, mean_value, columns, link_function)
        225.0
    """
    return link_model.link_calculate(mean_value + data[columns].sum(axis=1))

generate_all_combinations(features)

Generates all possible combinations of the given features list. This will be used to test all possible combinations fo features to find the best combination.

Parameters:

Name Type Description Default
features List[str]

A list of features.

required

Returns:

Type Description
List[Tuple[str]]

List[Tuple[str]]: A list of tuples representing all possible combinations of the features.

Source code in src/aprofs/utils.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def generate_all_combinations(features: List[str]) -> List[Tuple[str]]:
    """
    Generates all possible combinations of the given features list.
    This will be used to test all possible combinations fo features to find the best combination.

    Args:
        features (List[str]): A list of features.

    Returns:
        List[Tuple[str]]: A list of tuples representing all possible combinations of the features.
    """
    all_combinations: List = []
    for feature_size in range(1, len(features) + 1):
        all_combinations.extend(combinations(features, feature_size))
    return all_combinations

get_shap_values(data, model, type_model='tree')

Calculates the SHAP values and expected average shap value for a given dataset and model.

Parameters:

Name Type Description Default
data ndarray or DataFrame

The input dataset.

required
model Callable

The trained model object.

required
type_model str

type of model: tree based or other. If "tree" then TreeExplainer will be use, otherwise a general explainer from the SHAP package is used. Defaults to 'tree'.

'tree'

Returns:

Name Type Description
tuple Tuple[DataFrame, float]

A tuple containing the SHAP values and the Average shap value.

Examples:

>>> # Imports
>>> import numpy as np
>>> from xgboost import XGBClassifier
>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import train_test_split
>>> # Imports SHAP Package
>>> import shap
>>>
>>> # Load the iris dataset
>>> iris = load_iris()
>>> X, y = iris.data, iris.target
>>>
>>> # Split the dataset into train and test sets
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
>>>
>>> # Train a model
>>> model = XGBClassifier()
>>> model.fit(X_train, y_train)
>>>
>>> # Calculate SHAP values and expected value
>>> shap_values, expected_value = get_shap_tree_values(X_test, model)
Source code in src/aprofs/utils.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def get_shap_values(data: pd.DataFrame, model: Callable, type_model="tree") -> Tuple[pd.DataFrame, float]:
    """
    Calculates the SHAP values and expected average shap value for a given dataset and model.

    Args:
        data (numpy.ndarray or pandas.DataFrame): The input dataset.
        model: The trained model object.
        type_model (str): type of model: tree based or other. If "tree" then TreeExplainer will be use, otherwise a general explainer from the SHAP package is used. Defaults to 'tree'.

    Returns:
        tuple: A tuple containing the SHAP values and the Average shap value.

    Examples:
        >>> # Imports
        >>> import numpy as np
        >>> from xgboost import XGBClassifier
        >>> from sklearn.datasets import load_iris
        >>> from sklearn.model_selection import train_test_split
        >>> # Imports SHAP Package
        >>> import shap
        >>>
        >>> # Load the iris dataset
        >>> iris = load_iris()
        >>> X, y = iris.data, iris.target
        >>>
        >>> # Split the dataset into train and test sets
        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        >>>
        >>> # Train a model
        >>> model = XGBClassifier()
        >>> model.fit(X_train, y_train)
        >>>
        >>> # Calculate SHAP values and expected value
        >>> shap_values, expected_value = get_shap_tree_values(X_test, model)
    """
    if type_model == "tree":
        shap_explainer = TreeExplainer(model)
        shap_valid = shap_explainer.shap_values(data)
        shap_expected_value = shap_explainer.expected_value
    else:
        shap_explainer = Explainer(model)
        shap_valid = shap_explainer.shap_values(data)
        shap_expected_value = shap_explainer.expected_value

    if isinstance(shap_valid, list):
        shap_valid = np.concatenate(shap_valid, axis=1)

    return shap_valid, shap_expected_value

plot_data(temp, main_feature, other_features=None, nbins=20, type_bins='qcut', type_plot='prob')

Plot data based on the provided DataFrame and features.

Parameters:

Name Type Description Default
temp DataFrame

The DataFrame containing the data.

required
main_feature str

The main feature to plot.

required
other_features Optional[Union[str, List[str]]]

Other features to include in the plot. Defaults to None.

None
nbins int

The number of bins. Defaults to 20.

20
type_bins str

The type of binning. Defaults to "qcut".

'qcut'
type_plot str

The type of plot. Defaults to "prob".

'prob'

Returns:

Type Description
None

None

Examples:

>>> temp = pd.DataFrame(...)
>>> plot_data(temp, "main_feature", other_features=["feature_1", "feature_2"], nbins=10, type_bins="cut", type_plot="raw")
Source code in src/aprofs/utils.py
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
def plot_data(  # pylint: disable=too-many-arguments
    temp: pd.DataFrame,
    main_feature: str,
    other_features: Optional[Union[str, List[str]]] = None,
    nbins: int = 20,
    type_bins: str = "qcut",
    type_plot: str = "prob",
) -> None:
    """
    Plot data based on the provided DataFrame and features.

    Args:
        temp (pd.DataFrame): The DataFrame containing the data.
        main_feature (str): The main feature to plot.
        other_features (Optional[Union[str, List[str]]], optional): Other features to include in the plot. Defaults to None.
        nbins (int, optional): The number of bins. Defaults to 20.
        type_bins (str, optional): The type of binning. Defaults to "qcut".
        type_plot (str, optional): The type of plot. Defaults to "prob".

    Returns:
        None

    Examples:
        >>> temp = pd.DataFrame(...)
        >>> plot_data(temp, "main_feature", other_features=["feature_1", "feature_2"], nbins=10, type_bins="cut", type_plot="raw")
    """
    if other_features is None:
        other_features = []
    if not isinstance(other_features, list):
        other_features = [other_features]
    features = []
    features.append(main_feature)
    features.extend(other_features)

    if temp[main_feature].unique().shape[0] < 25:
        temp["bins"] = temp[main_feature].astype(str)
    elif type_bins == "cut":
        temp["bins"] = pd.cut(temp[main_feature], bins=nbins)
    elif type_bins == "qcut":
        temp["bins"] = pd.qcut(temp[main_feature], q=nbins)
    else:
        print("Invalid type_bins value")

    # Calculate the means for each bin
    means = temp.groupby("bins", observed=True)["target"].mean()

    means_shap = {}
    if type_plot == "raw":
        for feature in features:
            means_shap[feature] = temp.groupby("bins", observed=True)[f"{feature}_shap"].mean()
        means_shap_others = temp.groupby("bins", observed=True)["shap_other"].mean()
        means_shap_model = temp.groupby("bins", observed=True)["shap_model"].mean()
    else:
        for feature in features:
            means_shap[feature] = temp.groupby("bins", observed=True)[f"{feature}_shap_prob"].mean()
        means_shap_others = temp.groupby("bins", observed=True)["shap_prob_other"].mean()
        means_shap_model = temp.groupby("bins", observed=True)["shap_prob_model"].mean()

    # Calculate the counts for each bin
    counts = temp["bins"].value_counts(normalize=True).sort_index()

    # Create a figure
    fig = go.Figure()

    # Add bar plot for counts on the primary y-axis
    fig.add_trace(go.Bar(x=counts.index.astype(str), y=counts, name="Data", yaxis="y", marker_color="lightgray"))

    # Add line plots on the secondary y-axis
    fig.add_trace(go.Scatter(x=means.index.astype(str), y=means, mode="lines", name="Observed", yaxis="y2"))

    for feature in features:
        fig.add_trace(
            go.Scatter(
                x=means_shap[feature].index.astype(str),
                y=means_shap[feature],
                mode="lines",
                name=f"{feature} shap Mean",
                yaxis="y2",
            )
        )

    fig.add_trace(
        go.Scatter(
            x=means_shap_others.index.astype(str), y=means_shap_others, mode="lines", name="Others shaps", yaxis="y2"
        )
    )
    fig.add_trace(
        go.Scatter(
            x=means_shap_model.index.astype(str), y=means_shap_model, mode="lines", name="Model shaps", yaxis="y2"
        )
    )

    # Update layout to include a secondary y-axis
    fig.update_layout(
        yaxis={"title": "Counts", "side": "left", "tickformat": ".0%"},
        yaxis2={"title": "Avg.", "side": "right", "overlaying": "y"},
    )
    fig.update_xaxes(title_text=feature)
    fig.show()

plot_data_compare(temp, feature, nbins=20, type_bins='qcut', type_plot='prob')

Plot data based on the provided DataFrame and feature in a way to compare a specific shap.

Parameters:

Name Type Description Default
temp DataFrame

The DataFrame containing the data.

required
feature str

The main feature to plot.

required
nbins int

The number of bins. Defaults to 20.

20
type_bins str

The type of binning. Defaults to "qcut".

'qcut'
type_plot str

The type of plot. Defaults to "prob".

'prob'

Returns:

Type Description
None

None

Examples:

>>> temp = pd.DataFrame(...)
>>> plot_data(temp, "feature_name", nbins=10, type_bins="cut", type_plot="raw")
Source code in src/aprofs/utils.py
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
def plot_data_compare(  # pylint: disable=too-many-arguments
    temp: pd.DataFrame,
    feature: str,
    nbins: int = 20,
    type_bins: str = "qcut",
    type_plot: str = "prob",
) -> None:
    """
    Plot data based on the provided DataFrame and feature in a way to compare a specific shap.

    Args:
        temp (pd.DataFrame): The DataFrame containing the data.
        feature (str): The main feature to plot.
        nbins (int, optional): The number of bins. Defaults to 20.
        type_bins (str, optional): The type of binning. Defaults to "qcut".
        type_plot (str, optional): The type of plot. Defaults to "prob".

    Returns:
        None

    Examples:
        >>> temp = pd.DataFrame(...)
        >>> plot_data(temp, "feature_name", nbins=10, type_bins="cut", type_plot="raw")
    """

    if temp[feature].unique().shape[0] < 25:
        temp["bins"] = temp[feature].astype(str)
    elif type_bins == "cut":
        temp["bins"] = pd.cut(temp[feature], bins=nbins)
    elif type_bins == "qcut":
        temp["bins"] = pd.qcut(temp[feature], q=nbins)
    else:
        print("Invalid type_bins value")

    # Calculate the means for each bin
    means = temp.groupby("bins", observed=True)["target"].mean()

    means_shap = {}
    if type_plot == "raw":
        means_shap[feature] = temp.groupby("bins", observed=True)[f"{feature}_shap"].mean()
        means_shap[f"{feature}_shap"] = temp.groupby("bins", observed=True)[f"{feature}_shap"].mean()
        means_shap[f"{feature}_shap_compare"] = temp.groupby("bins", observed=True)[f"{feature}_shap_compare"].mean()
        means_shap_model = temp.groupby("bins", observed=True)["shap_model"].mean()
    else:
        means_shap[feature] = temp.groupby("bins", observed=True)[f"{feature}_shap_prob"].mean()
        means_shap[f"{feature}_compare"] = temp.groupby("bins", observed=True)[f"{feature}_shap_prob_compare"].mean()
        means_shap_model = temp.groupby("bins", observed=True)["shap_prob_model"].mean()

    # Calculate the counts for each bin
    counts = temp["bins"].value_counts(normalize=True).sort_index()

    # Create a figure
    fig = go.Figure()

    # Add bar plot for counts on the primary y-axis
    fig.add_trace(go.Bar(x=counts.index.astype(str), y=counts, name="Data", yaxis="y", marker_color="lightgray"))

    # Add line plots on the secondary y-axis
    fig.add_trace(go.Scatter(x=means.index.astype(str), y=means, mode="lines", name="Observed", yaxis="y2"))

    fig.add_trace(
        go.Scatter(
            x=means_shap[feature].index.astype(str),
            y=means_shap[feature],
            mode="lines",
            name=f"{feature} shap Mean",
            yaxis="y2",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=means_shap[f"{feature}_compare"].index.astype(str),
            y=means_shap[f"{feature}_compare"],
            mode="lines",
            name=f"{feature} shap Mean compare",
            yaxis="y2",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=means_shap_model.index.astype(str), y=means_shap_model, mode="lines", name="Model shaps", yaxis="y2"
        )
    )

    # Update layout to include a secondary y-axis
    fig.update_layout(
        yaxis={"title": "Counts", "side": "left", "tickformat": ".0%"},
        yaxis2={"title": "Avg.", "side": "right", "overlaying": "y"},
    )
    fig.update_xaxes(title_text=feature)
    fig.show()

plot_data_neutral(data, feature, nbins=20, type_bins='qcut', type_plot='prob')

Plot data based on the provided neutralized DataFrame and features.

Parameters:

Name Type Description Default
data DataFrame

The DataFrame containing the neutralize shap data.

required
feature str

The main feature to plot on the x-axis.

required
nbins int

The number of bins. Defaults to 20.

20
type_bins str

The type of binning. Defaults to "qcut".

'qcut'
type_plot str

The type of plot. Defaults to "prob".

'prob'

Returns:

Type Description
None

None

Examples:

>>> temp = pd.DataFrame(...)
>>> plot_data_neutral(temp, "main_feature", other_features=["feature_1", "feature_2"], nbins=10, type_bins="cut", type_plot="raw")
Source code in src/aprofs/utils.py
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
def plot_data_neutral(  # pylint: disable=too-many-arguments
    data: pd.DataFrame,
    feature: str,
    nbins: int = 20,
    type_bins: str = "qcut",
    type_plot: str = "prob",
) -> None:
    """
    Plot data based on the provided neutralized DataFrame and features.

    Args:
        data (pd.DataFrame): The DataFrame containing the neutralize shap data.
        feature (str): The main feature to plot on the x-axis.
        nbins (int, optional): The number of bins. Defaults to 20.
        type_bins (str, optional): The type of binning. Defaults to "qcut".
        type_plot (str, optional): The type of plot. Defaults to "prob".

    Returns:
        None

    Examples:
        >>> temp = pd.DataFrame(...)
        >>> plot_data_neutral(temp, "main_feature", other_features=["feature_1", "feature_2"], nbins=10, type_bins="cut", type_plot="raw")
    """

    if data[feature].unique().shape[0] < 25:
        data["bins"] = data[feature].astype(str)
    elif type_bins == "cut":
        data["bins"] = pd.cut(data[feature], bins=nbins)
    elif type_bins == "qcut":
        data["bins"] = pd.qcut(data[feature], q=nbins)
    else:
        print("Invalid type_bins value")

    # Calculate the means for each bin
    means = data.groupby("bins", observed=True)["target"].mean()

    if type_plot == "raw":
        means_shap_others = data.groupby("bins", observed=True)["shap_other"].mean()
        means_shap_model = data.groupby("bins", observed=True)["shap_model"].mean()
    else:
        means_shap_others = data.groupby("bins", observed=True)["shap_prob_other"].mean()
        means_shap_model = data.groupby("bins", observed=True)["shap_prob_model"].mean()

    # Calculate the counts for each bin
    counts = data["bins"].value_counts(normalize=True).sort_index()

    # Create a figure
    fig = go.Figure()

    # Add bar plot for counts on the primary y-axis
    fig.add_trace(go.Bar(x=counts.index.astype(str), y=counts, name="Data", yaxis="y", marker_color="lightgray"))

    # Add line plots on the secondary y-axis
    fig.add_trace(go.Scatter(x=means.index.astype(str), y=means, mode="lines", name="Observed", yaxis="y2"))

    fig.add_trace(
        go.Scatter(
            x=means_shap_others.index.astype(str),
            y=means_shap_others,
            mode="lines",
            name="Neutralized shaps",
            yaxis="y2",
        )
    )
    fig.add_trace(
        go.Scatter(
            x=means_shap_model.index.astype(str),
            y=means_shap_model,
            mode="lines",
            name="Original Model shaps",
            yaxis="y2",
        )
    )

    # Update layout to include a secondary y-axis
    fig.update_layout(
        yaxis={"title": "Counts", "side": "left", "tickformat": ".0%"},
        yaxis2={"title": "Avg.", "side": "right", "overlaying": "y"},
    )
    # Add title to x-axis
    fig.update_xaxes(title_text=feature)

    fig.show()

random_sort_shaps(shaps_values, shap_expected_value, feature_name, y_target, link_model)

Randomly shuffles the values of a specific feature in the SHAP values DataFrame, calculates the row sum, and returns the ROC AUC score.

Parameters:

Name Type Description Default
shaps_values DataFrame

The SHAP values DataFrame.

required
shap_expected_value float

The expected SHAP value.

required
feature_name str

The name of the feature to shuffle.

required
y_target Union[Series, ndarray]

The target variable.

required
link_model aprofs model object

An object that allows to calculate the performance of the model.

required

Returns:

Name Type Description
float float

The ROC AUC score.

Examples:

>>> import pandas as pd
>>> import numpy as np
>>> from sklearn.datasets import make_classification
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.metrics import roc_auc_score
>>>
>>> # Generate synthetic data
>>> X, y = make_classification(n_samples=100, n_features=5, random_state=42)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
>>>
>>> # Train a logistic regression model
>>> model = LogisticRegression()
>>> model.fit(X_train, y_train)
>>>
>>> # Calculate SHAP values and expected value
>>> shap_values, expected_value = get_shap_tree_values(X_test, model)
>>>
>>> # Calculate ROC AUC score with shuffled feature
>>> roc_score = random_sort_shaps(shap_values, expected_value, 'feature_1', y_test, link_function='logistic')
>>> print(roc_score)
Source code in src/aprofs/utils.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def random_sort_shaps(
    shaps_values: pd.DataFrame,
    shap_expected_value: float,
    feature_name: str,
    y_target: Union[pd.Series, np.ndarray],
    link_model: LinkModels,
) -> float:
    """
    Randomly shuffles the values of a specific feature in the SHAP values DataFrame,
    calculates the row sum, and returns the ROC AUC score.

    Args:
        shaps_values (pd.DataFrame): The SHAP values DataFrame.
        shap_expected_value (float): The expected SHAP value.
        feature_name (str): The name of the feature to shuffle.
        y_target (Union[pd.Series, np.ndarray]): The target variable.
        link_model (aprofs model object): An object that allows to calculate the performance of the model.

    Returns:
        float: The ROC AUC score.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> from sklearn.datasets import make_classification
        >>> from sklearn.linear_model import LogisticRegression
        >>> from sklearn.model_selection import train_test_split
        >>> from sklearn.metrics import roc_auc_score
        >>>
        >>> # Generate synthetic data
        >>> X, y = make_classification(n_samples=100, n_features=5, random_state=42)
        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        >>>
        >>> # Train a logistic regression model
        >>> model = LogisticRegression()
        >>> model.fit(X_train, y_train)
        >>>
        >>> # Calculate SHAP values and expected value
        >>> shap_values, expected_value = get_shap_tree_values(X_test, model)
        >>>
        >>> # Calculate ROC AUC score with shuffled feature
        >>> roc_score = random_sort_shaps(shap_values, expected_value, 'feature_1', y_test, link_function='logistic')
        >>> print(roc_score)
    """
    shaps_values_shuffled = shaps_values.sample(frac=1)  # shuffle
    shaps_values_shuffled.reset_index(inplace=True, drop=True)

    new_shap_table = shaps_values.copy()
    new_shap_table.reset_index(inplace=True, drop=True)

    new_shap_table[feature_name] = shaps_values_shuffled[feature_name]
    approx_pred_valid = calculate_all_row_sum(new_shap_table, shap_expected_value, link_model)

    return link_model.performance_fit(y_target, approx_pred_valid)

random_sort_shaps_column(shaps_values, shap_mean_value, target_column, feature, link_model, original=False)

Randomly shuffles the values of a specific feature in the SHAP values DataFrame, calculates the row sum, and returns the ROC AUC score.

Parameters:

Name Type Description Default
shaps_values DataFrame

The SHAP values DataFrame.

required
shap_mean_value float

The mean SHAP value.

required
target_column Union[Series, ndarray]

The target variable.

required
feature str

The name of the feature to shuffle.

required
link_model aprofs model object

An object that allows to calculate the performance of the model.

required
original bool

Whether to use the original feature values or shuffled values. Defaults to False.

False

Returns:

Name Type Description
float float

The ROC AUC score.

Examples:

>>> import pandas as pd
>>> import numpy as np
>>> from sklearn.datasets import make_classification
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.metrics import roc_auc_score
>>>
>>> # Generate synthetic data
>>> X, y = make_classification(n_samples=100, n_features=5, random_state=42)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
>>>
>>> # Train a logistic regression model
>>> model = LogisticRegression()
>>> model.fit(X_train, y_train)
>>>
>>> # Calculate SHAP values and expected value
>>> shap_values, expected_value = get_shap_tree_values(X_test, model)
>>>
>>> # Calculate ROC AUC score with shuffled feature
>>> roc_score = random_sort_shaps_column(shap_values, expected_value, y_test, 'feature_1', link_function='logistic')
>>> print(roc_score)
Source code in src/aprofs/utils.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
def random_sort_shaps_column(  # pylint: disable=too-many-arguments
    shaps_values: pd.DataFrame,
    shap_mean_value: float,
    target_column: Union[pd.Series, np.ndarray],
    feature: str,
    link_model: LinkModels,
    original: bool = False,
) -> float:
    """
    Randomly shuffles the values of a specific feature in the SHAP values DataFrame,
    calculates the row sum, and returns the ROC AUC score.

    Args:
        shaps_values (pd.DataFrame): The SHAP values DataFrame.
        shap_mean_value (float): The mean SHAP value.
        target_column (Union[pd.Series, np.ndarray]): The target variable.
        feature (str): The name of the feature to shuffle.
        link_model (aprofs model object): An object that allows to calculate the performance of the model.
        original (bool, optional): Whether to use the original feature values or shuffled values. Defaults to False.

    Returns:
        float: The ROC AUC score.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> from sklearn.datasets import make_classification
        >>> from sklearn.linear_model import LogisticRegression
        >>> from sklearn.model_selection import train_test_split
        >>> from sklearn.metrics import roc_auc_score
        >>>
        >>> # Generate synthetic data
        >>> X, y = make_classification(n_samples=100, n_features=5, random_state=42)
        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        >>>
        >>> # Train a logistic regression model
        >>> model = LogisticRegression()
        >>> model.fit(X_train, y_train)
        >>>
        >>> # Calculate SHAP values and expected value
        >>> shap_values, expected_value = get_shap_tree_values(X_test, model)
        >>>
        >>> # Calculate ROC AUC score with shuffled feature
        >>> roc_score = random_sort_shaps_column(shap_values, expected_value, y_test, 'feature_1', link_function='logistic')
        >>> print(roc_score)
    """
    shaps_values_original = shaps_values.copy()
    shaps_values_original.reset_index(inplace=True, drop=True)

    shaps_values_shuffled = shaps_values.sample(frac=1)  # shuffle
    shaps_values_shuffled.reset_index(inplace=True, drop=True)

    # Calculate the average values of each column
    average_values = shaps_values.mean()
    new_shap_table = shaps_values.copy()
    new_shap_table.reset_index(inplace=True, drop=True)
    for feature_name in shaps_values.columns:
        new_shap_table[feature_name] = average_values[feature_name]

    if original:
        new_shap_table[feature] = shaps_values_original[feature]
    else:
        new_shap_table[feature] = shaps_values_shuffled[feature]

    approx_pred_valid = calculate_all_row_sum(new_shap_table, shap_mean_value, link_model)

    return link_model.performance_fit(target_column, approx_pred_valid)

temp_neutral_plot_data(aprofs_obj, features)

Generate a temporary DataFrame for plotting purposes.

Parameters:

Name Type Description Default
aprofs_obj Aprofs Object

An instance of the Aprofs class.

required
features List[str]

A list of feature names that will be neutralized. The shapley values for this will be just the average values. This way the break the segmentation of the feature, maintaining the global effect of all the others.

required

Returns:

Type Description
DataFrame

pd.DataFrame: The temporary DataFrame.

Examples:

>>> aprofs_obj = Aprofs Object(...)
>>> features = ['feature_1', 'feature_2']
>>> temp = temp_neutral_plot_data(aprofs_obj, features)
>>> print(temp.head())
Source code in src/aprofs/utils.py
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
def temp_neutral_plot_data(aprofs_obj, features: List[str]) -> pd.DataFrame:
    """
    Generate a temporary DataFrame for plotting purposes.

    Args:
        aprofs_obj (Aprofs Object): An instance of the Aprofs class.
        features (List[str]): A list of feature names that will be neutralized. The shapley values for this will be just the average values. This way the break the segmentation of the feature, maintaining the global effect of all the others.

    Returns:
        pd.DataFrame: The temporary DataFrame.

    Examples:
        >>> aprofs_obj = Aprofs Object(...)
        >>> features = ['feature_1', 'feature_2']
        >>> temp = temp_neutral_plot_data(aprofs_obj, features)
        >>> print(temp.head())
    """
    if not isinstance(features, list):
        features = [features]

    temp = pd.DataFrame(
        {
            "target": aprofs_obj.target_column,
        }
    )

    for feat in features:
        temp[feat] = aprofs_obj.current_data[feat].values  # adding features to data

    temp["shap_other"] = (
        aprofs_obj.shap_mean
        + aprofs_obj.shap_values[[col for col in aprofs_obj.shap_values.columns if col not in features]].sum(axis=1)
        + aprofs_obj.shap_values[features]
        .sum(axis=1)
        .mean()  # sums the columns of the features and calculate the average value
    )
    temp["shap_prob_other"] = 1 / (1 + np.exp(-temp["shap_other"]))
    temp["shap_model"] = aprofs_obj.shap_mean + aprofs_obj.shap_values.sum(axis=1)
    temp["shap_prob_model"] = 1 / (1 + np.exp(-temp["shap_model"]))

    return temp

temp_plot_compare_data(aprofs_obj_self, aprofs_obj, feature)

Generate a temporary DataFrame for plotting purposes.

Parameters:

Name Type Description Default
aprofs_obj_self Aprofs Object

An instance of the Aprofs class.

required
aprofs_obj Aprofs Object

An instance of the Aprofs class.

required
feature str

feature to compare.

required

Returns:

Type Description
DataFrame

pd.DataFrame: The temporary DataFrame.

Examples:

>>> aprofs_obj = Aprofs Object(...)
>>> aprofs_obj_2_compare = Aprofs Object(...)
>>> features = 'feature_1'
>>> temp = temp_plot_data(aprofs_obj,aprofs_obj_2_compare, feature)
>>> print(temp.head())
Source code in src/aprofs/utils.py
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
def temp_plot_compare_data(aprofs_obj_self, aprofs_obj, feature: str) -> pd.DataFrame:
    """
    Generate a temporary DataFrame for plotting purposes.

    Args:
        aprofs_obj_self (Aprofs Object): An instance of the Aprofs class.
        aprofs_obj (Aprofs Object): An instance of the Aprofs class.
        feature (str): feature to compare.

    Returns:
        pd.DataFrame: The temporary DataFrame.

    Examples:
        >>> aprofs_obj = Aprofs Object(...)
        >>> aprofs_obj_2_compare = Aprofs Object(...)
        >>> features = 'feature_1'
        >>> temp = temp_plot_data(aprofs_obj,aprofs_obj_2_compare, feature)
        >>> print(temp.head())
    """

    temp = pd.DataFrame(
        {
            "target": aprofs_obj_self.target_column,
        }
    )

    # self data
    temp[feature] = aprofs_obj_self.current_data[feature].values
    temp[f"{feature}_shap"] = aprofs_obj_self.shap_mean + aprofs_obj_self.shap_values[feature].values
    temp[f"{feature}_shap_prob"] = 1 / (1 + np.exp(-temp[f"{feature}_shap"]))

    # compare data
    temp[f"{feature}_shap_compare"] = aprofs_obj.shap_mean + aprofs_obj.shap_values[feature].values
    temp[f"{feature}_shap_prob_compare"] = 1 / (1 + np.exp(-temp[f"{feature}_shap_compare"]))

    # model probabilities data
    temp["shap_model"] = aprofs_obj.shap_mean + aprofs_obj.shap_values.sum(axis=1)
    temp["shap_prob_model"] = 1 / (1 + np.exp(-temp["shap_model"]))

    return temp

temp_plot_data(aprofs_obj, features)

Generate a temporary DataFrame for plotting purposes.

Parameters:

Name Type Description Default
aprofs_obj Aprofs Object

An instance of the Aprofs class.

required
features List[str]

A list of feature names.

required

Returns:

Type Description
DataFrame

pd.DataFrame: The temporary DataFrame.

Examples:

>>> aprofs_obj = Aprofs Object(...)
>>> features = ['feature_1', 'feature_2']
>>> temp = temp_plot_data(aprofs_obj, features)
>>> print(temp.head())
Source code in src/aprofs/utils.py
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
def temp_plot_data(aprofs_obj, features: List[str]) -> pd.DataFrame:
    """
    Generate a temporary DataFrame for plotting purposes.

    Args:
        aprofs_obj (Aprofs Object): An instance of the Aprofs class.
        features (List[str]): A list of feature names.

    Returns:
        pd.DataFrame: The temporary DataFrame.

    Examples:
        >>> aprofs_obj = Aprofs Object(...)
        >>> features = ['feature_1', 'feature_2']
        >>> temp = temp_plot_data(aprofs_obj, features)
        >>> print(temp.head())
    """
    if not isinstance(features, list):
        features = [features]

    temp = pd.DataFrame(
        {
            "target": aprofs_obj.target_column,
        }
    )

    for feature in features:
        temp[feature] = aprofs_obj.current_data[feature].values
        temp[f"{feature}_shap"] = aprofs_obj.shap_mean + aprofs_obj.shap_values[feature].values
        temp[f"{feature}_shap_prob"] = 1 / (1 + np.exp(-temp[f"{feature}_shap"]))

    temp["shap_other"] = aprofs_obj.shap_mean + aprofs_obj.shap_values[
        [col for col in aprofs_obj.shap_values.columns if col not in features]
    ].sum(axis=1)
    temp["shap_prob_other"] = 1 / (1 + np.exp(-temp["shap_other"]))
    temp["shap_model"] = aprofs_obj.shap_mean + aprofs_obj.shap_values.sum(axis=1)
    temp["shap_prob_model"] = 1 / (1 + np.exp(-temp["shap_model"]))

    return temp