References

Core code development of the Aprofs class.

This class is used to calculate the SHAP values of a model and evaluate the performance of the features based on the SHAP values. The class also provides a method to visualize the marginal effect of a feature on the target variable. The class is used to perform feature selection based on the SHAP values and to calculate the p-values of the SHAP values of the features. The class is also used to calculate the performance of the model based on the SHAP values of the features.

`Aprofs`

Aprofs Class

A class for analyzing SHAP values using approximate predictions.

Attributes:

Name	Type	Description
`current_data`	`DataFrame`	The current data.
`target_column`	`Series`	The target column.
`link`	`function`	The link function.
`link_srt`	`str`	The string representation of the link function.
`shap_mean`	`float`	The mean SHAP value. None if SHAP values have not been calculated.
`shap_values`	`DataFrame`	The SHAP values. None if SHAP values have not been calculated.

Source code in src/aprofs/code.py

class Aprofs:
    """
    Aprofs Class

    A class for analyzing SHAP values using approximate predictions.
    --------------------------------------------------------------


    Attributes:
        current_data (pd.DataFrame): The current data.
        target_column (Series): The target column.
        link (function): The link function.
        link_srt (str): The string representation of the link function.
        shap_mean (float): The mean SHAP value. None if SHAP values have not been calculated.
        shap_values (DataFrame): The SHAP values. None if SHAP values have not been calculated.

    """

    def __init__(self, current_data, target_column, link_model: LinkModels):
        self.current_data = current_data
        self.target_column = target_column
        self.link_model = ClassificationLogisticLink() if link_model is None else link_model
        self.shap_mean: float = None
        self.shap_values: pd.DataFrame = None

    def __repr__(self):
        return (
            f"Aprofs(current_data shape ={self.current_data.shape}, target_column ={self.target_column.unique()}"
            + (
                f", shap_mean={self.shap_mean}, shap_values.shape={self.shap_values.shape}"
                if self.shap_mean is not None
                else "\n  Shapley values have not been calculated!"
            )
        )

    def calculate_shaps(self, model: Any, type_model="tree") -> None:
        """
        Calculate the SHAP values for the given model.

        Parameters:
            model (Any): The trained model for which to calculate the SHAP values.
            type_model (str): type of model: tree based or other. If "tree" then TreeExplainer will be use, otherwise a general explainer from the SHAP package is used. Defaults to 'tree'.


        Returns:
            None
        """
        shap_values, shap_mean = utils.get_shap_values(self.current_data, model, type_model)
        self.shap_values = pd.DataFrame(shap_values, index=self.current_data.index, columns=self.current_data.columns)
        self.shap_mean = shap_mean

    def get_feature_performance(self, features: List[str]) -> float:
        """
        Calculate the performance of the features based on the SHAP values.

        Parameters:
            features (List[str]): The list of features for which to calculate the performance.

        Returns:
            float: The performance of the features based on the SHAP values.

        Raises:
            ValueError: If an any feature is missing in the SHAP values.
        """
        missing_features = [feature for feature in features if feature not in self.shap_values.columns]
        if missing_features:
            raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")
        return self.link_model.performance_fit(
            self.target_column, utils.calculate_row_sum(self.shap_values, self.shap_mean, features, self.link_model)
        )

    def brute_force_selection(self, features: List[str]) -> List[str]:
        """
        Perform brute force feature selection by evaluating the performance of all possible combinations of features.

        Parameters:
            features (List[str]): The list of features to consider for feature selection.

        Returns:
            List[str]: The best list of features with the highest performance.

        Raises:
            ValueError: If an any feature is missing in the SHAP values.
        """
        missing_features = [feature for feature in features if feature not in self.shap_values.columns]
        if missing_features:
            raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

        best_performance = 0.0
        best_list = []
        all_combinations = list(utils.generate_all_combinations(features))
        for comb in tqdm(all_combinations, desc=f"Processing {len(all_combinations)} combinations"):
            current_performance = self.get_feature_performance(list(comb))
            if current_performance > best_performance:
                best_performance = current_performance
                best_list = comb
        print(f"the best list is {best_list} with performance {best_performance}")
        return list(best_list)

    def gready_forward_selection(self, features: List[str], greediness: float = 0.001) -> List[str]:
        """
        Perform gready forward feature selection by evaluating the performance of all possible combinations of features.

        Parameters:
            features (List[str]): The list of features to consider for feature selection.
            greediness (float): The greediness factor, how much better needs to be the performance to add the feature. Default is 0.001.
        Returns:
            List[str]: The best list of features with the highest performance.

        Raises:
            ValueError: If an any feature is missing in the SHAP values.
        """
        missing_features = [feature for feature in features if feature not in self.shap_values.columns]
        if missing_features:
            raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

        best_list: List = []
        candidate_list: List[str] = features.copy()
        aproximate_performance: List[float] = []
        best_performance = 0.0
        while len(candidate_list) > 0:
            best_feature_, best_performance_ = utils.best_feature(
                self.shap_values, self.shap_mean, self.link_model, self.target_column, best_list, candidate_list
            )
            candidate_list.remove(best_feature_)

            if self.link_model.perform == "maximize":  # maximize metric
                if best_performance > best_performance_ * (1 + greediness):
                    print(f"The feature {best_feature_} wont be added")
                else:
                    best_performance = best_performance_
                    best_list.append(best_feature_)
                    print(f"the best feature to add is {best_feature_} with performance {best_performance_}")
                    aproximate_performance.append(best_performance_)

            if self.link_model.perform == "minimize":  # minimize metric
                if best_performance < best_performance_ * (1 - greediness):
                    print(f"The feature {best_feature_} wont be added")
                else:
                    best_performance = best_performance_
                    best_list.append(best_feature_)
                    print(f"the best feature to add is {best_feature_} with performance {best_performance_}")
                    aproximate_performance.append(best_performance_)

        return best_list

    def get_shap_p_value(self, features: List[str], suffle_size: int = 500) -> pd.DataFrame:
        """
        Calculate the p-values of the SHAP values of the features.

        Parameters:
            features (List[str]): The list of features for which to calculate the p-values.
            suffle_size (int): The number of shuffling iterations to perform. Default is 500.

        Returns:
            pd.DataFrame: A DataFrame containing the features and their corresponding p-values.

        Raises:
            ValueError: If an any feature is missing in the SHAP values.
        """
        missing_features = [feature for feature in features if feature not in self.shap_values.columns]
        if missing_features:
            raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

        p_values = []
        performance_threshold = self.get_feature_performance(self.shap_values.columns)
        for feature in tqdm(features):
            samples = [
                utils.random_sort_shaps(self.shap_values, self.shap_mean, feature, self.target_column, self.link_model)
                for _ in range(suffle_size)
            ]
            count = sum(sample > performance_threshold for sample in samples)
            p_values.append(count / suffle_size)

        return pd.DataFrame({"Feature": features, "p-value_shap": p_values})

    def visualize_feature(  # pylint: disable=too-many-arguments
        self,
        main_feature: str,
        other_features: List[str] = None,
        nbins: int = 20,
        type_bins: str = "qcut",
        type_plot: str = "prob",
    ) -> None:
        """
        Visualize the marginal effect of a feature on the target variable.

        Parameters:
            main_feature (str): The main feature for which to visualize the marginal effect.
            other_features (List[str]): The list of other features to include in the visualization. Default is None.
            nbins (int): The number of bins to use for the visualization. Default is 20.
            type_bins (str): The type of binning to use. Default is "qcut".
            type_plot (str): The type of plot to generate. Default is "prob".

        Returns:
            None

        Raises:
            ValueError: If an any feature is missing in the SHAP values dataframe.
        """
        # generate data to plot marginal effect shapley values
        if other_features is None:
            other_features = []
        features = []
        features.append(main_feature)
        features.extend(other_features)

        missing_features = [feature for feature in features if feature not in self.shap_values.columns]
        if missing_features:
            raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

        temp_data = utils.temp_plot_data(self, features)
        # call plotting function
        utils.plot_data(
            temp_data,
            main_feature,
            other_features=other_features,
            nbins=nbins,
            type_bins=type_bins,
            type_plot=type_plot,
        )

    def compare_feature(  # pylint: disable=too-many-arguments
        self,
        other,
        feature: str,
        nbins: int = 20,
        type_bins: str = "qcut",
        type_plot: str = "prob",
    ) -> None:
        """
        Visualize the marginal effect of a feature on the target variable.

        Parameters:
            feature (str): The main feature for which to visualize the marginal effect.
            nbins (int): The number of bins to use for the visualization. Default is 20.
            type_bins (str): The type of binning to use. Default is "qcut".
            type_plot (str): The type of plot to generate. Default is "prob".

        Returns:
            None

        Raises:
            ValueError: If an any feature is missing in the SHAP values.
        """

        if not isinstance(other, Aprofs):
            raise ValueError("Can only compare with another Aprofs object")

        if feature not in self.shap_values.columns:
            raise ValueError(f"The following feature are missing in the SHAP values: {feature}")

        temp_data = utils.temp_plot_compare_data(self, other, feature)
        # call plotting function
        utils.plot_data_compare(
            temp_data,
            feature,
            nbins=nbins,
            type_bins=type_bins,
            type_plot=type_plot,
        )

    def visualize_neutralized_feature(  # pylint: disable=too-many-arguments
        self,
        main_feature: str,
        neutralize_features: List[str] = None,
        nbins: int = 20,
        type_bins: str = "qcut",
        type_plot: str = "prob",
    ) -> None:
        """
        Visualize the marginal effect of a feature on the target variable after neutralizing the effect of other features.

        Parameters:
            main_feature (str): The main feature for which to visualize the marginal effect.
            neutralize_features (List[str]): The list of other features to be neutralized.
            nbins (int): The number of bins to use for the visualization. Default is 20.
            type_bins (str): The type of binning to use. Default is "qcut".
            type_plot (str): The type of plot to generate. Default is "prob".

        Returns:
            None

        Raises:
            ValueError: If an any feature is missing in the SHAP values dataframe.
        """
        # generate data to plot marginal effect shapley values
        if neutralize_features is None:
            neutralize_features = []
        features = []
        if not isinstance(neutralize_features, list):
            neutralize_features = [neutralize_features]

        features.append(main_feature)
        features.extend(neutralize_features)
        features = list(set(features))  # remove duplicates

        missing_features = [feature for feature in features if feature not in self.shap_values.columns]
        if missing_features:
            raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

        temp_data = utils.temp_neutral_plot_data(self, neutralize_features)
        temp_data[main_feature] = self.current_data[main_feature]
        # call plotting function
        utils.plot_data_neutral(
            temp_data,
            main_feature,
            nbins=nbins,
            type_bins=type_bins,
            type_plot=type_plot,
        )

`brute_force_selection(features)`

Perform brute force feature selection by evaluating the performance of all possible combinations of features.

Parameters:

Name	Type	Description	Default
`features`	`List[str]`	The list of features to consider for feature selection.	required

Returns:

Type	Description
`List[str]`	List[str]: The best list of features with the highest performance.

Raises:

Type	Description
`ValueError`	If an any feature is missing in the SHAP values.

Source code in src/aprofs/code.py

def brute_force_selection(self, features: List[str]) -> List[str]:
    """
    Perform brute force feature selection by evaluating the performance of all possible combinations of features.

    Parameters:
        features (List[str]): The list of features to consider for feature selection.

    Returns:
        List[str]: The best list of features with the highest performance.

    Raises:
        ValueError: If an any feature is missing in the SHAP values.
    """
    missing_features = [feature for feature in features if feature not in self.shap_values.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

    best_performance = 0.0
    best_list = []
    all_combinations = list(utils.generate_all_combinations(features))
    for comb in tqdm(all_combinations, desc=f"Processing {len(all_combinations)} combinations"):
        current_performance = self.get_feature_performance(list(comb))
        if current_performance > best_performance:
            best_performance = current_performance
            best_list = comb
    print(f"the best list is {best_list} with performance {best_performance}")
    return list(best_list)

`calculate_shaps(model, type_model='tree')`

Calculate the SHAP values for the given model.

Parameters:

Name	Type	Description	Default
`model`	`Any`	The trained model for which to calculate the SHAP values.	required
`type_model`	`str`	type of model: tree based or other. If "tree" then TreeExplainer will be use, otherwise a general explainer from the SHAP package is used. Defaults to 'tree'.	`'tree'`

Returns:

Type	Description
`None`	None

Source code in src/aprofs/code.py

def calculate_shaps(self, model: Any, type_model="tree") -> None:
    """
    Calculate the SHAP values for the given model.

    Parameters:
        model (Any): The trained model for which to calculate the SHAP values.
        type_model (str): type of model: tree based or other. If "tree" then TreeExplainer will be use, otherwise a general explainer from the SHAP package is used. Defaults to 'tree'.


    Returns:
        None
    """
    shap_values, shap_mean = utils.get_shap_values(self.current_data, model, type_model)
    self.shap_values = pd.DataFrame(shap_values, index=self.current_data.index, columns=self.current_data.columns)
    self.shap_mean = shap_mean

`compare_feature(other, feature, nbins=20, type_bins='qcut', type_plot='prob')`

Visualize the marginal effect of a feature on the target variable.

Parameters:

Name	Type	Description	Default
`feature`	`str`	The main feature for which to visualize the marginal effect.	required
`nbins`	`int`	The number of bins to use for the visualization. Default is 20.	`20`
`type_bins`	`str`	The type of binning to use. Default is "qcut".	`'qcut'`
`type_plot`	`str`	The type of plot to generate. Default is "prob".	`'prob'`

Returns:

Type	Description
`None`	None

Raises:

Type	Description
`ValueError`	If an any feature is missing in the SHAP values.

Source code in src/aprofs/code.py

def compare_feature(  # pylint: disable=too-many-arguments
    self,
    other,
    feature: str,
    nbins: int = 20,
    type_bins: str = "qcut",
    type_plot: str = "prob",
) -> None:
    """
    Visualize the marginal effect of a feature on the target variable.

    Parameters:
        feature (str): The main feature for which to visualize the marginal effect.
        nbins (int): The number of bins to use for the visualization. Default is 20.
        type_bins (str): The type of binning to use. Default is "qcut".
        type_plot (str): The type of plot to generate. Default is "prob".

    Returns:
        None

    Raises:
        ValueError: If an any feature is missing in the SHAP values.
    """

    if not isinstance(other, Aprofs):
        raise ValueError("Can only compare with another Aprofs object")

    if feature not in self.shap_values.columns:
        raise ValueError(f"The following feature are missing in the SHAP values: {feature}")

    temp_data = utils.temp_plot_compare_data(self, other, feature)
    # call plotting function
    utils.plot_data_compare(
        temp_data,
        feature,
        nbins=nbins,
        type_bins=type_bins,
        type_plot=type_plot,
    )

`get_feature_performance(features)`

Calculate the performance of the features based on the SHAP values.

Parameters:

Name	Type	Description	Default
`features`	`List[str]`	The list of features for which to calculate the performance.	required

Returns:

Name	Type	Description
`float`	`float`	The performance of the features based on the SHAP values.

Raises:

Type	Description
`ValueError`	If an any feature is missing in the SHAP values.

Source code in src/aprofs/code.py

def get_feature_performance(self, features: List[str]) -> float:
    """
    Calculate the performance of the features based on the SHAP values.

    Parameters:
        features (List[str]): The list of features for which to calculate the performance.

    Returns:
        float: The performance of the features based on the SHAP values.

    Raises:
        ValueError: If an any feature is missing in the SHAP values.
    """
    missing_features = [feature for feature in features if feature not in self.shap_values.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")
    return self.link_model.performance_fit(
        self.target_column, utils.calculate_row_sum(self.shap_values, self.shap_mean, features, self.link_model)
    )

`get_shap_p_value(features, suffle_size=500)`

Calculate the p-values of the SHAP values of the features.

Parameters:

Name	Type	Description	Default
`features`	`List[str]`	The list of features for which to calculate the p-values.	required
`suffle_size`	`int`	The number of shuffling iterations to perform. Default is 500.	`500`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A DataFrame containing the features and their corresponding p-values.

Raises:

Type	Description
`ValueError`	If an any feature is missing in the SHAP values.

Source code in src/aprofs/code.py

def get_shap_p_value(self, features: List[str], suffle_size: int = 500) -> pd.DataFrame:
    """
    Calculate the p-values of the SHAP values of the features.

    Parameters:
        features (List[str]): The list of features for which to calculate the p-values.
        suffle_size (int): The number of shuffling iterations to perform. Default is 500.

    Returns:
        pd.DataFrame: A DataFrame containing the features and their corresponding p-values.

    Raises:
        ValueError: If an any feature is missing in the SHAP values.
    """
    missing_features = [feature for feature in features if feature not in self.shap_values.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

    p_values = []
    performance_threshold = self.get_feature_performance(self.shap_values.columns)
    for feature in tqdm(features):
        samples = [
            utils.random_sort_shaps(self.shap_values, self.shap_mean, feature, self.target_column, self.link_model)
            for _ in range(suffle_size)
        ]
        count = sum(sample > performance_threshold for sample in samples)
        p_values.append(count / suffle_size)

    return pd.DataFrame({"Feature": features, "p-value_shap": p_values})

`gready_forward_selection(features, greediness=0.001)`

Perform gready forward feature selection by evaluating the performance of all possible combinations of features.

Parameters:

Name	Type	Description	Default
`features`	`List[str]`	The list of features to consider for feature selection.	required
`greediness`	`float`	The greediness factor, how much better needs to be the performance to add the feature. Default is 0.001.	`0.001`

Returns: List[str]: The best list of features with the highest performance.

Raises:

Type	Description
`ValueError`	If an any feature is missing in the SHAP values.

Source code in src/aprofs/code.py

def gready_forward_selection(self, features: List[str], greediness: float = 0.001) -> List[str]:
    """
    Perform gready forward feature selection by evaluating the performance of all possible combinations of features.

    Parameters:
        features (List[str]): The list of features to consider for feature selection.
        greediness (float): The greediness factor, how much better needs to be the performance to add the feature. Default is 0.001.
    Returns:
        List[str]: The best list of features with the highest performance.

    Raises:
        ValueError: If an any feature is missing in the SHAP values.
    """
    missing_features = [feature for feature in features if feature not in self.shap_values.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

    best_list: List = []
    candidate_list: List[str] = features.copy()
    aproximate_performance: List[float] = []
    best_performance = 0.0
    while len(candidate_list) > 0:
        best_feature_, best_performance_ = utils.best_feature(
            self.shap_values, self.shap_mean, self.link_model, self.target_column, best_list, candidate_list
        )
        candidate_list.remove(best_feature_)

        if self.link_model.perform == "maximize":  # maximize metric
            if best_performance > best_performance_ * (1 + greediness):
                print(f"The feature {best_feature_} wont be added")
            else:
                best_performance = best_performance_
                best_list.append(best_feature_)
                print(f"the best feature to add is {best_feature_} with performance {best_performance_}")
                aproximate_performance.append(best_performance_)

        if self.link_model.perform == "minimize":  # minimize metric
            if best_performance < best_performance_ * (1 - greediness):
                print(f"The feature {best_feature_} wont be added")
            else:
                best_performance = best_performance_
                best_list.append(best_feature_)
                print(f"the best feature to add is {best_feature_} with performance {best_performance_}")
                aproximate_performance.append(best_performance_)

    return best_list

`visualize_feature(main_feature, other_features=None, nbins=20, type_bins='qcut', type_plot='prob')`

Visualize the marginal effect of a feature on the target variable.

Parameters:

Name	Type	Description	Default
`main_feature`	`str`	The main feature for which to visualize the marginal effect.	required
`other_features`	`List[str]`	The list of other features to include in the visualization. Default is None.	`None`
`nbins`	`int`	The number of bins to use for the visualization. Default is 20.	`20`
`type_bins`	`str`	The type of binning to use. Default is "qcut".	`'qcut'`
`type_plot`	`str`	The type of plot to generate. Default is "prob".	`'prob'`

Returns:

Type	Description
`None`	None

Raises:

Type	Description
`ValueError`	If an any feature is missing in the SHAP values dataframe.

Source code in src/aprofs/code.py

def visualize_feature(  # pylint: disable=too-many-arguments
    self,
    main_feature: str,
    other_features: List[str] = None,
    nbins: int = 20,
    type_bins: str = "qcut",
    type_plot: str = "prob",
) -> None:
    """
    Visualize the marginal effect of a feature on the target variable.

    Parameters:
        main_feature (str): The main feature for which to visualize the marginal effect.
        other_features (List[str]): The list of other features to include in the visualization. Default is None.
        nbins (int): The number of bins to use for the visualization. Default is 20.
        type_bins (str): The type of binning to use. Default is "qcut".
        type_plot (str): The type of plot to generate. Default is "prob".

    Returns:
        None

    Raises:
        ValueError: If an any feature is missing in the SHAP values dataframe.
    """
    # generate data to plot marginal effect shapley values
    if other_features is None:
        other_features = []
    features = []
    features.append(main_feature)
    features.extend(other_features)

    missing_features = [feature for feature in features if feature not in self.shap_values.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

    temp_data = utils.temp_plot_data(self, features)
    # call plotting function
    utils.plot_data(
        temp_data,
        main_feature,
        other_features=other_features,
        nbins=nbins,
        type_bins=type_bins,
        type_plot=type_plot,
    )

`visualize_neutralized_feature(main_feature, neutralize_features=None, nbins=20, type_bins='qcut', type_plot='prob')`

Visualize the marginal effect of a feature on the target variable after neutralizing the effect of other features.

Parameters:

Name	Type	Description	Default
`main_feature`	`str`	The main feature for which to visualize the marginal effect.	required
`neutralize_features`	`List[str]`	The list of other features to be neutralized.	`None`
`nbins`	`int`	The number of bins to use for the visualization. Default is 20.	`20`
`type_bins`	`str`	The type of binning to use. Default is "qcut".	`'qcut'`
`type_plot`	`str`	The type of plot to generate. Default is "prob".	`'prob'`

Returns:

Type	Description
`None`	None

Raises:

Type	Description
`ValueError`	If an any feature is missing in the SHAP values dataframe.

Source code in src/aprofs/code.py

def visualize_neutralized_feature(  # pylint: disable=too-many-arguments
    self,
    main_feature: str,
    neutralize_features: List[str] = None,
    nbins: int = 20,
    type_bins: str = "qcut",
    type_plot: str = "prob",
) -> None:
    """
    Visualize the marginal effect of a feature on the target variable after neutralizing the effect of other features.

    Parameters:
        main_feature (str): The main feature for which to visualize the marginal effect.
        neutralize_features (List[str]): The list of other features to be neutralized.
        nbins (int): The number of bins to use for the visualization. Default is 20.
        type_bins (str): The type of binning to use. Default is "qcut".
        type_plot (str): The type of plot to generate. Default is "prob".

    Returns:
        None

    Raises:
        ValueError: If an any feature is missing in the SHAP values dataframe.
    """
    # generate data to plot marginal effect shapley values
    if neutralize_features is None:
        neutralize_features = []
    features = []
    if not isinstance(neutralize_features, list):
        neutralize_features = [neutralize_features]

    features.append(main_feature)
    features.extend(neutralize_features)
    features = list(set(features))  # remove duplicates

    missing_features = [feature for feature in features if feature not in self.shap_values.columns]
    if missing_features:
        raise ValueError(f"The following features are missing in the SHAP values: {missing_features}")

    temp_data = utils.temp_neutral_plot_data(self, neutralize_features)
    temp_data[main_feature] = self.current_data[main_feature]
    # call plotting function
    utils.plot_data_neutral(
        temp_data,
        main_feature,
        nbins=nbins,
        type_bins=type_bins,
        type_plot=type_plot,
    )

Detailed API models

This module implements the models class. this wasy we can extend this class to implement new models to calculate the use with the aprofs class

`ClassificationLogisticLink`

Bases: LinkModels

This class implements the interface for classification with logistic link

Source code in src/aprofs/models.py

class ClassificationLogisticLink(LinkModels):
    """This class implements the interface for classification with logistic link"""

    def __init__(self) -> None:
        super().__init__(type_model="classification", type_link="logistic", perform="maximize")

    def performance_fit(self, target: Union[np.ndarray, pd.Series], prediction: Union[np.ndarray, pd.Series]) -> float:
        return roc_auc_score(target, prediction)

    def link_calculate(
        self, inv_prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        if not isinstance(inv_prediction, (int, float, np.ndarray, pd.Series)):
            raise ValueError("Invalid input type for link_calculate")
        return 1 / (1 + np.exp(-inv_prediction))

    def inv_link_calculate(
        self, prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        return np.log(prediction / (1 - prediction))

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}() with type model {self.type_model} and type link {self.type_link}"

`LinkModels`

This class implements the interface for the link models to be used in the aprofs class

Functionality that needs ot be implemented

performance_fit: calculate the performance of the model
link_calculate: calculate the link function
inv_link_calculate: calculate the inverse link function

Source code in src/aprofs/models.py

class LinkModels(metaclass=abc.ABCMeta):
    """This class implements the interface for the link models
    to be used in the aprofs class

    Functionality that needs ot be implemented:
        - performance_fit: calculate the performance of the model
        - link_calculate: calculate the link function
        - inv_link_calculate: calculate the inverse link function

    """

    def __init__(self, type_model: str, type_link: str, perform: str) -> None:
        self.type_model = type_model
        self.type_link = type_link
        self.perform = perform

    @abc.abstractmethod
    def performance_fit(self, target: Union[np.ndarray, pd.Series], prediction: Union[np.ndarray, pd.Series]) -> float:
        """
        Calculate the performance of the model.

        Args:
            target (np.ndarray): The true target values.
            prediction (np.ndarray): The predicted values.

        Returns:
            float: The performance metric.
        """

    @abc.abstractmethod
    def link_calculate(
        self, inv_prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray]:
        """
        Calculate the link function.

        Args:
            inv_prediction (Union[int, float, np.ndarray]): The input value(s).

        Returns:
            Union[int, float, np.ndarray]: The transformed value(s).
        """

    @abc.abstractmethod
    def inv_link_calculate(
        self, prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        """
        Calculate the inverse link function.

        Args:
            prediction (Union[int, float, np.ndarray]): The input value(s).

        Returns:
            Union[int, float, np.ndarray]: The transformed value(s).
        """

`inv_link_calculate(prediction)` `abstractmethod`

Calculate the inverse link function.

Parameters:

Name	Type	Description	Default
`prediction`	`Union[int, float, ndarray]`	The input value(s).	required

Returns:

Type	Description
`Union[int, float, ndarray, Series]`	Union[int, float, np.ndarray]: The transformed value(s).

Source code in src/aprofs/models.py

@abc.abstractmethod
def inv_link_calculate(
    self, prediction: Union[int, float, np.ndarray, pd.Series]
) -> Union[int, float, np.ndarray, pd.Series]:
    """
    Calculate the inverse link function.

    Args:
        prediction (Union[int, float, np.ndarray]): The input value(s).

    Returns:
        Union[int, float, np.ndarray]: The transformed value(s).
    """

`link_calculate(inv_prediction)` `abstractmethod`

Calculate the link function.

Parameters:

Name	Type	Description	Default
`inv_prediction`	`Union[int, float, ndarray]`	The input value(s).	required

Returns:

Type	Description
`Union[int, float, ndarray]`	Union[int, float, np.ndarray]: The transformed value(s).

Source code in src/aprofs/models.py

@abc.abstractmethod
def link_calculate(
    self, inv_prediction: Union[int, float, np.ndarray, pd.Series]
) -> Union[int, float, np.ndarray]:
    """
    Calculate the link function.

    Args:
        inv_prediction (Union[int, float, np.ndarray]): The input value(s).

    Returns:
        Union[int, float, np.ndarray]: The transformed value(s).
    """

`performance_fit(target, prediction)` `abstractmethod`

Calculate the performance of the model.

Parameters:

Name	Type	Description	Default
`target`	`ndarray`	The true target values.	required
`prediction`	`ndarray`	The predicted values.	required

Returns:

Name	Type	Description
`float`	`float`	The performance metric.

Source code in src/aprofs/models.py

@abc.abstractmethod
def performance_fit(self, target: Union[np.ndarray, pd.Series], prediction: Union[np.ndarray, pd.Series]) -> float:
    """
    Calculate the performance of the model.

    Args:
        target (np.ndarray): The true target values.
        prediction (np.ndarray): The predicted values.

    Returns:
        float: The performance metric.
    """

`RegressionIdentityLink`

Bases: LinkModels

This class implements the interface for regression with identity link

Source code in src/aprofs/models.py

class RegressionIdentityLink(LinkModels):
    """This class implements the interface for regression with identity link"""

    def __init__(self) -> None:
        super().__init__(type_model="regression", type_link="identity", perform="minimize")

    def performance_fit(self, target: Union[np.ndarray, pd.Series], prediction: Union[np.ndarray, pd.Series]) -> float:
        return np.sqrt(mean_squared_error(target, prediction))

    def link_calculate(
        self, inv_prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        return inv_prediction

    def inv_link_calculate(
        self, prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        return prediction

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}() with type model {self.type_model} and type link {self.type_link}"

`RegressionLogLink`

Bases: LinkModels

This class implements the interface for regression with logarithmic link

Source code in src/aprofs/models.py

class RegressionLogLink(LinkModels):
    """This class implements the interface for regression with logarithmic link"""

    def __init__(self) -> None:
        super().__init__(type_model="regression", type_link="logarithmic", perform="minimize")

    def performance_fit(self, target: Union[np.ndarray, pd.Series], prediction: Union[np.ndarray, pd.Series]) -> float:
        return np.sqrt(mean_squared_error(target, prediction))

    def link_calculate(
        self, inv_prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        if not isinstance(inv_prediction, (int, float, np.ndarray, pd.Series)):
            raise ValueError("Invalid input type for link_calculate")
        return np.log(inv_prediction)

    def inv_link_calculate(
        self, prediction: Union[int, float, np.ndarray, pd.Series]
    ) -> Union[int, float, np.ndarray, pd.Series]:
        return np.exp(prediction)

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}() with type model {self.type_model} and type link {self.type_link}"

Detailed API utilities

Utility functions for the package. this module contains utility functions that are used in the package. the core functions are used to calculate the SHAP values and expected average SHAP value for a given dataset and model.

`best_feature(shaps_values, shap_expected_values, link_model, y_target, current_list, candidate_list)`

Return the best feature to add to the current list based on the highest AUC score.

Parameters:

Name	Type	Description	Default
`shaps_values`	`DataFrame`	A DataFrame containing SHAP values for each feature.	required
`shap_expected_values`	`Series`	A Series containing the expected SHAP values.	required
`link_model`	`aprofs model object`	An object that allows to calculate the performance of the model.	required
`y_target`	`Series`	The target variable for the AUC score calculation.	required
`current_list`	`list`	The current list of features.	required
`candidate_list`	`list`	The list of candidate features to consider adding.	required

Returns:

Name	Type	Description
`tuple`	`Tuple[str, float]`	A tuple containing the best feature to add (str) and the corresponding best AUC score (float).

Raises:

Type	Description
`ValueError`	If `candidate_list` is empty.

Source code in src/aprofs/utils.py

def best_feature(  # pylint: disable=too-many-arguments
    shaps_values: pd.DataFrame,
    shap_expected_values: float,
    link_model: LinkModels,
    y_target: pd.Series,
    current_list: List[str],
    candidate_list: List[str],
) -> Tuple[str, float]:
    """
    Return the best feature to add to the current list based on the highest AUC score.

    Args:
        shaps_values (DataFrame): A DataFrame containing SHAP values for each feature.
        shap_expected_values (Series): A Series containing the expected SHAP values.
        link_model (aprofs model object): An object that allows to calculate the performance of the model.
        y_target (Series): The target variable for the AUC score calculation.
        current_list (list): The current list of features.
        candidate_list (list): The list of candidate features to consider adding.

    Returns:
        tuple: A tuple containing the best feature to add (str) and the corresponding best AUC score (float).

    Raises:
        ValueError: If `candidate_list` is empty.
    """

    if candidate_list == [] or candidate_list is None:
        raise ValueError("The candidate list cannot be empty.")

    best_feature: str = None
    best_auc: float = 0
    for feature in candidate_list:
        current_list.append(feature)
        aprox_preds = calculate_row_sum(shaps_values, shap_expected_values, current_list, link_model)
        auc = roc_auc_score(y_target, aprox_preds)
        if auc > best_auc:
            best_auc = auc
            best_feature = feature
        current_list.remove(feature)
    return best_feature, best_auc

`calculate_all_row_sum(data, mean_value, link_model)`

Calculates the row sum of all columns in a Shapley values DataFrame and applies a link function to the result.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The input Shapley values DataFrame.	required
`mean_value`	`float`	The mean shapley value to be added to the row sum.	required
`link_model`	`aprofs model object`	An object that allows to calculate the performance of the model.	required

Returns:

Type	Description
`Union[float, Series]`	Union[float, pd.Series]: The result of applying the link function to the row sum.

Examples:

>>> import pandas as pd
>>> data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
>>> mean_value = 10.0
>>> link_function = lambda x: x ** 2
>>> calculate_all_row_sum(data, mean_value, link_function)
225.0

Source code in src/aprofs/utils.py

def calculate_all_row_sum(data: pd.DataFrame, mean_value: float, link_model: LinkModels) -> Union[float, pd.Series]:
    """
    Calculates the row sum of **all columns** in a Shapley values DataFrame and applies a link function to the result.

    Args:
        data (pd.DataFrame): The input Shapley values DataFrame.
        mean_value (float): The mean shapley value to be added to the row sum.
        link_model (aprofs model object): An object that allows to calculate the performance of the model.

    Returns:
        Union[float, pd.Series]: The result of applying the link function to the row sum.

    Examples:
        >>> import pandas as pd
        >>> data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
        >>> mean_value = 10.0
        >>> link_function = lambda x: x ** 2
        >>> calculate_all_row_sum(data, mean_value, link_function)
        225.0
    """
    return link_model.link_calculate(mean_value + data.sum(axis=1))

`calculate_row_sum(data, mean_value, columns, link_model)`

Calculates the row sum of specified columns in a Shapley values DataFrame and applies a link function to the result.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The input DataFrame with shapley values.	required
`mean_value`	`float`	The mean shapley value to be added to the row sum.	required
`columns`	`List[str]`	The list of column names to be summed.	required
`link_model`	`aprofs model object`	An object that allows to calculate the performance of the model.	required

Returns:

Type	Description
`Union[float, Series]`	Union[float, pd.Series]: The result of applying the link function to the row sum.

Examples:

>>> import pandas as pd
>>> data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
>>> mean_value = 10.0
>>> columns = ['A', 'B']
>>> link_function = lambda x: x ** 2
>>> calculate_row_sum(data, mean_value, columns, link_function)
225.0

Source code in src/aprofs/utils.py

def calculate_row_sum(
    data: pd.DataFrame, mean_value: float, columns: List[str], link_model: LinkModels
) -> Union[float, pd.Series]:
    """
    Calculates the row sum of specified columns in a Shapley values DataFrame and applies a link function to the result.

    Args:
        data (pd.DataFrame): The input DataFrame with shapley values.
        mean_value (float): The mean shapley value to be added to the row sum.
        columns (List[str]): The list of column names to be summed.
        link_model (aprofs model object): An object that allows to calculate the performance of the model.

    Returns:
        Union[float, pd.Series]: The result of applying the link function to the row sum.

    Examples:
        >>> import pandas as pd
        >>> data = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
        >>> mean_value = 10.0
        >>> columns = ['A', 'B']
        >>> link_function = lambda x: x ** 2
        >>> calculate_row_sum(data, mean_value, columns, link_function)
        225.0
    """
    return link_model.link_calculate(mean_value + data[columns].sum(axis=1))

`generate_all_combinations(features)`

Generates all possible combinations of the given features list. This will be used to test all possible combinations fo features to find the best combination.

Parameters:

Name	Type	Description	Default
`features`	`List[str]`	A list of features.	required

Returns:

Type	Description
`List[Tuple[str]]`	List[Tuple[str]]: A list of tuples representing all possible combinations of the features.

Source code in src/aprofs/utils.py

def generate_all_combinations(features: List[str]) -> List[Tuple[str]]:
    """
    Generates all possible combinations of the given features list.
    This will be used to test all possible combinations fo features to find the best combination.

    Args:
        features (List[str]): A list of features.

    Returns:
        List[Tuple[str]]: A list of tuples representing all possible combinations of the features.
    """
    all_combinations: List = []
    for feature_size in range(1, len(features) + 1):
        all_combinations.extend(combinations(features, feature_size))
    return all_combinations

`get_shap_values(data, model, type_model='tree')`

Calculates the SHAP values and expected average shap value for a given dataset and model.

Parameters:

Name	Type	Description	Default
`data`	`ndarray or DataFrame`	The input dataset.	required
`model`	`Callable`	The trained model object.	required
`type_model`	`str`	type of model: tree based or other. If "tree" then TreeExplainer will be use, otherwise a general explainer from the SHAP package is used. Defaults to 'tree'.	`'tree'`

Returns:

Name	Type	Description
`tuple`	`Tuple[DataFrame, float]`	A tuple containing the SHAP values and the Average shap value.

Examples:

>>> # Imports
>>> import numpy as np
>>> from xgboost import XGBClassifier
>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import train_test_split
>>> # Imports SHAP Package
>>> import shap
>>>
>>> # Load the iris dataset
>>> iris = load_iris()
>>> X, y = iris.data, iris.target
>>>
>>> # Split the dataset into train and test sets
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
>>>
>>> # Train a model
>>> model = XGBClassifier()
>>> model.fit(X_train, y_train)
>>>
>>> # Calculate SHAP values and expected value
>>> shap_values, expected_value = get_shap_tree_values(X_test, model)

Source code in src/aprofs/utils.py

def get_shap_values(data: pd.DataFrame, model: Callable, type_model="tree") -> Tuple[pd.DataFrame, float]:
    """
    Calculates the SHAP values and expected average shap value for a given dataset and model.

    Args:
        data (numpy.ndarray or pandas.DataFrame): The input dataset.
        model: The trained model object.
        type_model (str): type of model: tree based or other. If "tree" then TreeExplainer will be use, otherwise a general explainer from the SHAP package is used. Defaults to 'tree'.

    Returns:
        tuple: A tuple containing the SHAP values and the Average shap value.

    Examples:
        >>> # Imports
        >>> import numpy as np
        >>> from xgboost import XGBClassifier
        >>> from sklearn.datasets import load_iris
        >>> from sklearn.model_selection import train_test_split
        >>> # Imports SHAP Package
        >>> import shap
        >>>
        >>> # Load the iris dataset
        >>> iris = load_iris()
        >>> X, y = iris.data, iris.target
        >>>
        >>> # Split the dataset into train and test sets
        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        >>>
        >>> # Train a model
        >>> model = XGBClassifier()
        >>> model.fit(X_train, y_train)
        >>>
        >>> # Calculate SHAP values and expected value
        >>> shap_values, expected_value = get_shap_tree_values(X_test, model)
    """
    if type_model == "tree":
        shap_explainer = TreeExplainer(model)
        shap_valid = shap_explainer.shap_values(data)
        shap_expected_value = shap_explainer.expected_value
    else:
        shap_explainer = Explainer(model)
        shap_valid = shap_explainer.shap_values(data)
        shap_expected_value = shap_explainer.expected_value

    if isinstance(shap_valid, list):
        shap_valid = np.concatenate(shap_valid, axis=1)

    return shap_valid, shap_expected_value

`plot_data(temp, main_feature, other_features=None, nbins=20, type_bins='qcut', type_plot='prob')`

Plot data based on the provided DataFrame and features.

Parameters:

Name	Type	Description	Default
`temp`	`DataFrame`	The DataFrame containing the data.	required
`main_feature`	`str`	The main feature to plot.	required
`other_features`	`Optional[Union[str, List[str]]]`	Other features to include in the plot. Defaults to None.	`None`
`nbins`	`int`	The number of bins. Defaults to 20.	`20`
`type_bins`	`str`	The type of binning. Defaults to "qcut".	`'qcut'`
`type_plot`	`str`	The type of plot. Defaults to "prob".	`'prob'`

Returns:

Type	Description
`None`	None

Examples:

>>> temp = pd.DataFrame(...)
>>> plot_data(temp, "main_feature", other_features=["feature_1", "feature_2"], nbins=10, type_bins="cut", type_plot="raw")

Source code in src/aprofs/utils.py

def plot_data(  # pylint: disable=too-many-arguments
    temp: pd.DataFrame,
    main_feature: str,
    other_features: Optional[Union[str, List[str]]] = None,
    nbins: int = 20,
    type_bins: str = "qcut",
    type_plot: str = "prob",
) -> None:
    """
    Plot data based on the provided DataFrame and features.

    Args:
        temp (pd.DataFrame): The DataFrame containing the data.
        main_feature (str): The main feature to plot.
        other_features (Optional[Union[str, List[str]]], optional): Other features to include in the plot. Defaults to None.
        nbins (int, optional): The number of bins. Defaults to 20.
        type_bins (str, optional): The type of binning. Defaults to "qcut".
        type_plot (str, optional): The type of plot. Defaults to "prob".

    Returns:
        None

    Examples:
        >>> temp = pd.DataFrame(...)
        >>> plot_data(temp, "main_feature", other_features=["feature_1", "feature_2"], nbins=10, type_bins="cut", type_plot="raw")
    """
    if other_features is None:
        other_features = []
    if not isinstance(other_features, list):
        other_features = [other_features]
    features = []
    features.append(main_feature)
    features.extend(other_features)

    if temp[main_feature].unique().shape[0] < 25:
        temp["bins"] = temp[main_feature].astype(str)
    elif type_bins == "cut":
        temp["bins"] = pd.cut(temp[main_feature], bins=nbins)
    elif type_bins == "qcut":
        temp["bins"] = pd.qcut(temp[main_feature], q=nbins)
    else:
        print("Invalid type_bins value")

    # Calculate the means for each bin
    means = temp.groupby("bins", observed=True)["target"].mean()

    means_shap = {}
    if type_plot == "raw":
        for feature in features:
            means_shap[feature] = temp.groupby("bins", observed=True)[f"{feature}_shap"].mean()
        means_shap_others = temp.groupby("bins", observed=True)["shap_other"].mean()
        means_shap_model = temp.groupby("bins", observed=True)["shap_model"].mean()
    else:
        for feature in features:
            means_shap[feature] = temp.groupby("bins", observed=True)[f"{feature}_shap_prob"].mean()
        means_shap_others = temp.groupby("bins", observed=True)["shap_prob_other"].mean()
        means_shap_model = temp.groupby("bins", observed=True)["shap_prob_model"].mean()

    # Calculate the counts for each bin
    counts = temp["bins"].value_counts(normalize=True).sort_index()

    # Create a figure
    fig = go.Figure()

    # Add bar plot for counts on the primary y-axis
    fig.add_trace(go.Bar(x=counts.index.astype(str), y=counts, name="Data", yaxis="y", marker_color="lightgray"))

    # Add line plots on the secondary y-axis
    fig.add_trace(go.Scatter(x=means.index.astype(str), y=means, mode="lines", name="Observed", yaxis="y2"))

    for feature in features:
        fig.add_trace(
            go.Scatter(
                x=means_shap[feature].index.astype(str),
                y=means_shap[feature],
                mode="lines",
                name=f"{feature} shap Mean",
                yaxis="y2",
            )
        )

    fig.add_trace(
        go.Scatter(
            x=means_shap_others.index.astype(str), y=means_shap_others, mode="lines", name="Others shaps", yaxis="y2"
        )
    )
    fig.add_trace(
        go.Scatter(
            x=means_shap_model.index.astype(str), y=means_shap_model, mode="lines", name="Model shaps", yaxis="y2"
        )
    )

    # Update layout to include a secondary y-axis
    fig.update_layout(
        yaxis={"title": "Counts", "side": "left", "tickformat": ".0%"},
        yaxis2={"title": "Avg.", "side": "right", "overlaying": "y"},
    )
    fig.update_xaxes(title_text=feature)
    fig.show()

`plot_data_compare(temp, feature, nbins=20, type_bins='qcut', type_plot='prob')`

Plot data based on the provided DataFrame and feature in a way to compare a specific shap.

Parameters:

Name	Type	Description	Default
`temp`	`DataFrame`	The DataFrame containing the data.	required
`feature`	`str`	The main feature to plot.	required
`nbins`	`int`	The number of bins. Defaults to 20.	`20`
`type_bins`	`str`	The type of binning. Defaults to "qcut".	`'qcut'`
`type_plot`	`str`	The type of plot. Defaults to "prob".	`'prob'`

Returns:

Type	Description
`None`	None

Examples:

>>> temp = pd.DataFrame(...)
>>> plot_data(temp, "feature_name", nbins=10, type_bins="cut", type_plot="raw")

Source code in src/aprofs/utils.py

def plot_data_compare(  # pylint: disable=too-many-arguments
    temp: pd.DataFrame,
    feature: str,
    nbins: int = 20,
    type_bins: str = "qcut",
    type_plot: str = "prob",
) -> None:
    """
    Plot data based on the provided DataFrame and feature in a way to compare a specific shap.

    Args:
        temp (pd.DataFrame): The DataFrame containing the data.
        feature (str): The main feature to plot.
        nbins (int, optional): The number of bins. Defaults to 20.
        type_bins (str, optional): The type of binning. Defaults to "qcut".
        type_plot (str, optional): The type of plot. Defaults to "prob".

    Returns:
        None

    Examples:
        >>> temp = pd.DataFrame(...)
        >>> plot_data(temp, "feature_name", nbins=10, type_bins="cut", type_plot="raw")
    """

    if temp[feature].unique().shape[0] < 25:
        temp["bins"] = temp[feature].astype(str)
    elif type_bins == "cut":
        temp["bins"] = pd.cut(temp[feature], bins=nbins)
    elif type_bins == "qcut":
        temp["bins"] = pd.qcut(temp[feature], q=nbins)
    else:
        print("Invalid type_bins value")

    # Calculate the means for each bin
    means = temp.groupby("bins", observed=True)["target"].mean()

    means_shap = {}
    if type_plot == "raw":
        means_shap[feature] = temp.groupby("bins", observed=True)[f"{feature}_shap"].mean()
        means_shap[f"{feature}_shap"] = temp.groupby("bins", observed=True)[f"{feature}_shap"].mean()
        means_shap[f"{feature}_shap_compare"] = temp.groupby("bins", observed=True)[f"{feature}_shap_compare"].mean()
        means_shap_model = temp.groupby("bins", observed=True)["shap_model"].mean()
    else:
        means_shap[feature] = temp.groupby("bins", observed=True)[f"{feature}_shap_prob"].mean()
        means_shap[f"{feature}_compare"] = temp.groupby("bins", observed=True)[f"{feature}_shap_prob_compare"].mean()
        means_shap_model = temp.groupby("bins", observed=True)["shap_prob_model"].mean()

    # Calculate the counts for each bin
    counts = temp["bins"].value_counts(normalize=True).sort_index()

    # Create a figure
    fig = go.Figure()

    # Add bar plot for counts on the primary y-axis
    fig.add_trace(go.Bar(x=counts.index.astype(str), y=counts, name="Data", yaxis="y", marker_color="lightgray"))

    # Add line plots on the secondary y-axis
    fig.add_trace(go.Scatter(x=means.index.astype(str), y=means, mode="lines", name="Observed", yaxis="y2"))

    fig.add_trace(
        go.Scatter(
            x=means_shap[feature].index.astype(str),
            y=means_shap[feature],
            mode="lines",
            name=f"{feature} shap Mean",
            yaxis="y2",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=means_shap[f"{feature}_compare"].index.astype(str),
            y=means_shap[f"{feature}_compare"],
            mode="lines",
            name=f"{feature} shap Mean compare",
            yaxis="y2",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=means_shap_model.index.astype(str), y=means_shap_model, mode="lines", name="Model shaps", yaxis="y2"
        )
    )

    # Update layout to include a secondary y-axis
    fig.update_layout(
        yaxis={"title": "Counts", "side": "left", "tickformat": ".0%"},
        yaxis2={"title": "Avg.", "side": "right", "overlaying": "y"},
    )
    fig.update_xaxes(title_text=feature)
    fig.show()

`plot_data_neutral(data, feature, nbins=20, type_bins='qcut', type_plot='prob')`

Plot data based on the provided neutralized DataFrame and features.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The DataFrame containing the neutralize shap data.	required
`feature`	`str`	The main feature to plot on the x-axis.	required
`nbins`	`int`	The number of bins. Defaults to 20.	`20`
`type_bins`	`str`	The type of binning. Defaults to "qcut".	`'qcut'`
`type_plot`	`str`	The type of plot. Defaults to "prob".	`'prob'`

Returns:

Type	Description
`None`	None

Examples:

>>> temp = pd.DataFrame(...)
>>> plot_data_neutral(temp, "main_feature", other_features=["feature_1", "feature_2"], nbins=10, type_bins="cut", type_plot="raw")

Source code in src/aprofs/utils.py

def plot_data_neutral(  # pylint: disable=too-many-arguments
    data: pd.DataFrame,
    feature: str,
    nbins: int = 20,
    type_bins: str = "qcut",
    type_plot: str = "prob",
) -> None:
    """
    Plot data based on the provided neutralized DataFrame and features.

    Args:
        data (pd.DataFrame): The DataFrame containing the neutralize shap data.
        feature (str): The main feature to plot on the x-axis.
        nbins (int, optional): The number of bins. Defaults to 20.
        type_bins (str, optional): The type of binning. Defaults to "qcut".
        type_plot (str, optional): The type of plot. Defaults to "prob".

    Returns:
        None

    Examples:
        >>> temp = pd.DataFrame(...)
        >>> plot_data_neutral(temp, "main_feature", other_features=["feature_1", "feature_2"], nbins=10, type_bins="cut", type_plot="raw")
    """

    if data[feature].unique().shape[0] < 25:
        data["bins"] = data[feature].astype(str)
    elif type_bins == "cut":
        data["bins"] = pd.cut(data[feature], bins=nbins)
    elif type_bins == "qcut":
        data["bins"] = pd.qcut(data[feature], q=nbins)
    else:
        print("Invalid type_bins value")

    # Calculate the means for each bin
    means = data.groupby("bins", observed=True)["target"].mean()

    if type_plot == "raw":
        means_shap_others = data.groupby("bins", observed=True)["shap_other"].mean()
        means_shap_model = data.groupby("bins", observed=True)["shap_model"].mean()
    else:
        means_shap_others = data.groupby("bins", observed=True)["shap_prob_other"].mean()
        means_shap_model = data.groupby("bins", observed=True)["shap_prob_model"].mean()

    # Calculate the counts for each bin
    counts = data["bins"].value_counts(normalize=True).sort_index()

    # Create a figure
    fig = go.Figure()

    # Add bar plot for counts on the primary y-axis
    fig.add_trace(go.Bar(x=counts.index.astype(str), y=counts, name="Data", yaxis="y", marker_color="lightgray"))

    # Add line plots on the secondary y-axis
    fig.add_trace(go.Scatter(x=means.index.astype(str), y=means, mode="lines", name="Observed", yaxis="y2"))

    fig.add_trace(
        go.Scatter(
            x=means_shap_others.index.astype(str),
            y=means_shap_others,
            mode="lines",
            name="Neutralized shaps",
            yaxis="y2",
        )
    )
    fig.add_trace(
        go.Scatter(
            x=means_shap_model.index.astype(str),
            y=means_shap_model,
            mode="lines",
            name="Original Model shaps",
            yaxis="y2",
        )
    )

    # Update layout to include a secondary y-axis
    fig.update_layout(
        yaxis={"title": "Counts", "side": "left", "tickformat": ".0%"},
        yaxis2={"title": "Avg.", "side": "right", "overlaying": "y"},
    )
    # Add title to x-axis
    fig.update_xaxes(title_text=feature)

    fig.show()

`random_sort_shaps(shaps_values, shap_expected_value, feature_name, y_target, link_model)`

Randomly shuffles the values of a specific feature in the SHAP values DataFrame, calculates the row sum, and returns the ROC AUC score.

Parameters:

Name	Type	Description	Default
`shaps_values`	`DataFrame`	The SHAP values DataFrame.	required
`shap_expected_value`	`float`	The expected SHAP value.	required
`feature_name`	`str`	The name of the feature to shuffle.	required
`y_target`	`Union[Series, ndarray]`	The target variable.	required
`link_model`	`aprofs model object`	An object that allows to calculate the performance of the model.	required

Returns:

Name	Type	Description
`float`	`float`	The ROC AUC score.

Examples:

>>> import pandas as pd
>>> import numpy as np
>>> from sklearn.datasets import make_classification
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.metrics import roc_auc_score
>>>
>>> # Generate synthetic data
>>> X, y = make_classification(n_samples=100, n_features=5, random_state=42)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
>>>
>>> # Train a logistic regression model
>>> model = LogisticRegression()
>>> model.fit(X_train, y_train)
>>>
>>> # Calculate SHAP values and expected value
>>> shap_values, expected_value = get_shap_tree_values(X_test, model)
>>>
>>> # Calculate ROC AUC score with shuffled feature
>>> roc_score = random_sort_shaps(shap_values, expected_value, 'feature_1', y_test, link_function='logistic')
>>> print(roc_score)

Source code in src/aprofs/utils.py

def random_sort_shaps(
    shaps_values: pd.DataFrame,
    shap_expected_value: float,
    feature_name: str,
    y_target: Union[pd.Series, np.ndarray],
    link_model: LinkModels,
) -> float:
    """
    Randomly shuffles the values of a specific feature in the SHAP values DataFrame,
    calculates the row sum, and returns the ROC AUC score.

    Args:
        shaps_values (pd.DataFrame): The SHAP values DataFrame.
        shap_expected_value (float): The expected SHAP value.
        feature_name (str): The name of the feature to shuffle.
        y_target (Union[pd.Series, np.ndarray]): The target variable.
        link_model (aprofs model object): An object that allows to calculate the performance of the model.

    Returns:
        float: The ROC AUC score.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> from sklearn.datasets import make_classification
        >>> from sklearn.linear_model import LogisticRegression
        >>> from sklearn.model_selection import train_test_split
        >>> from sklearn.metrics import roc_auc_score
        >>>
        >>> # Generate synthetic data
        >>> X, y = make_classification(n_samples=100, n_features=5, random_state=42)
        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        >>>
        >>> # Train a logistic regression model
        >>> model = LogisticRegression()
        >>> model.fit(X_train, y_train)
        >>>
        >>> # Calculate SHAP values and expected value
        >>> shap_values, expected_value = get_shap_tree_values(X_test, model)
        >>>
        >>> # Calculate ROC AUC score with shuffled feature
        >>> roc_score = random_sort_shaps(shap_values, expected_value, 'feature_1', y_test, link_function='logistic')
        >>> print(roc_score)
    """
    shaps_values_shuffled = shaps_values.sample(frac=1)  # shuffle
    shaps_values_shuffled.reset_index(inplace=True, drop=True)

    new_shap_table = shaps_values.copy()
    new_shap_table.reset_index(inplace=True, drop=True)

    new_shap_table[feature_name] = shaps_values_shuffled[feature_name]
    approx_pred_valid = calculate_all_row_sum(new_shap_table, shap_expected_value, link_model)

    return link_model.performance_fit(y_target, approx_pred_valid)

`random_sort_shaps_column(shaps_values, shap_mean_value, target_column, feature, link_model, original=False)`

Randomly shuffles the values of a specific feature in the SHAP values DataFrame, calculates the row sum, and returns the ROC AUC score.

Parameters:

Name	Type	Description	Default
`shaps_values`	`DataFrame`	The SHAP values DataFrame.	required
`shap_mean_value`	`float`	The mean SHAP value.	required
`target_column`	`Union[Series, ndarray]`	The target variable.	required
`feature`	`str`	The name of the feature to shuffle.	required
`link_model`	`aprofs model object`	An object that allows to calculate the performance of the model.	required
`original`	`bool`	Whether to use the original feature values or shuffled values. Defaults to False.	`False`

Returns:

Name	Type	Description
`float`	`float`	The ROC AUC score.

Examples:

>>> import pandas as pd
>>> import numpy as np
>>> from sklearn.datasets import make_classification
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.metrics import roc_auc_score
>>>
>>> # Generate synthetic data
>>> X, y = make_classification(n_samples=100, n_features=5, random_state=42)
>>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
>>>
>>> # Train a logistic regression model
>>> model = LogisticRegression()
>>> model.fit(X_train, y_train)
>>>
>>> # Calculate SHAP values and expected value
>>> shap_values, expected_value = get_shap_tree_values(X_test, model)
>>>
>>> # Calculate ROC AUC score with shuffled feature
>>> roc_score = random_sort_shaps_column(shap_values, expected_value, y_test, 'feature_1', link_function='logistic')
>>> print(roc_score)

Source code in src/aprofs/utils.py

def random_sort_shaps_column(  # pylint: disable=too-many-arguments
    shaps_values: pd.DataFrame,
    shap_mean_value: float,
    target_column: Union[pd.Series, np.ndarray],
    feature: str,
    link_model: LinkModels,
    original: bool = False,
) -> float:
    """
    Randomly shuffles the values of a specific feature in the SHAP values DataFrame,
    calculates the row sum, and returns the ROC AUC score.

    Args:
        shaps_values (pd.DataFrame): The SHAP values DataFrame.
        shap_mean_value (float): The mean SHAP value.
        target_column (Union[pd.Series, np.ndarray]): The target variable.
        feature (str): The name of the feature to shuffle.
        link_model (aprofs model object): An object that allows to calculate the performance of the model.
        original (bool, optional): Whether to use the original feature values or shuffled values. Defaults to False.

    Returns:
        float: The ROC AUC score.

    Examples:
        >>> import pandas as pd
        >>> import numpy as np
        >>> from sklearn.datasets import make_classification
        >>> from sklearn.linear_model import LogisticRegression
        >>> from sklearn.model_selection import train_test_split
        >>> from sklearn.metrics import roc_auc_score
        >>>
        >>> # Generate synthetic data
        >>> X, y = make_classification(n_samples=100, n_features=5, random_state=42)
        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        >>>
        >>> # Train a logistic regression model
        >>> model = LogisticRegression()
        >>> model.fit(X_train, y_train)
        >>>
        >>> # Calculate SHAP values and expected value
        >>> shap_values, expected_value = get_shap_tree_values(X_test, model)
        >>>
        >>> # Calculate ROC AUC score with shuffled feature
        >>> roc_score = random_sort_shaps_column(shap_values, expected_value, y_test, 'feature_1', link_function='logistic')
        >>> print(roc_score)
    """
    shaps_values_original = shaps_values.copy()
    shaps_values_original.reset_index(inplace=True, drop=True)

    shaps_values_shuffled = shaps_values.sample(frac=1)  # shuffle
    shaps_values_shuffled.reset_index(inplace=True, drop=True)

    # Calculate the average values of each column
    average_values = shaps_values.mean()
    new_shap_table = shaps_values.copy()
    new_shap_table.reset_index(inplace=True, drop=True)
    for feature_name in shaps_values.columns:
        new_shap_table[feature_name] = average_values[feature_name]

    if original:
        new_shap_table[feature] = shaps_values_original[feature]
    else:
        new_shap_table[feature] = shaps_values_shuffled[feature]

    approx_pred_valid = calculate_all_row_sum(new_shap_table, shap_mean_value, link_model)

    return link_model.performance_fit(target_column, approx_pred_valid)

`temp_neutral_plot_data(aprofs_obj, features)`

Generate a temporary DataFrame for plotting purposes.

Parameters:

Name	Type	Description	Default
`aprofs_obj`	`Aprofs Object`	An instance of the Aprofs class.	required
`features`	`List[str]`	A list of feature names that will be neutralized. The shapley values for this will be just the average values. This way the break the segmentation of the feature, maintaining the global effect of all the others.	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The temporary DataFrame.

Examples:

>>> aprofs_obj = Aprofs Object(...)
>>> features = ['feature_1', 'feature_2']
>>> temp = temp_neutral_plot_data(aprofs_obj, features)
>>> print(temp.head())

Source code in src/aprofs/utils.py

def temp_neutral_plot_data(aprofs_obj, features: List[str]) -> pd.DataFrame:
    """
    Generate a temporary DataFrame for plotting purposes.

    Args:
        aprofs_obj (Aprofs Object): An instance of the Aprofs class.
        features (List[str]): A list of feature names that will be neutralized. The shapley values for this will be just the average values. This way the break the segmentation of the feature, maintaining the global effect of all the others.

    Returns:
        pd.DataFrame: The temporary DataFrame.

    Examples:
        >>> aprofs_obj = Aprofs Object(...)
        >>> features = ['feature_1', 'feature_2']
        >>> temp = temp_neutral_plot_data(aprofs_obj, features)
        >>> print(temp.head())
    """
    if not isinstance(features, list):
        features = [features]

    temp = pd.DataFrame(
        {
            "target": aprofs_obj.target_column,
        }
    )

    for feat in features:
        temp[feat] = aprofs_obj.current_data[feat].values  # adding features to data

    temp["shap_other"] = (
        aprofs_obj.shap_mean
        + aprofs_obj.shap_values[[col for col in aprofs_obj.shap_values.columns if col not in features]].sum(axis=1)
        + aprofs_obj.shap_values[features]
        .sum(axis=1)
        .mean()  # sums the columns of the features and calculate the average value
    )
    temp["shap_prob_other"] = 1 / (1 + np.exp(-temp["shap_other"]))
    temp["shap_model"] = aprofs_obj.shap_mean + aprofs_obj.shap_values.sum(axis=1)
    temp["shap_prob_model"] = 1 / (1 + np.exp(-temp["shap_model"]))

    return temp

`temp_plot_compare_data(aprofs_obj_self, aprofs_obj, feature)`

Generate a temporary DataFrame for plotting purposes.

Parameters:

Name	Type	Description	Default
`aprofs_obj_self`	`Aprofs Object`	An instance of the Aprofs class.	required
`aprofs_obj`	`Aprofs Object`	An instance of the Aprofs class.	required
`feature`	`str`	feature to compare.	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The temporary DataFrame.

Examples:

>>> aprofs_obj = Aprofs Object(...)
>>> aprofs_obj_2_compare = Aprofs Object(...)
>>> features = 'feature_1'
>>> temp = temp_plot_data(aprofs_obj,aprofs_obj_2_compare, feature)
>>> print(temp.head())

Source code in src/aprofs/utils.py

def temp_plot_compare_data(aprofs_obj_self, aprofs_obj, feature: str) -> pd.DataFrame:
    """
    Generate a temporary DataFrame for plotting purposes.

    Args:
        aprofs_obj_self (Aprofs Object): An instance of the Aprofs class.
        aprofs_obj (Aprofs Object): An instance of the Aprofs class.
        feature (str): feature to compare.

    Returns:
        pd.DataFrame: The temporary DataFrame.

    Examples:
        >>> aprofs_obj = Aprofs Object(...)
        >>> aprofs_obj_2_compare = Aprofs Object(...)
        >>> features = 'feature_1'
        >>> temp = temp_plot_data(aprofs_obj,aprofs_obj_2_compare, feature)
        >>> print(temp.head())
    """

    temp = pd.DataFrame(
        {
            "target": aprofs_obj_self.target_column,
        }
    )

    # self data
    temp[feature] = aprofs_obj_self.current_data[feature].values
    temp[f"{feature}_shap"] = aprofs_obj_self.shap_mean + aprofs_obj_self.shap_values[feature].values
    temp[f"{feature}_shap_prob"] = 1 / (1 + np.exp(-temp[f"{feature}_shap"]))

    # compare data
    temp[f"{feature}_shap_compare"] = aprofs_obj.shap_mean + aprofs_obj.shap_values[feature].values
    temp[f"{feature}_shap_prob_compare"] = 1 / (1 + np.exp(-temp[f"{feature}_shap_compare"]))

    # model probabilities data
    temp["shap_model"] = aprofs_obj.shap_mean + aprofs_obj.shap_values.sum(axis=1)
    temp["shap_prob_model"] = 1 / (1 + np.exp(-temp["shap_model"]))

    return temp

`temp_plot_data(aprofs_obj, features)`

Generate a temporary DataFrame for plotting purposes.

Parameters:

Name	Type	Description	Default
`aprofs_obj`	`Aprofs Object`	An instance of the Aprofs class.	required
`features`	`List[str]`	A list of feature names.	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The temporary DataFrame.

Examples:

>>> aprofs_obj = Aprofs Object(...)
>>> features = ['feature_1', 'feature_2']
>>> temp = temp_plot_data(aprofs_obj, features)
>>> print(temp.head())

Source code in src/aprofs/utils.py

def temp_plot_data(aprofs_obj, features: List[str]) -> pd.DataFrame:
    """
    Generate a temporary DataFrame for plotting purposes.

    Args:
        aprofs_obj (Aprofs Object): An instance of the Aprofs class.
        features (List[str]): A list of feature names.

    Returns:
        pd.DataFrame: The temporary DataFrame.

    Examples:
        >>> aprofs_obj = Aprofs Object(...)
        >>> features = ['feature_1', 'feature_2']
        >>> temp = temp_plot_data(aprofs_obj, features)
        >>> print(temp.head())
    """
    if not isinstance(features, list):
        features = [features]

    temp = pd.DataFrame(
        {
            "target": aprofs_obj.target_column,
        }
    )

    for feature in features:
        temp[feature] = aprofs_obj.current_data[feature].values
        temp[f"{feature}_shap"] = aprofs_obj.shap_mean + aprofs_obj.shap_values[feature].values
        temp[f"{feature}_shap_prob"] = 1 / (1 + np.exp(-temp[f"{feature}_shap"]))

    temp["shap_other"] = aprofs_obj.shap_mean + aprofs_obj.shap_values[
        [col for col in aprofs_obj.shap_values.columns if col not in features]
    ].sum(axis=1)
    temp["shap_prob_other"] = 1 / (1 + np.exp(-temp["shap_other"]))
    temp["shap_model"] = aprofs_obj.shap_mean + aprofs_obj.shap_values.sum(axis=1)
    temp["shap_prob_model"] = 1 / (1 + np.exp(-temp["shap_model"]))

    return temp

References

Aprofs

A class for analyzing SHAP values using approximate predictions.

brute_force_selection(features)

calculate_shaps(model, type_model='tree')

compare_feature(other, feature, nbins=20, type_bins='qcut', type_plot='prob')

get_feature_performance(features)

get_shap_p_value(features, suffle_size=500)

gready_forward_selection(features, greediness=0.001)

visualize_feature(main_feature, other_features=None, nbins=20, type_bins='qcut', type_plot='prob')

visualize_neutralized_feature(main_feature, neutralize_features=None, nbins=20, type_bins='qcut', type_plot='prob')

Detailed API models

ClassificationLogisticLink

LinkModels

inv_link_calculate(prediction) abstractmethod

link_calculate(inv_prediction) abstractmethod

performance_fit(target, prediction) abstractmethod

RegressionIdentityLink

RegressionLogLink

Detailed API utilities

best_feature(shaps_values, shap_expected_values, link_model, y_target, current_list, candidate_list)

calculate_all_row_sum(data, mean_value, link_model)

calculate_row_sum(data, mean_value, columns, link_model)

generate_all_combinations(features)

get_shap_values(data, model, type_model='tree')

plot_data(temp, main_feature, other_features=None, nbins=20, type_bins='qcut', type_plot='prob')

plot_data_compare(temp, feature, nbins=20, type_bins='qcut', type_plot='prob')

plot_data_neutral(data, feature, nbins=20, type_bins='qcut', type_plot='prob')

random_sort_shaps(shaps_values, shap_expected_value, feature_name, y_target, link_model)

random_sort_shaps_column(shaps_values, shap_mean_value, target_column, feature, link_model, original=False)

temp_neutral_plot_data(aprofs_obj, features)

temp_plot_compare_data(aprofs_obj_self, aprofs_obj, feature)

temp_plot_data(aprofs_obj, features)

`Aprofs`

`brute_force_selection(features)`

`calculate_shaps(model, type_model='tree')`

`compare_feature(other, feature, nbins=20, type_bins='qcut', type_plot='prob')`

`get_feature_performance(features)`

`get_shap_p_value(features, suffle_size=500)`

`gready_forward_selection(features, greediness=0.001)`

`visualize_feature(main_feature, other_features=None, nbins=20, type_bins='qcut', type_plot='prob')`

`visualize_neutralized_feature(main_feature, neutralize_features=None, nbins=20, type_bins='qcut', type_plot='prob')`

`ClassificationLogisticLink`

`LinkModels`

`inv_link_calculate(prediction)` `abstractmethod`

`link_calculate(inv_prediction)` `abstractmethod`

`performance_fit(target, prediction)` `abstractmethod`

`RegressionIdentityLink`

`RegressionLogLink`

`best_feature(shaps_values, shap_expected_values, link_model, y_target, current_list, candidate_list)`

`calculate_all_row_sum(data, mean_value, link_model)`

`calculate_row_sum(data, mean_value, columns, link_model)`

`generate_all_combinations(features)`

`get_shap_values(data, model, type_model='tree')`

`plot_data(temp, main_feature, other_features=None, nbins=20, type_bins='qcut', type_plot='prob')`

`plot_data_compare(temp, feature, nbins=20, type_bins='qcut', type_plot='prob')`

`plot_data_neutral(data, feature, nbins=20, type_bins='qcut', type_plot='prob')`

`random_sort_shaps(shaps_values, shap_expected_value, feature_name, y_target, link_model)`

`random_sort_shaps_column(shaps_values, shap_mean_value, target_column, feature, link_model, original=False)`

`temp_neutral_plot_data(aprofs_obj, features)`

`temp_plot_compare_data(aprofs_obj_self, aprofs_obj, feature)`

`temp_plot_data(aprofs_obj, features)`