Source code for pyod.models.suod

# -*- coding: utf-8 -*-
"""SUOD
"""
# Author: Yue Zhao <yzhao062@gmail.com>
# License: BSD 2 clause


import numpy as np
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted

# `suod` is an optional dependency. Make the failure mode loud and useful
# instead of silently `print`-ing then crashing on the next line with a
# bare ImportError that doesn't tell the user how to fix it (issue #640).
try:
    from suod.models.base import SUOD as SUOD_model
except ImportError as _suod_import_error:  # pragma: no cover - import guard
    SUOD_model = None
    _SUOD_IMPORT_ERROR = _suod_import_error
else:
    _SUOD_IMPORT_ERROR = None

from .base import BaseDetector
from .lof import LOF
from .hbos import HBOS
from .iforest import IForest
from .copod import COPOD
from .combination import average, maximization
from ..utils.utility import standardizer



[docs]
class SUOD(BaseDetector):
    # noinspection PyPep8
    """SUOD (Scalable Unsupervised Outlier Detection) is an acceleration
    framework for large scale unsupervised outlier detector training and
    prediction. See :cite:`zhao2021suod` for details.

    Parameters
    ----------
    base_estimators : list, length must be greater than 1
        A list of base estimators. Certain methods must be present, e.g.,
        `fit` and `predict`.

    combination : str, optional (default='average')
        Decide how to aggregate the results from multiple models:

        - "average" : average the results from all base detectors
        - "maximization" : output the max value across all base detectors

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    n_jobs : optional (default=1)
        The number of jobs to run in parallel for both `fit` and
        `predict`. If -1, then the number of jobs is set to the
        the number of jobs that can actually run in parallel.

    rp_clf_list : list, optional (default=None)
        The list of outlier detection models to use random projection. The
        detector name should be consistent with PyOD.

    rp_ng_clf_list : list, optional (default=None)
        The list of outlier detection models NOT to use random projection. The
        detector name should be consistent with PyOD.

    rp_flag_global : bool, optional (default=True)
        If set to False, random projection is turned off for all base models.

    target_dim_frac : float in (0., 1), optional (default=0.5)
        The target compression ratio.

    jl_method : string, optional (default = 'basic')
        The JL projection method:

        - "basic": each component of the transformation matrix is taken at
          random in N(0,1).
        - "discrete", each component of the transformation matrix is taken at
          random in {-1,1}.
        - "circulant": the first row of the transformation matrix is taken at
          random in N(0,1), and each row is obtained from the previous one
          by a one-left shift.
        - "toeplitz": the first row and column of the transformation matrix
          is taken at random in N(0,1), and each diagonal has a constant value
          taken from these first vector.

    bps_flag : bool, optional (default=True)
        If set to False, balanced parallel scheduling is turned off.

    approx_clf_list : list, optional (default=None)
        The list of outlier detection models to use pseudo-supervised
        approximation. The detector name should be consistent with PyOD.

    approx_ng_clf_list : list, optional (default=None)
        The list of outlier detection models NOT to use pseudo-supervised
        approximation. The detector name should be consistent with PyOD.

    approx_flag_global : bool, optional (default=True)
        If set to False, pseudo-supervised approximation is turned off.

    approx_clf : object, optional (default: sklearn RandomForestRegressor)
        The supervised model used to approximate unsupervised models.

    cost_forecast_loc_fit : str, optional
        The location of the pretrained cost prediction forecast for training.

    cost_forecast_loc_pred : str, optional
        The location of the pretrained cost prediction forecast for prediction.

    verbose : int, optional (default=0)
        Controls the verbosity of the building process.

    Attributes
    ----------
    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """

    def __init__(self, base_estimators=None, contamination=0.1,
                 combination='average', n_jobs=None,
                 rp_clf_list=None, rp_ng_clf_list=None, rp_flag_global=True,
                 target_dim_frac=0.5, jl_method='basic', bps_flag=True,
                 approx_clf_list=None, approx_ng_clf_list=None,
                 approx_flag_global=True, approx_clf=None,
                 verbose=False):
        # Issue #640: defer the optional `suod` dependency check until
        # the user actually constructs SUOD, with an actionable message
        # pointing at `pip install suod`. Importing pyod.models.suod
        # without the package installed must remain side-effect-free
        # (the symbol is exposed for type hints / introspection).
        if SUOD_model is None:
            raise ImportError(
                "pyod.models.suod requires the optional `suod` package. "
                "Install it with `pip install suod` and retry. "
                "Original error: {}".format(_SUOD_IMPORT_ERROR)
            )
        super(SUOD, self).__init__(contamination=contamination)
        self.base_estimators = base_estimators
        self.contamination = contamination
        self.combination = combination
        self.n_jobs = n_jobs
        self.rp_clf_list = rp_clf_list
        self.rp_ng_clf_list = rp_ng_clf_list
        self.rp_flag_global = rp_flag_global
        self.target_dim_frac = target_dim_frac
        self.jl_method = jl_method
        self.bps_flag = bps_flag
        self.approx_clf_list = approx_clf_list
        self.approx_ng_clf_list = approx_ng_clf_list
        self.approx_flag_global = approx_flag_global
        self.approx_clf = approx_clf
        self.verbose = verbose

        # by default we will provide a group of performing models
        if self.base_estimators is None:
            self.base_estimators = [LOF(n_neighbors=15), LOF(n_neighbors=20),
                                    HBOS(n_bins=10), HBOS(n_bins=20),
                                    COPOD(), IForest(n_estimators=50),
                                    IForest(n_estimators=100),
                                    IForest(n_estimators=150)]

        self.n_estimators = len(self.base_estimators)

        # pass in the arguments for SUOD model
        self.model_ = SUOD_model(
            base_estimators=self.base_estimators,
            contamination=self.contamination,
            n_jobs=self.n_jobs,
            rp_clf_list=self.rp_clf_list,
            rp_ng_clf_list=self.rp_ng_clf_list,
            rp_flag_global=self.rp_flag_global,
            target_dim_frac=self.target_dim_frac,
            jl_method=self.jl_method,
            approx_clf_list=self.approx_clf_list,
            approx_ng_clf_list=self.approx_ng_clf_list,
            approx_flag_global=self.approx_flag_global,
            approx_clf=self.approx_clf,
            bps_flag=self.bps_flag,
            verbose=self.verbose,
        )


[docs]
    def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """

        # validate inputs X and y (optional)
        X = check_array(X)
        n_samples, n_features = X.shape[0], X.shape[1]
        self._set_n_classes(y)

        # fit the model and then approximate it
        self.model_.fit(X)
        self.model_.approximate(X)

        # get the decision scores from each base estimators
        decision_score_mat = np.zeros([n_samples, self.n_estimators])
        for i in range(self.n_estimators):
            decision_score_mat[:, i] = self.model_.base_estimators[
                i].decision_scores_

        # the scores must be standardized before combination
        decision_score_mat, self.score_scalar_ = standardizer(
            decision_score_mat, keep_scalar=True)

        # todo: may support other combination
        if self.combination == 'average':
            decision_score = average(decision_score_mat)
        else:
            decision_score = maximization(decision_score_mat)

        assert (len(decision_score) == n_samples)

        self.decision_scores_ = decision_score.ravel()
        self._process_decision_scores()

        return self



[docs]
    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detectors.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['model_', 'decision_scores_',
                               'threshold_', 'labels_'])

        X = check_array(X)

        # initialize the output score
        predicted_scores = self.model_.decision_function(X)

        # standardize the score and combine
        predicted_scores = self.score_scalar_.transform(predicted_scores)

        # todo: may support other combination
        if self.combination == 'average':
            decision_score = average(predicted_scores)
        else:
            decision_score = maximization(predicted_scores)

        assert (len(decision_score) == X.shape[0])

        return decision_score.ravel()