Source code for pyod.models.iforest

# -*- coding: utf-8 -*-
"""IsolationForest Outlier Detector. Implemented on scikit-learn library.
"""
# Author: Yue Zhao <zhaoy@cmu.edu>
# License: BSD 2 clause

from __future__ import division
from __future__ import print_function

from sklearn.ensemble import IsolationForest
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import check_array

from .base import BaseDetector
from ..utils.utility import invert_order
# noinspection PyProtectedMember
from ..utils.utility import _get_sklearn_version


# TODO: behavior of Isolation Forest will change in sklearn 0.22. See below.
# in 0.22, scikit learn will start adjust decision_function values by
# offset to make the values below zero as outliers. In other words, it is
# an absolute shift, which SHOULD NOT affect the result of PyOD at all as
# the order is still preserved.

# Behaviour of the decision_function which can be either ‘old’ or ‘new’.
# Passing behaviour='new' makes the decision_function change to match other
# anomaly detection algorithm API which will be the default behaviour in the
# future. As explained in details in the offset_ attribute documentation,
# the decision_function becomes dependent on the contamination parameter,
# in such a way that 0 becomes its natural threshold to detect outliers.

# offset_ : float
# Offset used to define the decision function from the raw scores.
# We have the relation: decision_function = score_samples - offset_.
# Assuming behaviour == ‘new’, offset_ is defined as follows.
# When the contamination parameter is set to “auto”,
# the offset is equal to -0.5 as the scores of inliers are close to 0 and the
# scores of outliers are close to -1. When a contamination parameter different
# than “auto” is provided, the offset is defined in such a way we obtain the
# expected number of outliers (samples with decision function < 0) in training.
# Assuming the behaviour parameter is set to ‘old’,
# we always have offset_ = -0.5, making the decision function independent from
# the contamination parameter.

# check https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html for more information


[docs]class IForest(BaseDetector): """Wrapper of scikit-learn Isolation Forest with more functionalities. The IsolationForest 'isolates' observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. See :cite:`liu2008isolation,liu2012isolation` for details. Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node. This path length, averaged over a forest of such random trees, is a measure of normality and our decision function. Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies. Parameters ---------- n_estimators : int, optional (default=100) The number of base estimators in the ensemble. max_samples : int or float, optional (default="auto") The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If "auto", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling). contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. max_features : int or float, optional (default=1.0) The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. bootstrap : bool, optional (default=False) If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed. n_jobs : integer, optional (default=1) The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores. behaviour : str, default='old' Behaviour of the ``decision_function`` which can be either 'old' or 'new'. Passing ``behaviour='new'`` makes the ``decision_function`` change to match other anomaly detection algorithm API which will be the default behaviour in the future. As explained in details in the ``offset_`` attribute documentation, the ``decision_function`` becomes dependent on the contamination parameter, in such a way that 0 becomes its natural threshold to detect outliers. .. versionadded:: 0.7.0 ``behaviour`` is added in 0.7.0 for back-compatibility purpose. .. deprecated:: 0.20 ``behaviour='old'`` is deprecated in sklearn 0.20 and will not be possible in 0.22. .. deprecated:: 0.22 ``behaviour`` parameter will be deprecated in sklearn 0.22 and removed in 0.24. .. warning:: Only applicable for sklearn 0.20 above. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. verbose : int, optional (default=0) Controls the verbosity of the tree building process. Attributes ---------- estimators_ : list of DecisionTreeClassifier The collection of fitted sub-estimators. estimators_samples_ : list of arrays The subset of drawn samples (i.e., the in-bag samples) for each base estimator. max_samples_ : integer The actual number of samples decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, n_estimators=100, max_samples="auto", contamination=0.1, max_features=1., bootstrap=False, n_jobs=1, behaviour='old', random_state=None, verbose=0): super(IForest, self).__init__(contamination=contamination) self.n_estimators = n_estimators self.max_samples = max_samples self.max_features = max_features self.bootstrap = bootstrap self.n_jobs = n_jobs self.behaviour = behaviour self.random_state = random_state self.verbose = verbose
[docs] def fit(self, X, y=None): """Fit detector. y is ignored in unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : Ignored Not used, present for API consistency by convention. Returns ------- self : object Fitted estimator. """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) # In sklearn 0.20+ new behaviour is added (arg behaviour={'new','old'}) # to IsolationForest that shifts the location of the anomaly scores # noinspection PyProtectedMember sklearn_version = _get_sklearn_version() if sklearn_version == 21: self.detector_ = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, behaviour=self.behaviour, random_state=self.random_state, verbose=self.verbose) # Do not pass behaviour argument when sklearn version is < 0.20 or >0.21 else: # pragma: no cover self.detector_ = IsolationForest(n_estimators=self.n_estimators, max_samples=self.max_samples, contamination=self.contamination, max_features=self.max_features, bootstrap=self.bootstrap, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) self.detector_.fit(X=X, y=None, sample_weight=None) # invert decision_scores_. Outliers comes with higher outlier scores. self.decision_scores_ = invert_order( self.detector_.decision_function(X)) self._process_decision_scores() return self
[docs] def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) # invert outlier scores. Outliers comes with higher outlier scores return invert_order(self.detector_.decision_function(X))
@property def estimators_(self): """The collection of fitted sub-estimators. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_ @property def estimators_samples_(self): """The subset of drawn samples (i.e., the in-bag samples) for each base estimator. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.estimators_samples_ @property def max_samples_(self): """The actual number of samples. Decorator for scikit-learn Isolation Forest attributes. """ return self.detector_.max_samples_