Source code for pyod.models.gmm

# -*- coding: utf-8 -*-
"""
Outlier detection based on Gaussian Mixture Model (GMM).
"""
# Author: Akira Tamamori <tamamori5917@gmail.com>
# License: BSD 2 clause

from __future__ import division, print_function

from sklearn.mixture import GaussianMixture
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted

from pyod.models.base import BaseDetector
from pyod.utils.utility import invert_order


[docs] class GMM(BaseDetector): """Wrapper of scikit-learn Gaussian Mixture Model with more functionalities. Unsupervised Outlier Detection. See :cite:`aggarwal2015outlier` Chapter 2 for details. Parameters ---------- n_components : int, default=1 The number of mixture components. covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full' String describing the type of covariance parameters to use. tol : float, default=1e-3 The convergence threshold. EM iterations will stop when the lower bound average gain is below this threshold. reg_covar : float, default=1e-6 Non-negative regularization added to the diagonal of covariance. Allows to assure that the covariance matrices are all positive. max_iter : int, default=100 The number of EM iterations to perform. n_init : int, default=1 The number of initializations to perform. The best results are kept. init_params : {'kmeans', 'random'}, default='kmeans' The method used to initialize the weights, the means and the precisions. weights_init : array-like of shape (n_components, ), default=None The user-provided initial weights. If it is None, weights are initialized using the `init_params` method. means_init : array-like of shape (n_components, n_features), default=None The user-provided initial means, If it is None, means are initialized using the `init_params` method. precisions_init : array-like, default=None The user-provided initial precisions (inverse of the covariance matrices). If it is None, precisions are initialized using the 'init_params' method. random_state : int, RandomState instance or None, default=None Controls the random seed given to the method chosen to initialize the parameters. warm_start : bool, default=False If 'warm_start' is True, the solution of the last fitting is used as initialization for the next call of fit(). verbose : int, default=0 Enable verbose output. verbose_interval : int, default=10 Number of iteration done before the next print. contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set. Attributes ---------- weights_ : array-like of shape (n_components,) The weights of each mixture components. means_ : array-like of shape (n_components, n_features) The mean of each mixture component. covariances_ : array-like The covariance of each mixture component. precisions_ : array-like The precision matrices for each component in the mixture. precisions_cholesky_ : array-like The cholesky decomposition of the precision matrices of each mixture component. converged_ : bool True when convergence was reached in fit(), False otherwise. n_iter_ : int Number of step used by the best fit of EM to reach the convergence. lower_bound_ : float Lower bound value on the log-likelihood (of the training data with respect to the model) of the best fit of EM. decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__( self, n_components=1, covariance_type="full", tol=1e-3, reg_covar=1e-6, max_iter=100, n_init=1, init_params="kmeans", weights_init=None, means_init=None, precisions_init=None, random_state=None, warm_start=False, contamination=0.1, ): super().__init__(contamination=contamination) self.n_components = n_components self.covariance_type = covariance_type self.tol = tol self.reg_covar = reg_covar self.max_iter = max_iter self.n_init = n_init self.init_params = init_params self.weights_init = weights_init self.means_init = means_init self.precisions_init = precisions_init self.random_state = random_state self.warm_start = warm_start self.detector_ = None self.decision_scores_ = None
[docs] def fit(self, X, y=None): """Fit detector. y is ignored in unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : Ignored Not used, present for API consistency by convention. sample_weight : array-like, shape (n_samples,) Per-sample weights. Rescale C per sample. Higher weights force the classifier to put more emphasis on these points. Returns ------- self : object Fitted estimator. """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = GaussianMixture( n_components=self.n_components, covariance_type=self.covariance_type, tol=self.tol, reg_covar=self.reg_covar, max_iter=self.max_iter, n_init=self.n_init, init_params=self.init_params, weights_init=self.weights_init, means_init=self.means_init, precisions_init=self.precisions_init, random_state=self.random_state, warm_start=self.warm_start, ) self.detector_.fit(X=X, y=y) # invert decision_scores_. Outliers comes with higher outlier scores self.decision_scores_ = invert_order(self.detector_.score_samples(X)) self._process_decision_scores() return self
[docs] def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ["decision_scores_", "threshold_", "labels_"]) # Invert outlier scores. Outliers come with higher outlier scores return invert_order(self.detector_.score_samples(X))
@property def weights_(self): """The weights of each mixture components. Decorator for scikit-learn Gaussian Mixture Model attributes. """ return self.detector_.weights_ @property def means_(self): """The mean of each mixture component. Decorator for scikit-learn Gaussian Mixture Model attributes. """ return self.detector_.means_ @property def covariances_(self): """The covariance of each mixture component. Decorator for scikit-learn Gaussian Mixture Model attributes. """ return self.detector_.covariances_ @property def precisions_(self): """The precision matrices for each component in the mixture. Decorator for scikit-learn Gaussian Mixture Model attributes. """ return self.detector_.precisions_ @property def precisions_cholesky_(self): """The cholesky decomposition of the precision matrices of each mixture component. Decorator for scikit-learn Gaussian Mixture Model attributes. """ return self.detector_.precisions_cholesky_ @property def converged_(self): """True when convergence was reached in fit(), False otherwise. Decorator for scikit-learn Gaussian Mixture Model attributes. """ return self.detector_.converged_ @property def n_iter_(self): """Number of step used by the best fit of EM to reach the convergence. Decorator for scikit-learn Gaussian Mixture Model attributes. """ return self.detector_.n_iter_ @property def lower_bound_(self): """Lower bound value on the log-likelihood of the best fit of EM. Decorator for scikit-learn Gaussian Mixture Model attributes. """ return self.detector_.lower_bound_