Source code for pyod.models.sos

# -*- coding: utf-8 -*-
"""Stochastic Outlier Selection (SOS).
Part of the codes are adapted from https://github.com/jeroenjanssens/scikit-sos
"""
# Author: Winston Li <jk_zhengli@hotmail.com>
# License: BSD 2 clause


import numpy as np
from numba import njit
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted

from .base import BaseDetector


@njit
def _get_perplexity(D, beta):
    """Compute the perplexity and the A-row for a specific value of the
    precision of a Gaussian distribution.

    Parameters
    ----------
    D : array, shape (n_samples, )
        The dissimilarity matrix of the training samples.
    """

    # `A` is the direct ndarray output of `np.exp`, so call the ndarray
    # method instead of `np.sum`. The method bypasses numpy's
    # __array_function__ dispatch and is measurably faster on the small
    # rows this function is called on (one row of the affinity matrix per
    # sample); the same applies to `(D * A).sum()` below. See issue #635.
    A = np.exp(-D * beta)
    sumA = A.sum()
    H = np.log(sumA) + beta * (D * A).sum() / sumA
    return H, A



[docs]
class SOS(BaseDetector):
    """Stochastic Outlier Selection.
    
    SOS employs the concept of affinity to quantify
    the relationship from one data point to another data point. Affinity is 
    proportional to the similarity between two data points. So, a data point 
    has little affinity with a dissimilar data point. A data point is 
    selected as an outlier when all the other data points have insufficient
    affinity with it.
    Read more in the :cite:`janssens2012stochastic`.
    
    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1) 
        The amount of contamination of the data set, i.e.
        the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.
    
    perplexity : float, optional (default=4.5)
        A smooth measure of the effective number of neighbours. The perplexity
        parameter is similar to the parameter `k` in kNN algorithm (the number
        of nearest neighbors). The range of perplexity can be any real number
        between 1 and n-1, where `n` is the number of samples.

    metric: str, default 'euclidean'
        Metric used for the distance computation. Any metric from
        scipy.spatial.distance can be used.

        Valid values for metric are:

        - 'euclidean'
        - from scipy.spatial.distance: ['braycurtis', 'canberra',
          'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard',
          'kulsinski', 'mahalanobis', 'matching', 'minkowski',
          'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener',
          'sokalsneath', 'sqeuclidean', 'yule']

        See the documentation for scipy.spatial.distance for details on these
        metrics:
        http://docs.scipy.org/doc/scipy/reference/spatial.distance.html

    eps : float, optional (default = 1e-5)
        Tolerance threshold for floating point errors.
        
    Attributes
    ----------
    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is fitted.
    
    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.
    
    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
        
        
    Examples
    --------
    >>> from pyod.models.sos import SOS
    >>> from pyod.utils.data import generate_data
    >>> n_train = 50
    >>> n_test = 50
    >>> contamination = 0.1
    >>> X_train, y_train, X_test, y_test = generate_data(
    ...     n_train=n_train, n_test=n_test,
    ...     contamination=contamination, random_state=42)
    >>>
    >>> clf = SOS()
    >>> clf.fit(X_train)
    SOS(contamination=0.1, eps=1e-05, metric='euclidean', perplexity=4.5)
    """

    def __init__(self, contamination=0.1, perplexity=4.5, metric='euclidean',
                 eps=1e-5):
        super(SOS, self).__init__(contamination=contamination)
        self.perplexity = perplexity
        self.metric = metric
        self.eps = eps

    def _x2d(self, X):
        """Computes the dissimilarity matrix of a given dataset.
        
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The query sample or samples to compute the dissimilarity matrix
            w.r.t. to the training samples.
            
        Returns
        -------
        D : array, shape (n_samples, )
            Returns the dissimilarity matrix.       
        """

        (n, d) = X.shape
        metric = self.metric.lower()
        if metric == 'none':
            if n != d:
                raise ValueError(
                    "If you specify 'none' as the metric, the data set "
                    "should be a square dissimilarity matrix")
            else:
                D = X
        elif metric == 'euclidean':
            sumX = np.sum(np.square(X), 1)

            # np.abs protects against extremely small negative values
            # that may arise due to floating point arithmetic errors
            D = np.sqrt(
                np.abs(np.add(np.add(-2 * np.dot(X, X.T), sumX).T, sumX)))
        else:
            try:
                from scipy.spatial import distance
            except ImportError as e:
                raise ImportError(
                    "Please install scipy if you wish to use a metric "
                    "other than 'euclidean' or 'none'")
            else:
                D = distance.squareform(distance.pdist(X, metric))
        return D

    def _d2a(self, D):
        """Performs a binary search to get affinities in such a way that each
        conditional Gaussian has the same perplexity. Then returns the
        affinities matrix.
        
        Parameters
        ----------
        D : array, shape (n_samples, )
            The dissimilarity matrix of the training samples.
            
        Returns
        -------
        A : array, shape (n_samples, )
            Returns the affinity matrix.       
        """

        (n, _) = D.shape
        A = np.zeros((n, n))
        beta = np.ones((n, 1))
        logU = np.log(self.perplexity)

        for i in range(n):
            # Compute the Gaussian kernel and entropy for the current precision
            betamin = -np.inf
            betamax = np.inf
            Di = D[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))]
            (H, thisA) = _get_perplexity(Di, beta[i])

            # Evaluate whether the perplexity is within tolerance
            Hdiff = H - logU
            tries = 0
            while (np.isnan(Hdiff) or np.abs(
                    Hdiff) > self.eps) and tries < 5000:
                if np.isnan(Hdiff):
                    beta[i] = beta[i] / 10.0
                # If not, increase or decrease precision
                elif Hdiff > 0:
                    betamin = beta[i].copy()
                    if betamax == np.inf or betamax == -np.inf:
                        beta[i] = beta[i] * 2.0
                    else:
                        beta[i] = (beta[i] + betamax) / 2.0
                else:
                    betamax = beta[i].copy()
                    if betamin == np.inf or betamin == -np.inf:
                        beta[i] = beta[i] / 2.0
                    else:
                        beta[i] = (beta[i] + betamin) / 2.0
                # Recompute the values
                (H, thisA) = _get_perplexity(Di, beta[i])
                Hdiff = H - logU
                tries += 1

            # Set the final row of A
            A[i, np.concatenate((np.r_[0:i], np.r_[i + 1:n]))] = thisA

        return A

    def _a2b(self, A):
        """Computes the binding probabilities of a given affinity
        matrix.
        
        Parameters
        ----------
        A : array, shape (n_samples, )
            The affinities matrix.
            
        Returns
        -------
        B : array, shape (n_samples, )
            Returns the matrix of binding probabilities.       
        """

        B = A / A.sum(axis=1)[:, np.newaxis]
        return B

    def _b2o(self, B):
        """Computes the binding probabilities of a given affinity
        matrix.
        
        Parameters
        ----------
        A : array, shape (n_samples, )
            The affinities matrix.
            
        Returns
        -------
        B : array, shape (n_samples, )
            Returns the matrix of binding probabilities.       
        """
        O = np.prod(1 - B, 0)
        return O


[docs]
    def fit(self, X, y=None):
        """Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : Ignored
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            Fitted estimator.
        """
        X = check_array(X)
        self._set_n_classes(y)
        D = self._x2d(X)
        A = self._d2a(D)
        B = self._a2b(A)
        O = self._b2o(B)
        # Invert decision_scores_. Outliers comes with higher outlier scores
        self.decision_scores_ = O
        self._process_decision_scores()
        return self



[docs]
    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        X = check_array(X)
        D = self._x2d(X)
        A = self._d2a(D)
        B = self._a2b(A)
        O = self._b2o(B)
        return O