Source code for

# -*- coding: utf-8 -*-
"""Cook's distance outlier detection (CD)

# Author: D Kulik
# License: BSD 2 clause

from __future__ import division
from __future__ import print_function

import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted

from .base import BaseDetector

[docs]def whiten_data(X, pca): X = pca.transform(X) return X
[docs]def Cooks_dist(X, y, model): # Leverage is computed as the diagonal of the projection matrix of X leverage = (X * np.linalg.pinv(X).T).sum(1) # Compute the rank and the degrees of freedom of the model rank = np.linalg.matrix_rank(X) df = X.shape[0] - rank # Compute the MSE from the residuals residuals = y - model.predict(X) mse =, residuals) / df # Compute Cook's distance residuals_studentized = residuals / np.sqrt(mse) / np.sqrt(1 - leverage) distance_ = residuals_studentized ** 2 / X.shape[1] distance_ *= leverage / (1 - leverage) return distance_
[docs]class CD(BaseDetector): """Cook's distance can be used to identify points that negatively affect a regression model. A combination of each observation’s leverage and residual values are used in the measurement. Higher leverage and residuals relate to higher Cook’s distances. Read more in the :cite:`cook1977detection`. Parameters ---------- contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. whiten : bool, optional (default=True) transform X to have a covariance matrix that is the identity matrix of 1 in the diagonal and 0 for the other cells using PCA rule_of_thumb : bool, optional (default=False) to apply the rule of thumb prediction (4 / n) as the influence threshold; where n is the number of samples. This has been know to be a good estimate for values over this point as being outliers. ** Note the contamination level is reset when rule_of_thumb is set to True Attributes ---------- decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The modified z-score to use as a threshold. Observations with a modified z-score (based on the median absolute deviation) greater than this value will be classified as outliers. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, whitening=True, contamination=0.1, rule_of_thumb=False): super(CD, self).__init__(contamination=contamination) self.whitening = whitening self.rule_of_thumb = rule_of_thumb
[docs] def fit(self, X, y): """Fit detector. y is necessary for supervised method. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # Define OLS model self.model = LinearRegression() # Validate inputs X and y try: X = check_array(X) except ValueError: X = X.reshape(-1, 1) y = np.squeeze(check_array(y, ensure_2d=False)) self._set_n_classes(y) # Apply whitening if self.whitening: self.pca = PCA(whiten=True) X = whiten_data(X, self.pca) # Fit a linear model to X and y, y) # Get Cook's Distance distance_ = Cooks_dist(X, y, self.model) # Compute the influence threshold if self.rule_of_thumb: influence_threshold_ = 4 / X.shape[0] self.contamination = sum(distance_ > influence_threshold_) / \ X.shape[0] self.decision_scores_ = distance_ self._process_decision_scores() return self
[docs] def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The independent and dependent/target samples with the target samples being the last column of the numpy array such that eg: X = np.append(x, y.reshape(-1,1), axis=1). Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) try: X = check_array(X) except ValueError: X = X.reshape(-1, 1) y = X[:, -1] X = X[:, :-1] # Apply whitening if self.whitening: X = whiten_data(X, self.pca) # Get Cook's Distance distance_ = Cooks_dist(X, y, self.model) return distance_