Source code for pyod.models.cof

# -*- coding: utf-8 -*-
"""Connectivity-Based Outlier Factor (COF) Algorithm
"""
# Author: Yahya Almardeny <almardeny@gmail.com>, Roel Bouman <roel.bouman@ru.nl> (memory efficient COF)
# License: BSD 2 clause

from __future__ import division
from __future__ import print_function

import warnings
from operator import itemgetter

import numpy as np
from scipy.spatial import distance_matrix
from scipy.spatial import minkowski_distance
from sklearn.utils import check_array

from .base import BaseDetector
from ..utils.utility import check_parameter


[docs] class COF(BaseDetector): """Connectivity-Based Outlier Factor (COF) COF uses the ratio of average chaining distance of data point and the average of average chaining distance of k nearest neighbor of the data point, as the outlier score for observations. See :cite:`tang2002enhancing` for details. Two version of COF are supported: - Fast COF: computes the entire pairwise distance matrix at the cost of a O(n^2) memory requirement. - Memory efficient COF: calculates pairwise distances incrementally. Use this implementation when it is not feasible to fit the n-by-n distance in memory. This leads to a linear overhead because many distances will have to be recalculated. Parameters ---------- contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. n_neighbors : int, optional (default=20) Number of neighbors to use by default for k neighbors queries. Note that n_neighbors should be less than the number of samples. If n_neighbors is larger than the number of samples provided, all samples will be used. method : string, optional (default='fast') Valid values for method are: - 'fast' Fast COF, computes the full pairwise distance matrix up front. - 'memory' Memory-efficient COF, computes pairwise distances only when needed at the cost of computational speed. Attributes ---------- decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. n_neighbors_: int Number of neighbors to use by default for k neighbors queries. """ def __init__(self, contamination=0.1, n_neighbors=20, method="fast"): super(COF, self).__init__(contamination=contamination) if isinstance(n_neighbors, int): check_parameter(n_neighbors, low=1, param_name='n_neighbors') else: raise TypeError( "n_neighbors should be int. Got %s" % type(n_neighbors)) self.n_neighbors = n_neighbors self.method = method
[docs] def fit(self, X, y=None): """Fit detector. y is ignored in unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : Ignored Not used, present for API consistency by convention. Returns ------- self : object Fitted estimator. """ X = check_array(X) self.n_train_ = X.shape[0] self.n_neighbors_ = self.n_neighbors if self.n_neighbors_ >= self.n_train_: self.n_neighbors_ = self.n_train_ - 1 warnings.warn( "n_neighbors is set to the number of training points " "minus 1: {0}".format(self.n_neighbors_)) check_parameter(self.n_neighbors_, 1, self.n_train_, include_left=True, include_right=True) self._set_n_classes(y) self.decision_scores_ = self.decision_function(X) self._process_decision_scores() return self
[docs] def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ if self.method.lower() == "fast": return self._cof_fast(X) elif self.method.lower() == "memory": return self._cof_memory(X) else: raise ValueError("method should be set to either \'fast\' or \'memory\'. Got %s" % self.method)
def _cof_memory(self, X): """ Connectivity-Based Outlier Factor (COF) Algorithm This function is called internally to calculate the Connectivity-Based Outlier Factor (COF) as an outlier score for observations. This function uses a memory efficient implementation at the cost of speed. :return: numpy array containing COF scores for observations. The greater the COF, the greater the outlierness. """ #dist_matrix = np.array(distance_matrix(X, X)) sbn_path_index = np.zeros((X.shape[0],self.n_neighbors_), dtype=np.int64) ac_dist, cof_ = np.zeros((X.shape[0])), np.zeros((X.shape[0])) for i in range(X.shape[0]): #sbn_path = np.argsort(dist_matrix[i]) sbn_path = np.argsort(minkowski_distance(X[i,:],X,p=2)) sbn_path_index[i,:] = sbn_path[1: self.n_neighbors_ + 1] cost_desc = np.zeros((self.n_neighbors_)) for j in range(self.n_neighbors_): #cost_desc.append( # np.min(dist_matrix[sbn_path[j + 1]][sbn_path][:j + 1])) cost_desc[j] = np.min(minkowski_distance(X[sbn_path[j + 1]],X,p=2)[sbn_path][:j + 1]) acd = np.zeros((self.n_neighbors_)) for _h, cost_ in enumerate(cost_desc): neighbor_add1 = self.n_neighbors_ + 1 acd[_h] = ((2. * (neighbor_add1 - (_h + 1))) / (neighbor_add1 * self.n_neighbors_)) * cost_ ac_dist[i] = np.sum(acd) for _g in range(X.shape[0]): cof_[_g] = (ac_dist[_g] * self.n_neighbors_) / np.sum(ac_dist[sbn_path_index[_g]]) return np.nan_to_num(cof_) def _cof_fast(self, X): """ Connectivity-Based Outlier Factor (COF) Algorithm This function is called internally to calculate the Connectivity-Based Outlier Factor (COF) as an outlier score for observations. This function uses a fast implementation at the cost of memory. :return: numpy array containing COF scores for observations. The greater the COF, the greater the outlierness. """ dist_matrix = np.array(distance_matrix(X, X)) sbn_path_index, ac_dist, cof_ = [], [], [] for i in range(X.shape[0]): sbn_path = np.argsort(dist_matrix[i]) sbn_path_index.append(sbn_path[1: self.n_neighbors_ + 1]) cost_desc = [] for j in range(self.n_neighbors_): cost_desc.append( np.min(dist_matrix[sbn_path[j + 1]][sbn_path][:j + 1])) acd = [] for _h, cost_ in enumerate(cost_desc): neighbor_add1 = self.n_neighbors_ + 1 acd.append(((2. * (neighbor_add1 - (_h + 1))) / ( neighbor_add1 * self.n_neighbors_)) * cost_) ac_dist.append(np.sum(acd)) for _g in range(X.shape[0]): cof_.append((ac_dist[_g] * self.n_neighbors_) / np.sum(itemgetter(*sbn_path_index[_g])(ac_dist))) return np.nan_to_num(cof_)