Source code for pyod.models.thresholds

[docs] def AUCP(**kwargs): """AUCP class for Area Under Curve Precentage thresholder. Use the area under the curve to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond where the auc of the kde is less than the (mean + abs(mean-median)) percent of the total kde auc. """ from pythresh.thresholds.aucp import AUCP as AUCP_thres return AUCP_thres(**kwargs)
[docs] def BOOT(**kwargs): """BOOT class for Bootstrapping thresholder. Use a boostrapping based method to find a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the mean of the confidence intervals. Parameters ---------- random_state : int, optional (default=1234) Random seed for bootstrapping a confidence interval. Can also be set to None. """ from pythresh.thresholds.boot import BOOT as BOOT_thres return BOOT_thres(**kwargs)
[docs] def CHAU(**kwargs): """CHAU class for Chauvenet's criterion thresholder. Use the Chauvenet's criterion to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value below the Chauvenet's criterion. Parameters ---------- method : {'mean', 'median', 'gmean'}, optional (default='mean') Calculate the area normal to distance using a scaler - 'mean': Construct a scaler with the mean of the scores - 'median: Construct a scaler with the median of the scores - 'gmean': Construct a scaler with the geometric mean of the scores """ from pythresh.thresholds.chau import CHAU as CHAU_thres return CHAU_thres(**kwargs)
[docs] def CLF(**kwargs): """CLF class for Trained Classifier thresholder. Use the trained linear classifier to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond 0. Parameters ---------- method : {'simple', 'complex'}, optional (default='complex') Type of linear model - 'simple': Uses only the scores - 'complex': Uses the scores, log of the scores, and the scores' PDF """ from pythresh.thresholds.clf import CLF as CLF_thres return CLF_thres(**kwargs)
[docs] def CLUST(**kwargs): """CLUST class for clustering type thresholders. Use the clustering methods to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value not labelled as part of the main cluster. Parameters ---------- method : {'agg', 'birch', 'bang', 'bgm', 'bsas', 'dbscan', 'ema', 'kmeans', 'mbsas', 'mshift', 'optics', 'somsc', 'spec', 'xmeans'}, optional (default='spec') Clustering method - 'agg': Agglomerative - 'birch': Balanced Iterative Reducing and Clustering using Hierarchies - 'bang': BANG - 'bgm': Bayesian Gaussian Mixture - 'bsas': Basic Sequential Algorithmic Scheme - 'dbscan': Density-based spatial clustering of applications with noise - 'ema': Expectation-Maximization clustering algorithm for Gaussian Mixture Model - 'kmeans': K-means - 'mbsas': Modified Basic Sequential Algorithmic Scheme - 'mshift': Mean shift - 'optics': Ordering Points To Identify Clustering Structure - 'somsc': Self-organized feature map - 'spec': Clustering to a projection of the normalized Laplacian - 'xmeans': X-means random_state : int, optional (default=1234) Random seed for the BayesianGaussianMixture clustering (method='bgm'). Can also be set to None. """ from pythresh.thresholds.clust import CLUST as CLUST_thres return CLUST_thres(**kwargs)
[docs] def CPD(**kwargs): """CPD class for Change Point Detection thresholder. Use change point detection to find a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the detected change point. Parameters ---------- method : {'Dynp', 'KernelCPD', 'Binseg', 'BottomUp'}, optional (default='Dynp') Method for change point detection - 'Dynp': Dynamic programming (optimal minimum sum of errors per partition) - 'KernelCPD': RBF kernel function (optimal minimum sum of errors per partition) - 'Binseg': Binary segmentation - 'BottomUp': Bottom-up segmentation transform : {'cdf', 'kde'}, optional (default='cdf') Data transformation method prior to fit - 'cdf': Use the cumulative distribution function - 'kde': Use the kernel density estimation """ from pythresh.thresholds.cpd import CPD as CPD_thres return CPD_thres(**kwargs)
[docs] def DECOMP(**kwargs): """DECOMP class for Decomposition based thresholders. Use decomposition to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the maximum of the decomposed matrix that results from decomposing the cumulative distribution function of the decision scores. Parameters ---------- method : {'NMF', 'PCA', 'GRP', 'SRP'}, optional (default='PCA') Method to use for decomposition - 'NMF': Non-Negative Matrix Factorization - 'PCA': Principal Component Analysis - 'GRP': Gaussian Random Projection - 'SRP': Sparse Random Projection random_state : int, optional (default=1234) Random seed for the decomposition algorithm. Can also be set to None. """ from pythresh.thresholds.decomp import DECOMP as DECOMP_thres return DECOMP_thres(**kwargs)
[docs] def DSN(**kwargs): """DSN class for Distance Shift from Normal thresholder. Use the distance shift from normal to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the distance calculated by the selected metric. Parameters ---------- metric : {'JS', 'WS', 'ENG', 'BHT', 'HLL', 'HI', 'LK', 'LP', 'MAH', 'TMT', 'RES', 'KS', 'INT', 'MMD'}, optional (default='MAH') Metric to use for distance computation - 'JS': Jensen-Shannon distance - 'WS': Wasserstein or Earth Movers distance - 'ENG': Energy distance - 'BHT': Bhattacharyya distance - 'HLL': Hellinger distance - 'HI': Histogram intersection distance - 'LK': Lukaszyk-Karmowski metric for normal distributions - 'LP': Levy-Prokhorov metric - 'MAH': Mahalanobis distance - 'TMT': Tanimoto distance - 'RES': Studentized residual distance - 'KS': Kolmogorov-Smirnov distance - 'INT': Weighted spline interpolated distance - 'MMD': Maximum Mean Discrepancy distance random_state : int, optional (default=1234) Random seed for the normal distribution. Can also be set to None. """ from pythresh.thresholds.dsn import DSN as DSN_thres return DSN_thres(**kwargs)
[docs] def EB(**kwargs): """EB class for Elliptical Boundary thresholder. Use pseudo-random elliptical boundaries to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond a pseudo-random elliptical boundary set between inliers and outliers. """ from pythresh.thresholds.eb import EB as EB_thres return EB_thres(**kwargs)
[docs] def FGD(**kwargs): """FGD class for Fixed Gradient Descent thresholder. Use the fixed gradient descent to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond where the first derivative of the kde with respect to the decision scores passes the mean of the first and second inflection points. """ from pythresh.thresholds.fgd import FGD as FGD_thres return FGD_thres(**kwargs)
[docs] def FILTER(**kwargs): """FILTER class for Filtering based thresholders. Use the filtering based methods to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the maximum filter value. See :cite:`hashemi2019filter` for details. Parameters ---------- method : {'gaussian', 'savgol', 'hilbert', 'wiener', 'medfilt', 'decimate','detrend', 'resample'}, optional (default='savgol') Method to filter the scores - 'gaussian': use a gaussian based filter - 'savgol': use the savgol based filter - 'hilbert': use the hilbert based filter - 'wiener': use the wiener based filter - 'medfilt: use a median based filter - 'decimate': use a decimate based filter - 'detrend': use a detrend based filter - 'resample': use a resampling based filter sigma : int, optional (default='auto') Variable specific to each filter type, default sets sigma to len(scores)*np.std(scores) - 'gaussian': standard deviation for Gaussian kernel - 'savgol': savgol filter window size - 'hilbert': number of Fourier components - 'medfilt: kernel size - 'decimate': downsampling factor - 'detrend': number of break points - 'resample': resampling window size """ from pythresh.thresholds.filter import FILTER as FILTER_thres return FILTER_thres(**kwargs)
[docs] def FWFM(**kwargs): """FWFM class for Full Width at Full Minimum thresholder. Use the full width at full minimum (aka base width) to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the base width. """ from pythresh.thresholds.fwfm import FWFM as FWFM_thres return FWFM_thres(**kwargs)
[docs] def GESD(**kwargs): """GESD class for Generalized Extreme Studentized Deviate thresholder. Use the generalized extreme studentized deviate to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any less than the smallest detected outlier. Parameters ---------- max_outliers : int, optional (default='auto') mamiximum number of outliers that the dataset may have. Default sets max_outliers to be half the size of the dataset alpha : float, optional (default=0.05) significance level """ from pythresh.thresholds.gesd import GESD as GESD_thres return GESD_thres(**kwargs)
[docs] def HIST(**kwargs): """HIST class for Histogram based thresholders. Use histograms methods as described in scikit-image.filters to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set by histogram generated thresholds depending on the selected methods. Parameters ---------- nbins : int, optional (default='auto') Number of bins to use in the hostogram, default set to int(len(scores)**0.7) method : {'otsu', 'yen', 'isodata', 'li', 'minimum', 'triangle'}, optional (default='triangle') Histogram filtering based method - 'otsu': OTSU's method for filtering - 'yen': Yen's method for filtering - 'isodata': Ridler-Calvard or inter-means method for filtering - 'li': Li's iterative Minimum Cross Entropy method for filtering - 'minimum': Minimum between two maxima via smoothing method for filtering - 'triangle': Triangle algorithm method for filtering """ from pythresh.thresholds.hist import HIST as HIST_thres return HIST_thres(**kwargs)
[docs] def IQR(**kwargs): """IQR class for Inter-Qaurtile Region thresholder. Use the inter-quartile region to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the third quartile plus 1.5 times the inter-quartile region. """ from pythresh.thresholds.iqr import IQR as IQR_thres return IQR_thres(**kwargs)
[docs] def KARCH(**kwargs): """KARCH class for Riemannian Center of Mass thresholder. Use the Karcher mean (Riemannian Center of Mass) to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the Karcher mean + one standard deviation of the decision_scores. Parameters ---------- ndim : int, optional (default=2) Number of dimensions to construct the Euclidean manifold method : {'simple', 'complex'}, optional (default='complex') Method for computing the Karcher mean - 'simple': Compute the Karcher mean using the 1D array of scores - 'complex': Compute the Karcher mean between a 2D array dot product of the scores and the sorted scores arrays """ from pythresh.thresholds.karch import KARCH as KARCH_thres return KARCH_thres(**kwargs)
[docs] def MAD(**kwargs): """MAD class for Median Absolute Deviation thresholder. Use the median absolute deviation to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the mean plus the median absolute deviation over the standard deviation. """ from pythresh.thresholds.mad import MAD as MAD_thres return MAD_thres(**kwargs)
[docs] def MCST(**kwargs): """MCST class for Monte Carlo Shapiro Tests thresholder. Use uniform random sampling and statstical testing to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the minimum value left after iterative Shapiro-Wilk tests have occured. Note** accuracy decreases with array size. For good results the should be array<1000. However still this threshold method may fail at any array size. Parameters ---------- random_state : int, optional (default=1234) Random seed for the uniform distribution. Can also be set to None. """ from pythresh.thresholds.mcst import MCST as MCST_thres return MCST_thres(**kwargs)
[docs] def META(**kwargs): """META class for Meta-modelling thresholder. Use a trained meta-model to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set based on the trained meta-model classifier. Parameters ---------- method : {'LIN', 'GNB', 'GNBC', 'GNBM'}, optional (default='GNBM') select - 'LIN': RidgeCV trained linear classifier meta-model on true labels - 'GNB': Gaussian Naive Bayes trained classifier meta-model on true labels - 'GNBC': Gaussian Naive Bayes trained classifier meta-model on best contamination - 'GNBM': Gaussian Naive Bayes multivariate trained classifier meta-model """ from pythresh.thresholds.meta import META as META_thres return META_thres(**kwargs)
[docs] def MOLL(**kwargs): """MOLL class for Friedrichs' mollifier thresholder. Use the Friedrichs' mollifier to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond one minus the maximum of the smoothed dataset via convolution. """ from pythresh.thresholds.moll import MOLL as MOLL_thres return MOLL_thres(**kwargs)
[docs] def MTT(**kwargs): """MTT class for Modified Thompson Tau test thresholder. Use the modified Thompson Tau test to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the smallest outlier detected by the test. Parameters ---------- strictness : [1,2,3,4,5], optional (default=4) Level of strictness corresponding to the t-Student distribution map to sample """ from pythresh.thresholds.mtt import MTT as MTT_thres return MTT_thres(**kwargs)
[docs] def OCSVM(**kwargs): """OCSVM class for One-Class Support Vector Machine thresholder. Use a one-class svm to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are determined by the one-class svm using a polynomial kernel with the polynomial degree either set or determined by regression internally. Parameters ---------- model : {'poly', 'sgd'}, optional (default='sgd') OCSVM model to apply - 'poly': Use a polynomial kernel with a regular OCSVM - 'sgd': Used the Additive Chi2 kernel approximation with a SGDOneClassSVM degree : int, optional (default='auto') Polynomial degree to use for the one-class svm. Default 'auto' finds the optimal degree with linear regression gamma : float, optional (default='auto') Kernel coefficient for polynomial fit for the one-class svm. Default 'auto' uses 1 / n_features criterion : {'aic', 'bic'}, optional (default='bic') regression performance metric. AIC is the Akaike Information Criterion, and BIC is the Bayesian Information Criterion. This only applies when degree is set to 'auto' nu : float, optional (default='auto') An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Default 'auto' sets nu as the ratio between the any point that is less than or equal to the median plus the absolute difference between the mean and geometric mean over the the number of points in the entire dataset tol : float, optional (default=1e-3) The stopping criterion for the one-class svm random_state : int, optional (default=1234) Random seed for the SVM's data sampling. Can also be set to None. """ from pythresh.thresholds.ocsvm import OCSVM as OCSVM_thres return OCSVM_thres(**kwargs)
[docs] def QMCD(**kwargs): """QMCD class for Quasi-Monte Carlo Discreprancy thresholder. Use the quasi-Monte Carlo discreprancy to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond and percentile or quantile of one minus the descreperancy (Note** A discrepancy quantifies the distance between the continuous uniform distribution on a hypercube and the discrete uniform distribution on distinct sample points). Parameters ---------- method : {'CD', 'WD', 'MD', 'L2-star'}, optional (default='WD') Type of discrepancy - 'CD': Centered Discrepancy - 'WD': Wrap-around Discrepancy - 'MD': Mix between CD/WD - 'L2-star': L2-star discrepancy lim : {'Q', 'P'}, optional (default='P') Filtering method to threshold scores using 1 - discrepancy - 'Q': Use quntile limiting - 'P': Use percentile limiting """ from pythresh.thresholds.qmcd import QMCD as QMCD_thres return QMCD_thres(**kwargs)
[docs] def REGR(**kwargs): """REGR class for Regression based thresholder. Use the regression to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the y-intercept value of the linear fit. Parameters ---------- method : {'siegel', 'theil'}, optional (default='siegel') Regression based method to calculate the y-intercept - 'siegel': implements a method for robust linear regression using repeated medians - 'theil': implements a method for robust linear regression using paired values random_state : int, optional (default=1234) random seed for the normal distribution. Can also be set to None """ from pythresh.thresholds.regr import REGR as REGR_thres return REGR_thres(**kwargs)
[docs] def VAE(**kwargs): """VAE class for Variational AutoEncoder thresholder. Use a VAE to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the maximum minus the minimum of the reconstructed distribution probabilities after encoding. Parameters ---------- verbose : bool, optional (default=False) display training progress device : str, optional (default='cpu') device for pytorch latent_dims : int, optional (default='auto') number of latent dimensions the encoder will map the scores to. Default 'auto' applies automatic dimensionality selection using a profile likelihood. random_state : int, optional (default=1234) random seed for the normal distribution. Can also be set to None epochs : int, optional (default=100) number of epochs to train the VAE batch_size : int, optional (default=64) batch size for the dataloader during training loss : str, optional (default='kl') Loss function during training - 'kl' : use the combined negative log likelihood and Kullback-Leibler divergence - 'mmd': use the combined negative log likelihood and maximum mean discrepancy Attributes ---------- thresh_ : threshold value that separates inliers from outliers """ from pythresh.thresholds.vae import VAE as VAE_thres return VAE_thres(**kwargs)
[docs] def WIND(**kwargs): """WIND class for topological Winding number thresholder. Use the topological winding number (with respect to the origin) to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the mean intersection point calculated from the winding number. Parameters ---------- random_state : int, optional (default=1234) Random seed for the normal distribution. Can also be set to None. """ from pythresh.thresholds.wind import WIND as WIND_thres return WIND_thres(**kwargs)
[docs] def YJ(**kwargs): """YJ class for Yeo-Johnson transformation thresholder. Use the Yeo-Johnson transformation to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the max value in the YJ transformed data. """ from pythresh.thresholds.yj import YJ as YJ_thres return YJ_thres(**kwargs)
[docs] def ZSCORE(**kwargs): """ZSCORE class for ZSCORE thresholder. Use the zscore to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond a zscore of one. """ from pythresh.thresholds.zscore import ZSCORE as ZSCORE_thres return ZSCORE_thres(**kwargs)