Source code for pyod.utils.stat_models

# -*- coding: utf-8 -*-
""" A collection of statistical models
"""
# Author: Yue Zhao <zhaoy@cmu.edu>
# License: BSD 2 clause

from __future__ import division
from __future__ import print_function

import numpy as np
from numba import njit
from scipy.stats import pearsonr
from sklearn.utils.validation import check_array
# noinspection PyProtectedMember
from sklearn.utils.validation import check_consistent_length


# TODO: disable p value calculation due to python 2.7 break
# from scipy.special import betainc


[docs]def pairwise_distances_no_broadcast(X, Y): """Utility function to calculate row-wise euclidean distance of two matrix. Different from pair-wise calculation, this function would not broadcast. For instance, X and Y are both (4,3) matrices, the function would return a distance vector with shape (4,), instead of (4,4). Parameters ---------- X : array of shape (n_samples, n_features) First input samples Y : array of shape (n_samples, n_features) Second input samples Returns ------- distance : array of shape (n_samples,) Row-wise euclidean distance of X and Y """ X = check_array(X) Y = check_array(Y) if X.shape[0] != Y.shape[0] or X.shape[1] != Y.shape[1]: raise ValueError("pairwise_distances_no_broadcast function receive" "matrix with different shapes {0} and {1}".format( X.shape, Y.shape)) return _pairwise_distances_no_broadcast_helper(X, Y)
@njit def _pairwise_distances_no_broadcast_helper(X, Y): # pragma: no cover """Internal function for calculating the distance with numba. Do not use. Parameters ---------- X : array of shape (n_samples, n_features) First input samples Y : array of shape (n_samples, n_features) Second input samples Returns ------- distance : array of shape (n_samples,) Intermediate results. Do not use. """ euclidean_sq = np.square(Y - X) return np.sqrt(np.sum(euclidean_sq, axis=1)).ravel()
[docs]def wpearsonr(x, y, w=None): """Utility function to calculate the weighted Pearson correlation of two samples. See https://stats.stackexchange.com/questions/221246/such-thing-as-a-weighted-correlation for more information Parameters ---------- x : array, shape (n,) Input x. y : array, shape (n,) Input y. w : array, shape (n,) Weights w. Returns ------- scores : float in range of [-1,1] Weighted Pearson Correlation between x and y. """ # unweighted version # note the return is different # TODO: fix output differences if w is None: return pearsonr(x, y) x = np.asarray(x) y = np.asarray(y) w = np.asarray(w) check_consistent_length([x, y, w]) # n = len(x) w_sum = w.sum() mx = np.sum(x * w) / w_sum my = np.sum(y * w) / w_sum xm, ym = (x - mx), (y - my) r_num = np.sum(xm * ym * w) / w_sum xm2 = np.sum(xm * xm * w) / w_sum ym2 = np.sum(ym * ym * w) / w_sum r_den = np.sqrt(xm2 * ym2) r = r_num / r_den r = max(min(r, 1.0), -1.0) # TODO: disable p value calculation due to python 2.7 break # df = n_train_ - 2 # # if abs(r) == 1.0: # prob = 0.0 # else: # t_squared = r ** 2 * (df / ((1.0 - r) * (1.0 + r))) # prob = _betai(0.5 * df, 0.5, df / (df + t_squared)) return r # , prob
##################################### # PROBABILITY CALCULATIONS # ##################################### # TODO: disable p value calculation due to python 2.7 break # def _betai(a, b, x): # x = np.asarray(x) # x = np.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0 # return betainc(a, b, x)
[docs]def pearsonr_mat(mat, w=None): """Utility function to calculate pearson matrix (row-wise). Parameters ---------- mat : numpy array of shape (n_samples, n_features) Input matrix. w : numpy array of shape (n_features,) Weights. Returns ------- pear_mat : numpy array of shape (n_samples, n_samples) Row-wise pearson score matrix. """ mat = check_array(mat) n_row = mat.shape[0] n_col = mat.shape[1] pear_mat = np.full([n_row, n_row], 1).astype(float) if w is not None: for cx in range(n_row): for cy in range(cx + 1, n_row): curr_pear = wpearsonr(mat[cx, :], mat[cy, :], w) pear_mat[cx, cy] = curr_pear pear_mat[cy, cx] = curr_pear else: for cx in range(n_col): for cy in range(cx + 1, n_row): curr_pear = pearsonr(mat[cx, :], mat[cy, :])[0] pear_mat[cx, cy] = curr_pear pear_mat[cy, cx] = curr_pear return pear_mat