Source code for anlearn.stats

from typing import Optional

import numpy as np
from sklearn.base import BaseEstimator, OutlierMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted

from ._typing import ArrayLike


[docs]class IQR(BaseEstimator, OutlierMixin): """Interquartile range Outlier deteciton method using Tukey's fences. If lower quantile is 0.25 (:math:`Q_1` lower quartile) and upper quantile is 0.75 (:math:`Q_3` upper quartile), then outlier is any observation outside the range: .. math:: [Q_1 - k(Q_3 - Q_1); Q_3 + k(Q_3 - Q_1)] John Tukey proposed :math:`k=1.5` is an outlier, and :math:`k=3` is far out. Parameters ---------- k : float, optional Outlier threshold, by default 1.5 lower_quantile : float, optional Lower quantile, from (0; 1), by default 0.25 upper_quantile : float, optional Upper quantile, from (0; 1), by default 0.75 ensure_2d : bool, optional Frobid input 1D arrays, by default True Attributes ---------- lqv_ : float Lower quantile value estimated from the input data uqv_ : float Upper quantile value estimated from the input data iqr_ : float Interquartile range, :attr:`uqv_` - :attr:`lqv_` Example ------- >>> import numpy as np >>> from anlearn.stats import IQR >>> X = np.hstack([[-7,-4], np.arange(5), [10, 15]]) >>> iqr = IQR(ensure_2d=False) >>> iqr.fit(X) IQR(ensure_2d=False) >>> iqr.predict(X) array([-1, 1, 1, 1, 1, 1, 1, 1, -1]) >>> iqr.score_samples(X) array([-1.75, -1. , -0. , -0. , -0. , -0. , -0. , -1.5 , -2.75]) Raises ------ ValueError Lower quantile must be lower than upper quantile. """ def __init__( self, k: float = 1.5, lower_quantile: float = 0.25, upper_quantile: float = 0.75, ensure_2d: bool = True, ) -> None: self.k = k self.ensure_2d = ensure_2d self.lower_quantile = lower_quantile self.upper_quantile = upper_quantile if lower_quantile >= upper_quantile: raise ValueError("IQR: Lower quantile must be lower than upper quantile.")
[docs] def fit(self, X: ArrayLike, y: Optional[ArrayLike] = None) -> "IQR": """Fit estimator Parameters ---------- X : ArrayLike Input data of shape (n_samples, 1) or (n_samples,) if `ensure_2d` is False y : Optional[ArrayLike], optional Ignored, present for API consistency by convention, by default None Returns ------- IQR Fitted estimator """ raw_data = check_array( X, force_all_finite=True, ensure_2d=self.ensure_2d ).flatten() self.lqv_, self.uqv_ = np.quantile(raw_data, (0.25, 0.75)) self.iqr_ = self.uqv_ - self.lqv_ return self
[docs] def score_samples(self, X: ArrayLike) -> np.ndarray: """Score samples Score is comuputed as distance from interval :math:`[Q_{lower}; Q_{upper}]` divided by interquartile range. :math:`score = distance(data, (lqv, uqv)) / iqr`. Score is inverted for scikit-learn compatibility Parameters ---------- X : ArrayLike Input data of shape (n_samples, 1) or (n_samples,) if ``ensure_2d`` is False Returns ------- numpy.ndarray Shape (n_samples,). The outlier score of the input samples. The lower, the more abnormal. """ check_is_fitted(self, attributes=["lqv_", "uqv_", "iqr_"]) raw_data = check_array( X, force_all_finite=True, ensure_2d=self.ensure_2d ).flatten() scores = np.zeros(shape=raw_data.shape[0]) l_lqv = raw_data < self.lqv_ scores[l_lqv] = (raw_data[l_lqv] - self.lqv_) / self.iqr_ g_uqv = raw_data > self.uqv_ scores[g_uqv] = (raw_data[g_uqv] - self.uqv_) / self.iqr_ return -np.abs(scores)
[docs] def predict(self, X: ArrayLike) -> np.ndarray: """Predict if samples are outliers or not Samples with a score lower than ``k`` are considered to be outliers. Parameters ---------- X : ArrayLike Input data, shape (n_samples, n_features) Returns ------- numpy.ndarray Shape (n_samples,) 1 for inlineres, -1 for outliers """ scores = self.score_samples(X) return np.where(scores < -self.k, -1, 1)