Source code for anlearn.stats

from typing import Optional

import numpy as np
from sklearn.base import BaseEstimator, OutlierMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted

from ._typing import ArrayLike


[docs]class IQR(BaseEstimator, OutlierMixin):
    """Interquartile range

    Outlier deteciton method using Tukey's fences.
    If lower quantile is 0.25 (:math:`Q_1` lower quartile) and
    upper quantile is 0.75 (:math:`Q_3` upper quartile),
    then outlier is any observation outside the range:

    .. math::
        [Q_1 - k(Q_3 - Q_1); Q_3 + k(Q_3 - Q_1)]

    John Tukey proposed :math:`k=1.5` is an outlier, and :math:`k=3` is far out.

    Parameters
    ----------
    k : float, optional
        Outlier threshold, by default 1.5
    lower_quantile : float, optional
        Lower quantile, from (0; 1), by default 0.25
    upper_quantile : float, optional
        Upper quantile, from (0; 1), by default 0.75
    ensure_2d : bool, optional
        Frobid input 1D arrays, by default True

    Attributes
    ----------
    lqv_ : float
        Lower quantile value estimated from the input data
    uqv_ : float
        Upper quantile value estimated from the input data
    iqr_ : float
        Interquartile range, :attr:`uqv_` - :attr:`lqv_`

    Example
    -------
    >>> import numpy as np
    >>> from anlearn.stats import IQR
    >>> X = np.hstack([[-7,-4], np.arange(5), [10, 15]])
    >>> iqr = IQR(ensure_2d=False)
    >>> iqr.fit(X)
    IQR(ensure_2d=False)
    >>> iqr.predict(X)
    array([-1,  1,  1,  1,  1,  1,  1,  1, -1])
    >>> iqr.score_samples(X)
    array([-1.75, -1.  , -0.  , -0.  , -0.  , -0.  , -0.  , -1.5 , -2.75])

    Raises
    ------
    ValueError
        Lower quantile must be lower than upper quantile.
    """

    def __init__(
        self,
        k: float = 1.5,
        lower_quantile: float = 0.25,
        upper_quantile: float = 0.75,
        ensure_2d: bool = True,
    ) -> None:

        self.k = k
        self.ensure_2d = ensure_2d
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile

        if lower_quantile >= upper_quantile:
            raise ValueError("IQR: Lower quantile must be lower than upper quantile.")

[docs]    def fit(self, X: ArrayLike, y: Optional[ArrayLike] = None) -> "IQR":
        """Fit estimator

        Parameters
        ----------
        X : ArrayLike
            Input data of shape (n_samples, 1) or (n_samples,) if `ensure_2d` is False
        y : Optional[ArrayLike], optional
            Ignored, present for API consistency by convention, by default None

        Returns
        -------
        IQR
            Fitted estimator
        """
        raw_data = check_array(
            X, force_all_finite=True, ensure_2d=self.ensure_2d
        ).flatten()

        self.lqv_, self.uqv_ = np.quantile(raw_data, (0.25, 0.75))
        self.iqr_ = self.uqv_ - self.lqv_

        return self

[docs]    def score_samples(self, X: ArrayLike) -> np.ndarray:
        """Score samples

        Score is comuputed as distance from interval :math:`[Q_{lower}; Q_{upper}]` divided
        by interquartile range. :math:`score = distance(data, (lqv, uqv)) / iqr`.
        Score is inverted for scikit-learn compatibility

        Parameters
        ----------
        X : ArrayLike
            Input data of shape (n_samples, 1) or (n_samples,) if ``ensure_2d`` is False

        Returns
        -------
        numpy.ndarray
            Shape (n_samples,). The outlier score of the input samples.
            The lower, the more abnormal.
        """
        check_is_fitted(self, attributes=["lqv_", "uqv_", "iqr_"])

        raw_data = check_array(
            X, force_all_finite=True, ensure_2d=self.ensure_2d
        ).flatten()

        scores = np.zeros(shape=raw_data.shape[0])

        l_lqv = raw_data < self.lqv_
        scores[l_lqv] = (raw_data[l_lqv] - self.lqv_) / self.iqr_

        g_uqv = raw_data > self.uqv_
        scores[g_uqv] = (raw_data[g_uqv] - self.uqv_) / self.iqr_

        return -np.abs(scores)

[docs]    def predict(self, X: ArrayLike) -> np.ndarray:
        """Predict if samples are outliers or not

        Samples with a score lower than ``k`` are considered to be  outliers.

        Parameters
        ----------
        X : ArrayLike
            Input data, shape (n_samples, n_features)

        Returns
        -------
        numpy.ndarray
            Shape (n_samples,) 1 for inlineres, -1 for outliers
        """
        scores = self.score_samples(X)

        return np.where(scores < -self.k, -1, 1)