Source code for skmatter.metrics._reconstruction_measures

import numpy as np
from joblib import Parallel, delayed

from ..linear_model import OrthogonalRegression, Ridge2FoldCV
from ..model_selection import train_test_split
from ..preprocessing import StandardFlexibleScaler



[docs]
def pointwise_global_reconstruction_error(
    X,
    Y,
    train_idx=None,
    test_idx=None,
    scaler=None,
    estimator=None,
):
    r"""Computes the pointwise global reconstruction error using the source X
    to reconstruct the features or samples of target Y based on a minimization
    by linear regression:

    .. math::
        GRE^{(i)}(X,Y) = \min_W ||y_i - x_iW||

    If used with X and Y of shape (n_samples, n_features) it computes the pointwise
    global reconstruction error of the features as defined in Ref. [Goscinski2021]_.
    In this case the number of samples of X and Y should agree with each other,
    but the number of features can be different. The error is expressed per sample.

    If used with X and Y of shape(n_features, n_samples) it computes the
    reconstruction error of the samples. In this case the number of
    features of X and Y should agree with each other, but the number of
    samples can be different. The error is expressed per feature.

    The default parameters mimics the ones of Ref. [Goscinski2021]_.

    Parameters
    ----------
    X : numpy.ndarray of shape (n_samples, X_n_features)
        Source data which reconstructs target Y. For feature reconstruction of Y using X
        use input shape (samples, features). For sample reconstruction of Y using X use
        input shape (features, samples).
    Y : numpy.ndarray of shape (n_samples, Y_n_targets)
        Target data which is reconstructed with X. For feature reconstruction of Y using
        X use input shape (samples, features). For sample reconstruction of Y using X
        use input shape (features, samples).
    train_idx : numpy.ndarray, dtype=int, default=None
        Array of indices used for training. If ``None``, the complement of the
        ``test_idx`` is used. If ``train_size`` is also None, 2-fold split is taken.
    test_idx : numpy.ndarray, dtype=int, default=None
        Array of indices used for testing. If ``None``, the complement of the
        ``train_idx`` is used. If ``test_size`` is also ``None``, 2-fold split is taken.
    scaler : object implementing fit/transform, default=``StandardFlexibleScaler``
        Scales X and Y before computing the reconstruction measure. The default value
        scales the features such that the reconstruction measure on the training set is
        upper bounded to 1.
    estimator : object implementing fit/predict, default=``Ridge2FoldCV``
        Sklearn estimator used to reconstruct test features/samples.

    Returns
    -------
    pointwise_global_reconstruction_error : numpy.ndarray
        The global reconstruction error for each test sample/point.
    """
    train_idx, test_idx, scaler, estimator = check_global_reconstruction_measures_input(
        X, Y, train_idx, test_idx, scaler, estimator
    )

    X_train, X_test, Y_train, Y_test = _prepare_data(X, Y, train_idx, test_idx, scaler)

    estimator.fit(X_train, Y_train)

    return np.linalg.norm(Y_test - estimator.predict(X_test), axis=1)




[docs]
def global_reconstruction_error(
    X,
    Y,
    test_idx=None,
    train_idx=None,
    scaler=None,
    estimator=None,
):
    r"""Computes the global reconstruction error using the source X
    to reconstruct the features or samples of target Y based on a minimization
    by linear regression:

    .. math::
        GRE(X,Y) = \min_W ||Y - XW||

    If used with X and Y of shape (n_samples, n_features) it computes the
    global reconstruction error of the features as defined in Ref. [Goscinski2021]_.
    In this case the number of samples of X and Y should agree with each other,
    but the number of features can be different. The error is expressed per sample.

    If used with X and Y of shape(n_features, n_samples) it computes the
    reconstruction error of the samples. In this case the number of
    features of X and Y should agree with each other, but the number of
    samples can be different. The error is expressed per feature.

    The default parameters mimics the ones of Ref. [Goscinski2021]_.

    Parameters
    ----------
    X : numpy.ndarray of shape (n_samples, X_n_features)
        Source data which reconstructs target Y. For feature reconstruction of Y using X
        use input shape (samples, features). For sample reconstruction of Y using X use
        input shape (features, samples).
    Y : numpy.ndarray of shape (n_samples, Y_n_targets)
        Target data which is reconstructed with X. For feature reconstruction of Y using
        X use input shape (samples, features). For sample reconstruction of Y using X
        use input shape (features, samples).
    train_idx : numpy.ndarray, dtype=int, default=None
        Array of indices used for training. If ``None``, the complement of the
        ``test_idx`` is used. If ``train_size`` is also ``None``, 2-fold split is taken.
    test_idx : numpy.ndarray, dtype=int, default=None
        Array of indices used for testing. If ``None``, the complement of the
        ``train_idx`` is used. If ``test_size`` is also ``None``, 2-fold split is taken.
    scaler : object implementing fit/transform, default=``StandardFlexibleScaler``
        Scales X and Y before computing the reconstruction measure. The default value
        scales the features such that the reconstruction measure on the training set is
        upper bounded to 1.
    estimator : object implementing fit/predict, default=``Ridge2FoldCV``
        Sklearn estimator used to reconstruct test features/samples.

    Returns
    -------
    global_reconstruction_error : ndarray
        The global reconstruction error
    """
    pointwise_global_reconstruction_error_values = (
        pointwise_global_reconstruction_error(
            X,
            Y,
            train_idx=train_idx,
            test_idx=test_idx,
            scaler=scaler,
            estimator=estimator,
        )
    )
    return np.linalg.norm(pointwise_global_reconstruction_error_values) / np.sqrt(
        len(pointwise_global_reconstruction_error_values)
    )




[docs]
def pointwise_global_reconstruction_distortion(
    X,
    Y,
    test_idx=None,
    train_idx=None,
    scaler=None,
    estimator=None,
):
    r"""Computes the pointwise global reconstruction distortion using the source X
    to reconstruct the features or samples of target Y based on a minimization
    by orthogonal regression:

    .. math::
        GRD^{(i)}(X,Y) = \min_Q ||y_i - x_iQ\|| \quad\mathrm{subject\ to}\quad Q^TQ=I

    If used with X and Y of shape (n_samples, n_features) it computes the pointwise
    global reconstruction distortion of the features as defined in
    Ref. [Goscinski2021]_.
    In this case the number of samples of X and Y should agree with each other,
    but the number of features can be different. The distortion is expressed per sample.

    If used with X and Y of shape(n_features, n_samples) it computes the
    reconstruction distortion of the samples. In this case the number of
    features of X and Y should agree with each other, but the number of
    samples can be different. The distortion is expressed per feature.

    The default parameters mimics the ones of Ref. [Goscinski2021]_.

    Parameters
    ----------
    X : numpy.ndarray of shape (n_samples, X_n_features)
        Source data which reconstructs target Y.
        For feature reconstruction of Y using X use input shape (samples, features).
        For sample reconstruction of Y using X use input shape (features, samples).
    Y : numpy.ndarray of shape (n_samples, Y_n_targets)
        Target data which is reconstructed with X.
        For feature reconstruction of Y using X use input shape (samples, features).
        For sample reconstruction of Y using X use input shape (features, samples).
    train_idx : numpy.ndarray, dtype=int, default=None
        Array of indices used for training. If ``None``, the complement of the
        ``test_idx`` is used. If ``train_size`` is also ``None``, 2-fold split is taken.
    test_idx : numpy.ndarray, dtype=int, default=None
        Array of indices used for testing. If ``None``, the complement of the
        ``train_idx`` is used. If ``test_size`` is also ``None``, 2-fold split is taken.
    scaler : object implementing fit/transform, default=``StandardFlexibleScaler``
        Scales X and Y before computing the reconstruction measure. The default value
        scales the features such that the reconstruction measure on the training set is
        upper bounded to 1.
    estimator : object implementing fit/predict, default=``Ridge2FoldCV``
        Sklearn estimator used to reconstruct test features/samples.

    Returns
    -------
    pointwise_global_reconstruction_distortion : ndarray
        The global reconstruction distortion for each sample/point
    """
    train_idx, test_idx, scaler, estimator = check_global_reconstruction_measures_input(
        X, Y, train_idx, test_idx, scaler, estimator
    )

    X_train, X_test, Y_train, _Y_test = _prepare_data(X, Y, train_idx, test_idx, scaler)

    predictions_Y_test = estimator.fit(X_train, Y_train).predict(X_test)
    orthogonal_predictions_Y_test = (
        OrthogonalRegression(use_orthogonal_projector=False)
        .fit(X_train, estimator.predict(X_train))
        .predict(X_test)
    )

    return np.linalg.norm(predictions_Y_test - orthogonal_predictions_Y_test, axis=1)




[docs]
def global_reconstruction_distortion(
    X,
    Y,
    test_idx=None,
    train_idx=None,
    scaler=None,
    estimator=None,
):
    r"""Computes the global reconstruction distortion using the source X
    to reconstruct the features or samples of target Y based on a minimization
    by orthogonal regression:

    .. math::
        GRD(X,Y) = \min_Q ||y - XQ\|| \quad\mathrm{subject\ to}\quad Q^TQ=I

    If used with X and Y of shape (n_samples, n_features) it computes the
    global reconstruction distortion of the features as defined in
    Ref. [Goscinski2021]_.
    In this case the number of samples of X and Y should agree with each other,
    but the number of features can be different. The distortion is expressed per sample.

    If used with X and Y of shape(n_features, n_samples) it computes the
    reconstruction distortion of the samples. In this case the number of
    features of X and Y should agree with each other, but the number of
    samples can be different. The distortion is expressed per feature.

    The default parameters mimics the ones of Ref. [Goscinski2021]_.

    Parameters
    ----------
    X : numpy.ndarray of shape (n_samples, X_n_features)
        Source data which reconstructs target Y.
        For feature reconstruction of Y using X use input shape (samples, features).
        For sample reconstruction of Y using X use input shape (features, samples).
    Y : numpy.ndarray of shape (n_samples, Y_n_targets)
        Target data which is reconstructed with X.
        For feature reconstruction of Y using X use input shape (samples, features).
        For sample reconstruction of Y using X use input shape (features, samples).
    train_idx : numpy.ndarray, dtype=int, default=None
        Array of indices used for training. If ``None``, the complement of the
        ``test_idx`` is used. If ``train_size`` is also ``None``, 2-fold split is taken.
    test_idx : numpy.ndarray, dtype=int, default=None
        Array of indices used for testing. If ``None``, the complement of the
        ``train_idx`` is used. If ``test_size`` is also ``None``, 2-fold split is taken.
    scaler : object implementing fit/transform, default=``StandardFlexibleScaler``
        Scales X and Y before computing the reconstruction measure. The default value
        scales the features such that the reconstruction measure on the training set is
        upper bounded to 1.
    estimator : object implementing fit/predict, default=``Ridge2FoldCV``
        Sklearn estimator used to reconstruct test features/samples.

    Returns
    -------
    global_reconstruction_distortion : numpy.ndarray
        The global reconstruction distortion

    """
    pointwise_global_reconstruction_distortion_values = (
        pointwise_global_reconstruction_distortion(
            X,
            Y,
            train_idx=train_idx,
            test_idx=test_idx,
            scaler=scaler,
            estimator=estimator,
        )
    )
    return np.linalg.norm(pointwise_global_reconstruction_distortion_values) / np.sqrt(
        len(pointwise_global_reconstruction_distortion_values)
    )




[docs]
def pointwise_local_reconstruction_error(
    X,
    Y,
    n_local_points,
    test_idx=None,
    train_idx=None,
    scaler=None,
    estimator=None,
    n_jobs=None,
):
    r"""Computes the pointwise local reconstruction error using the source X to
    reconstruct the features or samples of target Y based on a minimization by linear
    regression:

    .. math::
        \tilde{\mathbf{x}}'_i = \bar{\mathbf{x}} + (\mathbf{x}_i
                                - \bar{\mathbf{x}})\mathbf{P}^{(i)}

    .. math::
        LRE^{(i)}(X,Y) = \|\mathbf{x}'_i - \tilde{\mathbf{x}}'_i\|^2

    If used with X and Y of shape (n_samples, n_features) it computes the pointwise
    local reconstruction error of the features as defined in Ref. [Goscinski2021]_.
    In this case the number of samples of X and Y should agree with each other,
    but the number of features can be different. The error is expressed per sample.

    If used with X and Y of shape(n_features, n_samples) it computes the
    reconstruction error of the samples. In this case the number of
    features of X and Y should agree with each other, but the number of
    samples can be different. The error is expressed per feature.

    The default parameters mimics the ones of Ref. [Goscinski2021]_.

    Parameters
    ----------
    X : numpy.ndarray of shape (n_samples, X_n_features)
        Source data which reconstructs target Y.
        For feature reconstruction of Y using X use input shape (samples, features).
        For sample reconstruction of Y using X use input shape (features, samples).
    Y : numpy.ndarray of shape (n_samples, Y_n_targets)
        Target data which is reconstructed with X.
        For feature reconstruction of Y using X use input shape (samples, features).
        For sample reconstruction of Y using X use input shape (features, samples).
    n_local_points : int,
        Number of neighbour points used to compute the local reconstruction weight for
        each sample/point.
    train_idx : numpy.ndarray, dtype=int, default=None
        Array of indices used for training. If ``None``, the complement of the
        ``test_idx`` is used. If ``train_size`` is also ``None``, 2-fold split is taken.
    test_idx : numpy.ndarray, dtype=int, default=None
        Array of indices used for testing. If ``None``, the complement of the
        ``train_idx`` is used. If ``test_size`` is also ``None``, 2-fold split is taken.
    scaler : object implementing fit/transform, default=``StandardFlexibleScaler``
        Scales X and Y before computing the reconstruction measure. The default value
        scales the features such that the reconstruction measure on the training set is
        upper bounded to 1.
    estimator : object implementing fit/predict, default=``Ridge2FoldCV``
        Sklearn estimator used to reconstruct test features/samples.
    n_jobs : int, default=None
        The number of CPUs to use to do the computation.
        :obj:`None` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See
        `n_jobs glossary from sklearn (external link) <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_
        for more details.

    Returns
    -------
    pointwise_local_reconstruction_error : numpy.ndarray
        The local reconstruction error for each sample/point

    """
    train_idx, test_idx, scaler, estimator = check_local_reconstruction_measures_input(
        X, Y, n_local_points, train_idx, test_idx, scaler, estimator
    )

    X_train, X_test, Y_train, Y_test = _prepare_data(X, Y, train_idx, test_idx, scaler)

    squared_dist = (
        np.sum(X_train**2, axis=1)
        + np.sum(X_test**2, axis=1)[:, np.newaxis]
        - 2 * X_test @ X_train.T
    )

    n_test = X_test.shape[0]

    def local_reconstruction_error_i(i):
        # comments correspond notation in [gfrm]_
        local_env_idx = np.argsort(squared_dist[i])[:n_local_points]
        # D_{k-neigh}^{(i)}
        local_X_train = X_train[local_env_idx]
        # \bar{x}_F
        local_X_train_mean = np.mean(X_train[local_env_idx], axis=0)
        # D_{k-neigh}^{(i)}
        local_Y_train = Y_train[local_env_idx]
        # \bar{x}_F'
        local_Y_train_mean = np.mean(Y_train[local_env_idx], axis=0)
        # P_{FF'}
        estimator.fit(
            local_X_train - local_X_train_mean,
            local_Y_train - local_Y_train_mean,
        )
        # \tilde{x}_i' = \bar{x}_{F'} + (x_i - \bar{x}_F)P_{FF'}
        tilde_x_i_dash_test = local_Y_train_mean + estimator.predict(
            X_test[i, :][np.newaxis, :] - local_X_train_mean
        )
        # \|x_i' -  \tilde{x}_i'\|
        return np.linalg.norm(Y_test[i, :][np.newaxis, :] - tilde_x_i_dash_test)

    pointwise_local_reconstruction_error_values = np.array(
        Parallel(n_jobs=n_jobs)(
            delayed(local_reconstruction_error_i)(i) for i in range(n_test)
        )
    )
    return pointwise_local_reconstruction_error_values




[docs]
def local_reconstruction_error(
    X,
    Y,
    n_local_points,
    test_idx=None,
    train_idx=None,
    scaler=None,
    estimator=None,
    n_jobs=None,
):
    r"""Computes the local reconstruction error using the source X to reconstruct the
    features or samples of target Y based on a minimization by linear regression:

    .. math::
        LRE(X,Y) = \sqrt{\sum_i LRE^{(i)}(X,Y)}/\sqrt{n_\text{test}}

    If used with X and Y of shape (n_samples, n_features) it computes the
    local reconstruction error of the features as defined in Ref. [Goscinski2021]_.
    In this case the number of samples of X and Y should agree with each other,
    but the number of features can be different. The error is expressed per sample.

    If used with X and Y of shape(n_features, n_samples) it computes the
    reconstruction error of the samples. In this case the number of
    features of X and Y should agree with each other, but the number of
    samples can be different. The error is expressed per feature.

    The default parameters mimics the ones of Ref. [Goscinski2021]_.

    Parameters
    ----------
    X : numpy.ndarray of shape (n_samples, X_n_features)
        Source data which reconstructs target Y.
        For feature reconstruction of Y using X use input shape (samples, features).
        For sample reconstruction of Y using X use input shape (features, samples).
    Y : numpy.ndarray of shape (n_samples, Y_n_targets)
        Target data which is reconstructed with X.
        For feature reconstruction of Y using X use input shape (samples, features).
        For sample reconstruction of Y using X use input shape (features, samples).
    n_local_points : int,
        Number of neighbour points used to compute the local reconstruction weight for
        each sample/point.
    train_idx : numpy.ndarray, dtype=int, default=None
        Array of indices used for training. If ``None``, the complement of the
        ``test_idx`` is used. If ``train_size`` is also ``None``, 2-fold split is taken.
    test_idx : numpy.ndarray, dtype=int, default=None
        Array of indices used for testing. If ``None``, the complement of the
        ``train_idx`` is used. If ``test_size`` is also ``None``, 2-fold split is taken.
    scaler : object implementing fit/transform, default=``StandardFlexibleScaler``
        Scales X and Y before computing the reconstruction measure. The default value
        scales the features such that the reconstruction measure on the training set is
        upper bounded to 1.
    estimator : object implementing fit/predict, default=``Ridge2FoldCV``
        Sklearn estimator used to reconstruct test features/samples.
    n_jobs : int, default=None
        The number of CPUs to use to do the computation.
        :obj:`None` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See
        `n_jobs glossary from sklearn (external link) <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_
        for more details.

    Returns
    -------
    local_reconstruction_error : numpy.ndarray
        The local reconstruction error
    """
    pointwise_local_reconstruction_error_values = pointwise_local_reconstruction_error(
        X,
        Y,
        n_local_points,
        train_idx=train_idx,
        test_idx=test_idx,
        scaler=scaler,
        estimator=estimator,
        n_jobs=n_jobs,
    )
    return np.linalg.norm(pointwise_local_reconstruction_error_values) / np.sqrt(
        len(pointwise_local_reconstruction_error_values)
    )



def check_global_reconstruction_measures_input(
    X, Y, train_idx, test_idx, scaler, estimator
):
    """Returns default reconstruction measure inputs for all None parameters"""
    if X.shape[0] != Y.shape[0]:
        raise ValueError(
            f"First dimension of X ({X.shape[0]}) and Y ({Y.shape[0]}) must match"
        )

    if (train_idx is None) and (test_idx is None):
        train_idx, test_idx = train_test_split(
            np.arange(len(X)),
            test_size=0.5,
            train_size=0.5,
            random_state=0x5F3759DF,
            shuffle=True,
            train_test_overlap=False,
        )
    elif train_idx is None:
        train_idx = np.setdiff1d(np.arange(len(X)), test_idx)
    elif test_idx is None:
        test_idx = np.setdiff1d(np.arange(len(X)), train_idx)

    if scaler is None:
        scaler = StandardFlexibleScaler()

    if estimator is None:
        estimator = Ridge2FoldCV(
            alphas=np.geomspace(1e-9, 0.9, 20),
            alpha_type="relative",
            regularization_method="cutoff",
            random_state=0x5F3759DF,
            shuffle=True,
            scoring="neg_root_mean_squared_error",
            n_jobs=1,
        )

    return train_idx, test_idx, scaler, estimator


def check_local_reconstruction_measures_input(
    X, Y, n_local_points, train_idx, test_idx, scaler, estimator
):
    """Returns default reconstruction measure inputs for all None parameters"""
    # only needs to check one extra parameter
    if len(X) < n_local_points:
        raise ValueError(
            f"X has {len(X)} samples but n_local_points={n_local_points}. "
            "Must have at least n_local_points samples"
        )

    return check_global_reconstruction_measures_input(
        X, Y, train_idx, test_idx, scaler, estimator
    )


def _prepare_data(X, Y, train_idx, test_idx, scaler):
    """Split and scale data for reconstruction measures"""
    X_train, X_test = X[train_idx], X[test_idx]
    Y_train, Y_test = Y[train_idx], Y[test_idx]

    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    scaler.fit(Y_train)
    Y_train_scaled = scaler.transform(Y_train)
    Y_test_scaled = scaler.transform(Y_test)

    return X_train_scaled, X_test_scaled, Y_train_scaled, Y_test_scaled