Source code for mutar.estimators

"""
Mutar estimators module.
"""
import numpy as np
import warnings

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.metrics import r2_score
from sklearn.exceptions import ConvergenceWarning

from .solvers import solver_dirty, solver_lasso, solver_mll, solver_mtw
from . import utils


class MultitaskRegression(BaseEstimator, RegressorMixin):
    """ Multitask regression abstract class.

    Parameters
    ----------
    fit_intercept : boolean
        whether to calculate the intercept for this model. If set
        to false, no intercept will be used in calculations
        (e.g. data is expected to be already centered).
    normalize : boolean
        This parameter is ignored when fit_intercept is set to False. If True,
        the regressors X will be normalized before regression by subtracting
        the mean and dividing by the l2-norm.
    max_iter : int, optional
        The maximum number of iterations
    tol : float, optional
        The tolerance for the optimization: if the updates are
        smaller than ``tol``, the optimization code checks the
        dual gap for optimality and continues until it is smaller
        than ``tol``.
    positive : boolean, optional (default False)
        If True, coefficients are constrained to be non-negative.
    warm_start : bool, optional
        When set to ``True``, reuse the solution of the previous call to fit as
        initialization, otherwise, just erase the previous solution.

    Attributes
    ----------
    coef_ : array, shape (n_features, n_tasks)
        Parameter vector (W in the cost function formula).
    intercept_ : array, shape (n_tasks,)
        independent term in decision function.
    n_iter_ : int
        number of iterations run by the coordinate descent solver to reach
        the specified tolerance.

    """
    def __init__(self, fit_intercept=True, normalize=False, positive=False,
                 max_iter=2000, tol=1e-4, warm_start=False):
        self.fit_intercept = fit_intercept
        self.normalize = normalize
        self.max_iter = max_iter
        self.tol = tol
        self.warm_start = warm_start
        self.is_fitted_ = False
        self.positive = positive

    def _pre_fit(self, X, y):
        """ Normalize and scale the data."""
        n_tasks = len(X)
        n_samples, n_features = X[0].shape
        X, y = check_X_y(X, y, accept_sparse=False, allow_nd=True,
                         multi_output=True)

        if y.shape != (n_tasks, n_samples):
            raise ValueError("Data shape not understood. X must be "
                             "(n_tasks, n_samples, n_features) and y must be "
                             "(n_tasks, n_samples)")
        X_scale = np.ones((n_tasks, n_features))
        if self.fit_intercept:
            X = X.copy()
            y = y.copy()
            X_offset = X.mean(axis=1)
            y_offset = y.mean(axis=1)
            X -= X_offset[:, None, :]
            y -= y_offset[:, None]
            if self.normalize:
                X_scale = np.linalg.norm(X, axis=1)
                X /= X_scale[:, None, :]
        else:
            X_offset = np.zeros((n_tasks, 1, n_features))
            y_offset = np.zeros((n_tasks, 1))
        return X, y, X_offset, y_offset, X_scale

    def _fit(self, X, y):
        """Generic fit method to be specified for each model."""
        pass

    def _post_fit(self, X_offset, y_offset, X_scale):
        """Set intercept and scale after fit."""
        assert self.is_fitted_
        if self.fit_intercept:
            if self.normalize:
                self.coef_ /= X_scale.T
            self.intercept_ = y_offset
            self.intercept_ -= np.einsum("kj,jk->k", X_offset, self.coef_)
        else:
            self.intercept_ = np.zeros_like(y_offset).flatten()

    def fit(self, X, y):
        X, y, X_offset, y_offset, X_scale = self._pre_fit(X, y)
        self._fit(X, y)
        self.is_fitted_ = True
        self._post_fit(X_offset, y_offset, X_scale)
        return self

    def predict(self, X):
        """ Predict target given unseen data samples.

        Parameters
        ----------
        X : {array-like}, shape (n_tasks, n_samples, n_features)
            The training input samples.

        Returns
        -------
        y : ndarray, shape (n_tasks, n_samples)
            Returns the predicted targets.
        """
        X = check_array(X, accept_sparse=False, allow_nd=True)
        check_is_fitted(self, 'is_fitted_')
        y_pred = [x.dot(c) for x, c in zip(X, self.coef_.T)]
        y_pred = np.array(y_pred) + self.intercept_[:, None]
        return y_pred

    def score(self, X, y, sample_weight=None):
        """ Returns the coefficient of determination R^2 of the prediction.

        Computes a score for each regression task.
        The coefficient R^2 is defined as (1 - u/v), where u is the residual
        sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum
        of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible
        score is 1.0 and it can be negative (because the model can be
        arbitrarily worse). A constant model that always predicts the expected
        value of y, disregarding the input features, would get a R^2 score of
        0.0.

        Parameters
        Xarray-like, shape = (n_tasks, n_samples, n_features)
        Test samples.

        yarray-like, shape = (n_tasks, n_samples)
        True values for y.

        sample_weightarray-like, shape = [n_tasks, n_samples], optional
        Sample weights.

        Returns
        -------
        array-like, shape = (n_tasks)
        R^2 of self.predict(X) wrt. y for each task.
        """
        y_pred = self.predict(X)
        if sample_weight is None:
            scores = [r2_score(y_i, y_pred_i, multioutput='variance_weighted')
                      for y_i, y_pred_i in zip(y, y_pred)]
        else:
            scores = [r2_score(y_i, y_pred_i, sample_weight=w_i,
                      multioutput='variance_weighted')
                      for y_i, y_pred_i, w_i in zip(y, y_pred, sample_weight)]
        return np.array(scores)


[docs]class DirtyModel(MultitaskRegression): """ DirtyModel estimator with L1 and L1/L2 mixed-norm as regularizers. The optimization objective for Dirty models is:: (1 / (2 * n_samples)) * ||Y - X(W_1 + W_2)||^2_Fro + alpha * ||W_1||_21 + beta * ||W_2||_1 Where:: ||W||_21 = sum_i sqrt{sum_j w_ij^2} i.e. the sum of norm of each row. and:: ||W||_1 = sum_i sum_j |w_ij| Parameters ---------- alpha : float, optional Constant that multiplies the L1/L2 term. Defaults to 1.0 beta : float, optional Constant that multiplies the L1 term. Defaults to 1.0 fit_intercept : boolean whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). normalize : boolean This parameter is ignored when fit_intercept is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. max_iter : int, optional The maximum number of iterations tol : float, optional The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. positive : boolean, optional (default False) If True, coefficients are constrained to be non-negative. warm_start : bool, optional When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Attributes ---------- coef_ : array, shape (n_features, n_tasks) Parameter vector (W in the cost function formula). intercept_ : array, shape (n_tasks,) independent term in decision function. n_iter_ : int number of iterations run by the coordinate descent solver to reach the specified tolerance. Examples -------- >>> from mutar import DirtyModel >>> import numpy as np >>> X = np.array([[[3, 1], [2, 0], [1, 0]],\ [[0, 2], [-1, 3], [1, -2]]], dtype=float) >>> coef = np.array([[1., 1.], [0., -1]]) >>> y = np.array([x.dot(c) for x, c in zip(X, coef.T)]) >>> y += 0.1 >>> dirty = DirtyModel(alpha=0.15, beta=0.12).fit(X, y) >>> print(dirty.coef_shared_) [[ 0.4652447 0.3465437] [ 0. -0. ]] >>> print(dirty.coef_specific_) [[ 0.35453532 0. ] [ 0. -1.20766296]] """
[docs] def __init__(self, alpha=1., beta=1., fit_intercept=True, normalize=False, max_iter=2000, tol=1e-4, positive=False, warm_start=False): super().__init__(fit_intercept=fit_intercept, normalize=normalize, max_iter=max_iter, tol=tol, positive=positive, warm_start=warm_start) self.alpha = alpha self.beta = beta
def _fit(self, X, y): n_tasks = len(X) n_samples, n_features = X[0].shape if self.alpha == 0. and self.beta == 0.: warnings.warn("With alpha=beta=0, this algorithm does not converge" " well. You are advised to use LinearRegression " "estimator", stacklevel=2) if not self.warm_start or not hasattr(self, "coef_"): coef_shared_ = np.zeros((n_features, n_tasks), dtype=X.dtype, order='F') coef_specific_ = np.zeros((n_features, n_tasks), dtype=X.dtype, order='F') coef_shared_, coef_specific_, R, n_iter = \ solver_dirty(X, y, coef_shared_, coef_specific_, self.alpha, self.beta, self.max_iter, self.tol, self.positive) self.coef_ = coef_shared_ + coef_specific_ self.coef_shared_ = coef_shared_ self.coef_specific_ = coef_specific_ self.residuals_ = R self.n_iter_ = n_iter
[docs]class GroupLasso(DirtyModel): """ GroupLasso estimator with L1/L2 mixed-norm as regularizer. The optimization objective for Group Lasso is:: (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21 Where:: ||W||_21 = sum_i sqrt{sum_j w_ij^2} i.e. the sum of norm of each row. Parameters ---------- alpha : float, optional Constant that multiplies the L1/L2 term. Defaults to 1. fit_intercept : boolean whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). normalize : boolean This parameter is ignored when fit_intercept is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. max_iter : int, optional The maximum number of iterations tol : float, optional The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. positive : boolean, optional (default False) If True, coefficients are constrained to be non-negative. warm_start : bool, optional When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Attributes ---------- coef_ : array, shape (n_features, n_tasks) Parameter vector (W in the cost function formula). intercept_ : array, shape (n_tasks,) independent term in decision function. n_iter_ : int number of iterations run by the coordinate descent solver to reach the specified tolerance. Examples -------- >>> from mutar import GroupLasso >>> import numpy as np >>> X = np.array([[[3, 1], [1, 0], [1, 0]],\ [[0, 2], [2, 3], [2, 3]]], dtype=float) >>> y = X.sum(axis=2) + 2 >>> grouplasso = GroupLasso().fit(X, y) >>> print(grouplasso.coef_shared_) [[1.42045049 1.42045049] [0. 0. ]] >>> print(grouplasso.coef_specific_) [[0. 0.] [0. 0.]] """
[docs] def __init__(self, alpha=0.1, fit_intercept=True, normalize=False, max_iter=2000, tol=1e-4, positive=False, warm_start=False): # if beta > alpha, Dirty models are equivalent to a Group Lasso beta = 10 * alpha super().__init__(alpha=alpha, beta=beta, fit_intercept=fit_intercept, normalize=normalize, max_iter=max_iter, tol=tol, positive=positive, warm_start=warm_start)
[docs]class IndRewLasso(MultitaskRegression): """ Independent Reweighted Lasso estimator with L1 regularizer. The optimization objective for IndRewLasso is:: (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_0.5 Where:: ||W||_0.5 = sum_i sum_j sqrt|w_ij| Parameters ---------- alpha : (float or array-like), shape (n_tasks) Optional, default ones(n_tasks) Constant that multiplies the L0.5 term. Defaults to 1.0 fit_intercept : boolean whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). normalize : boolean This parameter is ignored when fit_intercept is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. max_iter : int, optional The maximum number of inner loop iterations max_iter_reweighting : int, optional Maximum number of reweighting steps i.e outer loop iterations tol : float, optional The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. positive : boolean, optional (default False) If True, coefficients are constrained to be non-negative. warm_start : bool, optional When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Attributes ---------- coef_ : array, shape (n_features, n_tasks) Parameter vector (W in the cost function formula). intercept_ : array, shape (n_tasks,) independent term in decision function. n_iter_ : int number of iterations run by the coordinate descent solver to reach the specified tolerance. Examples -------- >>> from mutar import IndRewLasso >>> import numpy as np >>> X = np.array([[[3, 1], [2, 0], [1, 0]],\ [[0, 2], [-1, 3], [1, -2]]], dtype=float) >>> coef = np.array([[1., 1.], [0., -1]]) >>> y = np.array([x.dot(c) for x, c in zip(X, coef.T)]) >>> y += 0.1 >>> alpha = [0.1, 0.2] >>> relasso = IndRewLasso(alpha=alpha).fit(X, y) >>> print(relasso.coef_) [[ 0.92188134 0. ] [ 0. -1.33862186]] """
[docs] def __init__(self, alpha=1., fit_intercept=True, normalize=False, max_iter=2000, max_iter_reweighting=100, tol=1e-4, positive=False, warm_start=False): super().__init__(fit_intercept=fit_intercept, normalize=normalize, max_iter=max_iter, tol=tol, positive=positive, warm_start=warm_start) self.alpha = np.asarray(alpha) self.max_iter_reweighting = max_iter_reweighting
def _fit(self, X, y): n_tasks = len(X) n_samples, n_features = X[0].shape if not self.warm_start or not hasattr(self, "coef_"): self.coef_ = np.zeros((n_features, n_tasks)) weights = np.ones_like(self.coef_) coef_old = self.coef_.copy() self.loss_ = [] for i in range(self.max_iter_reweighting): Xw = X * weights.T[:, None, :] coef_ = solver_lasso(Xw, y, self.alpha, self.max_iter, self.tol, positive=self.positive) coef_ = coef_ * weights err = abs(coef_ - coef_old).max() err /= max(abs(coef_).max(), abs(coef_old).max(), 1.) coef_old = coef_.copy() weights = 2 * (abs(coef_) ** 0.5 + 1e-10) obj = 0.5 * (utils.residual(X, coef_, y) ** 2).sum() / n_samples obj += (self.alpha[None, :] * abs(coef_) ** 0.5).sum() self.loss_.append(obj) if err < self.tol and i: break if i == self.max_iter_reweighting - 1 and i: warnings.warn('Reweighted objective did not converge.' + ' You might want to increase ' + 'the number of iterations of reweighting.' + ' Fitting data with very small alpha' + ' may cause precision problems.', ConvergenceWarning) self.coef_ = coef_
[docs]class IndLasso(IndRewLasso): """ Independent Lasso estimator with L1 regularizer. The optimization objective for IndLasso is:: (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_1 Where:: ||W||_0.5 = sum_i sum_j |w_ij| Parameters ---------- alpha : (float or array-like), shape (n_tasks) Optional, default ones(n_tasks) Constant that multiplies the L1 term. Defaults to 1.0 fit_intercept : boolean whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). normalize : boolean This parameter is ignored when fit_intercept is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. max_iter : int, optional The maximum number of inner loop iterations max_iter_reweighting : int, optional Maximum number of reweighting steps i.e outer loop iterations positive : boolean, optional (default False) If True, coefficients are constrained to be non-negative. tol : float, optional The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. warm_start : bool, optional When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Attributes ---------- coef_ : array, shape (n_features, n_tasks) Parameter vector (W in the cost function formula). intercept_ : array, shape (n_tasks,) independent term in decision function. n_iter_ : int number of iterations run by the coordinate descent solver to reach the specified tolerance. Examples -------- >>> from mutar import IndLasso >>> import numpy as np >>> X = np.array([[[3, 1], [2, 0], [1, 0]],\ [[0, 2], [-1, 3], [1, -2]]], dtype=float) >>> coef = np.array([[1., 1.], [0., -1]]) >>> y = np.array([x.dot(c) for x, c in zip(X, coef.T)]) >>> y += 0.1 >>> alpha = [0.1, 0.2] >>> relasso = IndLasso(alpha=alpha).fit(X, y) >>> print(relasso.coef_) [[ 0.85 0. ] [ 0. -1.31428571]] """
[docs] def __init__(self, alpha=1., fit_intercept=True, normalize=False, max_iter=2000, tol=1e-4, positive=False, warm_start=False): super().__init__(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, max_iter=max_iter, positive=positive, tol=tol, warm_start=warm_start) self.max_iter_reweighting = 1
[docs]class MultiLevelLasso(MultitaskRegression): """ MultiLevelLasso estimator with a non-convex product decomposition. The optimization objective for the Multilevel Lasso is:: (1 / (2 * n_samples)) * ||Y - XW||^2_{Fro} + alpha ||W||_{1 1/2}} Where: .. math:: \\|W\\|_{1 \\frac{1}{2}} = \\sum_j \\sqrt{\\|W_j\\|_1} Which is equivelent to:: (1 / (2 * n_samples)) * ||Y - X(C[:, None] * S)||^2_Fro + beta * ||C||_1 + gamma * ||S||_1 Where: .. math:: C \\in \\R^{n\\_features} S \\in \\R^{n\\_features, n\\_tasks} \\alpha = 2 \\sqrt{\\beta * \\gamma} Parameters ---------- alpha : float, optional Constant that multiplies the common L{1 0.5} term. Defaults to 1.0 fit_intercept : boolean whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). normalize : boolean This parameter is ignored when fit_intercept is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. max_iter : int, optional The maximum number of iterations tol : float, optional The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. positive : boolean, optional (default False) If True, coefficients are constrained to be non-negative. warm_start : bool, optional When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Attributes ---------- coef_ : array, shape (n_features, n_tasks) Parameter vector (W in the cost function formula). intercept_ : array, shape (n_tasks,) independent term in decision function. n_iter_ : int number of iterations run by the coordinate descent solver to reach the specified tolerance. Examples -------- >>> from mutar import MultiLevelLasso >>> import numpy as np >>> X = np.array([[[3, 1], [2, 0], [1, 0]],\ [[0, 2], [-1, 3], [1, -2]]], dtype=float) >>> coef = np.array([[1., 1.], [0., -1]]) >>> y = np.array([x.dot(c) for x, c in zip(X, coef.T)]) >>> y += 0.1 >>> mll = MultiLevelLasso(alpha=0.1).fit(X, y) >>> print(mll.coef_shared_) [0.91502387 1.04852402] >>> print(mll.coef_specific_) [[ 0.91339402 0. ] [ 0. -1.27834952]] >>> print(mll.coef_) [[ 0.83577734 0. ] [ 0. -1.34038017]] """
[docs] def __init__(self, alpha=1., fit_intercept=True, normalize=False, max_iter=2000, tol=1e-4, positive=False, warm_start=False): super().__init__(fit_intercept=fit_intercept, normalize=normalize, max_iter=max_iter, tol=tol, positive=positive, warm_start=warm_start) self.alpha = alpha
def _fit(self, X, y): n_tasks = len(X) n_samples, n_features = X[0].shape if self.alpha == 0.: warnings.warn("With alpha=0, this algorithm does not converge" " well. You are advised to use LinearRegression " "estimator", stacklevel=2) if not self.warm_start or not hasattr(self, "coef_"): coef_shared_ = np.ones(n_features) coef_specific_ = np.zeros((n_features, n_tasks)) coef_shared_, coef_specific_, n_iter = \ solver_mll(X, y, C=coef_shared_, S=coef_specific_, alpha=self.alpha, max_iter=self.max_iter, tol=self.tol, positive=self.positive) self.coef_ = coef_shared_[:, None] * coef_specific_ self.coef_shared_ = coef_shared_ self.coef_specific_ = coef_specific_ self.n_iter_ = n_iter
[docs]class ReMTW(MultitaskRegression): """A class for reweighted Multitask Wasserstein regularization. The optimization objective for Reweighted-MTW is:: (1 / (2 * n_samples)) * ||Y - X(W+ - W-)||^2_Fro + alpha(sum_k OT(W+_k, Wb+) + sum_k OT(W-_k, Wb-)) + beta * (||W+||_{0.5} + ||W-||_{0.5}) Where:: OT is the Unbalanced Wasserstein distance with Kullback-Leibler marginal relaxation. and:: ||W||_{0.5} = sum_i sum_j sqrt|w_ij| if `concomitant` is set to `True`, ReMTW also infers the standard deviation of each task. This allows to scale `beta` adaptively for each task according to the level of noise. The optimization objective for Concomitant Reweighted-MTW is:: (1 / (2 * n_samples)) * sum||Y_k - X_k(W+k - W-k)||^2 / sigma_k + alpha(sum_k OT(W+_k, Wb+) + sum_k OT(W-_k, Wb-)) + beta * (||W+||_{0.5} + ||W-||_{0.5}) + sum sigma_k / 2 Parameters ---------- alpha : float, optional Constant that multiplies the OT term. Defaults to 0.1 beta : float, optional Constant that multiplies the L0.5 term. Defaults to 0.1 M : array, shape (n_features, n_features) Ground metric matrix defining the Wasserstein distance. epsilon : float > 0, optional OT parameter. Weight of the entropy regularization. gamma : float > 0, optional OT parameter. Weight of the Kullback-Leibler marginal relaxation. fit_intercept : boolean whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). normalize : boolean This parameter is ignored when fit_intercept is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. concomitant : boolean, optional (default False) If True, the concomittant version of MTW is used where the l1 penalty is adaptively scaled to the noise std estimation. stable : boolean. optional (default False) if True, use log-domain Sinhorn stabilization from the first iter. if False, the solver will automatically switch to log-domain if numerical errors are encountered. max_iter_reweighting : int, optional Maximum number of reweighting steps i.e outer loop iterations callback : boolean. optional. if True, set a printing callback function to the solver. max_iter_ot : int, optional (default, 20) maximum Sinkhorn iterations max_iter_cd : int, optional (default, 10000) maximum coordinate descent iterations tol_ot : float, optional (default 1e-4) relative maximum change of the Wasserstein barycenter. tol_cd : float, optional (default 1e-4) relative maximum change of the coefficients in coordinate descent. n_jobs: int > 1, default 1 number of threads used in coordinate descents gpu: boolean, optional (default False) if True, Sinkhorn iterations are performed on gpus using cupy. positive : boolean. if True, coefficients must be positive. max_iter : int, optional The maximum number of iterations tol : float, optional The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. ot_threshold : float, optional (default 0.) OT barycenters are computed on the support of coefs > ot_threshold. 1e-7 recommended to speed up Sinkhorn with high dimensional data. warm_start : bool, optional When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Attributes ---------- coef_ : array, shape (n_features, n_tasks) Parameter vector (W in the cost function formula). intercept_ : array, shape (n_tasks,) independent term in decision function. n_iter_ : int number of iterations run by the coordinate descent solver to reach the specified tolerance. Example ------- >>> from mutar import ReMTW >>> import numpy as np >>> X = np.array([[[3, 1], [2, 0], [1, 0]],\ [[0, 2], [-1, 3], [1, -2]]], dtype=float) >>> coef = np.array([[0., 1.], [0., 2.]]) >>> y = np.array([x.dot(c) for x, c in zip(X, coef.T)]) >>> y += 0.1 >>> remtw = ReMTW(alpha=0.2, beta=0.1).fit(X, y) >>> print(remtw.coef_) [[-0.01064798 -0.12868387] [ 0.10112546 1.55361523]] >>> print(remtw.barycenter_) [0.13216306 0.34976005] """
[docs] def __init__(self, alpha=0.1, beta=0.1, M=None, epsilon=None, gamma=None, fit_intercept=True, normalize=False, concomitant=False, max_iter=2000, tol=1e-4, warm_start=False, max_iter_reweighting=100, stable=False, max_iter_ot=20, max_iter_cd=1000, tol_ot=1e-5, tol_cd=1e-5, n_jobs=1, gpu=False, positive=False, ot_threshold=0., tol_reweighting=0.001): super().__init__(fit_intercept=fit_intercept, normalize=normalize, max_iter=max_iter, tol=tol, positive=positive, warm_start=warm_start) self.n_jobs = n_jobs self.M = M self.alpha = alpha self.epsilon = epsilon self.gamma = gamma self.beta = beta self.stable = stable self.max_iter_ot = max_iter_ot self.tol_ot = tol_ot self.tol = tol self.n_jobs = n_jobs self.tol_cd = tol_cd self.tol_reweighting = tol_reweighting self.concomitant = concomitant self.max_iter_reweighting = max_iter_reweighting self.gpu = gpu self.ot_threshold = ot_threshold self.b1_ = None self.b2_ = None self.coef1_ = None self.coef2_ = None
def _set_ot_params(self, n_features): """Set OT hyperparams if not specified.""" if self.M is None: self.M = utils.groundmetric(n_features, p=2, normed=True) if self.epsilon is None: self.epsilon = 5. / n_features if self.gamma is None: self.gamma = utils.compute_gamma(0.8, self.M) def _fit(self, X, y): if self.alpha == 0. and self.beta == 0.: warnings.warn("With alpha=beta0, this algorithm does not converge" " well. You are advised to use LinearRegression " "estimator", stacklevel=2) n_tasks, n_samples, n_features = X.shape Xf = [np.asfortranarray(X[k]) for k in range(n_tasks)] mXf = [- np.asfortranarray(X[k]) for k in range(n_tasks)] Ls = (X ** 2).sum(axis=1) Ls[Ls == 0.] = Ls[Ls != 0].min() self._set_ot_params(n_features) self.loss_ = [] weights1 = np.ones((n_features, n_tasks)) weights2 = weights1.copy() if not self.warm_start or not hasattr(self, "coef_"): self.coef1_ = np.ones((n_features, n_tasks)) / n_features self.coef2_ = np.ones((n_features, n_tasks)) / n_features if self.alpha == 0.: self.coef1_ *= 0. self.coef2_ *= 0. self.b1_ = None self.b2_ = None self.residuals_ = y.copy() self.sigma_ = np.ones(n_tasks) if self.positive: self.coef2_ *= 0. coef_old = np.zeros((n_features, n_tasks)) beta1 = self.beta * weights1 beta2 = self.beta * weights2 for i in range(self.max_iter_reweighting): coef1, coef2, bar1, bar2, log, sigma, b1, b2, R = \ solver_mtw(Xf, mXf, Ls, y, M=self.M, alpha=self.alpha, beta1=beta1, beta2=beta2, epsilon=self.epsilon, gamma=self.gamma, coef1=self.coef1_, coef2=self.coef2_, R=self.residuals_, b1=self.b1_, b2=self.b2_, sigmas=self.sigma_, concomitant=self.concomitant, stable=self.stable, tol=self.tol, max_iter=self.max_iter, tol_ot=self.tol_ot, max_iter_ot=self.max_iter_ot, positive=self.positive, n_jobs=self.n_jobs, tol_cd=self.tol_cd, gpu=self.gpu, ot_threshold=self.ot_threshold) coef_ = coef1 - coef2 self.coef1_ = coef1 self.coef2_ = coef2 self.coef_ = coef_ self.sigma_ = sigma self.b1_ = b1 self.b2_ = b2 self.residuals_ = R err = abs(coef_ - coef_old).max() err /= max(abs(coef_).max(), abs(coef_old).max(), 1.) coef_old = coef_.copy() obj = self.alpha * (log["fot1"][-1] + log["fot2"][-1]) obj += 0.5 * (R ** 2).sum(axis=1).dot(1 / self.sigma_) / n_samples obj += self.beta * (coef1 ** 0.5 + coef2 ** 0.5).sum() self.loss_.append(obj) if err < self.tol_reweighting and i: break if self.alpha == 0.: weights1 = 0.5 / (abs(coef_) ** 0.5 + 1e-10) weights2 = weights1 else: weights1 = 0.5 / (coef1 ** 0.5 + 1e-10) weights2 = 0.5 / (coef2 ** 0.5 + 1e-10) beta1 = self.beta * weights1 beta2 = self.beta * weights2 if i == self.max_iter_reweighting - 1 and i: warnings.warn('Reweighted objective did not converge.' + ' You might want to increase ' + 'the number of iterations of reweighting.' + ' Fitting data with very small alpha and beta' + ' may cause precision problems.', ConvergenceWarning) self.barycenter1_ = bar1 self.barycenter2_ = bar2 self.barycenter_ = bar1 - bar2 self.log_ = log return self def reset(self): if hasattr(self, 'coef_'): del self.coef_ self.coef1_ = None self.coef2_ = None del self.barycenter2_ del self.barycenter1_ del self.barycenter_ del self.log_ self.residuals_ = None self.b1_ = None self.b2_ = None self.stable = False
[docs]class MTW(ReMTW): """A class for Multitask Wasserstein regularization. The optimization objective for Reweighted-MTW is:: (1 / (2 * n_samples)) * ||Y - X(W+ - W-)||^2_Fro + alpha(sum_k OT(W+_k, Wb+) + sum_k OT(W-_k, Wb-)) + beta * (||W+||_1 + ||W-||_1) Where:: OT is the Unbalanced Wasserstein distance with Kullback-Leibler marginal relaxation. and:: ||W||_1 = sum_i sum_j |w_ij| if `concomitant` is set to `True`, ReMTW also infers the standard deviation of each task. This allows to scale `beta` adaptively for each task according to the level of noise. The optimization objective for Concomitant MTW is:: (1 / (2 * n_samples)) * sum||Y_k - X_k(W+k - W-k)||^2 / sigma_k + alpha(sum_k OT(W+_k, Wb+) + sum_k OT(W-_k, Wb-)) + beta * (||W+||_1 + ||W-||_1) + sum sigma_k / 2 Parameters ---------- alpha : float, optional Constant that multiplies the OT term. Defaults to 0.1 beta : float, optional Constant that multiplies the L1 term. Defaults to 0.1 M : array, shape (n_features, n_features) Ground metric matrix defining the Wasserstein distance. epsilon : float > 0, optional OT parameter. Weight of the entropy regularization. gamma : float > 0, optional OT parameter. Weight of the Kullback-Leibler marginal relaxation. fit_intercept : boolean whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). normalize : boolean This parameter is ignored when fit_intercept is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. concomitant : boolean, optional (default False) If True, the concomittant version of MTW is used where the l1 penalty is adaptively scaled to the noise std estimation. stable : boolean. optional (default False) if True, use log-domain Sinhorn stabilization from the first iter. if False, the solver will automatically switch to log-domain if numerical errors are encountered. callback : boolean. optional. if True, set a printing callback function to the solver. max_iter_ot : int, optional (default, 20) maximum Sinkhorn iterations max_iter_cd : int, optional (default, 10000) maximum coordinate descent iterations tol_ot : float, optional (default 1e-4) relative maximum change of the Wasserstein barycenter. tol_cd : float, optional (default 1e-4) relative maximum change of the coefficients in coordinate descent. n_jobs: int > 1, default 1 number of threads used in coordinate descents gpu: boolean, optional (default False) if True, Sinkhorn iterations are performed on gpus using cupy. positive : boolean. if True, coefficients must be positive. max_iter : int, optional The maximum number of iterations tol : float, optional The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. warm_start : bool, optional When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. ot_threshold : float, optional (default 0.) OT barycenters are computed on the support of coefs > ot_threshold. 1e-7 recommended to speed up Sinkhorn with high dimensional data. Attributes ---------- coef_ : array, shape (n_features, n_tasks) Parameter vector (W in the cost function formula). intercept_ : array, shape (n_tasks,) independent term in decision function. n_iter_ : int number of iterations run by the coordinate descent solver to reach the specified tolerance. Example ------- >>> from mutar import MTW >>> import numpy as np >>> X = np.array([[[3, 1], [2, 0], [1, 0]],\ [[0, 2], [-1, 3], [1, -2]]], dtype=float) >>> coef = np.array([[1., 1.], [0., -1]]) >>> y = np.array([x.dot(c) for x, c in zip(X, coef.T)]) >>> y += 0.1 >>> mtw = MTW(alpha=1., beta=1.).fit(X, y) >>> print(mtw.coef_) [[ 0.28727587 0.49165381] [ 0.04883213 -0.93324644]] >>> print(mtw.barycenter_) [ 0.10630429 -0.13313143] """
[docs] def __init__(self, alpha=0.1, beta=0.1, M=None, epsilon=None, gamma=None, fit_intercept=True, normalize=False, concomitant=False, max_iter=2000, tol=1e-4, warm_start=False, stable=False, max_iter_ot=20, max_iter_cd=1000, tol_ot=1e-5, tol_cd=1e-5, n_jobs=1, gpu=False, positive=False, ot_threshold=0.): super().__init__(alpha=alpha, beta=beta, M=M, epsilon=epsilon, gamma=gamma, max_iter_ot=max_iter_ot, max_iter_cd=max_iter_cd, tol_cd=tol_cd, n_jobs=n_jobs, fit_intercept=fit_intercept, normalize=normalize, gpu=gpu, positive=positive, max_iter=max_iter, tol=tol, warm_start=warm_start, ot_threshold=ot_threshold) self.max_iter_reweighting = 1