"""
Mutar estimators module.
"""
import numpy as np
import warnings
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.metrics import r2_score
from sklearn.exceptions import ConvergenceWarning
from .solvers import solver_dirty, solver_lasso, solver_mll, solver_mtw
from . import utils
class MultitaskRegression(BaseEstimator, RegressorMixin):
""" Multitask regression abstract class.
Parameters
----------
fit_intercept : boolean
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
normalize : boolean
This parameter is ignored when fit_intercept is set to False. If True,
the regressors X will be normalized before regression by subtracting
the mean and dividing by the l2-norm.
max_iter : int, optional
The maximum number of iterations
tol : float, optional
The tolerance for the optimization: if the updates are
smaller than ``tol``, the optimization code checks the
dual gap for optimality and continues until it is smaller
than ``tol``.
positive : boolean, optional (default False)
If True, coefficients are constrained to be non-negative.
warm_start : bool, optional
When set to ``True``, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution.
Attributes
----------
coef_ : array, shape (n_features, n_tasks)
Parameter vector (W in the cost function formula).
intercept_ : array, shape (n_tasks,)
independent term in decision function.
n_iter_ : int
number of iterations run by the coordinate descent solver to reach
the specified tolerance.
"""
def __init__(self, fit_intercept=True, normalize=False, positive=False,
max_iter=2000, tol=1e-4, warm_start=False):
self.fit_intercept = fit_intercept
self.normalize = normalize
self.max_iter = max_iter
self.tol = tol
self.warm_start = warm_start
self.is_fitted_ = False
self.positive = positive
def _pre_fit(self, X, y):
""" Normalize and scale the data."""
n_tasks = len(X)
n_samples, n_features = X[0].shape
X, y = check_X_y(X, y, accept_sparse=False, allow_nd=True,
multi_output=True)
if y.shape != (n_tasks, n_samples):
raise ValueError("Data shape not understood. X must be "
"(n_tasks, n_samples, n_features) and y must be "
"(n_tasks, n_samples)")
X_scale = np.ones((n_tasks, n_features))
if self.fit_intercept:
X = X.copy()
y = y.copy()
X_offset = X.mean(axis=1)
y_offset = y.mean(axis=1)
X -= X_offset[:, None, :]
y -= y_offset[:, None]
if self.normalize:
X_scale = np.linalg.norm(X, axis=1)
X /= X_scale[:, None, :]
else:
X_offset = np.zeros((n_tasks, 1, n_features))
y_offset = np.zeros((n_tasks, 1))
return X, y, X_offset, y_offset, X_scale
def _fit(self, X, y):
"""Generic fit method to be specified for each model."""
pass
def _post_fit(self, X_offset, y_offset, X_scale):
"""Set intercept and scale after fit."""
assert self.is_fitted_
if self.fit_intercept:
if self.normalize:
self.coef_ /= X_scale.T
self.intercept_ = y_offset
self.intercept_ -= np.einsum("kj,jk->k", X_offset, self.coef_)
else:
self.intercept_ = np.zeros_like(y_offset).flatten()
def fit(self, X, y):
X, y, X_offset, y_offset, X_scale = self._pre_fit(X, y)
self._fit(X, y)
self.is_fitted_ = True
self._post_fit(X_offset, y_offset, X_scale)
return self
def predict(self, X):
""" Predict target given unseen data samples.
Parameters
----------
X : {array-like}, shape (n_tasks, n_samples, n_features)
The training input samples.
Returns
-------
y : ndarray, shape (n_tasks, n_samples)
Returns the predicted targets.
"""
X = check_array(X, accept_sparse=False, allow_nd=True)
check_is_fitted(self, 'is_fitted_')
y_pred = [x.dot(c) for x, c in zip(X, self.coef_.T)]
y_pred = np.array(y_pred) + self.intercept_[:, None]
return y_pred
def score(self, X, y, sample_weight=None):
""" Returns the coefficient of determination R^2 of the prediction.
Computes a score for each regression task.
The coefficient R^2 is defined as (1 - u/v), where u is the residual
sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum
of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible
score is 1.0 and it can be negative (because the model can be
arbitrarily worse). A constant model that always predicts the expected
value of y, disregarding the input features, would get a R^2 score of
0.0.
Parameters
Xarray-like, shape = (n_tasks, n_samples, n_features)
Test samples.
yarray-like, shape = (n_tasks, n_samples)
True values for y.
sample_weightarray-like, shape = [n_tasks, n_samples], optional
Sample weights.
Returns
-------
array-like, shape = (n_tasks)
R^2 of self.predict(X) wrt. y for each task.
"""
y_pred = self.predict(X)
if sample_weight is None:
scores = [r2_score(y_i, y_pred_i, multioutput='variance_weighted')
for y_i, y_pred_i in zip(y, y_pred)]
else:
scores = [r2_score(y_i, y_pred_i, sample_weight=w_i,
multioutput='variance_weighted')
for y_i, y_pred_i, w_i in zip(y, y_pred, sample_weight)]
return np.array(scores)
[docs]class DirtyModel(MultitaskRegression):
""" DirtyModel estimator with L1 and L1/L2 mixed-norm as regularizers.
The optimization objective for Dirty models is::
(1 / (2 * n_samples)) * ||Y - X(W_1 + W_2)||^2_Fro + alpha * ||W_1||_21
+ beta * ||W_2||_1
Where::
||W||_21 = sum_i sqrt{sum_j w_ij^2}
i.e. the sum of norm of each row.
and::
||W||_1 = sum_i sum_j |w_ij|
Parameters
----------
alpha : float, optional
Constant that multiplies the L1/L2 term. Defaults to 1.0
beta : float, optional
Constant that multiplies the L1 term. Defaults to 1.0
fit_intercept : boolean
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
normalize : boolean
This parameter is ignored when fit_intercept is set to False. If True,
the regressors X will be normalized before regression by subtracting
the mean and dividing by the l2-norm.
max_iter : int, optional
The maximum number of iterations
tol : float, optional
The tolerance for the optimization: if the updates are
smaller than ``tol``, the optimization code checks the
dual gap for optimality and continues until it is smaller
than ``tol``.
positive : boolean, optional (default False)
If True, coefficients are constrained to be non-negative.
warm_start : bool, optional
When set to ``True``, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution.
Attributes
----------
coef_ : array, shape (n_features, n_tasks)
Parameter vector (W in the cost function formula).
intercept_ : array, shape (n_tasks,)
independent term in decision function.
n_iter_ : int
number of iterations run by the coordinate descent solver to reach
the specified tolerance.
Examples
--------
>>> from mutar import DirtyModel
>>> import numpy as np
>>> X = np.array([[[3, 1], [2, 0], [1, 0]],\
[[0, 2], [-1, 3], [1, -2]]], dtype=float)
>>> coef = np.array([[1., 1.], [0., -1]])
>>> y = np.array([x.dot(c) for x, c in zip(X, coef.T)])
>>> y += 0.1
>>> dirty = DirtyModel(alpha=0.15, beta=0.12).fit(X, y)
>>> print(dirty.coef_shared_)
[[ 0.4652447 0.3465437]
[ 0. -0. ]]
>>> print(dirty.coef_specific_)
[[ 0.35453532 0. ]
[ 0. -1.20766296]]
"""
[docs] def __init__(self, alpha=1., beta=1., fit_intercept=True, normalize=False,
max_iter=2000, tol=1e-4, positive=False, warm_start=False):
super().__init__(fit_intercept=fit_intercept, normalize=normalize,
max_iter=max_iter, tol=tol, positive=positive,
warm_start=warm_start)
self.alpha = alpha
self.beta = beta
def _fit(self, X, y):
n_tasks = len(X)
n_samples, n_features = X[0].shape
if self.alpha == 0. and self.beta == 0.:
warnings.warn("With alpha=beta=0, this algorithm does not converge"
" well. You are advised to use LinearRegression "
"estimator", stacklevel=2)
if not self.warm_start or not hasattr(self, "coef_"):
coef_shared_ = np.zeros((n_features, n_tasks), dtype=X.dtype,
order='F')
coef_specific_ = np.zeros((n_features, n_tasks), dtype=X.dtype,
order='F')
coef_shared_, coef_specific_, R, n_iter = \
solver_dirty(X, y, coef_shared_, coef_specific_,
self.alpha, self.beta, self.max_iter, self.tol,
self.positive)
self.coef_ = coef_shared_ + coef_specific_
self.coef_shared_ = coef_shared_
self.coef_specific_ = coef_specific_
self.residuals_ = R
self.n_iter_ = n_iter
[docs]class GroupLasso(DirtyModel):
""" GroupLasso estimator with L1/L2 mixed-norm as regularizer.
The optimization objective for Group Lasso is::
(1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21
Where::
||W||_21 = sum_i sqrt{sum_j w_ij^2}
i.e. the sum of norm of each row.
Parameters
----------
alpha : float, optional
Constant that multiplies the L1/L2 term. Defaults to 1.
fit_intercept : boolean
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
normalize : boolean
This parameter is ignored when fit_intercept is set to False. If True,
the regressors X will be normalized before regression by subtracting
the mean and dividing by the l2-norm.
max_iter : int, optional
The maximum number of iterations
tol : float, optional
The tolerance for the optimization: if the updates are
smaller than ``tol``, the optimization code checks the
dual gap for optimality and continues until it is smaller
than ``tol``.
positive : boolean, optional (default False)
If True, coefficients are constrained to be non-negative.
warm_start : bool, optional
When set to ``True``, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution.
Attributes
----------
coef_ : array, shape (n_features, n_tasks)
Parameter vector (W in the cost function formula).
intercept_ : array, shape (n_tasks,)
independent term in decision function.
n_iter_ : int
number of iterations run by the coordinate descent solver to reach
the specified tolerance.
Examples
--------
>>> from mutar import GroupLasso
>>> import numpy as np
>>> X = np.array([[[3, 1], [1, 0], [1, 0]],\
[[0, 2], [2, 3], [2, 3]]], dtype=float)
>>> y = X.sum(axis=2) + 2
>>> grouplasso = GroupLasso().fit(X, y)
>>> print(grouplasso.coef_shared_)
[[1.42045049 1.42045049]
[0. 0. ]]
>>> print(grouplasso.coef_specific_)
[[0. 0.]
[0. 0.]]
"""
[docs] def __init__(self, alpha=0.1, fit_intercept=True, normalize=False,
max_iter=2000, tol=1e-4, positive=False, warm_start=False):
# if beta > alpha, Dirty models are equivalent to a Group Lasso
beta = 10 * alpha
super().__init__(alpha=alpha, beta=beta, fit_intercept=fit_intercept,
normalize=normalize, max_iter=max_iter, tol=tol,
positive=positive, warm_start=warm_start)
[docs]class IndRewLasso(MultitaskRegression):
""" Independent Reweighted Lasso estimator with L1 regularizer.
The optimization objective for IndRewLasso is::
(1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_0.5
Where::
||W||_0.5 = sum_i sum_j sqrt|w_ij|
Parameters
----------
alpha : (float or array-like), shape (n_tasks)
Optional, default ones(n_tasks)
Constant that multiplies the L0.5 term. Defaults to 1.0
fit_intercept : boolean
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
normalize : boolean
This parameter is ignored when fit_intercept is set to False. If True,
the regressors X will be normalized before regression by subtracting
the mean and dividing by the l2-norm.
max_iter : int, optional
The maximum number of inner loop iterations
max_iter_reweighting : int, optional
Maximum number of reweighting steps i.e outer loop iterations
tol : float, optional
The tolerance for the optimization: if the updates are
smaller than ``tol``, the optimization code checks the
dual gap for optimality and continues until it is smaller
than ``tol``.
positive : boolean, optional (default False)
If True, coefficients are constrained to be non-negative.
warm_start : bool, optional
When set to ``True``, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution.
Attributes
----------
coef_ : array, shape (n_features, n_tasks)
Parameter vector (W in the cost function formula).
intercept_ : array, shape (n_tasks,)
independent term in decision function.
n_iter_ : int
number of iterations run by the coordinate descent solver to reach
the specified tolerance.
Examples
--------
>>> from mutar import IndRewLasso
>>> import numpy as np
>>> X = np.array([[[3, 1], [2, 0], [1, 0]],\
[[0, 2], [-1, 3], [1, -2]]], dtype=float)
>>> coef = np.array([[1., 1.], [0., -1]])
>>> y = np.array([x.dot(c) for x, c in zip(X, coef.T)])
>>> y += 0.1
>>> alpha = [0.1, 0.2]
>>> relasso = IndRewLasso(alpha=alpha).fit(X, y)
>>> print(relasso.coef_)
[[ 0.92188134 0. ]
[ 0. -1.33862186]]
"""
[docs] def __init__(self, alpha=1., fit_intercept=True, normalize=False,
max_iter=2000, max_iter_reweighting=100, tol=1e-4,
positive=False, warm_start=False):
super().__init__(fit_intercept=fit_intercept, normalize=normalize,
max_iter=max_iter, tol=tol, positive=positive,
warm_start=warm_start)
self.alpha = np.asarray(alpha)
self.max_iter_reweighting = max_iter_reweighting
def _fit(self, X, y):
n_tasks = len(X)
n_samples, n_features = X[0].shape
if not self.warm_start or not hasattr(self, "coef_"):
self.coef_ = np.zeros((n_features, n_tasks))
weights = np.ones_like(self.coef_)
coef_old = self.coef_.copy()
self.loss_ = []
for i in range(self.max_iter_reweighting):
Xw = X * weights.T[:, None, :]
coef_ = solver_lasso(Xw, y, self.alpha, self.max_iter, self.tol,
positive=self.positive)
coef_ = coef_ * weights
err = abs(coef_ - coef_old).max()
err /= max(abs(coef_).max(), abs(coef_old).max(), 1.)
coef_old = coef_.copy()
weights = 2 * (abs(coef_) ** 0.5 + 1e-10)
obj = 0.5 * (utils.residual(X, coef_, y) ** 2).sum() / n_samples
obj += (self.alpha[None, :] * abs(coef_) ** 0.5).sum()
self.loss_.append(obj)
if err < self.tol and i:
break
if i == self.max_iter_reweighting - 1 and i:
warnings.warn('Reweighted objective did not converge.' +
' You might want to increase ' +
'the number of iterations of reweighting.' +
' Fitting data with very small alpha' +
' may cause precision problems.',
ConvergenceWarning)
self.coef_ = coef_
[docs]class IndLasso(IndRewLasso):
""" Independent Lasso estimator with L1 regularizer.
The optimization objective for IndLasso is::
(1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_1
Where::
||W||_0.5 = sum_i sum_j |w_ij|
Parameters
----------
alpha : (float or array-like), shape (n_tasks)
Optional, default ones(n_tasks)
Constant that multiplies the L1 term. Defaults to 1.0
fit_intercept : boolean
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
normalize : boolean
This parameter is ignored when fit_intercept is set to False. If True,
the regressors X will be normalized before regression by subtracting
the mean and dividing by the l2-norm.
max_iter : int, optional
The maximum number of inner loop iterations
max_iter_reweighting : int, optional
Maximum number of reweighting steps i.e outer loop iterations
positive : boolean, optional (default False)
If True, coefficients are constrained to be non-negative.
tol : float, optional
The tolerance for the optimization: if the updates are
smaller than ``tol``, the optimization code checks the
dual gap for optimality and continues until it is smaller
than ``tol``.
warm_start : bool, optional
When set to ``True``, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution.
Attributes
----------
coef_ : array, shape (n_features, n_tasks)
Parameter vector (W in the cost function formula).
intercept_ : array, shape (n_tasks,)
independent term in decision function.
n_iter_ : int
number of iterations run by the coordinate descent solver to reach
the specified tolerance.
Examples
--------
>>> from mutar import IndLasso
>>> import numpy as np
>>> X = np.array([[[3, 1], [2, 0], [1, 0]],\
[[0, 2], [-1, 3], [1, -2]]], dtype=float)
>>> coef = np.array([[1., 1.], [0., -1]])
>>> y = np.array([x.dot(c) for x, c in zip(X, coef.T)])
>>> y += 0.1
>>> alpha = [0.1, 0.2]
>>> relasso = IndLasso(alpha=alpha).fit(X, y)
>>> print(relasso.coef_)
[[ 0.85 0. ]
[ 0. -1.31428571]]
"""
[docs] def __init__(self, alpha=1., fit_intercept=True, normalize=False,
max_iter=2000, tol=1e-4, positive=False, warm_start=False):
super().__init__(alpha=alpha, fit_intercept=fit_intercept,
normalize=normalize, max_iter=max_iter,
positive=positive, tol=tol, warm_start=warm_start)
self.max_iter_reweighting = 1
[docs]class MultiLevelLasso(MultitaskRegression):
""" MultiLevelLasso estimator with a non-convex product decomposition.
The optimization objective for the Multilevel Lasso is::
(1 / (2 * n_samples)) * ||Y - XW||^2_{Fro} + alpha ||W||_{1 1/2}}
Where:
.. math::
\\|W\\|_{1 \\frac{1}{2}} = \\sum_j \\sqrt{\\|W_j\\|_1}
Which is equivelent to::
(1 / (2 * n_samples)) * ||Y - X(C[:, None] * S)||^2_Fro
+ beta * ||C||_1 + gamma * ||S||_1
Where:
.. math::
C \\in \\R^{n\\_features}
S \\in \\R^{n\\_features, n\\_tasks}
\\alpha = 2 \\sqrt{\\beta * \\gamma}
Parameters
----------
alpha : float, optional
Constant that multiplies the common L{1 0.5} term. Defaults to 1.0
fit_intercept : boolean
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
normalize : boolean
This parameter is ignored when fit_intercept is set to False. If True,
the regressors X will be normalized before regression by subtracting
the mean and dividing by the l2-norm.
max_iter : int, optional
The maximum number of iterations
tol : float, optional
The tolerance for the optimization: if the updates are
smaller than ``tol``, the optimization code checks the
dual gap for optimality and continues until it is smaller
than ``tol``.
positive : boolean, optional (default False)
If True, coefficients are constrained to be non-negative.
warm_start : bool, optional
When set to ``True``, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution.
Attributes
----------
coef_ : array, shape (n_features, n_tasks)
Parameter vector (W in the cost function formula).
intercept_ : array, shape (n_tasks,)
independent term in decision function.
n_iter_ : int
number of iterations run by the coordinate descent solver to reach
the specified tolerance.
Examples
--------
>>> from mutar import MultiLevelLasso
>>> import numpy as np
>>> X = np.array([[[3, 1], [2, 0], [1, 0]],\
[[0, 2], [-1, 3], [1, -2]]], dtype=float)
>>> coef = np.array([[1., 1.], [0., -1]])
>>> y = np.array([x.dot(c) for x, c in zip(X, coef.T)])
>>> y += 0.1
>>> mll = MultiLevelLasso(alpha=0.1).fit(X, y)
>>> print(mll.coef_shared_)
[0.91502387 1.04852402]
>>> print(mll.coef_specific_)
[[ 0.91339402 0. ]
[ 0. -1.27834952]]
>>> print(mll.coef_)
[[ 0.83577734 0. ]
[ 0. -1.34038017]]
"""
[docs] def __init__(self, alpha=1., fit_intercept=True, normalize=False,
max_iter=2000, tol=1e-4, positive=False, warm_start=False):
super().__init__(fit_intercept=fit_intercept, normalize=normalize,
max_iter=max_iter, tol=tol, positive=positive,
warm_start=warm_start)
self.alpha = alpha
def _fit(self, X, y):
n_tasks = len(X)
n_samples, n_features = X[0].shape
if self.alpha == 0.:
warnings.warn("With alpha=0, this algorithm does not converge"
" well. You are advised to use LinearRegression "
"estimator", stacklevel=2)
if not self.warm_start or not hasattr(self, "coef_"):
coef_shared_ = np.ones(n_features)
coef_specific_ = np.zeros((n_features, n_tasks))
coef_shared_, coef_specific_, n_iter = \
solver_mll(X, y, C=coef_shared_, S=coef_specific_,
alpha=self.alpha, max_iter=self.max_iter, tol=self.tol,
positive=self.positive)
self.coef_ = coef_shared_[:, None] * coef_specific_
self.coef_shared_ = coef_shared_
self.coef_specific_ = coef_specific_
self.n_iter_ = n_iter
[docs]class ReMTW(MultitaskRegression):
"""A class for reweighted Multitask Wasserstein regularization.
The optimization objective for Reweighted-MTW is::
(1 / (2 * n_samples)) * ||Y - X(W+ - W-)||^2_Fro +
alpha(sum_k OT(W+_k, Wb+) + sum_k OT(W-_k, Wb-)) +
beta * (||W+||_{0.5} + ||W-||_{0.5})
Where::
OT is the Unbalanced Wasserstein distance with Kullback-Leibler
marginal relaxation.
and::
||W||_{0.5} = sum_i sum_j sqrt|w_ij|
if `concomitant` is set to `True`, ReMTW also infers the standard deviation
of each task. This allows to scale `beta` adaptively for each task
according to the level of noise.
The optimization objective for Concomitant Reweighted-MTW is::
(1 / (2 * n_samples)) * sum||Y_k - X_k(W+k - W-k)||^2 / sigma_k +
alpha(sum_k OT(W+_k, Wb+) + sum_k OT(W-_k, Wb-)) +
beta * (||W+||_{0.5} + ||W-||_{0.5}) + sum sigma_k / 2
Parameters
----------
alpha : float, optional
Constant that multiplies the OT term. Defaults to 0.1
beta : float, optional
Constant that multiplies the L0.5 term. Defaults to 0.1
M : array, shape (n_features, n_features)
Ground metric matrix defining the Wasserstein distance.
epsilon : float > 0, optional
OT parameter. Weight of the entropy regularization.
gamma : float > 0, optional
OT parameter. Weight of the Kullback-Leibler marginal relaxation.
fit_intercept : boolean
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
normalize : boolean
This parameter is ignored when fit_intercept is set to False. If True,
the regressors X will be normalized before regression by subtracting
the mean and dividing by the l2-norm.
concomitant : boolean, optional (default False)
If True, the concomittant version of MTW is used where the l1
penalty is adaptively scaled to the noise std estimation.
stable : boolean. optional (default False)
if True, use log-domain Sinhorn stabilization from the first iter.
if False, the solver will automatically switch to log-domain if
numerical errors are encountered.
max_iter_reweighting : int, optional
Maximum number of reweighting steps i.e outer loop iterations
callback : boolean. optional.
if True, set a printing callback function to the solver.
max_iter_ot : int, optional (default, 20)
maximum Sinkhorn iterations
max_iter_cd : int, optional (default, 10000)
maximum coordinate descent iterations
tol_ot : float, optional (default 1e-4)
relative maximum change of the Wasserstein barycenter.
tol_cd : float, optional (default 1e-4)
relative maximum change of the coefficients in coordinate descent.
n_jobs: int > 1, default 1
number of threads used in coordinate descents
gpu: boolean, optional (default False)
if True, Sinkhorn iterations are performed on gpus using cupy.
positive : boolean.
if True, coefficients must be positive.
max_iter : int, optional
The maximum number of iterations
tol : float, optional
The tolerance for the optimization: if the updates are
smaller than ``tol``, the optimization code checks the
dual gap for optimality and continues until it is smaller
than ``tol``.
ot_threshold : float, optional (default 0.)
OT barycenters are computed on the support of coefs > ot_threshold.
1e-7 recommended to speed up Sinkhorn with high dimensional data.
warm_start : bool, optional
When set to ``True``, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution.
Attributes
----------
coef_ : array, shape (n_features, n_tasks)
Parameter vector (W in the cost function formula).
intercept_ : array, shape (n_tasks,)
independent term in decision function.
n_iter_ : int
number of iterations run by the coordinate descent solver to reach
the specified tolerance.
Example
-------
>>> from mutar import ReMTW
>>> import numpy as np
>>> X = np.array([[[3, 1], [2, 0], [1, 0]],\
[[0, 2], [-1, 3], [1, -2]]], dtype=float)
>>> coef = np.array([[0., 1.], [0., 2.]])
>>> y = np.array([x.dot(c) for x, c in zip(X, coef.T)])
>>> y += 0.1
>>> remtw = ReMTW(alpha=0.2, beta=0.1).fit(X, y)
>>> print(remtw.coef_)
[[-0.01064798 -0.12868387]
[ 0.10112546 1.55361523]]
>>> print(remtw.barycenter_)
[0.13216306 0.34976005]
"""
[docs] def __init__(self, alpha=0.1, beta=0.1, M=None, epsilon=None,
gamma=None, fit_intercept=True, normalize=False,
concomitant=False, max_iter=2000, tol=1e-4,
warm_start=False, max_iter_reweighting=100,
stable=False, max_iter_ot=20, max_iter_cd=1000, tol_ot=1e-5,
tol_cd=1e-5, n_jobs=1, gpu=False, positive=False,
ot_threshold=0., tol_reweighting=0.001):
super().__init__(fit_intercept=fit_intercept, normalize=normalize,
max_iter=max_iter, tol=tol, positive=positive,
warm_start=warm_start)
self.n_jobs = n_jobs
self.M = M
self.alpha = alpha
self.epsilon = epsilon
self.gamma = gamma
self.beta = beta
self.stable = stable
self.max_iter_ot = max_iter_ot
self.tol_ot = tol_ot
self.tol = tol
self.n_jobs = n_jobs
self.tol_cd = tol_cd
self.tol_reweighting = tol_reweighting
self.concomitant = concomitant
self.max_iter_reweighting = max_iter_reweighting
self.gpu = gpu
self.ot_threshold = ot_threshold
self.b1_ = None
self.b2_ = None
self.coef1_ = None
self.coef2_ = None
def _set_ot_params(self, n_features):
"""Set OT hyperparams if not specified."""
if self.M is None:
self.M = utils.groundmetric(n_features, p=2, normed=True)
if self.epsilon is None:
self.epsilon = 5. / n_features
if self.gamma is None:
self.gamma = utils.compute_gamma(0.8, self.M)
def _fit(self, X, y):
if self.alpha == 0. and self.beta == 0.:
warnings.warn("With alpha=beta0, this algorithm does not converge"
" well. You are advised to use LinearRegression "
"estimator", stacklevel=2)
n_tasks, n_samples, n_features = X.shape
Xf = [np.asfortranarray(X[k]) for k in range(n_tasks)]
mXf = [- np.asfortranarray(X[k]) for k in range(n_tasks)]
Ls = (X ** 2).sum(axis=1)
Ls[Ls == 0.] = Ls[Ls != 0].min()
self._set_ot_params(n_features)
self.loss_ = []
weights1 = np.ones((n_features, n_tasks))
weights2 = weights1.copy()
if not self.warm_start or not hasattr(self, "coef_"):
self.coef1_ = np.ones((n_features, n_tasks)) / n_features
self.coef2_ = np.ones((n_features, n_tasks)) / n_features
if self.alpha == 0.:
self.coef1_ *= 0.
self.coef2_ *= 0.
self.b1_ = None
self.b2_ = None
self.residuals_ = y.copy()
self.sigma_ = np.ones(n_tasks)
if self.positive:
self.coef2_ *= 0.
coef_old = np.zeros((n_features, n_tasks))
beta1 = self.beta * weights1
beta2 = self.beta * weights2
for i in range(self.max_iter_reweighting):
coef1, coef2, bar1, bar2, log, sigma, b1, b2, R = \
solver_mtw(Xf, mXf, Ls, y, M=self.M, alpha=self.alpha,
beta1=beta1,
beta2=beta2,
epsilon=self.epsilon, gamma=self.gamma,
coef1=self.coef1_, coef2=self.coef2_,
R=self.residuals_,
b1=self.b1_, b2=self.b2_, sigmas=self.sigma_,
concomitant=self.concomitant,
stable=self.stable, tol=self.tol,
max_iter=self.max_iter, tol_ot=self.tol_ot,
max_iter_ot=self.max_iter_ot,
positive=self.positive,
n_jobs=self.n_jobs, tol_cd=self.tol_cd,
gpu=self.gpu, ot_threshold=self.ot_threshold)
coef_ = coef1 - coef2
self.coef1_ = coef1
self.coef2_ = coef2
self.coef_ = coef_
self.sigma_ = sigma
self.b1_ = b1
self.b2_ = b2
self.residuals_ = R
err = abs(coef_ - coef_old).max()
err /= max(abs(coef_).max(), abs(coef_old).max(), 1.)
coef_old = coef_.copy()
obj = self.alpha * (log["fot1"][-1] + log["fot2"][-1])
obj += 0.5 * (R ** 2).sum(axis=1).dot(1 / self.sigma_) / n_samples
obj += self.beta * (coef1 ** 0.5 + coef2 ** 0.5).sum()
self.loss_.append(obj)
if err < self.tol_reweighting and i:
break
if self.alpha == 0.:
weights1 = 0.5 / (abs(coef_) ** 0.5 + 1e-10)
weights2 = weights1
else:
weights1 = 0.5 / (coef1 ** 0.5 + 1e-10)
weights2 = 0.5 / (coef2 ** 0.5 + 1e-10)
beta1 = self.beta * weights1
beta2 = self.beta * weights2
if i == self.max_iter_reweighting - 1 and i:
warnings.warn('Reweighted objective did not converge.' +
' You might want to increase ' +
'the number of iterations of reweighting.' +
' Fitting data with very small alpha and beta' +
' may cause precision problems.',
ConvergenceWarning)
self.barycenter1_ = bar1
self.barycenter2_ = bar2
self.barycenter_ = bar1 - bar2
self.log_ = log
return self
def reset(self):
if hasattr(self, 'coef_'):
del self.coef_
self.coef1_ = None
self.coef2_ = None
del self.barycenter2_
del self.barycenter1_
del self.barycenter_
del self.log_
self.residuals_ = None
self.b1_ = None
self.b2_ = None
self.stable = False
[docs]class MTW(ReMTW):
"""A class for Multitask Wasserstein regularization.
The optimization objective for Reweighted-MTW is::
(1 / (2 * n_samples)) * ||Y - X(W+ - W-)||^2_Fro +
alpha(sum_k OT(W+_k, Wb+) + sum_k OT(W-_k, Wb-)) +
beta * (||W+||_1 + ||W-||_1)
Where::
OT is the Unbalanced Wasserstein distance with Kullback-Leibler
marginal relaxation.
and::
||W||_1 = sum_i sum_j |w_ij|
if `concomitant` is set to `True`, ReMTW also infers the standard deviation
of each task. This allows to scale `beta` adaptively for each task
according to the level of noise.
The optimization objective for Concomitant MTW is::
(1 / (2 * n_samples)) * sum||Y_k - X_k(W+k - W-k)||^2 / sigma_k +
alpha(sum_k OT(W+_k, Wb+) + sum_k OT(W-_k, Wb-)) +
beta * (||W+||_1 + ||W-||_1) + sum sigma_k / 2
Parameters
----------
alpha : float, optional
Constant that multiplies the OT term. Defaults to 0.1
beta : float, optional
Constant that multiplies the L1 term. Defaults to 0.1
M : array, shape (n_features, n_features)
Ground metric matrix defining the Wasserstein distance.
epsilon : float > 0, optional
OT parameter. Weight of the entropy regularization.
gamma : float > 0, optional
OT parameter. Weight of the Kullback-Leibler marginal relaxation.
fit_intercept : boolean
whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).
normalize : boolean
This parameter is ignored when fit_intercept is set to False. If True,
the regressors X will be normalized before regression by subtracting
the mean and dividing by the l2-norm.
concomitant : boolean, optional (default False)
If True, the concomittant version of MTW is used where the l1
penalty is adaptively scaled to the noise std estimation.
stable : boolean. optional (default False)
if True, use log-domain Sinhorn stabilization from the first iter.
if False, the solver will automatically switch to log-domain if
numerical errors are encountered.
callback : boolean. optional.
if True, set a printing callback function to the solver.
max_iter_ot : int, optional (default, 20)
maximum Sinkhorn iterations
max_iter_cd : int, optional (default, 10000)
maximum coordinate descent iterations
tol_ot : float, optional (default 1e-4)
relative maximum change of the Wasserstein barycenter.
tol_cd : float, optional (default 1e-4)
relative maximum change of the coefficients in coordinate descent.
n_jobs: int > 1, default 1
number of threads used in coordinate descents
gpu: boolean, optional (default False)
if True, Sinkhorn iterations are performed on gpus using cupy.
positive : boolean.
if True, coefficients must be positive.
max_iter : int, optional
The maximum number of iterations
tol : float, optional
The tolerance for the optimization: if the updates are
smaller than ``tol``, the optimization code checks the
dual gap for optimality and continues until it is smaller
than ``tol``.
warm_start : bool, optional
When set to ``True``, reuse the solution of the previous call to fit as
initialization, otherwise, just erase the previous solution.
ot_threshold : float, optional (default 0.)
OT barycenters are computed on the support of coefs > ot_threshold.
1e-7 recommended to speed up Sinkhorn with high dimensional data.
Attributes
----------
coef_ : array, shape (n_features, n_tasks)
Parameter vector (W in the cost function formula).
intercept_ : array, shape (n_tasks,)
independent term in decision function.
n_iter_ : int
number of iterations run by the coordinate descent solver to reach
the specified tolerance.
Example
-------
>>> from mutar import MTW
>>> import numpy as np
>>> X = np.array([[[3, 1], [2, 0], [1, 0]],\
[[0, 2], [-1, 3], [1, -2]]], dtype=float)
>>> coef = np.array([[1., 1.], [0., -1]])
>>> y = np.array([x.dot(c) for x, c in zip(X, coef.T)])
>>> y += 0.1
>>> mtw = MTW(alpha=1., beta=1.).fit(X, y)
>>> print(mtw.coef_)
[[ 0.28727587 0.49165381]
[ 0.04883213 -0.93324644]]
>>> print(mtw.barycenter_)
[ 0.10630429 -0.13313143]
"""
[docs] def __init__(self, alpha=0.1, beta=0.1, M=None, epsilon=None,
gamma=None, fit_intercept=True, normalize=False,
concomitant=False, max_iter=2000, tol=1e-4,
warm_start=False, stable=False, max_iter_ot=20,
max_iter_cd=1000, tol_ot=1e-5, tol_cd=1e-5, n_jobs=1,
gpu=False, positive=False, ot_threshold=0.):
super().__init__(alpha=alpha, beta=beta, M=M, epsilon=epsilon,
gamma=gamma, max_iter_ot=max_iter_ot,
max_iter_cd=max_iter_cd, tol_cd=tol_cd, n_jobs=n_jobs,
fit_intercept=fit_intercept, normalize=normalize,
gpu=gpu, positive=positive,
max_iter=max_iter, tol=tol,
warm_start=warm_start, ot_threshold=ot_threshold)
self.max_iter_reweighting = 1