-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
[MRG+1] More robust input validation, more testing. #4136
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e3e0827
8ef0b9a
9f50699
021bf74
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -348,6 +348,9 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, | |
ElasticNetCV | ||
""" | ||
X = check_array(X, 'csc', dtype=np.float64, order='F', copy=copy_X) | ||
if Xy is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe @agramfort wants to have a look at this. This comes up in ElasticNetCV. In c2f0b31 I enforced it there, but there was a comment that is shouldn't be enforced there. not sure about that. |
||
Xy = check_array(Xy, 'csc', dtype=np.float64, order='F', copy=False, | ||
ensure_2d=False) | ||
n_samples, n_features = X.shape | ||
|
||
multi_output = False | ||
|
@@ -389,7 +392,6 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, | |
if selection not in ['random', 'cyclic']: | ||
raise ValueError("selection should be either random or cyclic.") | ||
random = (selection == 'random') | ||
models = [] | ||
|
||
if not multi_output: | ||
coefs = np.empty((n_features, n_alphas), dtype=np.float64) | ||
|
@@ -414,6 +416,7 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, | |
model = cd_fast.enet_coordinate_descent_multi_task( | ||
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random) | ||
elif isinstance(precompute, np.ndarray): | ||
precompute = check_array(precompute, 'csc', dtype=np.float64, order='F') | ||
model = cd_fast.enet_coordinate_descent_gram( | ||
coef_, l1_reg, l2_reg, precompute, Xy, y, max_iter, | ||
tol, rng, random, positive) | ||
|
@@ -1418,6 +1421,7 @@ def __init__(self, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, | |
self.random_state = random_state | ||
self.selection = selection | ||
|
||
|
||
############################################################################### | ||
# Multi Task ElasticNet and Lasso models (with joint feature selection) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -357,7 +357,7 @@ def __init__(self, n_components=2, metric=True, n_init=4, | |
def _pairwise(self): | ||
return self.kernel == "precomputed" | ||
|
||
def fit(self, X, init=None, y=None): | ||
def fit(self, X, y=None, init=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe I merged to fast. I am still not 100% about this :-/ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I forgot about that one. I don't think it's a big deal though. |
||
""" | ||
Computes the position of the points in the embedding space | ||
|
||
|
@@ -374,7 +374,7 @@ def fit(self, X, init=None, y=None): | |
self.fit_transform(X, init=init) | ||
return self | ||
|
||
def fit_transform(self, X, init=None, y=None): | ||
def fit_transform(self, X, y=None, init=None): | ||
""" | ||
Fit the data from X, and returns the embedded coordinates | ||
|
||
|
@@ -389,6 +389,7 @@ def fit_transform(self, X, init=None, y=None): | |
if ndarray, initialize the SMACOF algorithm with this array. | ||
|
||
""" | ||
X = check_array(X) | ||
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": | ||
warnings.warn("The MDS API has changed. ``fit`` now constructs an" | ||
" dissimilarity matrix from data. To use a custom " | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -215,7 +215,7 @@ def _gradient_descent(objective, p0, it, n_iter, n_iter_without_progress=30, | |
update = momentum * update - learning_rate * grad | ||
p += update | ||
|
||
if verbose >= 2 and (i+1) % 10 == 0: | ||
if verbose >= 2 and (i + 1) % 10 == 0: | ||
print("[t-SNE] Iteration %d: error = %.7f, gradient norm = %.7f" | ||
% (i + 1, error, grad_norm)) | ||
|
||
|
@@ -404,7 +404,7 @@ def __init__(self, n_components=2, perplexity=30.0, | |
self.verbose = verbose | ||
self.random_state = random_state | ||
|
||
def _fit(self, X): | ||
def fit(self, X, y=None): | ||
"""Fit the model using X as training data. | ||
|
||
Parameters | ||
|
@@ -413,7 +413,7 @@ def _fit(self, X): | |
If the metric is 'precomputed' X must be a square distance | ||
matrix. Otherwise it contains a sample per row. | ||
""" | ||
X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) | ||
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64) | ||
random_state = check_random_state(self.random_state) | ||
|
||
if self.early_exaggeration < 1.0: | ||
|
@@ -507,7 +507,7 @@ def _tsne(self, P, alpha, n_samples, random_state, X_embedded=None): | |
|
||
return X_embedded | ||
|
||
def fit_transform(self, X): | ||
def fit_transform(self, X, y=None): | ||
"""Transform X to the embedded space. | ||
|
||
Parameters | ||
|
@@ -521,5 +521,5 @@ def fit_transform(self, X): | |
X_new : array, shape (n_samples, n_components) | ||
Embedding of the training data in low-dimensional space. | ||
""" | ||
self._fit(X) | ||
self.fit(X) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks strange. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Never mind, I thought that we were in the fit method. |
||
return self.embedding_ |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,7 @@ | |
from scipy.spatial.distance import cdist | ||
|
||
from ..externals.six.moves import xrange | ||
from ..utils import check_random_state | ||
from ..utils import check_random_state, check_array | ||
from ..utils.extmath import logsumexp, pinvh, squared_norm | ||
from ..utils.validation import check_is_fitted | ||
from .. import cluster | ||
|
@@ -148,12 +148,12 @@ class DPGMM(GMM): | |
n_iter : int, default 10 | ||
Maximum number of iterations to perform before convergence. | ||
|
||
params : string, default 'wmc' | ||
params : string, default 'wmc' | ||
Controls which parameters are updated in the training | ||
process. Can contain any combination of 'w' for weights, | ||
'm' for means, and 'c' for covars. | ||
|
||
init_params : string, default 'wmc' | ||
init_params : string, default 'wmc' | ||
Controls which parameters are updated in the initialization | ||
process. Can contain any combination of 'w' for weights, | ||
'm' for means, and 'c' for covars. Defaults to 'wmc'. | ||
|
@@ -250,7 +250,7 @@ def score_samples(self, X): | |
""" | ||
check_is_fitted(self, 'gamma_') | ||
|
||
X = np.asarray(X) | ||
X = check_array(X) | ||
if X.ndim == 1: | ||
X = X[:, np.newaxis] | ||
z = np.zeros((X.shape[0], self.n_components)) | ||
|
@@ -461,7 +461,7 @@ def _logprior(self, z): | |
def lower_bound(self, X, z): | ||
"""returns a lower bound on model evidence based on X and membership""" | ||
check_is_fitted(self, 'means_') | ||
|
||
if self.covariance_type not in ['full', 'tied', 'diag', 'spherical']: | ||
raise NotImplementedError("This ctype is not implemented: %s" | ||
% self.covariance_type) | ||
|
@@ -480,7 +480,7 @@ def _set_weights(self): | |
+ self.gamma_[i, 2]) | ||
self.weights_ /= np.sum(self.weights_) | ||
|
||
def fit(self, X): | ||
def fit(self, X, y=None): | ||
"""Estimate model parameters with the variational | ||
algorithm. | ||
|
||
|
@@ -501,10 +501,10 @@ def fit(self, X): | |
List of n_features-dimensional data points. Each row | ||
corresponds to a single data point. | ||
""" | ||
self.random_state = check_random_state(self.random_state) | ||
self.random_state_ = check_random_state(self.random_state) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks strange that we need to store the random state as a fitted attribute. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? How else should we keep sampling? If we do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hum, this is a good question, I wonder if we deal consistently with this case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We used this pattern in other places, I don't remember where, though. The previous pattern of just overwriting the parameter is clearly not what we want, though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I agree. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW, wouldn't it make sense to have a common test to check that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you think that will be different from a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is different but we can do that in another PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh ,right, I got it. Btw, do you think it is ok to modify it if a RandomState object was passed? (And do you want to open an issue for the test?) |
||
|
||
## initialization step | ||
X = np.asarray(X) | ||
X = check_array(X) | ||
if X.ndim == 1: | ||
X = X[:, np.newaxis] | ||
|
||
|
@@ -521,7 +521,7 @@ def fit(self, X): | |
if 'm' in self.init_params or not hasattr(self, 'means_'): | ||
self.means_ = cluster.KMeans( | ||
n_clusters=self.n_components, | ||
random_state=self.random_state).fit(X).cluster_centers_[::-1] | ||
random_state=self.random_state_).fit(X).cluster_centers_[::-1] | ||
|
||
if 'w' in self.init_params or not hasattr(self, 'weights_'): | ||
self.weights_ = np.tile(1.0 / self.n_components, self.n_components) | ||
|
@@ -705,7 +705,7 @@ def score_samples(self, X): | |
""" | ||
check_is_fitted(self, 'gamma_') | ||
|
||
X = np.asarray(X) | ||
X = check_array(X) | ||
if X.ndim == 1: | ||
X = X[:, np.newaxis] | ||
dg = digamma(self.gamma_) - digamma(np.sum(self.gamma_)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like it's not needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I ran into a case where
max_iter
is 0. I opened #4134 because I'm not familiar with the details.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's put an inline comment to make it explicit that this is to support the
max_iter=0
case.Alternatively I would be ok to add input validation to ensure that models configured with
max_iter < 1
yield consistently aValueError
.I don't see the point in supporting the
max_iter=0
edge case in the first place. If most sklearn models already do something reasonable (a.k.a. then do not crash neither atfit
norpredict
ortransform
time) then this is is fine. But if this is problematic for any of them I would just disable explicitly support formax_iter=0
rather than hacking workarounds to add support for this useless edge case.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have you had a look at #4134? It is not like I'm setting
max_iter
anywhere. I just run with the default arguments on random data, and the estimator crashes every time. With this fix, at least it runs through. I don't know what is happening, which is why I opened #4134.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alright, I did not make the connection. Still please add an inline comment such as: