Skip to content

[WIP] FIX remove validation from __init__ and set_param #16945

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
7 changes: 4 additions & 3 deletions sklearn/decomposition/_factor_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,6 @@ def __init__(self, n_components=None, *, tol=1e-2, copy=True,
self.copy = copy
self.tol = tol
self.max_iter = max_iter
if svd_method not in ['lapack', 'randomized']:
raise ValueError('SVD method %s is not supported. Please consider'
' the documentation' % svd_method)
self.svd_method = svd_method

self.noise_variance_init = noise_variance_init
Expand All @@ -170,6 +167,10 @@ def fit(self, X, y=None):
-------
self
"""
if self.svd_method not in ['lapack', 'randomized']:
raise ValueError('SVD method %s is not supported. Please consider'
' the documentation' % self.svd_method)

X = self._validate_data(X, copy=self.copy, dtype=np.float64)

n_samples, n_features = X.shape
Expand Down
6 changes: 3 additions & 3 deletions sklearn/decomposition/_fastica.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,9 +397,6 @@ def __init__(self, n_components=None, *, algorithm='parallel', whiten=True,
fun='logcosh', fun_args=None, max_iter=200, tol=1e-4,
w_init=None, random_state=None):
super().__init__()
if max_iter < 1:
raise ValueError("max_iter should be greater than 1, got "
"(max_iter={})".format(max_iter))
self.n_components = n_components
self.algorithm = algorithm
self.whiten = whiten
Expand Down Expand Up @@ -427,6 +424,9 @@ def _fit(self, X, compute_sources=False):
-------
X_new : array-like, shape (n_samples, n_components)
"""
if self.max_iter < 1:
raise ValueError("max_iter should be greater than 1, got "
"(max_iter={})".format(self.max_iter))

X = self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES,
ensure_min_samples=2).T
Expand Down
7 changes: 4 additions & 3 deletions sklearn/decomposition/_kernel_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,6 @@ def __init__(self, n_components=None, *, kernel="linear",
alpha=1.0, fit_inverse_transform=False, eigen_solver='auto',
tol=0, max_iter=None, remove_zero_eig=False,
random_state=None, copy_X=True, n_jobs=None):
if fit_inverse_transform and kernel == 'precomputed':
raise ValueError(
"Cannot fit_inverse_transform with a precomputed kernel.")
self.n_components = n_components
self.kernel = kernel
self.kernel_params = kernel_params
Expand Down Expand Up @@ -275,6 +272,10 @@ def fit(self, X, y=None):
self : object
Returns the instance itself.
"""
if self.fit_inverse_transform and self.kernel == 'precomputed':
raise ValueError(
"Cannot fit_inverse_transform with a precomputed kernel.")

X = self._validate_data(X, accept_sparse='csr', copy=self.copy_X)
self._centerer = KernelCenterer()
K = self._get_kernel(X)
Expand Down
2 changes: 1 addition & 1 deletion sklearn/decomposition/tests/test_factor_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_factor_analysis():
X = np.dot(h, W) + noise

with pytest.raises(ValueError):
FactorAnalysis(svd_method='foo')
FactorAnalysis(svd_method='foo').fit(X)
fa_fail = FactorAnalysis()
fa_fail.svd_method = 'foo'
with pytest.raises(ValueError):
Expand Down
2 changes: 1 addition & 1 deletion sklearn/decomposition/tests/test_fastica.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def test_fastica_errors():
X = rng.random_sample((n_samples, n_features))
w_init = rng.randn(n_features + 1, n_features + 1)
with pytest.raises(ValueError, match='max_iter should be greater than 1'):
FastICA(max_iter=0)
FastICA(max_iter=0).fit(X)
with pytest.raises(ValueError, match=r'alpha must be in \[1,2\]'):
fastica(X, fun_args={'alpha': 0})
with pytest.raises(ValueError, match='w_init has invalid shape.+'
Expand Down
4 changes: 3 additions & 1 deletion sklearn/decomposition/tests/test_kernel_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,10 @@ def histogram(x, y, **kwargs):


def test_kernel_pca_invalid_parameters():
state = np.random.RandomState(0)
X = state.rand(10, 10)
with pytest.raises(ValueError):
KernelPCA(10, fit_inverse_transform=True, kernel='precomputed')
KernelPCA(10, fit_inverse_transform=True, kernel='precomputed').fit(X)


def test_kernel_pca_consistent_transform():
Expand Down
3 changes: 1 addition & 2 deletions sklearn/feature_extraction/_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,6 @@ class FeatureHasher(TransformerMixin, BaseEstimator):
@_deprecate_positional_args
def __init__(self, n_features=(2 ** 20), *, input_type="dict",
dtype=np.float64, alternate_sign=True):
self._validate_params(n_features, input_type)

self.dtype = dtype
self.input_type = input_type
self.n_features = n_features
Expand Down Expand Up @@ -150,6 +148,7 @@ def transform(self, raw_X):
Feature matrix, for use with estimators or further transformers.

"""
self._validate_params(self.n_features, self.input_type)
raw_X = iter(raw_X)
if self.input_type == "dict":
raw_X = (_iteritems(d) for d in raw_X)
Expand Down
10 changes: 6 additions & 4 deletions sklearn/feature_extraction/tests/test_feature_hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,16 @@ def test_hash_empty_input():


def test_hasher_invalid_input():
raw_X = [[], (), iter(range(0))]

with pytest.raises(ValueError):
FeatureHasher(input_type="gobbledygook")
FeatureHasher(input_type="gobbledygook").transform(raw_X)
with pytest.raises(ValueError):
FeatureHasher(n_features=-1)
FeatureHasher(n_features=-1).transform(raw_X)
with pytest.raises(ValueError):
FeatureHasher(n_features=0)
FeatureHasher(n_features=0).transform(raw_X)
with pytest.raises(TypeError):
FeatureHasher(n_features='ham')
FeatureHasher(n_features='ham').transform(raw_X)

h = FeatureHasher(n_features=np.uint16(2 ** 6))
with pytest.raises(ValueError):
Expand Down
4 changes: 4 additions & 0 deletions sklearn/feature_extraction/tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,12 +512,16 @@ def test_tfidf_vectorizer_setters():
tv = TfidfVectorizer(norm='l2', use_idf=False, smooth_idf=False,
sublinear_tf=False)
tv.norm = 'l1'
tv.fit(ALL_FOOD_DOCS)
assert tv._tfidf.norm == 'l1'
tv.use_idf = True
tv.fit(ALL_FOOD_DOCS)
assert tv._tfidf.use_idf
tv.smooth_idf = True
tv.fit(ALL_FOOD_DOCS)
assert tv._tfidf.smooth_idf
tv.sublinear_tf = True
tv.fit(ALL_FOOD_DOCS)
assert tv._tfidf.sublinear_tf


Expand Down
79 changes: 29 additions & 50 deletions sklearn/feature_extraction/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -1014,15 +1014,7 @@ def __init__(self, *, input='content', encoding='utf-8',
self.stop_words = stop_words
self.max_df = max_df
self.min_df = min_df
if max_df < 0 or min_df < 0:
raise ValueError("negative value for max_df or min_df")
self.max_features = max_features
if max_features is not None:
if (not isinstance(max_features, numbers.Integral) or
max_features <= 0):
raise ValueError(
"max_features=%r, neither a positive integer nor None"
% max_features)
self.ngram_range = ngram_range
self.vocabulary = vocabulary
self.binary = binary
Expand Down Expand Up @@ -1178,6 +1170,15 @@ def fit_transform(self, raw_documents, y=None):
# We intentionally don't call the transform method to make
# fit_transform overridable without unwanted side effects in
# TfidfVectorizer.
if self.max_df < 0 or self.min_df < 0:
raise ValueError("negative value for max_df or min_df")
if self.max_features is not None:
if (not isinstance(self.max_features, numbers.Integral) or
self.max_features <= 0):
raise ValueError(
"max_features=%r, neither a positive integer nor None"
% self.max_features)

if isinstance(raw_documents, str):
raise ValueError(
"Iterable over raw text documents expected, "
Expand Down Expand Up @@ -1730,49 +1731,18 @@ def __init__(self, *, input='content', encoding='utf-8',
ngram_range=ngram_range, max_df=max_df, min_df=min_df,
max_features=max_features, vocabulary=vocabulary, binary=binary,
dtype=dtype)

self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
smooth_idf=smooth_idf,
sublinear_tf=sublinear_tf)

# Broadcast the TF-IDF parameters to the underlying transformer instance
# for easy grid search and repr

@property
def norm(self):
return self._tfidf.norm

@norm.setter
def norm(self, value):
self._tfidf.norm = value

@property
def use_idf(self):
return self._tfidf.use_idf

@use_idf.setter
def use_idf(self, value):
self._tfidf.use_idf = value

@property
def smooth_idf(self):
return self._tfidf.smooth_idf

@smooth_idf.setter
def smooth_idf(self, value):
self._tfidf.smooth_idf = value

@property
def sublinear_tf(self):
return self._tfidf.sublinear_tf

@sublinear_tf.setter
def sublinear_tf(self, value):
self._tfidf.sublinear_tf = value
self.norm = norm
self.use_idf = use_idf
self.smooth_idf = smooth_idf
self.sublinear_tf = sublinear_tf

@property
def idf_(self):
return self._tfidf.idf_
try:
check_is_fitted(self)
return self._tfidf.idf_
except NotFittedError:
raise AttributeError

@idf_.setter
def idf_(self, value):
Expand All @@ -1782,6 +1752,11 @@ def idf_(self, value):
raise ValueError("idf length = %d must be equal "
"to vocabulary size = %d" %
(len(value), len(self.vocabulary)))
if not hasattr(self, '_tfidf'):
self._tfidf = TfidfTransformer(norm=self.norm,
use_idf=self.use_idf,
smooth_idf=self.smooth_idf,
sublinear_tf=self.sublinear_tf)
self._tfidf.idf_ = value

def _check_params(self):
Expand Down Expand Up @@ -1809,7 +1784,9 @@ def fit(self, raw_documents, y=None):
self._check_params()
self._warn_for_unused_params()
X = super().fit_transform(raw_documents)
self._tfidf.fit(X)
self._tfidf = TfidfTransformer(norm=self.norm, use_idf=self.use_idf,
smooth_idf=self.smooth_idf,
sublinear_tf=self.sublinear_tf).fit(X)
return self

def fit_transform(self, raw_documents, y=None):
Expand All @@ -1832,7 +1809,9 @@ def fit_transform(self, raw_documents, y=None):
"""
self._check_params()
X = super().fit_transform(raw_documents)
self._tfidf.fit(X)
self._tfidf = TfidfTransformer(norm=self.norm, use_idf=self.use_idf,
smooth_idf=self.smooth_idf,
sublinear_tf=self.sublinear_tf).fit(X)
# X is already a transformed view of raw_documents so
# we set copy to False
return self._tfidf.transform(X, copy=False)
Expand Down
2 changes: 0 additions & 2 deletions sklearn/linear_model/_stochastic_gradient.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ def __init__(self, loss, *, penalty='l2', alpha=0.0001, C=1.0,
self.tol = tol
# current tests expect init to do parameter validation
# but we are not allowed to set attributes
self._validate_params()

def set_params(self, **kwargs):
"""Set and validate the parameters of estimator.
Expand All @@ -114,7 +113,6 @@ def set_params(self, **kwargs):
Estimator instance.
"""
super().set_params(**kwargs)
self._validate_params()
return self

@abstractmethod
Expand Down
Loading