From 4d3e967ff6044c3c9e44d7cd0e769559f8d0391c Mon Sep 17 00:00:00 2001 From: reshamas Date: Thu, 15 Jul 2021 15:40:30 -0400 Subject: [PATCH 1/6] AdditiveChi2Sampler: fix numpy validation errors --- maint_tools/test_docstrings.py | 1 - sklearn/kernel_approximation.py | 112 +++++++++++++++++++------------- 2 files changed, 68 insertions(+), 45 deletions(-) diff --git a/maint_tools/test_docstrings.py b/maint_tools/test_docstrings.py index 587190401c61e..24b32a5de7a2a 100644 --- a/maint_tools/test_docstrings.py +++ b/maint_tools/test_docstrings.py @@ -9,7 +9,6 @@ # List of modules ignored when checking for numpydoc validation. DOCSTRING_IGNORE_LIST = [ - "AdditiveChi2Sampler", "AgglomerativeClustering", "BernoulliRBM", "Birch", diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index c00cd6f7f184e..84842d7a53399 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -57,10 +57,10 @@ class PolynomialCountSketch(BaseEstimator, TransformerMixin): will be approximated. n_components : int, default=100 - Dimensionality of the output feature space. Usually, n_components + Dimensionality of the output feature space. Usually, `n_components` should be greater than the number of features in input samples in order to achieve good performance. The optimal score / run time - balance is typically achieved around n_components = 10 * n_features, + balance is typically achieved around `n_components` = 10 * `n_features`, but this depends on the specific dataset being used. random_state : int, RandomState instance, default=None @@ -116,8 +116,12 @@ def fit(self, X, y=None): Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) - Training data, where n_samples in the number of samples - and n_features is the number of features. + Training data, where `n_samples` in the number of samples + and `n_features` is the number of features. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ + default=None + Target values (None for unsupervised transformations). Returns ------- @@ -147,8 +151,8 @@ def transform(self, X): Parameters ---------- X : {array-like}, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. + New data, where `n_samples` in the number of samples + and `n_features` is the number of features. Returns ------- @@ -281,8 +285,12 @@ def fit(self, X, y=None): Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) - Training data, where n_samples in the number of samples - and n_features is the number of features. + Training data, where `n_samples` in the number of samples + and `n_features` is the number of features. + + y : array-like, shape (n_samples,) or (n_samples, n_outputs), \ + default=None + Target values (None for unsupervised transformations). Returns ------- @@ -307,8 +315,8 @@ def transform(self, X): Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. + New data, where `n_samples` in the number of samples + and `n_features` is the number of features. Returns ------- @@ -402,8 +410,12 @@ def fit(self, X, y=None): Parameters ---------- X : array-like, shape (n_samples, n_features) - Training data, where n_samples in the number of samples - and n_features is the number of features. + Training data, where `n_samples` in the number of samples + and `n_features` is the number of features. + + y : array-like, shape (n_samples,) or (n_samples, n_outputs), \ + default=None + Target values (None for unsupervised transformations). Returns ------- @@ -426,8 +438,8 @@ def transform(self, X): Parameters ---------- X : array-like, shape (n_samples, n_features) - New data, where n_samples in the number of samples - and n_features is the number of features. All values of X must be + New data, where `n_samples` in the number of samples + and `n_features` is the number of features. All values of X must be strictly greater than "-skewedness". Returns @@ -471,39 +483,21 @@ class AdditiveChi2Sampler(TransformerMixin, BaseEstimator): ---------- sample_steps : int, default=2 Gives the number of (complex) sampling points. + sample_interval : float, default=None Sampling interval. Must be specified when sample_steps not in {1,2,3}. Attributes ---------- sample_interval_ : float - Stored sampling interval. Specified as a parameter if sample_steps not - in {1,2,3}. + Stored sampling interval. Specified as a parameter if `sample_steps` + not in {1,2,3}. n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 - Examples - -------- - >>> from sklearn.datasets import load_digits - >>> from sklearn.linear_model import SGDClassifier - >>> from sklearn.kernel_approximation import AdditiveChi2Sampler - >>> X, y = load_digits(return_X_y=True) - >>> chi2sampler = AdditiveChi2Sampler(sample_steps=2) - >>> X_transformed = chi2sampler.fit_transform(X, y) - >>> clf = SGDClassifier(max_iter=5, random_state=0, tol=1e-3) - >>> clf.fit(X_transformed, y) - SGDClassifier(max_iter=5, random_state=0) - >>> clf.score(X_transformed, y) - 0.9499... - - Notes - ----- - This estimator approximates a slightly different version of the additive - chi squared kernel then ``metric.additive_chi2`` computes. - See Also -------- SkewedChi2Sampler : A Fourier-approximation to a non-additive variant of @@ -514,12 +508,31 @@ class AdditiveChi2Sampler(TransformerMixin, BaseEstimator): sklearn.metrics.pairwise.additive_chi2_kernel : The exact additive chi squared kernel. + Notes + ----- + This estimator approximates a slightly different version of the additive + chi squared kernel then ``metric.additive_chi2`` computes. + References ---------- See `"Efficient additive kernels via explicit feature maps" `_ A. Vedaldi and A. Zisserman, Pattern Analysis and Machine Intelligence, 2011 + + Examples + -------- + >>> from sklearn.datasets import load_digits + >>> from sklearn.linear_model import SGDClassifier + >>> from sklearn.kernel_approximation import AdditiveChi2Sampler + >>> X, y = load_digits(return_X_y=True) + >>> chi2sampler = AdditiveChi2Sampler(sample_steps=2) + >>> X_transformed = chi2sampler.fit_transform(X, y) + >>> clf = SGDClassifier(max_iter=5, random_state=0, tol=1e-3) + >>> clf.fit(X_transformed, y) + SGDClassifier(max_iter=5, random_state=0) + >>> clf.score(X_transformed, y) + 0.9499... """ def __init__(self, *, sample_steps=2, sample_interval=None): @@ -532,8 +545,12 @@ def fit(self, X, y=None): Parameters ---------- X : array-like, shape (n_samples, n_features) - Training data, where n_samples in the number of samples - and n_features is the number of features. + Training data, where `n_samples` is the number of samples + and `n_features` is the number of features. + + y : array-like, shape (n_samples,) or (n_samples, n_outputs), \ + default=None + Target values (None for unsupervised transformations). Returns ------- @@ -565,7 +582,9 @@ def transform(self, X): Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data, where `n_samples` in the number of samples + and `n_features` is the number of features. Returns ------- @@ -664,7 +683,7 @@ class Nystroem(TransformerMixin, BaseEstimator): ---------- kernel : string or callable, default='rbf' Kernel map to be approximated. A callable should accept two arguments - and the keyword arguments passed to this object as kernel_params, and + and the keyword arguments passed to this object as `kernel_params`, and should return a floating point number. gamma : float, default=None @@ -690,14 +709,14 @@ class Nystroem(TransformerMixin, BaseEstimator): random_state : int, RandomState instance or None, default=None Pseudo-random number generator to control the uniform sampling without - replacement of n_components of the training data to construct the basis - kernel. + replacement of `n_components` of the training data to construct the + basis kernel. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. n_jobs : int, default=None The number of jobs to use for the computation. This works by breaking - down the kernel matrix into n_jobs even slices and computing them in + down the kernel matrix into `n_jobs` even slices and computing them in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. @@ -789,8 +808,13 @@ def fit(self, X, y=None): Parameters ---------- - X : array-like of shape (n_samples, n_features) - Training data. + X : array-like, shape (n_samples, n_features) + Training data, where `n_samples` in the number of samples + and `n_features` is the number of features. + + y : array-like, shape (n_samples,) or (n_samples, n_outputs), \ + default=None + Target values (None for unsupervised transformations). """ X = self._validate_data(X, accept_sparse="csr") rnd = check_random_state(self.random_state) From 9952ef625d392ca121d0bb5965f5d9e0ad71bf5e Mon Sep 17 00:00:00 2001 From: reshamas Date: Fri, 16 Jul 2021 10:37:55 -0400 Subject: [PATCH 2/6] ? forgot what I did ? --- sklearn/kernel_approximation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 84842d7a53399..83bf4bb5b648c 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -120,7 +120,7 @@ def fit(self, X, y=None): and `n_features` is the number of features. y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ - default=None + default=None Target values (None for unsupervised transformations). Returns @@ -219,7 +219,7 @@ class RBFSampler(TransformerMixin, BaseEstimator): Parameters ---------- gamma : float, default=1.0 - Parameter of RBF kernel: exp(-gamma * x^2) + Parameter of RBF kernel: exp(-gamma * x^2). n_components : int, default=100 Number of Monte Carlo samples per original feature. @@ -344,7 +344,7 @@ class SkewedChi2Sampler(TransformerMixin, BaseEstimator): "skewedness" parameter of the kernel. Needs to be cross-validated. n_components : int, default=100 - number of Monte Carlo samples per original feature. + Number of Monte Carlo samples per original feature. Equals the dimensionality of the computed feature space. random_state : int, RandomState instance or None, default=None @@ -813,7 +813,7 @@ def fit(self, X, y=None): and `n_features` is the number of features. y : array-like, shape (n_samples,) or (n_samples, n_outputs), \ - default=None + default=None Target values (None for unsupervised transformations). """ X = self._validate_data(X, accept_sparse="csr") From a77cde22df7f3d3781f7523d1bffccbedb1e4d2f Mon Sep 17 00:00:00 2001 From: reshamas Date: Fri, 16 Jul 2021 12:47:22 -0400 Subject: [PATCH 3/6] fixed typo: array or sparse matrix --- sklearn/kernel_approximation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 83bf4bb5b648c..2b1012a6e9ff3 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -590,7 +590,7 @@ def transform(self, X): ------- X_new : {ndarray, sparse matrix}, \ shape = (n_samples, n_features * (2*sample_steps + 1)) - Whether the return value is an array of sparse matrix depends on + Whether the return value is an array or sparse matrix depends on the type of the input X. """ msg = ( From b2bb38fca009d9909c27d887169f5880a272726a Mon Sep 17 00:00:00 2001 From: reshamas Date: Fri, 16 Jul 2021 12:54:33 -0400 Subject: [PATCH 4/6] fixed typos, added missing periods. --- sklearn/kernel_approximation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 2b1012a6e9ff3..6764414cfe65b 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -157,6 +157,7 @@ def transform(self, X): Returns ------- X_new : array-like, shape (n_samples, n_components) + Projected array. """ check_is_fitted(self) @@ -321,6 +322,7 @@ def transform(self, X): Returns ------- X_new : array-like, shape (n_samples, n_components) + Projected array. """ check_is_fitted(self) @@ -438,13 +440,14 @@ def transform(self, X): Parameters ---------- X : array-like, shape (n_samples, n_features) - New data, where `n_samples` in the number of samples + New data, where `n_samples` is the number of samples and `n_features` is the number of features. All values of X must be strictly greater than "-skewedness". Returns ------- X_new : array-like, shape (n_samples, n_components) + Projected array. """ check_is_fitted(self) From 1f7198cb574f029643c4e5f7795900b7ac7f77b8 Mon Sep 17 00:00:00 2001 From: reshamas Date: Fri, 16 Jul 2021 19:09:36 -0400 Subject: [PATCH 5/6] found where the missing period is supposed to be --- sklearn/kernel_approximation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 6764414cfe65b..be5070a5b79bc 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -543,7 +543,7 @@ def __init__(self, *, sample_steps=2, sample_interval=None): self.sample_interval = sample_interval def fit(self, X, y=None): - """Set the parameters + """Set the parameters. Parameters ---------- From 5c760d9dea31736871b23db536e06c9d6fc7a5da Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 20 Jul 2021 17:25:27 +0200 Subject: [PATCH 6/6] Apply suggestions from code review --- sklearn/kernel_approximation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index be5070a5b79bc..04b0321c7a13a 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -290,7 +290,7 @@ def fit(self, X, y=None): and `n_features` is the number of features. y : array-like, shape (n_samples,) or (n_samples, n_outputs), \ - default=None + default=None Target values (None for unsupervised transformations). Returns @@ -416,7 +416,7 @@ def fit(self, X, y=None): and `n_features` is the number of features. y : array-like, shape (n_samples,) or (n_samples, n_outputs), \ - default=None + default=None Target values (None for unsupervised transformations). Returns