diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index ae27828dd22a3..c31385dd3e48d 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -22,6 +22,7 @@ jobs:
SCIPY_VERSION: '0.17.0'
CYTHON_VERSION: '*'
PILLOW_VERSION: '4.0.0'
+ MATPLOTLIB_VERSION: '1.5.1'
# later version of joblib are not packaged in conda for Python 3.5
JOBLIB_VERSION: '0.12.3'
COVERAGE: 'true'
diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd
index 97f5cb4f7e465..a53cd61b34828 100644
--- a/build_tools/azure/install.cmd
+++ b/build_tools/azure/install.cmd
@@ -11,7 +11,7 @@ IF "%PYTHON_ARCH%"=="64" (
call deactivate
@rem Clean up any left-over from a previous build
conda remove --all -q -y -n %VIRTUALENV%
- conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython pytest wheel pillow joblib
+ conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython matplotlib pytest wheel pillow joblib
call activate %VIRTUALENV%
) else (
diff --git a/doc/conf.py b/doc/conf.py
index 27a6bf2ee30c2..c736adc8e267e 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -263,9 +263,9 @@
'sphx_glr_plot_compare_methods_001.png': 349}
-# enable experimental module so that the new GBDTs estimators can be
+# enable experimental module so that experimental estimators can be
# discovered properly by sphinx
-from sklearn.experimental import enable_hist_gradient_boosting # noqa
+from sklearn.experimental import * # noqa
def make_carousel_thumbs(app, exception):
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 34a5f63919c44..69e7f0b2b480d 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -195,67 +195,67 @@ The preferred way to contribute to scikit-learn is to fork the `main
repository `__ on GitHub,
then submit a "pull request" (PR):
- 1. `Create an account `_ on
- GitHub if you do not already have one.
+1. `Create an account `_ on
+ GitHub if you do not already have one.
- 2. Fork the `project repository
- `__: click on the 'Fork'
- button near the top of the page. This creates a copy of the code under your
- account on the GitHub user account. For more details on how to fork a
- repository see `this guide `_.
+2. Fork the `project repository
+ `__: click on the 'Fork'
+ button near the top of the page. This creates a copy of the code under your
+ account on the GitHub user account. For more details on how to fork a
+ repository see `this guide `_.
- 3. Clone your fork of the scikit-learn repo from your GitHub account to your
- local disk::
+3. Clone your fork of the scikit-learn repo from your GitHub account to your
+ local disk::
- $ git clone git@github.com:YourLogin/scikit-learn.git
- $ cd scikit-learn
+ $ git clone git@github.com:YourLogin/scikit-learn.git
+ $ cd scikit-learn
- 4. Install library in editable mode::
+4. Install library in editable mode::
- $ pip install --editable .
+ $ pip install --editable .
- for more details about advanced installation, see the
- :ref:`install_bleeding_edge` section.
+ for more details about advanced installation, see the
+ :ref:`install_bleeding_edge` section.
- 5. Create a branch to hold your development changes::
+5. Create a branch to hold your development changes::
- $ git checkout -b my-feature
+ $ git checkout -b my-feature
- and start making changes. Always use a ``feature`` branch. It's good practice to
- never work on the ``master`` branch!
+ and start making changes. Always use a ``feature`` branch. It's good practice to
+ never work on the ``master`` branch!
-.. note::
+ .. note::
- In the above setup, your ``origin`` remote repository points to
- ``YourLogin/scikit-learn.git``. If you wish to fetch/merge from the main
- repository instead of your forked one, you will need to add another remote
- to use instead of ``origin``. If we choose the name ``upstream`` for it, the
- command will be::
+ In the above setup, your ``origin`` remote repository points to
+ ``YourLogin/scikit-learn.git``. If you wish to fetch/merge from the main
+ repository instead of your forked one, you will need to add another remote
+ to use instead of ``origin``. If we choose the name ``upstream`` for it, the
+ command will be::
- $ git remote add upstream https://github.com/scikit-learn/scikit-learn.git
+ $ git remote add upstream https://github.com/scikit-learn/scikit-learn.git
- And in order to fetch the new remote and base your work on the latest changes
- of it you can::
+ And in order to fetch the new remote and base your work on the latest changes
+ of it you can::
- $ git fetch upstream
- $ git checkout -b my-feature upstream/master
+ $ git fetch upstream
+ $ git checkout -b my-feature upstream/master
- 6. Develop the feature on your feature branch on your computer, using Git to do the
- version control. When you're done editing, add changed files using ``git add``
- and then ``git commit`` files::
+6. Develop the feature on your feature branch on your computer, using Git to do the
+ version control. When you're done editing, add changed files using ``git add``
+ and then ``git commit`` files::
- $ git add modified_files
- $ git commit
+ $ git add modified_files
+ $ git commit
- to record your changes in Git, then push the changes to your GitHub account with::
+ to record your changes in Git, then push the changes to your GitHub account with::
- $ git push -u origin my-feature
+ $ git push -u origin my-feature
- 7. Follow `these
- `_
- instructions to create a pull request from your fork. This will send an
- email to the committers. You may want to consider sending an email to the
- mailing list for more visibility.
+7. Follow `these
+ `_
+ instructions to create a pull request from your fork. This will send an
+ email to the committers. You may want to consider sending an email to the
+ mailing list for more visibility.
.. note::
@@ -626,7 +626,7 @@ reviewing pull requests, you may find :ref:`this tip
.. _testing_coverage:
Testing and improving test coverage
-------------------------------------
+-----------------------------------
High-quality `unit testing `_
is a corner-stone of the scikit-learn development process. For this
@@ -641,22 +641,42 @@ the corresponding subpackages.
We expect code coverage of new features to be at least around 90%.
-.. note:: **Workflow to improve test coverage**
+For guidelines on how to use ``pytest`` efficiently, see the
+:ref:`pytest_tips`.
- To test code coverage, you need to install the `coverage
- `_ package in addition to pytest.
+Writing matplotlib related tests
+................................
- 1. Run 'make test-coverage'. The output lists for each file the line
- numbers that are not tested.
+Test fixtures ensure that a set of tests will be executing with the appropriate
+initialization and cleanup. The scikit-learn test suite implements a fixture
+which can be used with ``matplotlib``.
- 2. Find a low hanging fruit, looking at which lines are not tested,
- write or adapt a test specifically for these lines.
+``pyplot``
+ The ``pyplot`` fixture should be used when a test function is dealing with
+ ``matplotlib``. ``matplotlib`` is a soft dependency and is not required.
+ This fixture is in charge of skipping the tests if ``matplotlib`` is not
+ installed. In addition, figures created during the tests will be
+ automatically closed once the test function has been executed.
- 3. Loop.
+To use this fixture in a test function, one needs to pass it as an
+argument::
-For guidelines on how to use ``pytest`` efficiently, see the
-:ref:`pytest_tips`.
+ def test_requiring_mpl_fixture(pyplot):
+ # you can now safely use matplotlib
+
+Workflow to improve test coverage
+.................................
+
+To test code coverage, you need to install the `coverage
+`_ package in addition to pytest.
+
+1. Run 'make test-coverage'. The output lists for each file the line
+ numbers that are not tested.
+
+2. Find a low hanging fruit, looking at which lines are not tested,
+ write or adapt a test specifically for these lines.
+3. Loop.
Developers web site
-------------------
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index c523236a11348..56de69db9519c 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -471,6 +471,7 @@ Samples generator
:toctree: generated/
experimental.enable_hist_gradient_boosting
+ experimental.enable_iterative_imputer
.. _feature_extraction_ref:
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 777a2bd157b29..4cd0ea6e85d60 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -105,7 +105,16 @@ of ``y``. This is done for each feature in an iterative fashion, and then is
repeated for ``max_iter`` imputation rounds. The results of the final
imputation round are returned.
+.. note::
+
+ This estimator is still **experimental** for now: the predictions
+ and the API might change without any deprecation cycle. To use it,
+ you need to explicitly import ``enable_iterative_imputer``.
+
+::
+
>>> import numpy as np
+ >>> from sklearn.experimental import enable_iterative_imputer
>>> from sklearn.impute import IterativeImputer
>>> imp = IterativeImputer(max_iter=10, random_state=0)
>>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]]) # doctest: +NORMALIZE_WHITESPACE
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index a370791d248e2..c01b74775684f 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -136,17 +136,24 @@ Setting the regularization parameter: generalized Cross-Validation
------------------------------------------------------------------
:class:`RidgeCV` implements ridge regression with built-in
-cross-validation of the alpha parameter. The object works in the same way
+cross-validation of the alpha parameter. The object works in the same way
as GridSearchCV except that it defaults to Generalized Cross-Validation
(GCV), an efficient form of leave-one-out cross-validation::
+ >>> import numpy as np
>>> from sklearn import linear_model
- >>> reg = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0], cv=3)
- >>> reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) # doctest: +SKIP
- RidgeCV(alphas=[0.1, 1.0, 10.0], cv=3, fit_intercept=True, scoring=None,
- normalize=False)
- >>> reg.alpha_ # doctest: +SKIP
- 0.1
+ >>> reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
+ >>> reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) # doctest: +NORMALIZE_WHITESPACE
+ RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
+ 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]),
+ cv=None, fit_intercept=True, gcv_mode=None, normalize=False,
+ scoring=None, store_cv_values=False)
+ >>> reg.alpha_
+ 0.01
+
+Specifying the value of the `cv` attribute will trigger the use of
+cross-validation with `GridSearchCV`, for example `cv=10` for 10-fold
+cross-validation, rather than Generalized Cross-Validation.
.. topic:: References
diff --git a/doc/roadmap.rst b/doc/roadmap.rst
index a8334604395a2..2252b62d273e6 100644
--- a/doc/roadmap.rst
+++ b/doc/roadmap.rst
@@ -128,7 +128,6 @@ bottom.
#. Improved tools for model diagnostics and basic inference
- * partial dependence plots :issue:`5653`
* alternative feature importances implementations (e.g. methods or wrappers)
* better ways to handle validation sets when fitting
* better ways to find thresholds / create decision rules :issue:`8614`
@@ -144,19 +143,6 @@ bottom.
:issue:`6929`
* Callbacks or a similar system would facilitate logging and early stopping
-#. Use scipy BLAS Cython bindings
-
- * This will make it possible to get rid of our partial copy of suboptimal
- Atlas C-routines. :issue:`11638`
- * This should speed up the Windows and Linux wheels
-
-#. Allow fine-grained parallelism in cython
-
- * Now that we do not use fork-based multiprocessing in joblib anymore it's
- possible to use the prange / openmp thread management which makes it
- possible to have very efficient thread-based parallelism at the Cython
- level. Example with K-Means: :issue:`11950`
-
#. Distributed parallelism
* Joblib can now plug onto several backends, some of them can distribute the
@@ -240,9 +226,6 @@ Subpackage-specific goals
:mod:`sklearn.ensemble`
* a stacking implementation
-* a binned feature histogram based and thread parallel implementation of
- decision trees to compete with the performance of state of the art gradient
- boosting like LightGBM.
:mod:`sklearn.model_selection`
@@ -269,5 +252,3 @@ Subpackage-specific goals
* Performance issues with `Pipeline.memory`
* see "Everything in Scikit-learn should conform to our API contract" above
-* Add a verbose option :issue:`10435`
-
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index bf18d8350646e..91c8e4506ec2b 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -7,7 +7,7 @@
Version 0.21.0
==============
-**May 2019**
+**10 May 2019**
Changed models
--------------
@@ -38,6 +38,8 @@ random sampling procedures.
seed, including :class:`linear_model.LogisticRegression`,
:class:`linear_model.LogisticRegressionCV`, :class:`linear_model.Ridge`,
and :class:`linear_model.RidgeCV` with 'sag' solver. |Fix|
+- :class:`linear_model.ridge.RidgeCV` when using generalized cross-validation
+ with sparse inputs. |Fix|
Details are listed in the changelog below.
@@ -119,6 +121,12 @@ Support for Python 3.4 and below has been officially dropped.
parameter which can be used to find the clusters instead of ``n_clusters``.
:issue:`9069` by :user:`Vathsala Achar ` and `Adrin Jalali`_.
+:mod:`sklearn.compose`
+......................
+
+- |API| :class:`compose.ColumnTransformer` is no longer an experimental
+ feature. :pr:`13835` by :user:`Hanmin Qin `.
+
:mod:`sklearn.datasets`
.......................
@@ -214,7 +222,7 @@ Support for Python 3.4 and below has been officially dropped.
>>> # explicitly require this experimental feature
>>> from sklearn.experimental import enable_hist_gradient_boosting # noqa
- >>> # now you can import normally from ensemble
+ >>> # now you can import normally from sklearn.ensemble
>>> from sklearn.ensemble import HistGradientBoostingClassifier
:pr:`12807` by :user:`Nicolas Hug`.
@@ -319,6 +327,17 @@ Support for Python 3.4 and below has been officially dropped.
:pr:`12599` by :user:`Trevor Stephens` and
:user:`Nicolas Hug`.
+- |Fix| :class:`ensemble.VotingClassifier` and
+ :class:`ensemble.VotingRegressor` were failing during ``fit`` in one
+ of the estimators was set to ``None`` and ``sample_weight`` was not ``None``.
+ :pr:`13779` by :user:`Guillaume Lemaitre `.
+
+- |API| :class:`ensemble.VotingClassifier` and
+ :class:`ensemble.VotingRegressor` accept ``'drop'`` to disable an estimator
+ in addition to ``None`` to be consistent with other estimators (i.e.,
+ :class:`pipeline.FeatureUnion` and :class:`compose.ColumnTransformer`).
+ :pr:`13780` by :user:`Guillaume Lemaitre `.
+
:mod:`sklearn.externals`
........................
@@ -345,6 +364,15 @@ Support for Python 3.4 and below has been officially dropped.
:pr:`12177` by :user:`Sergey Feldman ` and :user:`Ben Lawson
`.
+ The API of IterativeImputer is experimental and subject to change without any
+ deprecation cycle. To use them, you need to explicitly import
+ ``enable_iterative_imputer``::
+
+ >>> from sklearn.experimental import enable_iterative_imputer # noqa
+ >>> # now you can import normally from sklearn.impute
+ >>> from sklearn.impute import IterativeImputer
+
+
- |Feature| The :class:`impute.SimpleImputer` and
:class:`impute.IterativeImputer` have a new parameter ``'add_indicator'``,
which simply stacks a :class:`impute.MissingIndicator` transform into the
@@ -384,6 +412,10 @@ Support for Python 3.4 and below has been officially dropped.
:mod:`sklearn.linear_model`
...........................
+- |Enhancement| :class:`linear_model.Ridge` now preserves ``float32`` and
+ ``float64`` dtypes. :issues:`8769` and :issues:`11000` by
+ :user:`Guillaume Lemaitre `, and :user:`Joan Massich `
+
- |Feature| :class:`linear_model.LogisticRegression` and
:class:`linear_model.LogisticRegressionCV` now support Elastic-Net penalty,
with the 'saga' solver. :pr:`11646` by :user:`Nicolas Hug `.
@@ -478,6 +510,10 @@ Support for Python 3.4 and below has been officially dropped.
in version 0.21 and will be removed in version 0.23.
:pr:`12821` by :user:`Nicolas Hug `.
+- |Fix| :class:`linear_model.ridge.RidgeCV` with generalized cross-validation
+ now correctly fits an intercept when ``fit_intercept=True`` and the design
+ matrix is sparse. :issue:`13350` by :user:`Jérôme Dockès `
+
:mod:`sklearn.manifold`
.......................
@@ -577,7 +613,7 @@ Support for Python 3.4 and below has been officially dropped.
- |Feature| Classes :class:`~model_selection.GridSearchCV` and
:class:`~model_selection.RandomizedSearchCV` now allow for refit=callable
to add flexibility in identifying the best estimator.
- See :doc:`/auto_examples/model_selection/plot_grid_search_refit_callable.py`.
+ See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_refit_callable.py`.
:pr:`11354` by :user:`Wenhao Zhang `,
`Joel Nothman`_ and :user:`Adrin Jalali `.
diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index c6a8cb65d2c6b..06fab08c381f2 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -8,13 +8,13 @@
variable as an output in turn.
In this example we compare some estimators for the purpose of missing feature
-imputation with :class:`sklearn.imputeIterativeImputer`::
+imputation with :class:`sklearn.impute.IterativeImputer`:
- :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression
- :class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression
- :class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R
- :class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN
- imputation approaches
+* :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression
+* :class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression
+* :class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R
+* :class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN
+ imputation approaches
Of particular interest is the ability of
:class:`sklearn.impute.IterativeImputer` to mimic the behavior of missForest, a
@@ -42,6 +42,8 @@
import matplotlib.pyplot as plt
import pandas as pd
+# To use this experimental feature, we need to explicitly ask for it:
+from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index 897b66aad246c..2d2d37745abf3 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -23,6 +23,8 @@
import numpy as np
import matplotlib.pyplot as plt
+# To use the experimental IterativeImputer, we need to explicitly ask for it:
+from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index bd5e052a50577..1271b7e9fd4a9 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -45,7 +45,7 @@
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
#
-__version__ = '0.21rc2'
+__version__ = '0.21.0'
# On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
index c99ef8f618b23..9f9245aa32f21 100644
--- a/sklearn/cluster/hierarchical.py
+++ b/sklearn/cluster/hierarchical.py
@@ -148,7 +148,7 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False):
Parameters
----------
X : array, shape (n_samples, n_features)
- feature matrix representing n_samples samples to be clustered
+ feature matrix representing n_samples samples to be clustered
connectivity : sparse matrix (optional).
connectivity matrix. Defines for each sample the neighboring samples
@@ -219,7 +219,7 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False):
n_samples, n_features = X.shape
if connectivity is None:
- from scipy.cluster import hierarchy # imports PIL
+ from scipy.cluster import hierarchy # imports PIL
if n_clusters is not None:
warnings.warn('Partial build of the tree is implemented '
@@ -433,7 +433,7 @@ def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete',
'of %s, but %s was given' % (linkage_choices.keys(), linkage))
if connectivity is None:
- from scipy.cluster import hierarchy # imports PIL
+ from scipy.cluster import hierarchy # imports PIL
if n_clusters is not None:
warnings.warn('Partial build of the tree is implemented '
@@ -597,7 +597,7 @@ def _single_linkage(*args, **kwargs):
###############################################################################
-# Functions for cutting hierarchical clustering tree
+# Functions for cutting hierarchical clustering tree
def _hc_cut(n_clusters, children, n_leaves):
"""Function cutting the ward tree for a given number of clusters.
diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
index 042e6990b5df1..37dc6a3abda61 100644
--- a/sklearn/cluster/k_means_.py
+++ b/sklearn/cluster/k_means_.py
@@ -44,7 +44,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
"""Init n_clusters seeds according to k-means++
Parameters
- -----------
+ ----------
X : array or sparse matrix, shape (n_samples, n_features)
The data to pick seeds for. To avoid memory copy, the input data
should be double precision (dtype=np.float64).
@@ -706,7 +706,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
an int to make the randomness deterministic.
See :term:`Glossary `.
- x_squared_norms : array, shape (n_samples,), optional
+ x_squared_norms : array, shape (n_samples,), optional
Squared euclidean norm of each data point. Pass it if you have it at
hands already to avoid it being recomputed here. Default: None
@@ -887,7 +887,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
probably much faster than the default batch implementation.
Notes
- ------
+ -----
The k-means problem is solved using either Lloyd's or Elkan's algorithm.
The average complexity is given by O(k n T), were n is the number of
@@ -1419,8 +1419,8 @@ class MiniBatchKMeans(KMeans):
... [3, 2], [5, 5], [1, -1]])
>>> # manually fit on batches
>>> kmeans = MiniBatchKMeans(n_clusters=2,
- ... random_state=0,
- ... batch_size=6)
+ ... random_state=0,
+ ... batch_size=6)
>>> kmeans = kmeans.partial_fit(X[0:6,:])
>>> kmeans = kmeans.partial_fit(X[6:12,:])
>>> kmeans.cluster_centers_
@@ -1430,9 +1430,9 @@ class MiniBatchKMeans(KMeans):
array([0, 1], dtype=int32)
>>> # fit on the whole data
>>> kmeans = MiniBatchKMeans(n_clusters=2,
- ... random_state=0,
- ... batch_size=6,
- ... max_iter=10).fit(X)
+ ... random_state=0,
+ ... batch_size=6,
+ ... max_iter=10).fit(X)
>>> kmeans.cluster_centers_
array([[3.95918367, 2.40816327],
[1.12195122, 1.3902439 ]])
diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py
index 7e93e715b7585..68b92139537d3 100644
--- a/sklearn/cluster/mean_shift_.py
+++ b/sklearn/cluster/mean_shift_.py
@@ -409,7 +409,7 @@ def fit(self, X, y=None):
"""Perform clustering.
Parameters
- -----------
+ ----------
X : array-like, shape=[n_samples, n_features]
Samples to cluster.
diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py
index 82a771756d09c..fdaf423a11db4 100644
--- a/sklearn/cluster/spectral.py
+++ b/sklearn/cluster/spectral.py
@@ -173,7 +173,7 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None,
Read more in the :ref:`User Guide `.
Parameters
- -----------
+ ----------
affinity : array-like or sparse matrix, shape: (n_samples, n_samples)
The affinity matrix describing the relationship of the samples to
embed. **Must be symmetric**.
@@ -240,7 +240,7 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None,
https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf
Notes
- ------
+ -----
The graph should contain only one connect component, elsewhere
the results make little sense.
@@ -298,7 +298,7 @@ class SpectralClustering(BaseEstimator, ClusterMixin):
Read more in the :ref:`User Guide `.
Parameters
- -----------
+ ----------
n_clusters : integer, optional
The dimension of the projection subspace.
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index a59e7962bbbb4..11dad7338b94a 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -33,9 +33,6 @@
class ColumnTransformer(_BaseComposition, TransformerMixin):
"""Applies transformers to columns of an array or pandas DataFrame.
- EXPERIMENTAL: some behaviors may change between releases without
- deprecation.
-
This estimator allows different columns or column subsets of the input
to be transformed separately and the features generated by each transformer
will be concatenated to form a single feature space.
diff --git a/sklearn/conftest.py b/sklearn/conftest.py
new file mode 100644
index 0000000000000..d38e45f57b4f8
--- /dev/null
+++ b/sklearn/conftest.py
@@ -0,0 +1,21 @@
+import pytest
+
+
+@pytest.fixture(scope='function')
+def pyplot():
+ """Setup and teardown fixture for matplotlib.
+
+ This fixture checks if we can import matplotlib. If not, the tests will be
+ skipped. Otherwise, we setup matplotlib backend and close the figures
+ after running the functions.
+
+ Returns
+ -------
+ pyplot : module
+ The ``matplotlib.pyplot`` module.
+ """
+ matplotlib = pytest.importorskip('matplotlib')
+ matplotlib.use('agg', warn=False, force=True)
+ pyplot = pytest.importorskip('matplotlib.pyplot')
+ yield pyplot
+ pyplot.close('all')
diff --git a/sklearn/covariance/empirical_covariance_.py b/sklearn/covariance/empirical_covariance_.py
index 21d389846f198..a962c7ead8615 100644
--- a/sklearn/covariance/empirical_covariance_.py
+++ b/sklearn/covariance/empirical_covariance_.py
@@ -122,8 +122,8 @@ class EmpiricalCovariance(BaseEstimator):
... [.3, .4]])
>>> rng = np.random.RandomState(0)
>>> X = rng.multivariate_normal(mean=[0, 0],
- ... cov=real_cov,
- ... size=500)
+ ... cov=real_cov,
+ ... size=500)
>>> cov = EmpiricalCovariance().fit(X)
>>> cov.covariance_ # doctest: +ELLIPSIS
array([[0.7569..., 0.2818...],
diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py
index 35ead3fcd8210..2e355f5cf3f1b 100644
--- a/sklearn/covariance/graph_lasso_.py
+++ b/sklearn/covariance/graph_lasso_.py
@@ -337,10 +337,10 @@ class GraphicalLasso(EmpiricalCovariance):
--------
>>> import numpy as np
>>> from sklearn.covariance import GraphicalLasso
- >>> true_cov = np.array([[.8, 0., .2, 0.],
- ... [0., .4, 0., 0.],
- ... [.2, 0., .3, .1],
- ... [0., 0., .1, .7]])
+ >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],
+ ... [0.0, 0.4, 0.0, 0.0],
+ ... [0.2, 0.0, 0.3, 0.1],
+ ... [0.0, 0.0, 0.1, 0.7]])
>>> np.random.seed(0)
>>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],
... cov=true_cov,
@@ -592,10 +592,10 @@ class GraphicalLassoCV(GraphicalLasso):
--------
>>> import numpy as np
>>> from sklearn.covariance import GraphicalLassoCV
- >>> true_cov = np.array([[.8, 0., .2, 0.],
- ... [0., .4, 0., 0.],
- ... [.2, 0., .3, .1],
- ... [0., 0., .1, .7]])
+ >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],
+ ... [0.0, 0.4, 0.0, 0.0],
+ ... [0.2, 0.0, 0.3, 0.1],
+ ... [0.0, 0.0, 0.1, 0.7]])
>>> np.random.seed(0)
>>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],
... cov=true_cov,
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index 3fdffc5851d01..0b8f73c86117b 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -568,12 +568,12 @@ def load_digits(n_class=10, return_X_y=False):
def load_diabetes(return_X_y=False):
"""Load and return the diabetes dataset (regression).
- ============== ==================
- Samples total 442
- Dimensionality 10
- Features real, -.2 < x < .2
- Targets integer 25 - 346
- ============== ==================
+ ============== ==================
+ Samples total 442
+ Dimensionality 10
+ Features real, -.2 < x < .2
+ Targets integer 25 - 346
+ ============== ==================
Read more in the :ref:`User Guide `.
@@ -621,12 +621,12 @@ def load_diabetes(return_X_y=False):
def load_linnerud(return_X_y=False):
"""Load and return the linnerud dataset (multivariate regression).
- ============== ============================
- Samples total 20
- Dimensionality 3 (for both data and target)
- Features integer
- Targets integer
- ============== ============================
+ ============== ============================
+ Samples total 20
+ Dimensionality 3 (for both data and target)
+ Features integer
+ Targets integer
+ ============== ============================
Read more in the :ref:`User Guide `.
@@ -685,12 +685,12 @@ def load_linnerud(return_X_y=False):
def load_boston(return_X_y=False):
"""Load and return the boston house-prices dataset (regression).
- ============== ==============
- Samples total 506
- Dimensionality 13
- Features real, positive
- Targets real 5. - 50.
- ============== ==============
+ ============== ==============
+ Samples total 506
+ Dimensionality 13
+ Features real, positive
+ Targets real 5. - 50.
+ ============== ==============
Read more in the :ref:`User Guide `.
@@ -810,7 +810,7 @@ def load_sample_image(image_name):
Read more in the :ref:`User Guide `.
Parameters
- -----------
+ ----------
image_name : {`china.jpg`, `flower.jpg`}
The name of the sample image loaded
@@ -820,7 +820,7 @@ def load_sample_image(image_name):
The image as a numpy array: height x width x color
Examples
- ---------
+ --------
>>> from sklearn.datasets import load_sample_image
>>> china = load_sample_image('china.jpg') # doctest: +SKIP
@@ -895,7 +895,7 @@ def _fetch_remote(remote, dirname=None):
downloaded file.
Parameters
- -----------
+ ----------
remote : RemoteFileMetadata
Named tuple containing remote dataset meta information: url, filename
and checksum
diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
index 372d6e44f1b92..26550270c3aab 100644
--- a/sklearn/datasets/california_housing.py
+++ b/sklearn/datasets/california_housing.py
@@ -50,12 +50,12 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
return_X_y=False):
"""Load the California housing dataset (regression).
- ============== ==============
- Samples total 20640
- Dimensionality 8
- Features real
- Target real 0.15 - 5.
- ============== ==============
+ ============== ==============
+ Samples total 20640
+ Dimensionality 8
+ Features real
+ Target real 0.15 - 5.
+ ============== ==============
Read more in the :ref:`User Guide `.
@@ -97,7 +97,7 @@ def fetch_california_housing(data_home=None, download_if_missing=True,
.. versionadded:: 0.20
Notes
- ------
+ -----
This dataset consists of 20,640 samples and 9 features.
"""
diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py
index 2363a9a4689ca..6f76ee15e2e40 100644
--- a/sklearn/datasets/openml.py
+++ b/sklearn/datasets/openml.py
@@ -449,9 +449,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
.. note:: EXPERIMENTAL
- The API is experimental in version 0.20 (particularly the return value
- structure), and might have small backward-incompatible changes in
- future releases.
+ The API is experimental (particularly the return value structure),
+ and might have small backward-incompatible changes in future releases.
Parameters
----------
@@ -515,10 +514,9 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
.. note:: EXPERIMENTAL
- This interface is **experimental** as at version 0.20 and
- subsequent releases may change attributes without notice
- (although there should only be minor changes to ``data``
- and ``target``).
+ This interface is **experimental** and subsequent releases may
+ change attributes without notice (although there should only be
+ minor changes to ``data`` and ``target``).
Missing values in the 'data' are represented as NaN's. Missing values
in 'target' are represented as NaN's (numerical target) or None
diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py
index 34e8251f9551f..83cb5b132ccd5 100644
--- a/sklearn/datasets/species_distributions.py
+++ b/sklearn/datasets/species_distributions.py
@@ -154,7 +154,7 @@ def fetch_species_distributions(data_home=None,
instead of trying to download the data from the source site.
Returns
- --------
+ -------
The data is returned as a Bunch object with the following attributes:
coverages : array, shape = [14, 1592, 1212]
diff --git a/sklearn/datasets/svmlight_format.py b/sklearn/datasets/svmlight_format.py
index fbb38ceffa298..c85d4e91749b6 100644
--- a/sklearn/datasets/svmlight_format.py
+++ b/sklearn/datasets/svmlight_format.py
@@ -435,7 +435,7 @@ def dump_svmlight_file(X, y, f, zero_based=True, comment=None, query_id=None,
# if a user wants to get fancy, they'll have to decode themselves.
# Avoid mention of str and unicode types for Python 3.x compat.
if isinstance(comment, bytes):
- comment.decode("ascii") # just for the exception
+ comment.decode("ascii") # just for the exception
else:
comment = comment.encode("utf-8")
if b"\0" in comment:
diff --git a/sklearn/decomposition/base.py b/sklearn/decomposition/base.py
index b318de0cd0daf..e0b9b33de0bda 100644
--- a/sklearn/decomposition/base.py
+++ b/sklearn/decomposition/base.py
@@ -27,7 +27,7 @@ def get_covariance(self):
"""Compute data covariance with the generative model.
``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)``
- where S**2 contains the explained variances, and sigma2 contains the
+ where S**2 contains the explained variances, and sigma2 contains the
noise variances.
Returns
diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
index ef823272e0e8f..8075b706a5f9c 100644
--- a/sklearn/decomposition/dict_learning.py
+++ b/sklearn/decomposition/dict_learning.py
@@ -171,7 +171,7 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars',
copy_Xy=copy_cov).T
else:
raise ValueError('Sparse coding method must be "lasso_lars" '
- '"lasso_cd", "lasso", "threshold" or "omp", got %s.'
+ '"lasso_cd", "lasso", "threshold" or "omp", got %s.'
% algorithm)
if new_code.ndim != 2:
return new_code.reshape(n_samples, n_components)
diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py
index c1c695c96d82b..555bd619c5a62 100644
--- a/sklearn/decomposition/kernel_pca.py
+++ b/sklearn/decomposition/kernel_pca.py
@@ -230,9 +230,9 @@ def _fit_transform(self, K):
# there is a link between
# the eigenvectors of K=Phi(X)'Phi(X) and the ones of Phi(X)Phi(X)'
# if v is an eigenvector of K
- # then Phi(X)v is an eigenvector of Phi(X)Phi(X)'
+ # then Phi(X)v is an eigenvector of Phi(X)Phi(X)'
# if u is an eigenvector of Phi(X)Phi(X)'
- # then Phi(X)'u is an eigenvector of Phi(X)Phi(X)'
+ # then Phi(X)'u is an eigenvector of Phi(X)Phi(X)'
#
# At this stage our self.alphas_ (the v) have norm 1, we need to scale
# them so that eigenvectors in kernel feature space (the u) have norm=1
diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
index d1cee0345d5e6..5c8893d141724 100644
--- a/sklearn/decomposition/pca.py
+++ b/sklearn/decomposition/pca.py
@@ -223,6 +223,8 @@ class PCA(_BasePCA):
The singular values are equal to the 2-norms of the ``n_components``
variables in the lower-dimensional space.
+ .. versionadded:: 0.19
+
mean_ : array, shape (n_features,)
Per-feature empirical mean, estimated from the training set.
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index e710bc5045b30..9d64292b702e0 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -716,7 +716,7 @@ def _decision_function(self, X):
Xm = X - self.means_[i]
X2 = np.dot(Xm, R * (S ** (-0.5)))
norm2.append(np.sum(X2 ** 2, 1))
- norm2 = np.array(norm2).T # shape = [len(X), n_classes]
+ norm2 = np.array(norm2).T # shape = [len(X), n_classes]
u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
return (-0.5 * (norm2 + u) + np.log(self.priors_))
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 6e1fe461fabe7..98ecef6f6c459 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -492,10 +492,10 @@ def predict(self, X, return_std=False):
Returns
-------
- y : array, shape = [n_samples] or [n_samples, n_outputs]
+ y : array, shape = [n_samples] or [n_samples, n_outputs]
Predicted target values for X.
- y_std : array, shape = [n_samples] or [n_samples, n_outputs]
+ y_std : array, shape = [n_samples] or [n_samples, n_outputs]
Standard deviation of predictive distribution of query points.
"""
check_is_fitted(self, "constant_")
diff --git a/sklearn/ensemble/_gb_losses.py b/sklearn/ensemble/_gb_losses.py
index ca92589075b0c..19c66710bf0ad 100644
--- a/sklearn/ensemble/_gb_losses.py
+++ b/sklearn/ensemble/_gb_losses.py
@@ -879,6 +879,6 @@ def get_init_raw_predictions(self, X, estimator):
'lad': LeastAbsoluteError,
'huber': HuberLossFunction,
'quantile': QuantileLossFunction,
- 'deviance': None, # for both, multinomial and binomial
+ 'deviance': None, # for both, multinomial and binomial
'exponential': ExponentialLoss,
}
diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx
index c46ed25a4c4dc..64225db2348dc 100644
--- a/sklearn/ensemble/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_gradient_boosting.pyx
@@ -239,131 +239,6 @@ def predict_stage(np.ndarray[object, ndim=2] estimators,
return predict_stages(estimators[stage:stage + 1], X, scale, out)
-cdef inline int array_index(int32 val, int32[::1] arr):
- """Find index of ``val`` in array ``arr``. """
- cdef int32 res = -1
- cdef int32 i = 0
- cdef int32 n = arr.shape[0]
- for i in range(n):
- if arr[i] == val:
- res = i
- break
- return res
-
-
-cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
- int32[::1] target_feature,
- double learn_rate,
- double[::1] out):
- """Partial dependence of the response on the ``target_feature`` set.
-
- For each row in ``X`` a tree traversal is performed.
- Each traversal starts from the root with weight 1.0.
-
- At each non-terminal node that splits on a target variable either
- the left child or the right child is visited based on the feature
- value of the current sample and the weight is not modified.
- At each non-terminal node that splits on a complementary feature
- both children are visited and the weight is multiplied by the fraction
- of training samples which went to each child.
-
- At each terminal node the value of the node is multiplied by the
- current weight (weights sum to 1 for all visited terminal nodes).
-
- Parameters
- ----------
- tree : sklearn.tree.Tree
- A regression tree; tree.values.shape[1] == 1
- X : memory view on 2d ndarray
- The grid points on which the partial dependence
- should be evaluated. X.shape[1] == target_feature.shape[0].
- target_feature : memory view on 1d ndarray
- The set of target features for which the partial dependence
- should be evaluated. X.shape[1] == target_feature.shape[0].
- learn_rate : double
- Constant scaling factor for the leaf predictions.
- out : memory view on 1d ndarray
- The value of the partial dependence function on each grid
- point.
- """
- cdef Py_ssize_t i = 0
- cdef Py_ssize_t n_features = X.shape[1]
- cdef Node* root_node = tree.nodes
- cdef double *value = tree.value
- cdef SIZE_t node_count = tree.node_count
-
- cdef SIZE_t stack_capacity = node_count * 2
- cdef Node **node_stack
- cdef double[::1] weight_stack = np_ones((stack_capacity,), dtype=np_float64)
- cdef SIZE_t stack_size = 1
- cdef double left_sample_frac
- cdef double current_weight
- cdef double total_weight = 0.0
- cdef Node *current_node
- underlying_stack = np_zeros((stack_capacity,), dtype=np.intp)
- node_stack = ( underlying_stack).data
-
- for i in range(X.shape[0]):
- # init stacks for new example
- stack_size = 1
- node_stack[0] = root_node
- weight_stack[0] = 1.0
- total_weight = 0.0
-
- while stack_size > 0:
- # get top node on stack
- stack_size -= 1
- current_node = node_stack[stack_size]
-
- if current_node.left_child == TREE_LEAF:
- out[i] += weight_stack[stack_size] * value[current_node - root_node] * \
- learn_rate
- total_weight += weight_stack[stack_size]
- else:
- # non-terminal node
- feature_index = array_index(current_node.feature, target_feature)
- if feature_index != -1:
- # split feature in target set
- # push left or right child on stack
- if X[i, feature_index] <= current_node.threshold:
- # left
- node_stack[stack_size] = (root_node +
- current_node.left_child)
- else:
- # right
- node_stack[stack_size] = (root_node +
- current_node.right_child)
- stack_size += 1
- else:
- # split feature in complement set
- # push both children onto stack
-
- # push left child
- node_stack[stack_size] = root_node + current_node.left_child
- current_weight = weight_stack[stack_size]
- left_sample_frac = root_node[current_node.left_child].weighted_n_node_samples / \
- current_node.weighted_n_node_samples
- if left_sample_frac <= 0.0 or left_sample_frac >= 1.0:
- raise ValueError("left_sample_frac:%d, "
- "weighted_n_node_samples current: %d, "
- "weighted_n_node_samples left: %d"
- % (left_sample_frac,
- current_node.weighted_n_node_samples,
- root_node[current_node.left_child].weighted_n_node_samples))
- weight_stack[stack_size] = current_weight * left_sample_frac
- stack_size +=1
-
- # push right child
- node_stack[stack_size] = root_node + current_node.right_child
- weight_stack[stack_size] = current_weight * \
- (1.0 - left_sample_frac)
- stack_size +=1
-
- if not (0.999 < total_weight < 1.001):
- raise ValueError("Total weight should be 1.0 but was %.9f" %
- total_weight)
-
-
def _random_sample_mask(np.npy_intp n_total_samples,
np.npy_intp n_total_in_bag, random_state):
"""Create a random sample mask where ``n_total_in_bag`` elements are set.
diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
index ce7ac7116030a..064c7ce8b6411 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -101,7 +101,7 @@ def __lt__(self, other_node):
priority).
Parameters
- -----------
+ ----------
other_node : TreeNode
The node to compare with.
"""
diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py
index 3ce0eb7f456da..468de934f3666 100644
--- a/sklearn/ensemble/gradient_boosting.py
+++ b/sklearn/ensemble/gradient_boosting.py
@@ -1694,7 +1694,7 @@ def _staged_raw_predict(self, X):
Regression and binary classification are special cases with
``k == 1``, otherwise ``k==n_classes``.
"""
- X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr')
+ X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr')
raw_predictions = self._raw_predict_init(X)
for i in range(self.estimators_.shape[0]):
predict_stage(self.estimators_, i, X, self.learning_rate,
@@ -2085,7 +2085,7 @@ def decision_function(self, X):
`classes_`. Regression and binary classification produce an
array of shape [n_samples].
"""
- X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr')
+ X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr')
raw_predictions = self._raw_predict(X)
if raw_predictions.shape[1] == 1:
return raw_predictions.ravel()
@@ -2527,7 +2527,7 @@ def predict(self, X):
y : array, shape (n_samples,)
The predicted values.
"""
- X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr')
+ X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr')
# In regression we can directly return the raw value from the trees.
return self._raw_predict(X).ravel()
diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index 11d5208d2d179..b1a40ffd09d1e 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -20,7 +20,6 @@
from ..tree._tree import DTYPE
from ..utils import deprecated
-from ._gradient_boosting import _partial_dependence_tree
from .gradient_boosting import BaseGradientBoosting
@@ -174,8 +173,8 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
for stage in range(n_estimators):
for k in range(n_trees_per_stage):
tree = gbrt.estimators_[stage, k].tree_
- _partial_dependence_tree(tree, grid, target_variables,
- gbrt.learning_rate, pdp[k])
+ tree.compute_partial_dependence(grid, target_variables, pdp[k])
+ pdp *= gbrt.learning_rate
return pdp, axes
diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index a40fea2ff0099..dc0e0419e812e 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -7,14 +7,12 @@
from numpy.testing import assert_array_equal, assert_allclose
from sklearn.utils.testing import assert_raises
-from sklearn.utils.testing import if_matplotlib
from sklearn.ensemble.partial_dependence import partial_dependence
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import datasets
from sklearn.utils.testing import ignore_warnings
-from sklearn.utils.testing import assert_warns_message
# toy sample
@@ -156,8 +154,7 @@ def test_partial_dependecy_input():
@ignore_warnings(category=DeprecationWarning)
@pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
# matplotlib Python3.7 warning
-@if_matplotlib
-def test_plot_partial_dependence():
+def test_plot_partial_dependence(pyplot):
# Test partial dependence plot function.
clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
clf.fit(boston.data, boston.target)
@@ -190,9 +187,8 @@ def test_plot_partial_dependence():
@pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
# matplotlib Python3.7 warning
-@if_matplotlib
@ignore_warnings(category=DeprecationWarning)
-def test_plot_partial_dependence_input():
+def test_plot_partial_dependence_input(pyplot):
# Test partial dependence plot function input checks.
clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -228,9 +224,8 @@ def test_plot_partial_dependence_input():
@pytest.mark.filterwarnings('ignore: Using or importing the ABCs from')
# matplotlib Python3.7 warning
-@if_matplotlib
@ignore_warnings(category=DeprecationWarning)
-def test_plot_partial_dependence_multiclass():
+def test_plot_partial_dependence_multiclass(pyplot):
# Test partial dependence plot function on multi-class input.
clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
clf.fit(iris.data, iris.target)
@@ -265,30 +260,18 @@ def test_plot_partial_dependence_multiclass():
grid_resolution=grid_resolution)
-def test_warning_raised_partial_dependence():
- # Test that deprecation warning is raised
-
- clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
- clf.fit(boston.data, boston.target)
- grid_resolution = 25
-
- assert_warns_message(DeprecationWarning, "The function "
- "ensemble.partial_dependence has been deprecated ",
- partial_dependence, clf, [0], X=boston.data,
- grid_resolution=grid_resolution)
-
-
-@if_matplotlib
-def test_warning_raised_partial_dependence_plot():
- # Test that deprecation warning is raised
-
+@pytest.mark.parametrize(
+ "func, params",
+ [(partial_dependence, {'target_variables': [0], 'X': boston.data}),
+ (plot_partial_dependence, {'X': boston.data, 'features': [0, 1, (0, 1)]})]
+)
+def test_raise_deprecation_warning(pyplot, func, params):
clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
clf.fit(boston.data, boston.target)
grid_resolution = 25
- assert_warns_message(DeprecationWarning, "The function "
- "ensemble.plot_partial_dependence has been "
- "deprecated",
- plot_partial_dependence, clf, boston.data,
- [0, 1, (0, 1)], grid_resolution=grid_resolution,
- feature_names=boston.feature_names)
+ warn_msg = "The function ensemble.{} has been deprecated".format(
+ func.__name__
+ )
+ with pytest.warns(DeprecationWarning, match=warn_msg):
+ func(clf, **params, grid_resolution=grid_resolution)
diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py
index 2a19bc9a64dc0..b2b16cf8eeec3 100644
--- a/sklearn/ensemble/tests/test_voting.py
+++ b/sklearn/ensemble/tests/test_voting.py
@@ -8,9 +8,11 @@
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_raise_message
from sklearn.exceptions import NotFittedError
+from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
@@ -340,12 +342,25 @@ def test_sample_weight():
assert_array_equal(eclf3.predict(X), clf1.predict(X))
assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X))
+ # check that an error is raised and indicative if sample_weight is not
+ # supported.
clf4 = KNeighborsClassifier()
eclf3 = VotingClassifier(estimators=[
('lr', clf1), ('svc', clf3), ('knn', clf4)],
voting='soft')
- msg = ('Underlying estimator \'knn\' does not support sample weights.')
- assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
+ msg = ('Underlying estimator KNeighborsClassifier does not support '
+ 'sample weights.')
+ with pytest.raises(ValueError, match=msg):
+ eclf3.fit(X, y, sample_weight)
+
+ # check that _parallel_fit_estimator will raise the right error
+ # it should raise the original error if this is not linked to sample_weight
+ class ClassifierErrorFit(BaseEstimator, ClassifierMixin):
+ def fit(self, X, y, sample_weight):
+ raise TypeError('Error unrelated to sample_weight.')
+ clf = ClassifierErrorFit()
+ with pytest.raises(TypeError, match='Error unrelated to sample_weight'):
+ clf.fit(X, y, sample_weight=sample_weight)
def test_sample_weight_kwargs():
@@ -402,8 +417,10 @@ def test_set_params():
@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22
@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22
@pytest.mark.filterwarnings('ignore:The default value of n_estimators')
-def test_set_estimator_none():
- """VotingClassifier set_params should be able to set estimators as None"""
+@pytest.mark.parametrize("drop", [None, 'drop'])
+def test_set_estimator_none(drop):
+ """VotingClassifier set_params should be able to set estimators as None or
+ drop"""
# Test predict
clf1 = LogisticRegression(random_state=123)
clf2 = RandomForestClassifier(random_state=123)
@@ -415,22 +432,22 @@ def test_set_estimator_none():
eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
('nb', clf3)],
voting='hard', weights=[1, 1, 0.5])
- eclf2.set_params(rf=None).fit(X, y)
+ eclf2.set_params(rf=drop).fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
- assert dict(eclf2.estimators)["rf"] is None
+ assert dict(eclf2.estimators)["rf"] is drop
assert len(eclf2.estimators_) == 2
assert all(isinstance(est, (LogisticRegression, GaussianNB))
for est in eclf2.estimators_)
- assert eclf2.get_params()["rf"] is None
+ assert eclf2.get_params()["rf"] is drop
eclf1.set_params(voting='soft').fit(X, y)
eclf2.set_params(voting='soft').fit(X, y)
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
- msg = 'All estimators are None. At least one is required!'
+ msg = 'All estimators are None or "drop". At least one is required!'
assert_raise_message(
- ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)
+ ValueError, msg, eclf2.set_params(lr=drop, rf=drop, nb=drop).fit, X, y)
# Test soft voting transform
X1 = np.array([[1], [2]])
@@ -442,7 +459,7 @@ def test_set_estimator_none():
eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
voting='soft', weights=[1, 0.5],
flatten_transform=False)
- eclf2.set_params(rf=None).fit(X1, y1)
+ eclf2.set_params(rf=drop).fit(X1, y1)
assert_array_almost_equal(eclf1.transform(X1),
np.array([[[0.7, 0.3], [0.3, 0.7]],
[[1., 0.], [0., 1.]]]))
@@ -507,3 +524,26 @@ def test_transform():
eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)),
eclf2.transform(X)
)
+
+
+@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22
+@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22
+@pytest.mark.parametrize(
+ "X, y, voter",
+ [(X, y, VotingClassifier(
+ [('lr', LogisticRegression()),
+ ('rf', RandomForestClassifier(n_estimators=5))])),
+ (X_r, y_r, VotingRegressor(
+ [('lr', LinearRegression()),
+ ('rf', RandomForestRegressor(n_estimators=5))]))]
+)
+@pytest.mark.parametrize("drop", [None, 'drop'])
+def test_none_estimator_with_weights(X, y, voter, drop):
+ # check that an estimator can be set to None and passing some weight
+ # regression test for
+ # https://github.com/scikit-learn/scikit-learn/issues/13777
+ voter.fit(X, y, sample_weight=np.ones(y.shape))
+ voter.set_params(lr=drop)
+ voter.fit(X, y, sample_weight=np.ones(y.shape))
+ y_pred = voter.predict(X)
+ assert y_pred.shape == y.shape
diff --git a/sklearn/ensemble/voting.py b/sklearn/ensemble/voting.py
index 35821201b6617..f60bb8f49b81d 100644
--- a/sklearn/ensemble/voting.py
+++ b/sklearn/ensemble/voting.py
@@ -30,7 +30,15 @@
def _parallel_fit_estimator(estimator, X, y, sample_weight=None):
"""Private function used to fit an estimator within a job."""
if sample_weight is not None:
- estimator.fit(X, y, sample_weight=sample_weight)
+ try:
+ estimator.fit(X, y, sample_weight=sample_weight)
+ except TypeError as exc:
+ if "unexpected keyword argument 'sample_weight'" in str(exc):
+ raise ValueError(
+ "Underlying estimator {} does not support sample weights."
+ .format(estimator.__class__.__name__)
+ ) from exc
+ raise
else:
estimator.fit(X, y)
return estimator
@@ -53,8 +61,8 @@ def _weights_not_none(self):
"""Get the weights of not `None` estimators"""
if self.weights is None:
return None
- return [w for est, w in zip(self.estimators,
- self.weights) if est[1] is not None]
+ return [w for est, w in zip(self.estimators, self.weights)
+ if est[1] not in (None, 'drop')]
def _predict(self, X):
"""Collect results from clf.predict calls. """
@@ -76,24 +84,22 @@ def fit(self, X, y, sample_weight=None):
'; got %d weights, %d estimators'
% (len(self.weights), len(self.estimators)))
- if sample_weight is not None:
- for name, step in self.estimators:
- if not has_fit_parameter(step, 'sample_weight'):
- raise ValueError('Underlying estimator \'%s\' does not'
- ' support sample weights.' % name)
-
names, clfs = zip(*self.estimators)
self._validate_names(names)
- n_isnone = np.sum([clf is None for _, clf in self.estimators])
+ n_isnone = np.sum(
+ [clf in (None, 'drop') for _, clf in self.estimators]
+ )
if n_isnone == len(self.estimators):
- raise ValueError('All estimators are None. At least one is '
- 'required!')
+ raise ValueError(
+ 'All estimators are None or "drop". At least one is required!'
+ )
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
delayed(_parallel_fit_estimator)(clone(clf), X, y,
sample_weight=sample_weight)
- for clf in clfs if clf is not None)
+ for clf in clfs if clf not in (None, 'drop')
+ )
self.named_estimators_ = Bunch()
for k, e in zip(self.estimators, self.estimators_):
@@ -147,8 +153,8 @@ class VotingClassifier(_BaseVoting, ClassifierMixin):
estimators : list of (string, estimator) tuples
Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
of those original estimators that will be stored in the class attribute
- ``self.estimators_``. An estimator can be set to `None` using
- ``set_params``.
+ ``self.estimators_``. An estimator can be set to ``None`` or ``'drop'``
+ using ``set_params``.
voting : str, {'hard', 'soft'} (default='hard')
If 'hard', uses predicted class labels for majority rule voting.
@@ -281,7 +287,7 @@ def predict(self, X):
The input samples.
Returns
- ----------
+ -------
maj : array-like, shape (n_samples,)
Predicted class labels.
"""
@@ -325,7 +331,7 @@ def predict_proba(self):
The input samples.
Returns
- ----------
+ -------
avg : array-like, shape (n_samples, n_classes)
Weighted average probability for each class per sample.
"""
@@ -379,9 +385,9 @@ class VotingRegressor(_BaseVoting, RegressorMixin):
Parameters
----------
estimators : list of (string, estimator) tuples
- Invoking the ``fit`` method on the ``VotingRegressor`` will fit
- clones of those original estimators that will be stored in the class
- attribute ``self.estimators_``. An estimator can be set to `None`
+ Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
+ of those original estimators that will be stored in the class attribute
+ ``self.estimators_``. An estimator can be set to ``None`` or ``'drop'``
using ``set_params``.
weights : array-like, shape (n_regressors,), optional (default=`None`)
diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py
index 6e13b7bd80ae2..724aa07d2d76c 100644
--- a/sklearn/ensemble/weight_boosting.py
+++ b/sklearn/ensemble/weight_boosting.py
@@ -685,7 +685,7 @@ class in ``classes_``, respectively.
# The weights are all 1. for SAMME.R
pred = sum(_samme_proba(estimator, n_classes, X)
for estimator in self.estimators_)
- else: # self.algorithm == "SAMME"
+ else: # self.algorithm == "SAMME"
pred = sum((estimator.predict(X) == classes).T * w
for estimator, w in zip(self.estimators_,
self.estimator_weights_))
@@ -780,7 +780,7 @@ def predict_proba(self, X):
# The weights are all 1. for SAMME.R
proba = sum(_samme_proba(estimator, n_classes, X)
for estimator in self.estimators_)
- else: # self.algorithm == "SAMME"
+ else: # self.algorithm == "SAMME"
proba = sum(estimator.predict_proba(X) * w
for estimator, w in zip(self.estimators_,
self.estimator_weights_))
diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py
index 9cf207e40fdd6..22289db5c45e2 100644
--- a/sklearn/exceptions.py
+++ b/sklearn/exceptions.py
@@ -29,7 +29,7 @@ class NotFittedError(ValueError, AttributeError):
... LinearSVC().predict([[1, 2], [2, 3], [3, 4]])
... except NotFittedError as e:
... print(repr(e))
- ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+ ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
NotFittedError('This LinearSVC instance is not fitted yet'...)
.. versionchanged:: 0.18
@@ -114,7 +114,7 @@ class FitFailedWarning(RuntimeWarning):
>>> X, y = [[1, 2], [3, 4], [5, 6], [7, 8]], [0, 0, 1, 1]
>>> with warnings.catch_warnings(record=True) as w:
... try:
- ... gs.fit(X, y) # This will raise a ValueError since C is < 0
+ ... gs.fit(X, y) # This will raise a ValueError since C is < 0
... except ValueError:
... pass
... print(repr(w[-1].message))
diff --git a/sklearn/experimental/enable_iterative_imputer.py b/sklearn/experimental/enable_iterative_imputer.py
new file mode 100644
index 0000000000000..2f262141cc069
--- /dev/null
+++ b/sklearn/experimental/enable_iterative_imputer.py
@@ -0,0 +1,19 @@
+"""Enables IterativeImputer
+
+The API and results of this estimators might change without any deprecation
+cycle.
+
+Importing this file dynamically sets :class:`sklearn.impute.IterativeImputer`
+as an attribute of the impute module::
+
+ >>> # explicitly require this experimental feature
+ >>> from sklearn.experimental import enable_iterative_imputer # noqa
+ >>> # now you can import normally from impute
+ >>> from sklearn.impute import IterativeImputer
+"""
+
+from ..impute._iterative import IterativeImputer
+from .. import impute
+
+impute.IterativeImputer = IterativeImputer
+impute.__all__ += ['IterativeImputer']
diff --git a/sklearn/experimental/tests/test_enable_iterative_imputer.py b/sklearn/experimental/tests/test_enable_iterative_imputer.py
new file mode 100644
index 0000000000000..17579e0c43612
--- /dev/null
+++ b/sklearn/experimental/tests/test_enable_iterative_imputer.py
@@ -0,0 +1,39 @@
+"""Tests for making sure experimental imports work as expected."""
+
+import textwrap
+
+from sklearn.utils.testing import assert_run_python_script
+
+
+def test_imports_strategies():
+ # Make sure different import strategies work or fail as expected.
+
+ # Since Python caches the imported modules, we need to run a child process
+ # for every test case. Else, the tests would not be independent
+ # (manually removing the imports from the cache (sys.modules) is not
+ # recommended and can lead to many complications).
+
+ good_import = """
+ from sklearn.experimental import enable_iterative_imputer
+ from sklearn.impute import IterativeImputer
+ """
+ assert_run_python_script(textwrap.dedent(good_import))
+
+ good_import_with_ensemble_first = """
+ import sklearn.ensemble
+ from sklearn.experimental import enable_iterative_imputer
+ from sklearn.impute import IterativeImputer
+ """
+ assert_run_python_script(textwrap.dedent(good_import_with_ensemble_first))
+
+ bad_imports = """
+ import pytest
+
+ with pytest.raises(ImportError):
+ from sklearn.impute import IterativeImputer
+
+ import sklearn.experimental
+ with pytest.raises(ImportError):
+ from sklearn.impute import IterativeImputer
+ """
+ assert_run_python_script(textwrap.dedent(bad_imports))
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
index 68a43ff0be313..e6f8ff4e555fa 100644
--- a/sklearn/feature_extraction/image.py
+++ b/sklearn/feature_extraction/image.py
@@ -32,7 +32,7 @@ def _make_edges_3d(n_x, n_y, n_z=1):
"""Returns a list of edges for a 3D image.
Parameters
- ===========
+ ----------
n_x : integer
The size of the grid in the x direction.
n_y : integer
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 007e158f3a449..7891e332c8214 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -110,7 +110,7 @@ def _check_stop_list(stop):
raise ValueError("not a built-in stop list: %s" % stop)
elif stop is None:
return None
- else: # assume it's a collection
+ else: # assume it's a collection
return frozenset(stop)
@@ -1496,7 +1496,7 @@ class TfidfVectorizer(CountVectorizer):
idf_ : array, shape (n_features)
The inverse document frequency (IDF) vector; only defined
- if ``use_idf`` is True.
+ if ``use_idf`` is True.
stop_words_ : set
Terms that were ignored because they either:
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
index e7d5e97037427..88e97deaecf54 100644
--- a/sklearn/gaussian_process/kernels.py
+++ b/sklearn/gaussian_process/kernels.py
@@ -1156,7 +1156,7 @@ class RBF(StationaryKernelMixin, NormalizedKernelMixin, Kernel):
.. versionadded:: 0.18
Parameters
- -----------
+ ----------
length_scale : float or array with shape (n_features,), default: 1.0
The length scale of the kernel. If a float, an isotropic kernel is
used. If an array, an anisotropic kernel is used where each dimension
@@ -1269,7 +1269,7 @@ class Matern(RBF):
.. versionadded:: 0.18
Parameters
- -----------
+ ----------
length_scale : float or array with shape (n_features,), default: 1.0
The length scale of the kernel. If a float, an isotropic kernel is
used. If an array, an anisotropic kernel is used where each dimension
diff --git a/sklearn/impute.py b/sklearn/impute.py
deleted file mode 100644
index 8bbf1bb94e242..0000000000000
--- a/sklearn/impute.py
+++ /dev/null
@@ -1,1339 +0,0 @@
-"""Transformers for missing value imputation"""
-# Authors: Nicolas Tresegnie
-# Sergey Feldman
-# License: BSD 3 clause
-
-from __future__ import division
-
-import warnings
-import numbers
-from time import time
-from distutils.version import LooseVersion
-
-import numpy as np
-import numpy.ma as ma
-import scipy
-from scipy import sparse
-from scipy import stats
-from collections import namedtuple
-
-from .base import BaseEstimator, TransformerMixin
-from .base import clone
-from .exceptions import ConvergenceWarning
-from .preprocessing import normalize
-from .utils import check_array, check_random_state, safe_indexing
-from .utils.sparsefuncs import _get_median
-from .utils.validation import check_is_fitted
-from .utils.validation import FLOAT_DTYPES
-from .utils.fixes import _object_dtype_isnan
-from .utils import is_scalar_nan
-
-
-ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx',
- 'neighbor_feat_idx',
- 'estimator'])
-
-__all__ = [
- 'MissingIndicator',
- 'SimpleImputer',
- 'IterativeImputer',
-]
-
-
-def _check_inputs_dtype(X, missing_values):
- if (X.dtype.kind in ("f", "i", "u") and
- not isinstance(missing_values, numbers.Real)):
- raise ValueError("'X' and 'missing_values' types are expected to be"
- " both numerical. Got X.dtype={} and "
- " type(missing_values)={}."
- .format(X.dtype, type(missing_values)))
-
-
-def _get_mask(X, value_to_mask):
- """Compute the boolean mask X == missing_values."""
- if is_scalar_nan(value_to_mask):
- if X.dtype.kind == "f":
- return np.isnan(X)
- elif X.dtype.kind in ("i", "u"):
- # can't have NaNs in integer array.
- return np.zeros(X.shape, dtype=bool)
- else:
- # np.isnan does not work on object dtypes.
- return _object_dtype_isnan(X)
- else:
- # X == value_to_mask with object dytpes does not always perform
- # element-wise for old versions of numpy
- return np.equal(X, value_to_mask)
-
-
-def _most_frequent(array, extra_value, n_repeat):
- """Compute the most frequent value in a 1d array extended with
- [extra_value] * n_repeat, where extra_value is assumed to be not part
- of the array."""
- # Compute the most frequent value in array only
- if array.size > 0:
- with warnings.catch_warnings():
- # stats.mode raises a warning when input array contains objects due
- # to incapacity to detect NaNs. Irrelevant here since input array
- # has already been NaN-masked.
- warnings.simplefilter("ignore", RuntimeWarning)
- mode = stats.mode(array)
-
- most_frequent_value = mode[0][0]
- most_frequent_count = mode[1][0]
- else:
- most_frequent_value = 0
- most_frequent_count = 0
-
- # Compare to array + [extra_value] * n_repeat
- if most_frequent_count == 0 and n_repeat == 0:
- return np.nan
- elif most_frequent_count < n_repeat:
- return extra_value
- elif most_frequent_count > n_repeat:
- return most_frequent_value
- elif most_frequent_count == n_repeat:
- # Ties the breaks. Copy the behaviour of scipy.stats.mode
- if most_frequent_value < extra_value:
- return most_frequent_value
- else:
- return extra_value
-
-
-class SimpleImputer(BaseEstimator, TransformerMixin):
- """Imputation transformer for completing missing values.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- missing_values : number, string, np.nan (default) or None
- The placeholder for the missing values. All occurrences of
- `missing_values` will be imputed.
-
- strategy : string, optional (default="mean")
- The imputation strategy.
-
- - If "mean", then replace missing values using the mean along
- each column. Can only be used with numeric data.
- - If "median", then replace missing values using the median along
- each column. Can only be used with numeric data.
- - If "most_frequent", then replace missing using the most frequent
- value along each column. Can be used with strings or numeric data.
- - If "constant", then replace missing values with fill_value. Can be
- used with strings or numeric data.
-
- .. versionadded:: 0.20
- strategy="constant" for fixed value imputation.
-
- fill_value : string or numerical value, optional (default=None)
- When strategy == "constant", fill_value is used to replace all
- occurrences of missing_values.
- If left to the default, fill_value will be 0 when imputing numerical
- data and "missing_value" for strings or object data types.
-
- verbose : integer, optional (default=0)
- Controls the verbosity of the imputer.
-
- copy : boolean, optional (default=True)
- If True, a copy of X will be created. If False, imputation will
- be done in-place whenever possible. Note that, in the following cases,
- a new copy will always be made, even if `copy=False`:
-
- - If X is not an array of floating values;
- - If X is encoded as a CSR matrix;
- - If add_indicator=True.
-
- add_indicator : boolean, optional (default=False)
- If True, a `MissingIndicator` transform will stack onto output
- of the imputer's transform. This allows a predictive estimator
- to account for missingness despite imputation. If a feature has no
- missing values at fit/train time, the feature won't appear on
- the missing indicator even if there are missing values at
- transform/test time.
-
- Attributes
- ----------
- statistics_ : array of shape (n_features,)
- The imputation fill value for each feature.
-
- indicator_ : :class:`sklearn.impute.MissingIndicator`
- Indicator used to add binary indicators for missing values.
- ``None`` if add_indicator is False.
-
- See also
- --------
- IterativeImputer : Multivariate imputation of missing values.
-
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.impute import SimpleImputer
- >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
- >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
- ... # doctest: +NORMALIZE_WHITESPACE
- SimpleImputer(add_indicator=False, copy=True, fill_value=None,
- missing_values=nan, strategy='mean', verbose=0)
- >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
- >>> print(imp_mean.transform(X))
- ... # doctest: +NORMALIZE_WHITESPACE
- [[ 7. 2. 3. ]
- [ 4. 3.5 6. ]
- [10. 3.5 9. ]]
-
- Notes
- -----
- Columns which only contained missing values at `fit` are discarded upon
- `transform` if strategy is not "constant".
-
- """
- def __init__(self, missing_values=np.nan, strategy="mean",
- fill_value=None, verbose=0, copy=True, add_indicator=False):
- self.missing_values = missing_values
- self.strategy = strategy
- self.fill_value = fill_value
- self.verbose = verbose
- self.copy = copy
- self.add_indicator = add_indicator
-
- def _validate_input(self, X):
- allowed_strategies = ["mean", "median", "most_frequent", "constant"]
- if self.strategy not in allowed_strategies:
- raise ValueError("Can only use these strategies: {0} "
- " got strategy={1}".format(allowed_strategies,
- self.strategy))
-
- if self.strategy in ("most_frequent", "constant"):
- dtype = None
- else:
- dtype = FLOAT_DTYPES
-
- if not is_scalar_nan(self.missing_values):
- force_all_finite = True
- else:
- force_all_finite = "allow-nan"
-
- try:
- X = check_array(X, accept_sparse='csc', dtype=dtype,
- force_all_finite=force_all_finite, copy=self.copy)
- except ValueError as ve:
- if "could not convert" in str(ve):
- raise ValueError("Cannot use {0} strategy with non-numeric "
- "data. Received datatype :{1}."
- "".format(self.strategy, X.dtype.kind))
- else:
- raise ve
-
- _check_inputs_dtype(X, self.missing_values)
- if X.dtype.kind not in ("i", "u", "f", "O"):
- raise ValueError("SimpleImputer does not support data with dtype "
- "{0}. Please provide either a numeric array (with"
- " a floating point or integer dtype) or "
- "categorical data represented either as an array "
- "with integer dtype or an array of string values "
- "with an object dtype.".format(X.dtype))
-
- return X
-
- def fit(self, X, y=None):
- """Fit the imputer on X.
-
- Parameters
- ----------
- X : {array-like, sparse matrix}, shape (n_samples, n_features)
- Input data, where ``n_samples`` is the number of samples and
- ``n_features`` is the number of features.
-
- Returns
- -------
- self : SimpleImputer
- """
- X = self._validate_input(X)
-
- # default fill_value is 0 for numerical input and "missing_value"
- # otherwise
- if self.fill_value is None:
- if X.dtype.kind in ("i", "u", "f"):
- fill_value = 0
- else:
- fill_value = "missing_value"
- else:
- fill_value = self.fill_value
-
- # fill_value should be numerical in case of numerical input
- if (self.strategy == "constant" and
- X.dtype.kind in ("i", "u", "f") and
- not isinstance(fill_value, numbers.Real)):
- raise ValueError("'fill_value'={0} is invalid. Expected a "
- "numerical value when imputing numerical "
- "data".format(fill_value))
-
- if sparse.issparse(X):
- # missing_values = 0 not allowed with sparse data as it would
- # force densification
- if self.missing_values == 0:
- raise ValueError("Imputation not possible when missing_values "
- "== 0 and input is sparse. Provide a dense "
- "array instead.")
- else:
- self.statistics_ = self._sparse_fit(X,
- self.strategy,
- self.missing_values,
- fill_value)
- else:
- self.statistics_ = self._dense_fit(X,
- self.strategy,
- self.missing_values,
- fill_value)
-
- if self.add_indicator:
- self.indicator_ = MissingIndicator(
- missing_values=self.missing_values)
- self.indicator_.fit(X)
- else:
- self.indicator_ = None
-
- return self
-
- def _sparse_fit(self, X, strategy, missing_values, fill_value):
- """Fit the transformer on sparse data."""
- mask_data = _get_mask(X.data, missing_values)
- n_implicit_zeros = X.shape[0] - np.diff(X.indptr)
-
- statistics = np.empty(X.shape[1])
-
- if strategy == "constant":
- # for constant strategy, self.statistcs_ is used to store
- # fill_value in each column
- statistics.fill(fill_value)
- else:
- for i in range(X.shape[1]):
- column = X.data[X.indptr[i]:X.indptr[i + 1]]
- mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]
- column = column[~mask_column]
-
- # combine explicit and implicit zeros
- mask_zeros = _get_mask(column, 0)
- column = column[~mask_zeros]
- n_explicit_zeros = mask_zeros.sum()
- n_zeros = n_implicit_zeros[i] + n_explicit_zeros
-
- if strategy == "mean":
- s = column.size + n_zeros
- statistics[i] = np.nan if s == 0 else column.sum() / s
-
- elif strategy == "median":
- statistics[i] = _get_median(column,
- n_zeros)
-
- elif strategy == "most_frequent":
- statistics[i] = _most_frequent(column,
- 0,
- n_zeros)
- return statistics
-
- def _dense_fit(self, X, strategy, missing_values, fill_value):
- """Fit the transformer on dense data."""
- mask = _get_mask(X, missing_values)
- masked_X = ma.masked_array(X, mask=mask)
-
- # Mean
- if strategy == "mean":
- mean_masked = np.ma.mean(masked_X, axis=0)
- # Avoid the warning "Warning: converting a masked element to nan."
- mean = np.ma.getdata(mean_masked)
- mean[np.ma.getmask(mean_masked)] = np.nan
-
- return mean
-
- # Median
- elif strategy == "median":
- median_masked = np.ma.median(masked_X, axis=0)
- # Avoid the warning "Warning: converting a masked element to nan."
- median = np.ma.getdata(median_masked)
- median[np.ma.getmaskarray(median_masked)] = np.nan
-
- return median
-
- # Most frequent
- elif strategy == "most_frequent":
- # scipy.stats.mstats.mode cannot be used because it will no work
- # properly if the first element is masked and if its frequency
- # is equal to the frequency of the most frequent valid element
- # See https://github.com/scipy/scipy/issues/2636
-
- # To be able access the elements by columns
- X = X.transpose()
- mask = mask.transpose()
-
- if X.dtype.kind == "O":
- most_frequent = np.empty(X.shape[0], dtype=object)
- else:
- most_frequent = np.empty(X.shape[0])
-
- for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
- row_mask = np.logical_not(row_mask).astype(np.bool)
- row = row[row_mask]
- most_frequent[i] = _most_frequent(row, np.nan, 0)
-
- return most_frequent
-
- # Constant
- elif strategy == "constant":
- # for constant strategy, self.statistcs_ is used to store
- # fill_value in each column
- return np.full(X.shape[1], fill_value, dtype=X.dtype)
-
- def transform(self, X):
- """Impute all missing values in X.
-
- Parameters
- ----------
- X : {array-like, sparse matrix}, shape (n_samples, n_features)
- The input data to complete.
- """
- check_is_fitted(self, 'statistics_')
-
- X = self._validate_input(X)
-
- statistics = self.statistics_
-
- if X.shape[1] != statistics.shape[0]:
- raise ValueError("X has %d features per sample, expected %d"
- % (X.shape[1], self.statistics_.shape[0]))
-
- if self.add_indicator:
- X_trans_indicator = self.indicator_.transform(X)
-
- # Delete the invalid columns if strategy is not constant
- if self.strategy == "constant":
- valid_statistics = statistics
- else:
- # same as np.isnan but also works for object dtypes
- invalid_mask = _get_mask(statistics, np.nan)
- valid_mask = np.logical_not(invalid_mask)
- valid_statistics = statistics[valid_mask]
- valid_statistics_indexes = np.flatnonzero(valid_mask)
-
- if invalid_mask.any():
- missing = np.arange(X.shape[1])[invalid_mask]
- if self.verbose:
- warnings.warn("Deleting features without "
- "observed values: %s" % missing)
- X = X[:, valid_statistics_indexes]
-
- # Do actual imputation
- if sparse.issparse(X):
- if self.missing_values == 0:
- raise ValueError("Imputation not possible when missing_values "
- "== 0 and input is sparse. Provide a dense "
- "array instead.")
- else:
- mask = _get_mask(X.data, self.missing_values)
- indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
- np.diff(X.indptr))[mask]
-
- X.data[mask] = valid_statistics[indexes].astype(X.dtype,
- copy=False)
- else:
- mask = _get_mask(X, self.missing_values)
- n_missing = np.sum(mask, axis=0)
- values = np.repeat(valid_statistics, n_missing)
- coordinates = np.where(mask.transpose())[::-1]
-
- X[coordinates] = values
-
- if self.add_indicator:
- hstack = sparse.hstack if sparse.issparse(X) else np.hstack
- X = hstack((X, X_trans_indicator))
-
- return X
-
- def _more_tags(self):
- return {'allow_nan': True}
-
-
-class IterativeImputer(BaseEstimator, TransformerMixin):
- """Multivariate imputer that estimates each feature from all the others.
-
- A strategy for imputing missing values by modeling each feature with
- missing values as a function of other features in a round-robin fashion.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- estimator : estimator object, default=BayesianRidge()
- The estimator to use at each step of the round-robin imputation.
- If ``sample_posterior`` is True, the estimator must support
- ``return_std`` in its ``predict`` method.
-
- missing_values : int, np.nan, optional (default=np.nan)
- The placeholder for the missing values. All occurrences of
- ``missing_values`` will be imputed.
-
- sample_posterior : boolean, default=False
- Whether to sample from the (Gaussian) predictive posterior of the
- fitted estimator for each imputation. Estimator must support
- ``return_std`` in its ``predict`` method if set to ``True``. Set to
- ``True`` if using ``IterativeImputer`` for multiple imputations.
-
- max_iter : int, optional (default=10)
- Maximum number of imputation rounds to perform before returning the
- imputations computed during the final round. A round is a single
- imputation of each feature with missing values. The stopping criterion
- is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol,
- where `X_t` is `X` at iteration `t. Note that early stopping is only
- applied if ``sample_posterior=False``.
-
- tol : float, optional (default=1e-3)
- Tolerance of the stopping condition.
-
- n_nearest_features : int, optional (default=None)
- Number of other features to use to estimate the missing values of
- each feature column. Nearness between features is measured using
- the absolute correlation coefficient between each feature pair (after
- initial imputation). To ensure coverage of features throughout the
- imputation process, the neighbor features are not necessarily nearest,
- but are drawn with probability proportional to correlation for each
- imputed target feature. Can provide significant speed-up when the
- number of features is huge. If ``None``, all features will be used.
-
- initial_strategy : str, optional (default="mean")
- Which strategy to use to initialize the missing values. Same as the
- ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer`
- Valid values: {"mean", "median", "most_frequent", or "constant"}.
-
- imputation_order : str, optional (default="ascending")
- The order in which the features will be imputed. Possible values:
-
- "ascending"
- From features with fewest missing values to most.
- "descending"
- From features with most missing values to fewest.
- "roman"
- Left to right.
- "arabic"
- Right to left.
- "random"
- A random order for each round.
-
- min_value : float, optional (default=None)
- Minimum possible imputed value. Default of ``None`` will set minimum
- to negative infinity.
-
- max_value : float, optional (default=None)
- Maximum possible imputed value. Default of ``None`` will set maximum
- to positive infinity.
-
- verbose : int, optional (default=0)
- Verbosity flag, controls the debug messages that are issued
- as functions are evaluated. The higher, the more verbose. Can be 0, 1,
- or 2.
-
- random_state : int, RandomState instance or None, optional (default=None)
- The seed of the pseudo random number generator to use. Randomizes
- selection of estimator features if n_nearest_features is not None, the
- ``imputation_order`` if ``random``, and the sampling from posterior if
- ``sample_posterior`` is True. Use an integer for determinism.
- See :term:`the Glossary `.
-
- add_indicator : boolean, optional (default=False)
- If True, a `MissingIndicator` transform will stack onto output
- of the imputer's transform. This allows a predictive estimator
- to account for missingness despite imputation. If a feature has no
- missing values at fit/train time, the feature won't appear on
- the missing indicator even if there are missing values at
- transform/test time.
-
- Attributes
- ----------
- initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
- Imputer used to initialize the missing values.
-
- imputation_sequence_ : list of tuples
- Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where
- ``feat_idx`` is the current feature to be imputed,
- ``neighbor_feat_idx`` is the array of other features used to impute the
- current feature, and ``estimator`` is the trained estimator used for
- the imputation. Length is ``self.n_features_with_missing_ *
- self.n_iter_``.
-
- n_iter_ : int
- Number of iteration rounds that occurred. Will be less than
- ``self.max_iter`` if early stopping criterion was reached.
-
- n_features_with_missing_ : int
- Number of features with missing values.
-
- indicator_ : :class:`sklearn.impute.MissingIndicator`
- Indicator used to add binary indicators for missing values.
- ``None`` if add_indicator is False.
-
- See also
- --------
- SimpleImputer : Univariate imputation of missing values.
-
- Notes
- -----
- To support imputation in inductive mode we store each feature's estimator
- during the ``fit`` phase, and predict without refitting (in order) during
- the ``transform`` phase.
-
- Features which contain all missing values at ``fit`` are discarded upon
- ``transform``.
-
- Features with missing values during ``transform`` which did not have any
- missing values during ``fit`` will be imputed with the initial imputation
- method only.
-
- References
- ----------
- .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice:
- Multivariate Imputation by Chained Equations in R". Journal of
- Statistical Software 45: 1-67.
- `_
-
- .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
- Multivariate Data Suitable for use with an Electronic Computer".
- Journal of the Royal Statistical Society 22(2): 302-306.
- `_
- """
-
- def __init__(self,
- estimator=None,
- missing_values=np.nan,
- sample_posterior=False,
- max_iter=10,
- tol=1e-3,
- n_nearest_features=None,
- initial_strategy="mean",
- imputation_order='ascending',
- min_value=None,
- max_value=None,
- verbose=0,
- random_state=None,
- add_indicator=False):
-
- self.estimator = estimator
- self.missing_values = missing_values
- self.sample_posterior = sample_posterior
- self.max_iter = max_iter
- self.tol = tol
- self.n_nearest_features = n_nearest_features
- self.initial_strategy = initial_strategy
- self.imputation_order = imputation_order
- self.min_value = min_value
- self.max_value = max_value
- self.verbose = verbose
- self.random_state = random_state
- self.add_indicator = add_indicator
-
- def _impute_one_feature(self,
- X_filled,
- mask_missing_values,
- feat_idx,
- neighbor_feat_idx,
- estimator=None,
- fit_mode=True):
- """Impute a single feature from the others provided.
-
- This function predicts the missing values of one of the features using
- the current estimates of all the other features. The ``estimator`` must
- support ``return_std=True`` in its ``predict`` method for this function
- to work.
-
- Parameters
- ----------
- X_filled : ndarray
- Input data with the most recent imputations.
-
- mask_missing_values : ndarray
- Input data's missing indicator matrix.
-
- feat_idx : int
- Index of the feature currently being imputed.
-
- neighbor_feat_idx : ndarray
- Indices of the features to be used in imputing ``feat_idx``.
-
- estimator : object
- The estimator to use at this step of the round-robin imputation.
- If ``sample_posterior`` is True, the estimator must support
- ``return_std`` in its ``predict`` method.
- If None, it will be cloned from self._estimator.
-
- fit_mode : boolean, default=True
- Whether to fit and predict with the estimator or just predict.
-
- Returns
- -------
- X_filled : ndarray
- Input data with ``X_filled[missing_row_mask, feat_idx]`` updated.
-
- estimator : estimator with sklearn API
- The fitted estimator used to impute
- ``X_filled[missing_row_mask, feat_idx]``.
- """
-
- # if nothing is missing, just return the default
- # (should not happen at fit time because feat_ids would be excluded)
- missing_row_mask = mask_missing_values[:, feat_idx]
- if not np.any(missing_row_mask):
- return X_filled, estimator
-
- if estimator is None and fit_mode is False:
- raise ValueError("If fit_mode is False, then an already-fitted "
- "estimator should be passed in.")
-
- if estimator is None:
- estimator = clone(self._estimator)
-
- if fit_mode:
- X_train = safe_indexing(X_filled[:, neighbor_feat_idx],
- ~missing_row_mask)
- y_train = safe_indexing(X_filled[:, feat_idx],
- ~missing_row_mask)
- estimator.fit(X_train, y_train)
-
- # get posterior samples
- X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
- missing_row_mask)
- if self.sample_posterior:
- mus, sigmas = estimator.predict(X_test, return_std=True)
- imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
- # two types of problems: (1) non-positive sigmas, (2) mus outside
- # legal range of min_value and max_value (results in inf sample)
- positive_sigmas = sigmas > 0
- imputed_values[~positive_sigmas] = mus[~positive_sigmas]
- mus_too_low = mus < self._min_value
- imputed_values[mus_too_low] = self._min_value
- mus_too_high = mus > self._max_value
- imputed_values[mus_too_high] = self._max_value
- # the rest can be sampled without statistical issues
- inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high
- mus = mus[inrange_mask]
- sigmas = sigmas[inrange_mask]
- a = (self._min_value - mus) / sigmas
- b = (self._max_value - mus) / sigmas
-
- if scipy.__version__ < LooseVersion('0.18'):
- # bug with vector-valued `a` in old scipy
- imputed_values[inrange_mask] = [
- stats.truncnorm(a=a_, b=b_,
- loc=loc_, scale=scale_).rvs(
- random_state=self.random_state_)
- for a_, b_, loc_, scale_
- in zip(a, b, mus, sigmas)]
- else:
- truncated_normal = stats.truncnorm(a=a, b=b,
- loc=mus, scale=sigmas)
- imputed_values[inrange_mask] = truncated_normal.rvs(
- random_state=self.random_state_)
- else:
- imputed_values = estimator.predict(X_test)
- imputed_values = np.clip(imputed_values,
- self._min_value,
- self._max_value)
-
- # update the feature
- X_filled[missing_row_mask, feat_idx] = imputed_values
- return X_filled, estimator
-
- def _get_neighbor_feat_idx(self,
- n_features,
- feat_idx,
- abs_corr_mat):
- """Get a list of other features to predict ``feat_idx``.
-
- If self.n_nearest_features is less than or equal to the total
- number of features, then use a probability proportional to the absolute
- correlation between ``feat_idx`` and each other feature to randomly
- choose a subsample of the other features (without replacement).
-
- Parameters
- ----------
- n_features : int
- Number of features in ``X``.
-
- feat_idx : int
- Index of the feature currently being imputed.
-
- abs_corr_mat : ndarray, shape (n_features, n_features)
- Absolute correlation matrix of ``X``. The diagonal has been zeroed
- out and each feature has been normalized to sum to 1. Can be None.
-
- Returns
- -------
- neighbor_feat_idx : array-like
- The features to use to impute ``feat_idx``.
- """
- if (self.n_nearest_features is not None and
- self.n_nearest_features < n_features):
- p = abs_corr_mat[:, feat_idx]
- neighbor_feat_idx = self.random_state_.choice(
- np.arange(n_features), self.n_nearest_features, replace=False,
- p=p)
- else:
- inds_left = np.arange(feat_idx)
- inds_right = np.arange(feat_idx + 1, n_features)
- neighbor_feat_idx = np.concatenate((inds_left, inds_right))
- return neighbor_feat_idx
-
- def _get_ordered_idx(self, mask_missing_values):
- """Decide in what order we will update the features.
-
- As a homage to the MICE R package, we will have 4 main options of
- how to order the updates, and use a random order if anything else
- is specified.
-
- Also, this function skips features which have no missing values.
-
- Parameters
- ----------
- mask_missing_values : array-like, shape (n_samples, n_features)
- Input data's missing indicator matrix, where "n_samples" is the
- number of samples and "n_features" is the number of features.
-
- Returns
- -------
- ordered_idx : ndarray, shape (n_features,)
- The order in which to impute the features.
- """
- frac_of_missing_values = mask_missing_values.mean(axis=0)
- missing_values_idx = np.nonzero(frac_of_missing_values)[0]
- if self.imputation_order == 'roman':
- ordered_idx = missing_values_idx
- elif self.imputation_order == 'arabic':
- ordered_idx = missing_values_idx[::-1]
- elif self.imputation_order == 'ascending':
- n = len(frac_of_missing_values) - len(missing_values_idx)
- ordered_idx = np.argsort(frac_of_missing_values,
- kind='mergesort')[n:][::-1]
- elif self.imputation_order == 'descending':
- n = len(frac_of_missing_values) - len(missing_values_idx)
- ordered_idx = np.argsort(frac_of_missing_values,
- kind='mergesort')[n:]
- elif self.imputation_order == 'random':
- ordered_idx = missing_values_idx
- self.random_state_.shuffle(ordered_idx)
- else:
- raise ValueError("Got an invalid imputation order: '{0}'. It must "
- "be one of the following: 'roman', 'arabic', "
- "'ascending', 'descending', or "
- "'random'.".format(self.imputation_order))
- return ordered_idx
-
- def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
- """Get absolute correlation matrix between features.
-
- Parameters
- ----------
- X_filled : ndarray, shape (n_samples, n_features)
- Input data with the most recent imputations.
-
- tolerance : float, optional (default=1e-6)
- ``abs_corr_mat`` can have nans, which will be replaced
- with ``tolerance``.
-
- Returns
- -------
- abs_corr_mat : ndarray, shape (n_features, n_features)
- Absolute correlation matrix of ``X`` at the beginning of the
- current round. The diagonal has been zeroed out and each feature's
- absolute correlations with all others have been normalized to sum
- to 1.
- """
- n_features = X_filled.shape[1]
- if (self.n_nearest_features is None or
- self.n_nearest_features >= n_features):
- return None
- abs_corr_mat = np.abs(np.corrcoef(X_filled.T))
- # np.corrcoef is not defined for features with zero std
- abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance
- # ensures exploration, i.e. at least some probability of sampling
- np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)
- # features are not their own neighbors
- np.fill_diagonal(abs_corr_mat, 0)
- # needs to sum to 1 for np.random.choice sampling
- abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)
- return abs_corr_mat
-
- def _initial_imputation(self, X):
- """Perform initial imputation for input X.
-
- Parameters
- ----------
- X : ndarray, shape (n_samples, n_features)
- Input data, where "n_samples" is the number of samples and
- "n_features" is the number of features.
-
- Returns
- -------
- Xt : ndarray, shape (n_samples, n_features)
- Input data, where "n_samples" is the number of samples and
- "n_features" is the number of features.
-
- X_filled : ndarray, shape (n_samples, n_features)
- Input data with the most recent imputations.
-
- mask_missing_values : ndarray, shape (n_samples, n_features)
- Input data's missing indicator matrix, where "n_samples" is the
- number of samples and "n_features" is the number of features.
- """
- if is_scalar_nan(self.missing_values):
- force_all_finite = "allow-nan"
- else:
- force_all_finite = True
-
- X = check_array(X, dtype=FLOAT_DTYPES, order="F",
- force_all_finite=force_all_finite)
- _check_inputs_dtype(X, self.missing_values)
-
- mask_missing_values = _get_mask(X, self.missing_values)
- if self.initial_imputer_ is None:
- self.initial_imputer_ = SimpleImputer(
- missing_values=self.missing_values,
- strategy=self.initial_strategy)
- X_filled = self.initial_imputer_.fit_transform(X)
- else:
- X_filled = self.initial_imputer_.transform(X)
-
- valid_mask = np.flatnonzero(np.logical_not(
- np.isnan(self.initial_imputer_.statistics_)))
- Xt = X[:, valid_mask]
- mask_missing_values = mask_missing_values[:, valid_mask]
-
- return Xt, X_filled, mask_missing_values
-
- def fit_transform(self, X, y=None):
- """Fits the imputer on X and return the transformed X.
-
- Parameters
- ----------
- X : array-like, shape (n_samples, n_features)
- Input data, where "n_samples" is the number of samples and
- "n_features" is the number of features.
-
- y : ignored.
-
- Returns
- -------
- Xt : array-like, shape (n_samples, n_features)
- The imputed input data.
- """
- self.random_state_ = getattr(self, "random_state_",
- check_random_state(self.random_state))
-
- if self.max_iter < 0:
- raise ValueError(
- "'max_iter' should be a positive integer. Got {} instead."
- .format(self.max_iter))
-
- if self.tol < 0:
- raise ValueError(
- "'tol' should be a non-negative float. Got {} instead."
- .format(self.tol)
- )
-
- if self.add_indicator:
- self.indicator_ = MissingIndicator(
- missing_values=self.missing_values)
- X_trans_indicator = self.indicator_.fit_transform(X)
- else:
- self.indicator_ = None
-
- if self.estimator is None:
- from .linear_model import BayesianRidge
- self._estimator = BayesianRidge()
- else:
- self._estimator = clone(self.estimator)
-
- self.imputation_sequence_ = []
-
- if hasattr(self._estimator, 'random_state'):
- self._estimator.random_state = self.random_state_
-
- self._min_value = -np.inf if self.min_value is None else self.min_value
- self._max_value = np.inf if self.max_value is None else self.max_value
-
- self.initial_imputer_ = None
- X, Xt, mask_missing_values = self._initial_imputation(X)
-
- if self.max_iter == 0 or np.all(mask_missing_values):
- self.n_iter_ = 0
- return Xt
-
- # order in which to impute
- # note this is probably too slow for large feature data (d > 100000)
- # and a better way would be good.
- # see: https://goo.gl/KyCNwj and subsequent comments
- ordered_idx = self._get_ordered_idx(mask_missing_values)
- self.n_features_with_missing_ = len(ordered_idx)
-
- abs_corr_mat = self._get_abs_corr_mat(Xt)
-
- n_samples, n_features = Xt.shape
- if self.verbose > 0:
- print("[IterativeImputer] Completing matrix with shape %s"
- % (X.shape,))
- start_t = time()
- if not self.sample_posterior:
- Xt_previous = Xt.copy()
- normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))
- for self.n_iter_ in range(1, self.max_iter + 1):
- if self.imputation_order == 'random':
- ordered_idx = self._get_ordered_idx(mask_missing_values)
-
- for feat_idx in ordered_idx:
- neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
- feat_idx,
- abs_corr_mat)
- Xt, estimator = self._impute_one_feature(
- Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
- estimator=None, fit_mode=True)
- estimator_triplet = ImputerTriplet(feat_idx,
- neighbor_feat_idx,
- estimator)
- self.imputation_sequence_.append(estimator_triplet)
-
- if self.verbose > 1:
- print('[IterativeImputer] Ending imputation round '
- '%d/%d, elapsed time %0.2f'
- % (self.n_iter_, self.max_iter, time() - start_t))
-
- if not self.sample_posterior:
- inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf,
- axis=None)
- if inf_norm < normalized_tol:
- if self.verbose > 0:
- print('[IterativeImputer] Early stopping criterion '
- 'reached.')
- break
- Xt_previous = Xt.copy()
- else:
- if not self.sample_posterior:
- warnings.warn("[IterativeImputer] Early stopping criterion not"
- " reached.", ConvergenceWarning)
- Xt[~mask_missing_values] = X[~mask_missing_values]
-
- if self.add_indicator:
- Xt = np.hstack((Xt, X_trans_indicator))
- return Xt
-
- def transform(self, X):
- """Imputes all missing values in X.
-
- Note that this is stochastic, and that if random_state is not fixed,
- repeated calls, or permuted input, will yield different results.
-
- Parameters
- ----------
- X : array-like, shape = [n_samples, n_features]
- The input data to complete.
-
- Returns
- -------
- Xt : array-like, shape (n_samples, n_features)
- The imputed input data.
- """
- check_is_fitted(self, 'initial_imputer_')
-
- if self.add_indicator:
- X_trans_indicator = self.indicator_.transform(X)
-
- X, Xt, mask_missing_values = self._initial_imputation(X)
-
- if self.n_iter_ == 0 or np.all(mask_missing_values):
- return Xt
-
- imputations_per_round = len(self.imputation_sequence_) // self.n_iter_
- i_rnd = 0
- if self.verbose > 0:
- print("[IterativeImputer] Completing matrix with shape %s"
- % (X.shape,))
- start_t = time()
- for it, estimator_triplet in enumerate(self.imputation_sequence_):
- Xt, _ = self._impute_one_feature(
- Xt,
- mask_missing_values,
- estimator_triplet.feat_idx,
- estimator_triplet.neighbor_feat_idx,
- estimator=estimator_triplet.estimator,
- fit_mode=False
- )
- if not (it + 1) % imputations_per_round:
- if self.verbose > 1:
- print('[IterativeImputer] Ending imputation round '
- '%d/%d, elapsed time %0.2f'
- % (i_rnd + 1, self.n_iter_, time() - start_t))
- i_rnd += 1
-
- Xt[~mask_missing_values] = X[~mask_missing_values]
-
- if self.add_indicator:
- Xt = np.hstack((Xt, X_trans_indicator))
- return Xt
-
- def fit(self, X, y=None):
- """Fits the imputer on X and return self.
-
- Parameters
- ----------
- X : array-like, shape (n_samples, n_features)
- Input data, where "n_samples" is the number of samples and
- "n_features" is the number of features.
-
- y : ignored
-
- Returns
- -------
- self : object
- Returns self.
- """
- self.fit_transform(X)
- return self
-
- def _more_tags(self):
- return {'allow_nan': True}
-
-
-class MissingIndicator(BaseEstimator, TransformerMixin):
- """Binary indicators for missing values.
-
- Note that this component typically should not be used in a vanilla
- :class:`Pipeline` consisting of transformers and a classifier, but rather
- could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.
-
- Read more in the :ref:`User Guide `.
-
- Parameters
- ----------
- missing_values : number, string, np.nan (default) or None
- The placeholder for the missing values. All occurrences of
- `missing_values` will be indicated (True in the output array), the
- other values will be marked as False.
-
- features : str, optional
- Whether the imputer mask should represent all or a subset of
- features.
-
- - If "missing-only" (default), the imputer mask will only represent
- features containing missing values during fit time.
- - If "all", the imputer mask will represent all features.
-
- sparse : boolean or "auto", optional
- Whether the imputer mask format should be sparse or dense.
-
- - If "auto" (default), the imputer mask will be of same type as
- input.
- - If True, the imputer mask will be a sparse matrix.
- - If False, the imputer mask will be a numpy array.
-
- error_on_new : boolean, optional
- If True (default), transform will raise an error when there are
- features with missing values in transform that have no missing values
- in fit. This is applicable only when ``features="missing-only"``.
-
- Attributes
- ----------
- features_ : ndarray, shape (n_missing_features,) or (n_features,)
- The features indices which will be returned when calling ``transform``.
- They are computed during ``fit``. For ``features='all'``, it is
- to ``range(n_features)``.
-
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.impute import MissingIndicator
- >>> X1 = np.array([[np.nan, 1, 3],
- ... [4, 0, np.nan],
- ... [8, 1, 0]])
- >>> X2 = np.array([[5, 1, np.nan],
- ... [np.nan, 2, 3],
- ... [2, 4, 0]])
- >>> indicator = MissingIndicator()
- >>> indicator.fit(X1) # doctest: +NORMALIZE_WHITESPACE
- MissingIndicator(error_on_new=True, features='missing-only',
- missing_values=nan, sparse='auto')
- >>> X2_tr = indicator.transform(X2)
- >>> X2_tr
- array([[False, True],
- [ True, False],
- [False, False]])
-
- """
-
- def __init__(self, missing_values=np.nan, features="missing-only",
- sparse="auto", error_on_new=True):
- self.missing_values = missing_values
- self.features = features
- self.sparse = sparse
- self.error_on_new = error_on_new
-
- def _get_missing_features_info(self, X):
- """Compute the imputer mask and the indices of the features
- containing missing values.
-
- Parameters
- ----------
- X : {ndarray or sparse matrix}, shape (n_samples, n_features)
- The input data with missing values. Note that ``X`` has been
- checked in ``fit`` and ``transform`` before to call this function.
-
- Returns
- -------
- imputer_mask : {ndarray or sparse matrix}, shape \
-(n_samples, n_features) or (n_samples, n_features_with_missing)
- The imputer mask of the original data.
-
- features_with_missing : ndarray, shape (n_features_with_missing)
- The features containing missing values.
-
- """
- if sparse.issparse(X):
- mask = _get_mask(X.data, self.missing_values)
-
- # The imputer mask will be constructed with the same sparse format
- # as X.
- sparse_constructor = (sparse.csr_matrix if X.format == 'csr'
- else sparse.csc_matrix)
- imputer_mask = sparse_constructor(
- (mask, X.indices.copy(), X.indptr.copy()),
- shape=X.shape, dtype=bool)
- imputer_mask.eliminate_zeros()
-
- if self.features == 'missing-only':
- n_missing = imputer_mask.getnnz(axis=0)
-
- if self.sparse is False:
- imputer_mask = imputer_mask.toarray()
- elif imputer_mask.format == 'csr':
- imputer_mask = imputer_mask.tocsc()
- else:
- imputer_mask = _get_mask(X, self.missing_values)
-
- if self.features == 'missing-only':
- n_missing = imputer_mask.sum(axis=0)
-
- if self.sparse is True:
- imputer_mask = sparse.csc_matrix(imputer_mask)
-
- if self.features == 'all':
- features_indices = np.arange(X.shape[1])
- else:
- features_indices = np.flatnonzero(n_missing)
-
- return imputer_mask, features_indices
-
- def _validate_input(self, X):
- if not is_scalar_nan(self.missing_values):
- force_all_finite = True
- else:
- force_all_finite = "allow-nan"
- X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,
- force_all_finite=force_all_finite)
- _check_inputs_dtype(X, self.missing_values)
- if X.dtype.kind not in ("i", "u", "f", "O"):
- raise ValueError("MissingIndicator does not support data with "
- "dtype {0}. Please provide either a numeric array"
- " (with a floating point or integer dtype) or "
- "categorical data represented either as an array "
- "with integer dtype or an array of string values "
- "with an object dtype.".format(X.dtype))
-
- if sparse.issparse(X) and self.missing_values == 0:
- # missing_values = 0 not allowed with sparse data as it would
- # force densification
- raise ValueError("Sparse input with missing_values=0 is "
- "not supported. Provide a dense "
- "array instead.")
-
- return X
-
- def fit(self, X, y=None):
- """Fit the transformer on X.
-
- Parameters
- ----------
- X : {array-like, sparse matrix}, shape (n_samples, n_features)
- Input data, where ``n_samples`` is the number of samples and
- ``n_features`` is the number of features.
-
- Returns
- -------
- self : object
- Returns self.
- """
- X = self._validate_input(X)
- self._n_features = X.shape[1]
-
- if self.features not in ('missing-only', 'all'):
- raise ValueError("'features' has to be either 'missing-only' or "
- "'all'. Got {} instead.".format(self.features))
-
- if not ((isinstance(self.sparse, str) and
- self.sparse == "auto") or isinstance(self.sparse, bool)):
- raise ValueError("'sparse' has to be a boolean or 'auto'. "
- "Got {!r} instead.".format(self.sparse))
-
- self.features_ = self._get_missing_features_info(X)[1]
-
- return self
-
- def transform(self, X):
- """Generate missing values indicator for X.
-
- Parameters
- ----------
- X : {array-like, sparse matrix}, shape (n_samples, n_features)
- The input data to complete.
-
- Returns
- -------
- Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
- The missing indicator for input data. The data type of ``Xt``
- will be boolean.
-
- """
- check_is_fitted(self, "features_")
- X = self._validate_input(X)
-
- if X.shape[1] != self._n_features:
- raise ValueError("X has a different number of features "
- "than during fitting.")
-
- imputer_mask, features = self._get_missing_features_info(X)
-
- if self.features == "missing-only":
- features_diff_fit_trans = np.setdiff1d(features, self.features_)
- if (self.error_on_new and features_diff_fit_trans.size > 0):
- raise ValueError("The features {} have missing values "
- "in transform but have no missing values "
- "in fit.".format(features_diff_fit_trans))
-
- if self.features_.size < self._n_features:
- imputer_mask = imputer_mask[:, self.features_]
-
- return imputer_mask
-
- def fit_transform(self, X, y=None):
- """Generate missing values indicator for X.
-
- Parameters
- ----------
- X : {array-like, sparse matrix}, shape (n_samples, n_features)
- The input data to complete.
-
- Returns
- -------
- Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
- The missing indicator for input data. The data type of ``Xt``
- will be boolean.
-
- """
- return self.fit(X, y).transform(X)
-
- def _more_tags(self):
- return {'allow_nan': True,
- 'X_types': ['2darray', 'str']}
diff --git a/sklearn/impute/__init__.py b/sklearn/impute/__init__.py
new file mode 100644
index 0000000000000..abeb4d471f5f3
--- /dev/null
+++ b/sklearn/impute/__init__.py
@@ -0,0 +1,8 @@
+"""Transformers for missing value imputation"""
+
+from ._base import MissingIndicator, SimpleImputer
+
+__all__ = [
+ 'MissingIndicator',
+ 'SimpleImputer',
+]
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
new file mode 100644
index 0000000000000..7be9da691ce11
--- /dev/null
+++ b/sklearn/impute/_base.py
@@ -0,0 +1,675 @@
+# Authors: Nicolas Tresegnie
+# Sergey Feldman
+# License: BSD 3 clause
+
+from __future__ import division
+
+import warnings
+import numbers
+
+import numpy as np
+import numpy.ma as ma
+from scipy import sparse
+from scipy import stats
+
+from ..base import BaseEstimator, TransformerMixin
+from ..utils.sparsefuncs import _get_median
+from ..utils.validation import check_is_fitted
+from ..utils.validation import FLOAT_DTYPES
+from ..utils.fixes import _object_dtype_isnan
+from ..utils import is_scalar_nan
+from ..utils import check_array
+
+
+def _check_inputs_dtype(X, missing_values):
+ if (X.dtype.kind in ("f", "i", "u") and
+ not isinstance(missing_values, numbers.Real)):
+ raise ValueError("'X' and 'missing_values' types are expected to be"
+ " both numerical. Got X.dtype={} and "
+ " type(missing_values)={}."
+ .format(X.dtype, type(missing_values)))
+
+
+def _get_mask(X, value_to_mask):
+ """Compute the boolean mask X == missing_values."""
+ if is_scalar_nan(value_to_mask):
+ if X.dtype.kind == "f":
+ return np.isnan(X)
+ elif X.dtype.kind in ("i", "u"):
+ # can't have NaNs in integer array.
+ return np.zeros(X.shape, dtype=bool)
+ else:
+ # np.isnan does not work on object dtypes.
+ return _object_dtype_isnan(X)
+ else:
+ # X == value_to_mask with object dytpes does not always perform
+ # element-wise for old versions of numpy
+ return np.equal(X, value_to_mask)
+
+
+def _most_frequent(array, extra_value, n_repeat):
+ """Compute the most frequent value in a 1d array extended with
+ [extra_value] * n_repeat, where extra_value is assumed to be not part
+ of the array."""
+ # Compute the most frequent value in array only
+ if array.size > 0:
+ with warnings.catch_warnings():
+ # stats.mode raises a warning when input array contains objects due
+ # to incapacity to detect NaNs. Irrelevant here since input array
+ # has already been NaN-masked.
+ warnings.simplefilter("ignore", RuntimeWarning)
+ mode = stats.mode(array)
+
+ most_frequent_value = mode[0][0]
+ most_frequent_count = mode[1][0]
+ else:
+ most_frequent_value = 0
+ most_frequent_count = 0
+
+ # Compare to array + [extra_value] * n_repeat
+ if most_frequent_count == 0 and n_repeat == 0:
+ return np.nan
+ elif most_frequent_count < n_repeat:
+ return extra_value
+ elif most_frequent_count > n_repeat:
+ return most_frequent_value
+ elif most_frequent_count == n_repeat:
+ # Ties the breaks. Copy the behaviour of scipy.stats.mode
+ if most_frequent_value < extra_value:
+ return most_frequent_value
+ else:
+ return extra_value
+
+
+class SimpleImputer(BaseEstimator, TransformerMixin):
+ """Imputation transformer for completing missing values.
+
+ Read more in the :ref:`User Guide `.
+
+ Parameters
+ ----------
+ missing_values : number, string, np.nan (default) or None
+ The placeholder for the missing values. All occurrences of
+ `missing_values` will be imputed.
+
+ strategy : string, optional (default="mean")
+ The imputation strategy.
+
+ - If "mean", then replace missing values using the mean along
+ each column. Can only be used with numeric data.
+ - If "median", then replace missing values using the median along
+ each column. Can only be used with numeric data.
+ - If "most_frequent", then replace missing using the most frequent
+ value along each column. Can be used with strings or numeric data.
+ - If "constant", then replace missing values with fill_value. Can be
+ used with strings or numeric data.
+
+ .. versionadded:: 0.20
+ strategy="constant" for fixed value imputation.
+
+ fill_value : string or numerical value, optional (default=None)
+ When strategy == "constant", fill_value is used to replace all
+ occurrences of missing_values.
+ If left to the default, fill_value will be 0 when imputing numerical
+ data and "missing_value" for strings or object data types.
+
+ verbose : integer, optional (default=0)
+ Controls the verbosity of the imputer.
+
+ copy : boolean, optional (default=True)
+ If True, a copy of X will be created. If False, imputation will
+ be done in-place whenever possible. Note that, in the following cases,
+ a new copy will always be made, even if `copy=False`:
+
+ - If X is not an array of floating values;
+ - If X is encoded as a CSR matrix;
+ - If add_indicator=True.
+
+ add_indicator : boolean, optional (default=False)
+ If True, a `MissingIndicator` transform will stack onto output
+ of the imputer's transform. This allows a predictive estimator
+ to account for missingness despite imputation. If a feature has no
+ missing values at fit/train time, the feature won't appear on
+ the missing indicator even if there are missing values at
+ transform/test time.
+
+ Attributes
+ ----------
+ statistics_ : array of shape (n_features,)
+ The imputation fill value for each feature.
+
+ indicator_ : :class:`sklearn.impute.MissingIndicator`
+ Indicator used to add binary indicators for missing values.
+ ``None`` if add_indicator is False.
+
+ See also
+ --------
+ IterativeImputer : Multivariate imputation of missing values.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> from sklearn.impute import SimpleImputer
+ >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
+ >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
+ ... # doctest: +NORMALIZE_WHITESPACE
+ SimpleImputer(add_indicator=False, copy=True, fill_value=None,
+ missing_values=nan, strategy='mean', verbose=0)
+ >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
+ >>> print(imp_mean.transform(X))
+ ... # doctest: +NORMALIZE_WHITESPACE
+ [[ 7. 2. 3. ]
+ [ 4. 3.5 6. ]
+ [10. 3.5 9. ]]
+
+ Notes
+ -----
+ Columns which only contained missing values at `fit` are discarded upon
+ `transform` if strategy is not "constant".
+
+ """
+ def __init__(self, missing_values=np.nan, strategy="mean",
+ fill_value=None, verbose=0, copy=True, add_indicator=False):
+ self.missing_values = missing_values
+ self.strategy = strategy
+ self.fill_value = fill_value
+ self.verbose = verbose
+ self.copy = copy
+ self.add_indicator = add_indicator
+
+ def _validate_input(self, X):
+ allowed_strategies = ["mean", "median", "most_frequent", "constant"]
+ if self.strategy not in allowed_strategies:
+ raise ValueError("Can only use these strategies: {0} "
+ " got strategy={1}".format(allowed_strategies,
+ self.strategy))
+
+ if self.strategy in ("most_frequent", "constant"):
+ dtype = None
+ else:
+ dtype = FLOAT_DTYPES
+
+ if not is_scalar_nan(self.missing_values):
+ force_all_finite = True
+ else:
+ force_all_finite = "allow-nan"
+
+ try:
+ X = check_array(X, accept_sparse='csc', dtype=dtype,
+ force_all_finite=force_all_finite, copy=self.copy)
+ except ValueError as ve:
+ if "could not convert" in str(ve):
+ raise ValueError("Cannot use {0} strategy with non-numeric "
+ "data. Received datatype :{1}."
+ "".format(self.strategy, X.dtype.kind))
+ else:
+ raise ve
+
+ _check_inputs_dtype(X, self.missing_values)
+ if X.dtype.kind not in ("i", "u", "f", "O"):
+ raise ValueError("SimpleImputer does not support data with dtype "
+ "{0}. Please provide either a numeric array (with"
+ " a floating point or integer dtype) or "
+ "categorical data represented either as an array "
+ "with integer dtype or an array of string values "
+ "with an object dtype.".format(X.dtype))
+
+ return X
+
+ def fit(self, X, y=None):
+ """Fit the imputer on X.
+
+ Parameters
+ ----------
+ X : {array-like, sparse matrix}, shape (n_samples, n_features)
+ Input data, where ``n_samples`` is the number of samples and
+ ``n_features`` is the number of features.
+
+ Returns
+ -------
+ self : SimpleImputer
+ """
+ X = self._validate_input(X)
+
+ # default fill_value is 0 for numerical input and "missing_value"
+ # otherwise
+ if self.fill_value is None:
+ if X.dtype.kind in ("i", "u", "f"):
+ fill_value = 0
+ else:
+ fill_value = "missing_value"
+ else:
+ fill_value = self.fill_value
+
+ # fill_value should be numerical in case of numerical input
+ if (self.strategy == "constant" and
+ X.dtype.kind in ("i", "u", "f") and
+ not isinstance(fill_value, numbers.Real)):
+ raise ValueError("'fill_value'={0} is invalid. Expected a "
+ "numerical value when imputing numerical "
+ "data".format(fill_value))
+
+ if sparse.issparse(X):
+ # missing_values = 0 not allowed with sparse data as it would
+ # force densification
+ if self.missing_values == 0:
+ raise ValueError("Imputation not possible when missing_values "
+ "== 0 and input is sparse. Provide a dense "
+ "array instead.")
+ else:
+ self.statistics_ = self._sparse_fit(X,
+ self.strategy,
+ self.missing_values,
+ fill_value)
+ else:
+ self.statistics_ = self._dense_fit(X,
+ self.strategy,
+ self.missing_values,
+ fill_value)
+
+ if self.add_indicator:
+ self.indicator_ = MissingIndicator(
+ missing_values=self.missing_values)
+ self.indicator_.fit(X)
+ else:
+ self.indicator_ = None
+
+ return self
+
+ def _sparse_fit(self, X, strategy, missing_values, fill_value):
+ """Fit the transformer on sparse data."""
+ mask_data = _get_mask(X.data, missing_values)
+ n_implicit_zeros = X.shape[0] - np.diff(X.indptr)
+
+ statistics = np.empty(X.shape[1])
+
+ if strategy == "constant":
+ # for constant strategy, self.statistcs_ is used to store
+ # fill_value in each column
+ statistics.fill(fill_value)
+ else:
+ for i in range(X.shape[1]):
+ column = X.data[X.indptr[i]:X.indptr[i + 1]]
+ mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]
+ column = column[~mask_column]
+
+ # combine explicit and implicit zeros
+ mask_zeros = _get_mask(column, 0)
+ column = column[~mask_zeros]
+ n_explicit_zeros = mask_zeros.sum()
+ n_zeros = n_implicit_zeros[i] + n_explicit_zeros
+
+ if strategy == "mean":
+ s = column.size + n_zeros
+ statistics[i] = np.nan if s == 0 else column.sum() / s
+
+ elif strategy == "median":
+ statistics[i] = _get_median(column,
+ n_zeros)
+
+ elif strategy == "most_frequent":
+ statistics[i] = _most_frequent(column,
+ 0,
+ n_zeros)
+ return statistics
+
+ def _dense_fit(self, X, strategy, missing_values, fill_value):
+ """Fit the transformer on dense data."""
+ mask = _get_mask(X, missing_values)
+ masked_X = ma.masked_array(X, mask=mask)
+
+ # Mean
+ if strategy == "mean":
+ mean_masked = np.ma.mean(masked_X, axis=0)
+ # Avoid the warning "Warning: converting a masked element to nan."
+ mean = np.ma.getdata(mean_masked)
+ mean[np.ma.getmask(mean_masked)] = np.nan
+
+ return mean
+
+ # Median
+ elif strategy == "median":
+ median_masked = np.ma.median(masked_X, axis=0)
+ # Avoid the warning "Warning: converting a masked element to nan."
+ median = np.ma.getdata(median_masked)
+ median[np.ma.getmaskarray(median_masked)] = np.nan
+
+ return median
+
+ # Most frequent
+ elif strategy == "most_frequent":
+ # scipy.stats.mstats.mode cannot be used because it will no work
+ # properly if the first element is masked and if its frequency
+ # is equal to the frequency of the most frequent valid element
+ # See https://github.com/scipy/scipy/issues/2636
+
+ # To be able access the elements by columns
+ X = X.transpose()
+ mask = mask.transpose()
+
+ if X.dtype.kind == "O":
+ most_frequent = np.empty(X.shape[0], dtype=object)
+ else:
+ most_frequent = np.empty(X.shape[0])
+
+ for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
+ row_mask = np.logical_not(row_mask).astype(np.bool)
+ row = row[row_mask]
+ most_frequent[i] = _most_frequent(row, np.nan, 0)
+
+ return most_frequent
+
+ # Constant
+ elif strategy == "constant":
+ # for constant strategy, self.statistcs_ is used to store
+ # fill_value in each column
+ return np.full(X.shape[1], fill_value, dtype=X.dtype)
+
+ def transform(self, X):
+ """Impute all missing values in X.
+
+ Parameters
+ ----------
+ X : {array-like, sparse matrix}, shape (n_samples, n_features)
+ The input data to complete.
+ """
+ check_is_fitted(self, 'statistics_')
+
+ X = self._validate_input(X)
+
+ statistics = self.statistics_
+
+ if X.shape[1] != statistics.shape[0]:
+ raise ValueError("X has %d features per sample, expected %d"
+ % (X.shape[1], self.statistics_.shape[0]))
+
+ if self.add_indicator:
+ X_trans_indicator = self.indicator_.transform(X)
+
+ # Delete the invalid columns if strategy is not constant
+ if self.strategy == "constant":
+ valid_statistics = statistics
+ else:
+ # same as np.isnan but also works for object dtypes
+ invalid_mask = _get_mask(statistics, np.nan)
+ valid_mask = np.logical_not(invalid_mask)
+ valid_statistics = statistics[valid_mask]
+ valid_statistics_indexes = np.flatnonzero(valid_mask)
+
+ if invalid_mask.any():
+ missing = np.arange(X.shape[1])[invalid_mask]
+ if self.verbose:
+ warnings.warn("Deleting features without "
+ "observed values: %s" % missing)
+ X = X[:, valid_statistics_indexes]
+
+ # Do actual imputation
+ if sparse.issparse(X):
+ if self.missing_values == 0:
+ raise ValueError("Imputation not possible when missing_values "
+ "== 0 and input is sparse. Provide a dense "
+ "array instead.")
+ else:
+ mask = _get_mask(X.data, self.missing_values)
+ indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
+ np.diff(X.indptr))[mask]
+
+ X.data[mask] = valid_statistics[indexes].astype(X.dtype,
+ copy=False)
+ else:
+ mask = _get_mask(X, self.missing_values)
+ n_missing = np.sum(mask, axis=0)
+ values = np.repeat(valid_statistics, n_missing)
+ coordinates = np.where(mask.transpose())[::-1]
+
+ X[coordinates] = values
+
+ if self.add_indicator:
+ hstack = sparse.hstack if sparse.issparse(X) else np.hstack
+ X = hstack((X, X_trans_indicator))
+
+ return X
+
+ def _more_tags(self):
+ return {'allow_nan': True}
+
+
+class MissingIndicator(BaseEstimator, TransformerMixin):
+ """Binary indicators for missing values.
+
+ Note that this component typically should not be used in a vanilla
+ :class:`Pipeline` consisting of transformers and a classifier, but rather
+ could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`.
+
+ Read more in the :ref:`User Guide `.
+
+ Parameters
+ ----------
+ missing_values : number, string, np.nan (default) or None
+ The placeholder for the missing values. All occurrences of
+ `missing_values` will be indicated (True in the output array), the
+ other values will be marked as False.
+
+ features : str, optional
+ Whether the imputer mask should represent all or a subset of
+ features.
+
+ - If "missing-only" (default), the imputer mask will only represent
+ features containing missing values during fit time.
+ - If "all", the imputer mask will represent all features.
+
+ sparse : boolean or "auto", optional
+ Whether the imputer mask format should be sparse or dense.
+
+ - If "auto" (default), the imputer mask will be of same type as
+ input.
+ - If True, the imputer mask will be a sparse matrix.
+ - If False, the imputer mask will be a numpy array.
+
+ error_on_new : boolean, optional
+ If True (default), transform will raise an error when there are
+ features with missing values in transform that have no missing values
+ in fit. This is applicable only when ``features="missing-only"``.
+
+ Attributes
+ ----------
+ features_ : ndarray, shape (n_missing_features,) or (n_features,)
+ The features indices which will be returned when calling ``transform``.
+ They are computed during ``fit``. For ``features='all'``, it is
+ to ``range(n_features)``.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> from sklearn.impute import MissingIndicator
+ >>> X1 = np.array([[np.nan, 1, 3],
+ ... [4, 0, np.nan],
+ ... [8, 1, 0]])
+ >>> X2 = np.array([[5, 1, np.nan],
+ ... [np.nan, 2, 3],
+ ... [2, 4, 0]])
+ >>> indicator = MissingIndicator()
+ >>> indicator.fit(X1) # doctest: +NORMALIZE_WHITESPACE
+ MissingIndicator(error_on_new=True, features='missing-only',
+ missing_values=nan, sparse='auto')
+ >>> X2_tr = indicator.transform(X2)
+ >>> X2_tr
+ array([[False, True],
+ [ True, False],
+ [False, False]])
+
+ """
+
+ def __init__(self, missing_values=np.nan, features="missing-only",
+ sparse="auto", error_on_new=True):
+ self.missing_values = missing_values
+ self.features = features
+ self.sparse = sparse
+ self.error_on_new = error_on_new
+
+ def _get_missing_features_info(self, X):
+ """Compute the imputer mask and the indices of the features
+ containing missing values.
+
+ Parameters
+ ----------
+ X : {ndarray or sparse matrix}, shape (n_samples, n_features)
+ The input data with missing values. Note that ``X`` has been
+ checked in ``fit`` and ``transform`` before to call this function.
+
+ Returns
+ -------
+ imputer_mask : {ndarray or sparse matrix}, shape \
+(n_samples, n_features) or (n_samples, n_features_with_missing)
+ The imputer mask of the original data.
+
+ features_with_missing : ndarray, shape (n_features_with_missing)
+ The features containing missing values.
+
+ """
+ if sparse.issparse(X):
+ mask = _get_mask(X.data, self.missing_values)
+
+ # The imputer mask will be constructed with the same sparse format
+ # as X.
+ sparse_constructor = (sparse.csr_matrix if X.format == 'csr'
+ else sparse.csc_matrix)
+ imputer_mask = sparse_constructor(
+ (mask, X.indices.copy(), X.indptr.copy()),
+ shape=X.shape, dtype=bool)
+ imputer_mask.eliminate_zeros()
+
+ if self.features == 'missing-only':
+ n_missing = imputer_mask.getnnz(axis=0)
+
+ if self.sparse is False:
+ imputer_mask = imputer_mask.toarray()
+ elif imputer_mask.format == 'csr':
+ imputer_mask = imputer_mask.tocsc()
+ else:
+ imputer_mask = _get_mask(X, self.missing_values)
+
+ if self.features == 'missing-only':
+ n_missing = imputer_mask.sum(axis=0)
+
+ if self.sparse is True:
+ imputer_mask = sparse.csc_matrix(imputer_mask)
+
+ if self.features == 'all':
+ features_indices = np.arange(X.shape[1])
+ else:
+ features_indices = np.flatnonzero(n_missing)
+
+ return imputer_mask, features_indices
+
+ def _validate_input(self, X):
+ if not is_scalar_nan(self.missing_values):
+ force_all_finite = True
+ else:
+ force_all_finite = "allow-nan"
+ X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None,
+ force_all_finite=force_all_finite)
+ _check_inputs_dtype(X, self.missing_values)
+ if X.dtype.kind not in ("i", "u", "f", "O"):
+ raise ValueError("MissingIndicator does not support data with "
+ "dtype {0}. Please provide either a numeric array"
+ " (with a floating point or integer dtype) or "
+ "categorical data represented either as an array "
+ "with integer dtype or an array of string values "
+ "with an object dtype.".format(X.dtype))
+
+ if sparse.issparse(X) and self.missing_values == 0:
+ # missing_values = 0 not allowed with sparse data as it would
+ # force densification
+ raise ValueError("Sparse input with missing_values=0 is "
+ "not supported. Provide a dense "
+ "array instead.")
+
+ return X
+
+ def fit(self, X, y=None):
+ """Fit the transformer on X.
+
+ Parameters
+ ----------
+ X : {array-like, sparse matrix}, shape (n_samples, n_features)
+ Input data, where ``n_samples`` is the number of samples and
+ ``n_features`` is the number of features.
+
+ Returns
+ -------
+ self : object
+ Returns self.
+ """
+ X = self._validate_input(X)
+ self._n_features = X.shape[1]
+
+ if self.features not in ('missing-only', 'all'):
+ raise ValueError("'features' has to be either 'missing-only' or "
+ "'all'. Got {} instead.".format(self.features))
+
+ if not ((isinstance(self.sparse, str) and
+ self.sparse == "auto") or isinstance(self.sparse, bool)):
+ raise ValueError("'sparse' has to be a boolean or 'auto'. "
+ "Got {!r} instead.".format(self.sparse))
+
+ self.features_ = self._get_missing_features_info(X)[1]
+
+ return self
+
+ def transform(self, X):
+ """Generate missing values indicator for X.
+
+ Parameters
+ ----------
+ X : {array-like, sparse matrix}, shape (n_samples, n_features)
+ The input data to complete.
+
+ Returns
+ -------
+ Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
+ The missing indicator for input data. The data type of ``Xt``
+ will be boolean.
+
+ """
+ check_is_fitted(self, "features_")
+ X = self._validate_input(X)
+
+ if X.shape[1] != self._n_features:
+ raise ValueError("X has a different number of features "
+ "than during fitting.")
+
+ imputer_mask, features = self._get_missing_features_info(X)
+
+ if self.features == "missing-only":
+ features_diff_fit_trans = np.setdiff1d(features, self.features_)
+ if (self.error_on_new and features_diff_fit_trans.size > 0):
+ raise ValueError("The features {} have missing values "
+ "in transform but have no missing values "
+ "in fit.".format(features_diff_fit_trans))
+
+ if self.features_.size < self._n_features:
+ imputer_mask = imputer_mask[:, self.features_]
+
+ return imputer_mask
+
+ def fit_transform(self, X, y=None):
+ """Generate missing values indicator for X.
+
+ Parameters
+ ----------
+ X : {array-like, sparse matrix}, shape (n_samples, n_features)
+ The input data to complete.
+
+ Returns
+ -------
+ Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
+ The missing indicator for input data. The data type of ``Xt``
+ will be boolean.
+
+ """
+ return self.fit(X, y).transform(X)
+
+ def _more_tags(self):
+ return {'allow_nan': True,
+ 'X_types': ['2darray', 'str']}
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
new file mode 100644
index 0000000000000..40df3f4059c04
--- /dev/null
+++ b/sklearn/impute/_iterative.py
@@ -0,0 +1,680 @@
+
+from time import time
+from distutils.version import LooseVersion
+from collections import namedtuple
+import warnings
+
+import scipy
+from scipy import stats
+import numpy as np
+
+from ..base import clone, BaseEstimator, TransformerMixin
+from ..exceptions import ConvergenceWarning
+from ..preprocessing import normalize
+from ..utils import check_array, check_random_state, safe_indexing
+from ..utils.validation import FLOAT_DTYPES, check_is_fitted
+from ..utils import is_scalar_nan
+
+from ._base import (_get_mask, MissingIndicator, SimpleImputer,
+ _check_inputs_dtype)
+
+
+_ImputerTriplet = namedtuple('_ImputerTriplet', ['feat_idx',
+ 'neighbor_feat_idx',
+ 'estimator'])
+
+
+class IterativeImputer(BaseEstimator, TransformerMixin):
+ """Multivariate imputer that estimates each feature from all the others.
+
+ A strategy for imputing missing values by modeling each feature with
+ missing values as a function of other features in a round-robin fashion.
+
+ Read more in the :ref:`User Guide `.
+
+ .. note::
+
+ This estimator is still **experimental** for now: the predictions
+ and the API might change without any deprecation cycle. To use it,
+ you need to explicitly import ``enable_iterative_imputer``::
+
+ >>> # explicitly require this experimental feature
+ >>> from sklearn.experimental import enable_iterative_imputer # noqa
+ >>> # now you can import normally from sklearn.impute
+ >>> from sklearn.impute import IterativeImputer
+
+ Parameters
+ ----------
+ estimator : estimator object, default=BayesianRidge()
+ The estimator to use at each step of the round-robin imputation.
+ If ``sample_posterior`` is True, the estimator must support
+ ``return_std`` in its ``predict`` method.
+
+ missing_values : int, np.nan, optional (default=np.nan)
+ The placeholder for the missing values. All occurrences of
+ ``missing_values`` will be imputed.
+
+ sample_posterior : boolean, default=False
+ Whether to sample from the (Gaussian) predictive posterior of the
+ fitted estimator for each imputation. Estimator must support
+ ``return_std`` in its ``predict`` method if set to ``True``. Set to
+ ``True`` if using ``IterativeImputer`` for multiple imputations.
+
+ max_iter : int, optional (default=10)
+ Maximum number of imputation rounds to perform before returning the
+ imputations computed during the final round. A round is a single
+ imputation of each feature with missing values. The stopping criterion
+ is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol,
+ where `X_t` is `X` at iteration `t. Note that early stopping is only
+ applied if ``sample_posterior=False``.
+
+ tol : float, optional (default=1e-3)
+ Tolerance of the stopping condition.
+
+ n_nearest_features : int, optional (default=None)
+ Number of other features to use to estimate the missing values of
+ each feature column. Nearness between features is measured using
+ the absolute correlation coefficient between each feature pair (after
+ initial imputation). To ensure coverage of features throughout the
+ imputation process, the neighbor features are not necessarily nearest,
+ but are drawn with probability proportional to correlation for each
+ imputed target feature. Can provide significant speed-up when the
+ number of features is huge. If ``None``, all features will be used.
+
+ initial_strategy : str, optional (default="mean")
+ Which strategy to use to initialize the missing values. Same as the
+ ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer`
+ Valid values: {"mean", "median", "most_frequent", or "constant"}.
+
+ imputation_order : str, optional (default="ascending")
+ The order in which the features will be imputed. Possible values:
+
+ "ascending"
+ From features with fewest missing values to most.
+ "descending"
+ From features with most missing values to fewest.
+ "roman"
+ Left to right.
+ "arabic"
+ Right to left.
+ "random"
+ A random order for each round.
+
+ min_value : float, optional (default=None)
+ Minimum possible imputed value. Default of ``None`` will set minimum
+ to negative infinity.
+
+ max_value : float, optional (default=None)
+ Maximum possible imputed value. Default of ``None`` will set maximum
+ to positive infinity.
+
+ verbose : int, optional (default=0)
+ Verbosity flag, controls the debug messages that are issued
+ as functions are evaluated. The higher, the more verbose. Can be 0, 1,
+ or 2.
+
+ random_state : int, RandomState instance or None, optional (default=None)
+ The seed of the pseudo random number generator to use. Randomizes
+ selection of estimator features if n_nearest_features is not None, the
+ ``imputation_order`` if ``random``, and the sampling from posterior if
+ ``sample_posterior`` is True. Use an integer for determinism.
+ See :term:`the Glossary `.
+
+ add_indicator : boolean, optional (default=False)
+ If True, a `MissingIndicator` transform will stack onto output
+ of the imputer's transform. This allows a predictive estimator
+ to account for missingness despite imputation. If a feature has no
+ missing values at fit/train time, the feature won't appear on
+ the missing indicator even if there are missing values at
+ transform/test time.
+
+ Attributes
+ ----------
+ initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer`
+ Imputer used to initialize the missing values.
+
+ imputation_sequence_ : list of tuples
+ Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where
+ ``feat_idx`` is the current feature to be imputed,
+ ``neighbor_feat_idx`` is the array of other features used to impute the
+ current feature, and ``estimator`` is the trained estimator used for
+ the imputation. Length is ``self.n_features_with_missing_ *
+ self.n_iter_``.
+
+ n_iter_ : int
+ Number of iteration rounds that occurred. Will be less than
+ ``self.max_iter`` if early stopping criterion was reached.
+
+ n_features_with_missing_ : int
+ Number of features with missing values.
+
+ indicator_ : :class:`sklearn.impute.MissingIndicator`
+ Indicator used to add binary indicators for missing values.
+ ``None`` if add_indicator is False.
+
+ See also
+ --------
+ SimpleImputer : Univariate imputation of missing values.
+
+ Notes
+ -----
+ To support imputation in inductive mode we store each feature's estimator
+ during the ``fit`` phase, and predict without refitting (in order) during
+ the ``transform`` phase.
+
+ Features which contain all missing values at ``fit`` are discarded upon
+ ``transform``.
+
+ Features with missing values during ``transform`` which did not have any
+ missing values during ``fit`` will be imputed with the initial imputation
+ method only.
+
+ References
+ ----------
+ .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice:
+ Multivariate Imputation by Chained Equations in R". Journal of
+ Statistical Software 45: 1-67.
+ `_
+
+ .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
+ Multivariate Data Suitable for use with an Electronic Computer".
+ Journal of the Royal Statistical Society 22(2): 302-306.
+ `_
+ """
+
+ def __init__(self,
+ estimator=None,
+ missing_values=np.nan,
+ sample_posterior=False,
+ max_iter=10,
+ tol=1e-3,
+ n_nearest_features=None,
+ initial_strategy="mean",
+ imputation_order='ascending',
+ min_value=None,
+ max_value=None,
+ verbose=0,
+ random_state=None,
+ add_indicator=False):
+
+ self.estimator = estimator
+ self.missing_values = missing_values
+ self.sample_posterior = sample_posterior
+ self.max_iter = max_iter
+ self.tol = tol
+ self.n_nearest_features = n_nearest_features
+ self.initial_strategy = initial_strategy
+ self.imputation_order = imputation_order
+ self.min_value = min_value
+ self.max_value = max_value
+ self.verbose = verbose
+ self.random_state = random_state
+ self.add_indicator = add_indicator
+
+ def _impute_one_feature(self,
+ X_filled,
+ mask_missing_values,
+ feat_idx,
+ neighbor_feat_idx,
+ estimator=None,
+ fit_mode=True):
+ """Impute a single feature from the others provided.
+
+ This function predicts the missing values of one of the features using
+ the current estimates of all the other features. The ``estimator`` must
+ support ``return_std=True`` in its ``predict`` method for this function
+ to work.
+
+ Parameters
+ ----------
+ X_filled : ndarray
+ Input data with the most recent imputations.
+
+ mask_missing_values : ndarray
+ Input data's missing indicator matrix.
+
+ feat_idx : int
+ Index of the feature currently being imputed.
+
+ neighbor_feat_idx : ndarray
+ Indices of the features to be used in imputing ``feat_idx``.
+
+ estimator : object
+ The estimator to use at this step of the round-robin imputation.
+ If ``sample_posterior`` is True, the estimator must support
+ ``return_std`` in its ``predict`` method.
+ If None, it will be cloned from self._estimator.
+
+ fit_mode : boolean, default=True
+ Whether to fit and predict with the estimator or just predict.
+
+ Returns
+ -------
+ X_filled : ndarray
+ Input data with ``X_filled[missing_row_mask, feat_idx]`` updated.
+
+ estimator : estimator with sklearn API
+ The fitted estimator used to impute
+ ``X_filled[missing_row_mask, feat_idx]``.
+ """
+
+ # if nothing is missing, just return the default
+ # (should not happen at fit time because feat_ids would be excluded)
+ missing_row_mask = mask_missing_values[:, feat_idx]
+ if not np.any(missing_row_mask):
+ return X_filled, estimator
+
+ if estimator is None and fit_mode is False:
+ raise ValueError("If fit_mode is False, then an already-fitted "
+ "estimator should be passed in.")
+
+ if estimator is None:
+ estimator = clone(self._estimator)
+
+ if fit_mode:
+ X_train = safe_indexing(X_filled[:, neighbor_feat_idx],
+ ~missing_row_mask)
+ y_train = safe_indexing(X_filled[:, feat_idx],
+ ~missing_row_mask)
+ estimator.fit(X_train, y_train)
+
+ # get posterior samples
+ X_test = safe_indexing(X_filled[:, neighbor_feat_idx],
+ missing_row_mask)
+ if self.sample_posterior:
+ mus, sigmas = estimator.predict(X_test, return_std=True)
+ imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
+ # two types of problems: (1) non-positive sigmas, (2) mus outside
+ # legal range of min_value and max_value (results in inf sample)
+ positive_sigmas = sigmas > 0
+ imputed_values[~positive_sigmas] = mus[~positive_sigmas]
+ mus_too_low = mus < self._min_value
+ imputed_values[mus_too_low] = self._min_value
+ mus_too_high = mus > self._max_value
+ imputed_values[mus_too_high] = self._max_value
+ # the rest can be sampled without statistical issues
+ inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high
+ mus = mus[inrange_mask]
+ sigmas = sigmas[inrange_mask]
+ a = (self._min_value - mus) / sigmas
+ b = (self._max_value - mus) / sigmas
+
+ if scipy.__version__ < LooseVersion('0.18'):
+ # bug with vector-valued `a` in old scipy
+ imputed_values[inrange_mask] = [
+ stats.truncnorm(a=a_, b=b_,
+ loc=loc_, scale=scale_).rvs(
+ random_state=self.random_state_)
+ for a_, b_, loc_, scale_
+ in zip(a, b, mus, sigmas)]
+ else:
+ truncated_normal = stats.truncnorm(a=a, b=b,
+ loc=mus, scale=sigmas)
+ imputed_values[inrange_mask] = truncated_normal.rvs(
+ random_state=self.random_state_)
+ else:
+ imputed_values = estimator.predict(X_test)
+ imputed_values = np.clip(imputed_values,
+ self._min_value,
+ self._max_value)
+
+ # update the feature
+ X_filled[missing_row_mask, feat_idx] = imputed_values
+ return X_filled, estimator
+
+ def _get_neighbor_feat_idx(self,
+ n_features,
+ feat_idx,
+ abs_corr_mat):
+ """Get a list of other features to predict ``feat_idx``.
+
+ If self.n_nearest_features is less than or equal to the total
+ number of features, then use a probability proportional to the absolute
+ correlation between ``feat_idx`` and each other feature to randomly
+ choose a subsample of the other features (without replacement).
+
+ Parameters
+ ----------
+ n_features : int
+ Number of features in ``X``.
+
+ feat_idx : int
+ Index of the feature currently being imputed.
+
+ abs_corr_mat : ndarray, shape (n_features, n_features)
+ Absolute correlation matrix of ``X``. The diagonal has been zeroed
+ out and each feature has been normalized to sum to 1. Can be None.
+
+ Returns
+ -------
+ neighbor_feat_idx : array-like
+ The features to use to impute ``feat_idx``.
+ """
+ if (self.n_nearest_features is not None and
+ self.n_nearest_features < n_features):
+ p = abs_corr_mat[:, feat_idx]
+ neighbor_feat_idx = self.random_state_.choice(
+ np.arange(n_features), self.n_nearest_features, replace=False,
+ p=p)
+ else:
+ inds_left = np.arange(feat_idx)
+ inds_right = np.arange(feat_idx + 1, n_features)
+ neighbor_feat_idx = np.concatenate((inds_left, inds_right))
+ return neighbor_feat_idx
+
+ def _get_ordered_idx(self, mask_missing_values):
+ """Decide in what order we will update the features.
+
+ As a homage to the MICE R package, we will have 4 main options of
+ how to order the updates, and use a random order if anything else
+ is specified.
+
+ Also, this function skips features which have no missing values.
+
+ Parameters
+ ----------
+ mask_missing_values : array-like, shape (n_samples, n_features)
+ Input data's missing indicator matrix, where "n_samples" is the
+ number of samples and "n_features" is the number of features.
+
+ Returns
+ -------
+ ordered_idx : ndarray, shape (n_features,)
+ The order in which to impute the features.
+ """
+ frac_of_missing_values = mask_missing_values.mean(axis=0)
+ missing_values_idx = np.nonzero(frac_of_missing_values)[0]
+ if self.imputation_order == 'roman':
+ ordered_idx = missing_values_idx
+ elif self.imputation_order == 'arabic':
+ ordered_idx = missing_values_idx[::-1]
+ elif self.imputation_order == 'ascending':
+ n = len(frac_of_missing_values) - len(missing_values_idx)
+ ordered_idx = np.argsort(frac_of_missing_values,
+ kind='mergesort')[n:][::-1]
+ elif self.imputation_order == 'descending':
+ n = len(frac_of_missing_values) - len(missing_values_idx)
+ ordered_idx = np.argsort(frac_of_missing_values,
+ kind='mergesort')[n:]
+ elif self.imputation_order == 'random':
+ ordered_idx = missing_values_idx
+ self.random_state_.shuffle(ordered_idx)
+ else:
+ raise ValueError("Got an invalid imputation order: '{0}'. It must "
+ "be one of the following: 'roman', 'arabic', "
+ "'ascending', 'descending', or "
+ "'random'.".format(self.imputation_order))
+ return ordered_idx
+
+ def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
+ """Get absolute correlation matrix between features.
+
+ Parameters
+ ----------
+ X_filled : ndarray, shape (n_samples, n_features)
+ Input data with the most recent imputations.
+
+ tolerance : float, optional (default=1e-6)
+ ``abs_corr_mat`` can have nans, which will be replaced
+ with ``tolerance``.
+
+ Returns
+ -------
+ abs_corr_mat : ndarray, shape (n_features, n_features)
+ Absolute correlation matrix of ``X`` at the beginning of the
+ current round. The diagonal has been zeroed out and each feature's
+ absolute correlations with all others have been normalized to sum
+ to 1.
+ """
+ n_features = X_filled.shape[1]
+ if (self.n_nearest_features is None or
+ self.n_nearest_features >= n_features):
+ return None
+ abs_corr_mat = np.abs(np.corrcoef(X_filled.T))
+ # np.corrcoef is not defined for features with zero std
+ abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance
+ # ensures exploration, i.e. at least some probability of sampling
+ np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)
+ # features are not their own neighbors
+ np.fill_diagonal(abs_corr_mat, 0)
+ # needs to sum to 1 for np.random.choice sampling
+ abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False)
+ return abs_corr_mat
+
+ def _initial_imputation(self, X):
+ """Perform initial imputation for input X.
+
+ Parameters
+ ----------
+ X : ndarray, shape (n_samples, n_features)
+ Input data, where "n_samples" is the number of samples and
+ "n_features" is the number of features.
+
+ Returns
+ -------
+ Xt : ndarray, shape (n_samples, n_features)
+ Input data, where "n_samples" is the number of samples and
+ "n_features" is the number of features.
+
+ X_filled : ndarray, shape (n_samples, n_features)
+ Input data with the most recent imputations.
+
+ mask_missing_values : ndarray, shape (n_samples, n_features)
+ Input data's missing indicator matrix, where "n_samples" is the
+ number of samples and "n_features" is the number of features.
+ """
+ if is_scalar_nan(self.missing_values):
+ force_all_finite = "allow-nan"
+ else:
+ force_all_finite = True
+
+ X = check_array(X, dtype=FLOAT_DTYPES, order="F",
+ force_all_finite=force_all_finite)
+ _check_inputs_dtype(X, self.missing_values)
+
+ mask_missing_values = _get_mask(X, self.missing_values)
+ if self.initial_imputer_ is None:
+ self.initial_imputer_ = SimpleImputer(
+ missing_values=self.missing_values,
+ strategy=self.initial_strategy)
+ X_filled = self.initial_imputer_.fit_transform(X)
+ else:
+ X_filled = self.initial_imputer_.transform(X)
+
+ valid_mask = np.flatnonzero(np.logical_not(
+ np.isnan(self.initial_imputer_.statistics_)))
+ Xt = X[:, valid_mask]
+ mask_missing_values = mask_missing_values[:, valid_mask]
+
+ return Xt, X_filled, mask_missing_values
+
+ def fit_transform(self, X, y=None):
+ """Fits the imputer on X and return the transformed X.
+
+ Parameters
+ ----------
+ X : array-like, shape (n_samples, n_features)
+ Input data, where "n_samples" is the number of samples and
+ "n_features" is the number of features.
+
+ y : ignored.
+
+ Returns
+ -------
+ Xt : array-like, shape (n_samples, n_features)
+ The imputed input data.
+ """
+ self.random_state_ = getattr(self, "random_state_",
+ check_random_state(self.random_state))
+
+ if self.max_iter < 0:
+ raise ValueError(
+ "'max_iter' should be a positive integer. Got {} instead."
+ .format(self.max_iter))
+
+ if self.tol < 0:
+ raise ValueError(
+ "'tol' should be a non-negative float. Got {} instead."
+ .format(self.tol)
+ )
+
+ if self.add_indicator:
+ self.indicator_ = MissingIndicator(
+ missing_values=self.missing_values)
+ X_trans_indicator = self.indicator_.fit_transform(X)
+ else:
+ self.indicator_ = None
+
+ if self.estimator is None:
+ from ..linear_model import BayesianRidge
+ self._estimator = BayesianRidge()
+ else:
+ self._estimator = clone(self.estimator)
+
+ self.imputation_sequence_ = []
+
+ if hasattr(self._estimator, 'random_state'):
+ self._estimator.random_state = self.random_state_
+
+ self._min_value = -np.inf if self.min_value is None else self.min_value
+ self._max_value = np.inf if self.max_value is None else self.max_value
+
+ self.initial_imputer_ = None
+ X, Xt, mask_missing_values = self._initial_imputation(X)
+
+ if self.max_iter == 0 or np.all(mask_missing_values):
+ self.n_iter_ = 0
+ return Xt
+
+ # order in which to impute
+ # note this is probably too slow for large feature data (d > 100000)
+ # and a better way would be good.
+ # see: https://goo.gl/KyCNwj and subsequent comments
+ ordered_idx = self._get_ordered_idx(mask_missing_values)
+ self.n_features_with_missing_ = len(ordered_idx)
+
+ abs_corr_mat = self._get_abs_corr_mat(Xt)
+
+ n_samples, n_features = Xt.shape
+ if self.verbose > 0:
+ print("[IterativeImputer] Completing matrix with shape %s"
+ % (X.shape,))
+ start_t = time()
+ if not self.sample_posterior:
+ Xt_previous = Xt.copy()
+ normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))
+ for self.n_iter_ in range(1, self.max_iter + 1):
+ if self.imputation_order == 'random':
+ ordered_idx = self._get_ordered_idx(mask_missing_values)
+
+ for feat_idx in ordered_idx:
+ neighbor_feat_idx = self._get_neighbor_feat_idx(n_features,
+ feat_idx,
+ abs_corr_mat)
+ Xt, estimator = self._impute_one_feature(
+ Xt, mask_missing_values, feat_idx, neighbor_feat_idx,
+ estimator=None, fit_mode=True)
+ estimator_triplet = _ImputerTriplet(feat_idx,
+ neighbor_feat_idx,
+ estimator)
+ self.imputation_sequence_.append(estimator_triplet)
+
+ if self.verbose > 1:
+ print('[IterativeImputer] Ending imputation round '
+ '%d/%d, elapsed time %0.2f'
+ % (self.n_iter_, self.max_iter, time() - start_t))
+
+ if not self.sample_posterior:
+ inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf,
+ axis=None)
+ if inf_norm < normalized_tol:
+ if self.verbose > 0:
+ print('[IterativeImputer] Early stopping criterion '
+ 'reached.')
+ break
+ Xt_previous = Xt.copy()
+ else:
+ if not self.sample_posterior:
+ warnings.warn("[IterativeImputer] Early stopping criterion not"
+ " reached.", ConvergenceWarning)
+ Xt[~mask_missing_values] = X[~mask_missing_values]
+
+ if self.add_indicator:
+ Xt = np.hstack((Xt, X_trans_indicator))
+ return Xt
+
+ def transform(self, X):
+ """Imputes all missing values in X.
+
+ Note that this is stochastic, and that if random_state is not fixed,
+ repeated calls, or permuted input, will yield different results.
+
+ Parameters
+ ----------
+ X : array-like, shape = [n_samples, n_features]
+ The input data to complete.
+
+ Returns
+ -------
+ Xt : array-like, shape (n_samples, n_features)
+ The imputed input data.
+ """
+ check_is_fitted(self, 'initial_imputer_')
+
+ if self.add_indicator:
+ X_trans_indicator = self.indicator_.transform(X)
+
+ X, Xt, mask_missing_values = self._initial_imputation(X)
+
+ if self.n_iter_ == 0 or np.all(mask_missing_values):
+ return Xt
+
+ imputations_per_round = len(self.imputation_sequence_) // self.n_iter_
+ i_rnd = 0
+ if self.verbose > 0:
+ print("[IterativeImputer] Completing matrix with shape %s"
+ % (X.shape,))
+ start_t = time()
+ for it, estimator_triplet in enumerate(self.imputation_sequence_):
+ Xt, _ = self._impute_one_feature(
+ Xt,
+ mask_missing_values,
+ estimator_triplet.feat_idx,
+ estimator_triplet.neighbor_feat_idx,
+ estimator=estimator_triplet.estimator,
+ fit_mode=False
+ )
+ if not (it + 1) % imputations_per_round:
+ if self.verbose > 1:
+ print('[IterativeImputer] Ending imputation round '
+ '%d/%d, elapsed time %0.2f'
+ % (i_rnd + 1, self.n_iter_, time() - start_t))
+ i_rnd += 1
+
+ Xt[~mask_missing_values] = X[~mask_missing_values]
+
+ if self.add_indicator:
+ Xt = np.hstack((Xt, X_trans_indicator))
+ return Xt
+
+ def fit(self, X, y=None):
+ """Fits the imputer on X and return self.
+
+ Parameters
+ ----------
+ X : array-like, shape (n_samples, n_features)
+ Input data, where "n_samples" is the number of samples and
+ "n_features" is the number of features.
+
+ y : ignored
+
+ Returns
+ -------
+ self : object
+ Returns self.
+ """
+ self.fit_transform(X)
+ return self
+
+ def _more_tags(self):
+ return {'allow_nan': True}
diff --git a/sklearn/impute/tests/__init__.py b/sklearn/impute/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sklearn/tests/test_impute.py b/sklearn/impute/tests/test_impute.py
similarity index 99%
rename from sklearn/tests/test_impute.py
rename to sklearn/impute/tests/test_impute.py
index 979140ba246cf..1552031ff2193 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/impute/tests/test_impute.py
@@ -13,6 +13,9 @@
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_array_almost_equal
+# make IterativeImputer available
+from sklearn.experimental import enable_iterative_imputer # noqa
+
from sklearn.impute import MissingIndicator
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.dummy import DummyRegressor
diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py
index 8798fb459ec74..3191dcd7a1352 100644
--- a/sklearn/inspection/partial_dependence.py
+++ b/sklearn/inspection/partial_dependence.py
@@ -22,7 +22,6 @@
from ..tree._tree import DTYPE
from ..exceptions import NotFittedError
from ..ensemble.gradient_boosting import BaseGradientBoosting
-from ..ensemble._gradient_boosting import _partial_dependence_tree
__all__ = ['partial_dependence', 'plot_partial_dependence']
@@ -105,14 +104,14 @@ def _partial_dependence_recursion(est, grid, features):
grid = np.asarray(grid, dtype=DTYPE, order='C')
n_estimators, n_trees_per_stage = est.estimators_.shape
- learning_rate = est.learning_rate
averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]),
dtype=np.float64, order='C')
for stage in range(n_estimators):
for k in range(n_trees_per_stage):
tree = est.estimators_[stage, k].tree_
- _partial_dependence_tree(tree, grid, features,
- learning_rate, averaged_predictions[k])
+ tree.compute_partial_dependence(grid, features,
+ averaged_predictions[k])
+ averaged_predictions *= est.learning_rate
return averaged_predictions
@@ -356,7 +355,7 @@ def partial_dependence(estimator, X, features, response_method='auto',
features)
# reshape averaged_predictions to
- # (n_outputs, n_values_feature_0, # n_values_feature_1, ...)
+ # (n_outputs, n_values_feature_0, n_values_feature_1, ...)
averaged_predictions = averaged_predictions.reshape(
-1, *[val.shape[0] for val in values])
diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py
index d2d3c7818e448..b90b76c4220f3 100644
--- a/sklearn/inspection/tests/test_partial_dependence.py
+++ b/sklearn/inspection/tests/test_partial_dependence.py
@@ -27,7 +27,6 @@
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.testing import assert_allclose
from sklearn.utils.testing import assert_array_equal
-from sklearn.utils.testing import if_matplotlib
# toy sample
@@ -396,11 +395,8 @@ def test_partial_dependence_sample_weight():
assert np.corrcoef(pdp, values)[0, 1] > 0.99
-@if_matplotlib
-def test_plot_partial_dependence():
+def test_plot_partial_dependence(pyplot):
# Test partial dependence plot function.
- import matplotlib.pyplot as plt # noqa
-
boston = load_boston()
clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
clf.fit(boston.data, boston.target)
@@ -409,7 +405,7 @@ def test_plot_partial_dependence():
plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
grid_resolution=grid_resolution,
feature_names=boston.feature_names)
- fig = plt.gcf()
+ fig = pyplot.gcf()
axs = fig.get_axes()
assert len(axs) == 3
assert all(ax.has_data for ax in axs)
@@ -420,7 +416,7 @@ def test_plot_partial_dependence():
grid_resolution=grid_resolution,
feature_names=boston.feature_names)
- fig = plt.gcf()
+ fig = pyplot.gcf()
axs = fig.get_axes()
assert len(axs) == 3
assert all(ax.has_data for ax in axs)
@@ -431,18 +427,14 @@ def test_plot_partial_dependence():
('CRIM', 'ZN')],
grid_resolution=grid_resolution,
feature_names=feature_names)
- fig = plt.gcf()
+ fig = pyplot.gcf()
axs = fig.get_axes()
assert len(axs) == 3
assert all(ax.has_data for ax in axs)
- plt.close('all')
-
-@if_matplotlib
-def test_plot_partial_dependence_multiclass():
+def test_plot_partial_dependence_multiclass(pyplot):
# Test partial dependence plot function on multi-class input.
- import matplotlib.pyplot as plt # noqa
iris = load_iris()
clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
clf.fit(iris.data, iris.target)
@@ -451,7 +443,7 @@ def test_plot_partial_dependence_multiclass():
plot_partial_dependence(clf, iris.data, [0, 1],
target=0,
grid_resolution=grid_resolution)
- fig = plt.gcf()
+ fig = pyplot.gcf()
axs = fig.get_axes()
assert len(axs) == 2
assert all(ax.has_data for ax in axs)
@@ -465,18 +457,14 @@ def test_plot_partial_dependence_multiclass():
plot_partial_dependence(clf, iris.data, [0, 1],
target='setosa',
grid_resolution=grid_resolution)
- fig = plt.gcf()
+ fig = pyplot.gcf()
axs = fig.get_axes()
assert len(axs) == 2
assert all(ax.has_data for ax in axs)
- plt.close('all')
-
-@if_matplotlib
-def test_plot_partial_dependence_multioutput():
+def test_plot_partial_dependence_multioutput(pyplot):
# Test partial dependence plot function on multi-output input.
- import matplotlib.pyplot as plt # noqa
(X, y), _ = multioutput_regression_data
clf = LinearRegression()
clf.fit(X, y)
@@ -485,7 +473,7 @@ def test_plot_partial_dependence_multioutput():
plot_partial_dependence(clf, X, [0, 1],
target=0,
grid_resolution=grid_resolution)
- fig = plt.gcf()
+ fig = pyplot.gcf()
axs = fig.get_axes()
assert len(axs) == 2
assert all(ax.has_data for ax in axs)
@@ -493,15 +481,12 @@ def test_plot_partial_dependence_multioutput():
plot_partial_dependence(clf, X, [0, 1],
target=1,
grid_resolution=grid_resolution)
- fig = plt.gcf()
+ fig = pyplot.gcf()
axs = fig.get_axes()
assert len(axs) == 2
assert all(ax.has_data for ax in axs)
- plt.close('all')
-
-@if_matplotlib
@pytest.mark.parametrize(
"data, params, err_msg",
[(multioutput_regression_data[0], {"target": None, 'features': [0]},
@@ -531,32 +516,23 @@ def test_plot_partial_dependence_multioutput():
)
@pytest.mark.filterwarnings('ignore:Default solver will be changed ') # 0.22
@pytest.mark.filterwarnings('ignore:Default multi_class will be') # 0.22
-def test_plot_partial_dependence_error(data, params, err_msg):
- import matplotlib.pyplot as plt # noqa
+def test_plot_partial_dependence_error(pyplot, data, params, err_msg):
X, y = data
estimator = LinearRegression().fit(X, y)
with pytest.raises(ValueError, match=err_msg):
plot_partial_dependence(estimator, X, **params)
- plt.close()
-
-@if_matplotlib
-def test_plot_partial_dependence_fig():
+def test_plot_partial_dependence_fig(pyplot):
# Make sure fig object is correctly used if not None
-
- import matplotlib.pyplot as plt
-
(X, y), _ = regression_data
clf = LinearRegression()
clf.fit(X, y)
- fig = plt.figure()
+ fig = pyplot.figure()
grid_resolution = 25
plot_partial_dependence(
clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig)
- assert plt.gcf() is fig
-
- plt.close()
+ assert pyplot.gcf() is fig
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
index 3b8f74a946699..7cff336715322 100644
--- a/sklearn/isotonic.py
+++ b/sklearn/isotonic.py
@@ -209,6 +209,15 @@ class IsotonicRegression(BaseEstimator, TransformerMixin, RegressorMixin):
Correctness of Kruskal's algorithms for monotone regression with ties
Leeuw, Psychometrica, 1977
+
+ Examples
+ --------
+ >>> from sklearn.datasets import make_regression
+ >>> from sklearn.isotonic import IsotonicRegression
+ >>> X, y = make_regression(n_samples=10, n_features=1, random_state=41)
+ >>> iso_reg = IsotonicRegression().fit(X.flatten(), y)
+ >>> iso_reg.predict([.1, .2]) # doctest: +ELLIPSIS
+ array([1.8628..., 3.7256...])
"""
def __init__(self, y_min=None, y_max=None, increasing=True,
out_of_bounds='nan'):
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index aeb5fd45f413f..17a5247d5ab20 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -27,7 +27,7 @@ class KernelRidge(BaseEstimator, RegressorMixin, MultiOutputMixin):
squared error loss while support vector regression uses epsilon-insensitive
loss, both combined with l2 regularization. In contrast to SVR, fitting a
KRR model can be done in closed-form and is typically faster for
- medium-sized datasets. On the other hand, the learned model is non-sparse
+ medium-sized datasets. On the other hand, the learned model is non-sparse
and thus slower than SVR, which learns a sparse model for epsilon > 0, at
prediction-time.
diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index ad0fa4277f3be..fcbe46ce77711 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -24,7 +24,7 @@ from ..utils._cython_blas cimport (_axpy, _dot, _asum, _ger, _gemv, _nrm2,
from ..utils._cython_blas cimport RowMajor, ColMajor, Trans, NoTrans
-from ..utils cimport _random
+from ..utils._random cimport our_rand_r
ctypedef np.float64_t DOUBLE
ctypedef np.uint32_t UINT32_t
@@ -42,7 +42,7 @@ cdef enum:
cdef inline UINT32_t rand_int(UINT32_t end, UINT32_t* random_state) nogil:
"""Generate a random integer in [0; end)."""
- return _random.our_rand_r(random_state) % end
+ return our_rand_r(random_state) % end
cdef inline floating fmax(floating x, floating y) nogil:
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index ceccafd706101..b14188bff50c1 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -224,7 +224,7 @@ def lasso_path(X, y, eps=1e-3, n_alphas=100, alphas=None,
values output by lars_path
Examples
- ---------
+ --------
Comparing lasso_path and lars_path with interpolation:
@@ -661,7 +661,7 @@ def fit(self, X, y, check_input=True):
"""Fit model with coordinate descent.
Parameters
- -----------
+ ----------
X : ndarray or scipy.sparse matrix, (n_samples, n_features)
Data
@@ -1747,7 +1747,7 @@ def fit(self, X, y):
"""Fit MultiTaskElasticNet model with coordinate descent
Parameters
- -----------
+ ----------
X : ndarray, shape (n_samples, n_features)
Data
y : ndarray, shape (n_samples, n_tasks)
diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py
index 5df45535db462..594fdb3676adb 100644
--- a/sklearn/linear_model/least_angle.py
+++ b/sklearn/linear_model/least_angle.py
@@ -42,7 +42,7 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0,
Read more in the :ref:`User Guide `.
Parameters
- -----------
+ ----------
X : None or array, shape (n_samples, n_features)
Input data. Note that if X is None then the Gram matrix must be
specified, i.e., cannot be None or False.
@@ -112,7 +112,7 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0,
solution of the coordinate descent lasso_path function.
Returns
- --------
+ -------
alphas : array, shape (n_alphas + 1,)
Maximum of covariances (in absolute value) at each iteration.
``n_alphas`` is either ``max_iter``, ``n_features`` or the
@@ -179,7 +179,7 @@ def lars_path_gram(Xy, Gram, n_samples, max_iter=500, alpha_min=0,
Read more in the :ref:`User Guide `.
Parameters
- -----------
+ ----------
Xy : array-like, shape (n_samples,) or (n_samples, n_targets)
Xy = np.dot(X.T, y).
@@ -231,7 +231,7 @@ def lars_path_gram(Xy, Gram, n_samples, max_iter=500, alpha_min=0,
solution of the coordinate descent lasso_path function.
Returns
- --------
+ -------
alphas : array, shape (n_alphas + 1,)
Maximum of covariances (in absolute value) at each iteration.
``n_alphas`` is either ``max_iter``, ``n_features`` or the
@@ -295,7 +295,7 @@ def _lars_path_solver(X, y, Xy=None, Gram=None, n_samples=None, max_iter=500,
Read more in the :ref:`User Guide `.
Parameters
- -----------
+ ----------
X : None or ndarray, shape (n_samples, n_features)
Input data. Note that if X is None then Gram must be specified,
i.e., cannot be None or False.
@@ -358,7 +358,7 @@ def _lars_path_solver(X, y, Xy=None, Gram=None, n_samples=None, max_iter=500,
solution of the coordinate descent lasso_path function.
Returns
- --------
+ -------
alphas : array, shape (n_alphas + 1,)
Maximum of covariances (in absolute value) at each iteration.
``n_alphas`` is either ``max_iter``, ``n_features`` or the
@@ -1128,7 +1128,7 @@ def _lars_path_residues(X_train, y_train, X_test, y_test, Gram=None,
"""Compute the residues on left-out data for a full LARS path
Parameters
- -----------
+ ----------
X_train : array, shape (n_samples, n_features)
The data to fit the LARS on
@@ -1189,7 +1189,7 @@ def _lars_path_residues(X_train, y_train, X_test, y_test, Gram=None,
Returns
- --------
+ -------
alphas : array, shape (n_alphas,)
Maximum of covariances (in absolute value) at each iteration.
``n_alphas`` is either ``max_iter`` or ``n_features``, whichever
diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py
index d9ee49cd37698..38be6ddd37540 100644
--- a/sklearn/linear_model/omp.py
+++ b/sklearn/linear_model/omp.py
@@ -681,7 +681,7 @@ def _omp_path_residues(X_train, y_train, X_test, y_test, copy=True,
"""Compute the residues on left-out data for a full LARS path
Parameters
- -----------
+ ----------
X_train : array, shape (n_samples, n_features)
The data to fit the LARS on
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
index 2a491bd3ef515..0e54126e52c33 100644
--- a/sklearn/linear_model/ridge.py
+++ b/sklearn/linear_model/ridge.py
@@ -31,6 +31,7 @@
from ..model_selection import GridSearchCV
from ..metrics.scorer import check_scoring
from ..exceptions import ConvergenceWarning
+from ..utils.sparsefuncs import mean_variance_axis
def _solve_sparse_cg(X, y, alpha, max_iter=None, tol=1e-3, verbose=0,
@@ -226,9 +227,17 @@ def _solve_svd(X, y, alpha):
return np.dot(Vt.T, d_UT_y).T
+def _get_valid_accept_sparse(is_X_sparse, solver):
+ if is_X_sparse and solver in ['auto', 'sag', 'saga']:
+ return 'csr'
+ else:
+ return ['csr', 'csc', 'coo']
+
+
def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
max_iter=None, tol=1e-3, verbose=0, random_state=None,
- return_n_iter=False, return_intercept=False):
+ return_n_iter=False, return_intercept=False,
+ check_input=True):
"""Solve the ridge equation by the method of normal equations.
Read more in the :ref:`User Guide `.
@@ -332,6 +341,11 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
.. versionadded:: 0.17
+ check_input : boolean, default True
+ If False, the input arrays X and y will not be checked.
+
+ .. versionadded:: 0.21
+
Returns
-------
coef : array, shape = [n_features] or [n_targets, n_features]
@@ -360,13 +374,14 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
return_n_iter=return_n_iter,
return_intercept=return_intercept,
X_scale=None,
- X_offset=None)
+ X_offset=None,
+ check_input=check_input)
def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
max_iter=None, tol=1e-3, verbose=0, random_state=None,
return_n_iter=False, return_intercept=False,
- X_scale=None, X_offset=None):
+ X_scale=None, X_offset=None, check_input=True):
has_sw = sample_weight is not None
@@ -388,17 +403,12 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
"intercept. Please change solver to 'sag' or set "
"return_intercept=False.")
- _dtype = [np.float64, np.float32]
-
- # SAG needs X and y columns to be C-contiguous and np.float64
- if solver in ['sag', 'saga']:
- X = check_array(X, accept_sparse=['csr'],
- dtype=np.float64, order='C')
- y = check_array(y, dtype=np.float64, ensure_2d=False, order='F')
- else:
- X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
- dtype=_dtype)
- y = check_array(y, dtype=X.dtype, ensure_2d=False)
+ if check_input:
+ _dtype = [np.float64, np.float32]
+ _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver)
+ X = check_array(X, accept_sparse=_accept_sparse, dtype=_dtype,
+ order="C")
+ y = check_array(y, dtype=X.dtype, ensure_2d=False, order="C")
check_consistent_length(X, y)
n_samples, n_features = X.shape
@@ -417,8 +427,6 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
raise ValueError("Number of samples in X and y does not correspond:"
" %d != %d" % (n_samples, n_samples_))
-
-
if has_sw:
if np.atleast_1d(sample_weight).ndim > 1:
raise ValueError("Sample weights must be 1D array or scalar")
@@ -438,7 +446,6 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
if alpha.size == 1 and n_targets > 1:
alpha = np.repeat(alpha, n_targets)
-
n_iter = None
if solver == 'sparse_cg':
coef = _solve_sparse_cg(X, y, alpha,
@@ -461,7 +468,6 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
except linalg.LinAlgError:
# use SVD solver if matrix is singular
solver = 'svd'
-
else:
try:
coef = _solve_cholesky(X, y, alpha)
@@ -473,11 +479,12 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
# precompute max_squared_sum for all targets
max_squared_sum = row_norms(X, squared=True).max()
- coef = np.empty((y.shape[1], n_features))
+ coef = np.empty((y.shape[1], n_features), dtype=X.dtype)
n_iter = np.empty(y.shape[1], dtype=np.int32)
- intercept = np.zeros((y.shape[1], ))
+ intercept = np.zeros((y.shape[1], ), dtype=X.dtype)
for i, (alpha_i, target) in enumerate(zip(alpha, y.T)):
- init = {'coef': np.zeros((n_features + int(return_intercept), 1))}
+ init = {'coef': np.zeros((n_features + int(return_intercept), 1),
+ dtype=X.dtype)}
coef_, n_iter_, _ = sag_solver(
X, target.ravel(), sample_weight, 'squared', alpha_i, 0,
max_iter, tol, verbose, random_state, False, max_squared_sum,
@@ -530,13 +537,13 @@ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
def fit(self, X, y, sample_weight=None):
- if self.solver in ('sag', 'saga'):
- _dtype = np.float64
- else:
- # all other solvers work at both float precision levels
- _dtype = [np.float64, np.float32]
-
- X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=_dtype,
+ # all other solvers work at both float precision levels
+ _dtype = [np.float64, np.float32]
+ _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X),
+ self.solver)
+ X, y = check_X_y(X, y,
+ accept_sparse=_accept_sparse,
+ dtype=_dtype,
multi_output=True, y_numeric=True)
if ((sample_weight is not None) and
@@ -555,7 +562,7 @@ def fit(self, X, y, sample_weight=None):
X, y, alpha=self.alpha, sample_weight=sample_weight,
max_iter=self.max_iter, tol=self.tol, solver=self.solver,
random_state=self.random_state, return_n_iter=True,
- return_intercept=True)
+ return_intercept=True, check_input=False)
# add the offset which was subtracted by _preprocess_data
self.intercept_ += y_offset
else:
@@ -570,8 +577,7 @@ def fit(self, X, y, sample_weight=None):
X, y, alpha=self.alpha, sample_weight=sample_weight,
max_iter=self.max_iter, tol=self.tol, solver=self.solver,
random_state=self.random_state, return_n_iter=True,
- return_intercept=False, **params)
-
+ return_intercept=False, check_input=False, **params)
self._set_intercept(X_offset, y_offset, X_scale)
return self
@@ -893,8 +899,9 @@ def fit(self, X, y, sample_weight=None):
-------
self : returns an instance of self.
"""
- check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
- multi_output=True)
+ _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X),
+ self.solver)
+ check_X_y(X, y, accept_sparse=_accept_sparse, multi_output=True)
self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
Y = self._label_binarizer.fit_transform(y)
@@ -921,6 +928,106 @@ def classes_(self):
return self._label_binarizer.classes_
+def _check_gcv_mode(X, gcv_mode):
+ possible_gcv_modes = [None, 'auto', 'svd', 'eigen']
+ if gcv_mode not in possible_gcv_modes:
+ raise ValueError(
+ "Unknown value for 'gcv_mode'. "
+ "Got {} instead of one of {}" .format(
+ gcv_mode, possible_gcv_modes))
+ if gcv_mode in ['eigen', 'svd']:
+ return gcv_mode
+ # if X has more rows than columns, use decomposition of X^T.X,
+ # otherwise X.X^T
+ if X.shape[0] > X.shape[1]:
+ return 'svd'
+ return 'eigen'
+
+
+def _find_smallest_angle(query, vectors):
+ """Find the column of vectors that is most aligned with the query.
+
+ Both query and the columns of vectors must have their l2 norm equal to 1.
+
+ Parameters
+ ----------
+ query : ndarray, shape (n_samples,)
+ Normalized query vector.
+
+ vectors : ndarray, shape (n_samples, n_features)
+ Vectors to which we compare query, as columns. Must be normalized.
+ """
+ abs_cosine = np.abs(query.dot(vectors))
+ index = np.argmax(abs_cosine)
+ return index
+
+
+class _X_operator(sparse.linalg.LinearOperator):
+ """Behaves as centered and scaled X with an added intercept column.
+
+ This operator behaves as
+ np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]])
+ """
+
+ def __init__(self, X, X_mean, sqrt_sw):
+ n_samples, n_features = X.shape
+ super().__init__(X.dtype, (n_samples, n_features + 1))
+ self.X = X
+ self.X_mean = X_mean
+ self.sqrt_sw = sqrt_sw
+
+ def _matvec(self, v):
+ v = v.ravel()
+ return safe_sparse_dot(
+ self.X, v[:-1], dense_output=True
+ ) - self.sqrt_sw * self.X_mean.dot(v[:-1]) + v[-1] * self.sqrt_sw
+
+ def _matmat(self, v):
+ return (
+ safe_sparse_dot(self.X, v[:-1], dense_output=True) -
+ self.sqrt_sw[:, None] * self.X_mean.dot(v[:-1]) + v[-1] *
+ self.sqrt_sw[:, None])
+
+ def _transpose(self):
+ return _Xt_operator(self.X, self.X_mean, self.sqrt_sw)
+
+
+class _Xt_operator(sparse.linalg.LinearOperator):
+ """Behaves as transposed centered and scaled X with an intercept column.
+
+ This operator behaves as
+ np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T
+ """
+
+ def __init__(self, X, X_mean, sqrt_sw):
+ n_samples, n_features = X.shape
+ super().__init__(X.dtype, (n_features + 1, n_samples))
+ self.X = X
+ self.X_mean = X_mean
+ self.sqrt_sw = sqrt_sw
+
+ def _matvec(self, v):
+ v = v.ravel()
+ n_features = self.shape[0]
+ res = np.empty(n_features, dtype=self.X.dtype)
+ res[:-1] = (
+ safe_sparse_dot(self.X.T, v, dense_output=True) -
+ (self.X_mean * self.sqrt_sw.dot(v))
+ )
+ res[-1] = np.dot(v, self.sqrt_sw)
+ return res
+
+ def _matmat(self, v):
+ n_features = self.shape[0]
+ res = np.empty((n_features, v.shape[1]), dtype=self.X.dtype)
+ res[:-1] = (
+ safe_sparse_dot(self.X.T, v, dense_output=True) -
+ self.X_mean[:, None] * self.sqrt_sw.dot(v)
+ )
+ res[-1] = np.dot(self.sqrt_sw, v)
+ return res
+
+
class _RidgeGCV(LinearModel):
"""Ridge regression with built-in Generalized Cross-Validation
@@ -972,18 +1079,6 @@ def __init__(self, alphas=(0.1, 1.0, 10.0),
self.gcv_mode = gcv_mode
self.store_cv_values = store_cv_values
- def _pre_compute(self, X, y, centered_kernel=True):
- # even if X is very sparse, K is usually very dense
- K = safe_sparse_dot(X, X.T, dense_output=True)
- # the following emulates an additional constant regressor
- # corresponding to fit_intercept=True
- # but this is done only when the features have been centered
- if centered_kernel:
- K += np.ones_like(K)
- v, Q = linalg.eigh(K)
- QT_y = np.dot(Q.T, y)
- return v, Q, QT_y
-
def _decomp_diag(self, v_prime, Q):
# compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T))
return (v_prime * Q ** 2).sum(axis=-1)
@@ -995,18 +1090,161 @@ def _diag_dot(self, D, B):
D = D[(slice(None), ) + (np.newaxis, ) * (len(B.shape) - 1)]
return D * B
- def _errors_and_values_helper(self, alpha, y, v, Q, QT_y):
- """Helper function to avoid code duplication between self._errors and
- self._values.
+ def _compute_gram(self, X, sqrt_sw):
+ """Computes the Gram matrix with possible centering.
- Notes
- -----
- We don't construct matrix G, instead compute action on y & diagonal.
+ If ``center`` is ``True``, compute
+ (X - X.mean(axis=0)).dot((X - X.mean(axis=0)).T)
+ else X.dot(X.T)
+
+ Parameters
+ ----------
+ X : {array-like, sparse matrix}, shape (n_samples, n_features)
+ The input uncentered data.
+
+ sqrt_sw : ndarray, shape (n_samples,)
+ square roots of sample weights
+
+ center : bool, default is True
+ Whether or not to remove the mean from ``X``.
+
+ Returns
+ -------
+ gram : ndarray, shape (n_samples, n_samples)
+ The Gram matrix.
+ X_mean : ndarray, shape (n_feature,)
+ The mean of ``X`` for each feature.
+ """
+ center = self.fit_intercept and sparse.issparse(X)
+ if not center:
+ # in this case centering has been done in preprocessing
+ # or we are not fitting an intercept.
+ X_mean = np.zeros(X.shape[1], dtype=X.dtype)
+ return safe_sparse_dot(X, X.T, dense_output=True), X_mean
+ # otherwise X is always sparse
+ n_samples = X.shape[0]
+ sample_weight_matrix = sparse.dia_matrix(
+ (sqrt_sw, 0), shape=(n_samples, n_samples))
+ X_weighted = sample_weight_matrix.dot(X)
+ X_mean, _ = mean_variance_axis(X_weighted, axis=0)
+ X_mean *= n_samples / sqrt_sw.dot(sqrt_sw)
+ X_mX = sqrt_sw[:, None] * safe_sparse_dot(
+ X_mean, X.T, dense_output=True)
+ X_mX_m = np.outer(sqrt_sw, sqrt_sw) * np.dot(X_mean, X_mean)
+ return (safe_sparse_dot(X, X.T, dense_output=True) + X_mX_m
+ - X_mX - X_mX.T, X_mean)
+
+ def _compute_covariance(self, X, sqrt_sw):
+ """Computes centered covariance matrix.
+
+ If ``center`` is ``True``, compute
+ (X - X.mean(axis=0)).T.dot(X - X.mean(axis=0))
+ else
+ X.T.dot(X)
+
+ Parameters
+ ----------
+ X : sparse matrix, shape (n_samples, n_features)
+ The input uncentered data.
+
+ sqrt_sw : ndarray, shape (n_samples,)
+ square roots of sample weights
+
+ center : bool, default is True
+ Whether or not to remove the mean from ``X``.
+
+ Returns
+ -------
+ covariance : ndarray, shape (n_features, n_features)
+ The covariance matrix.
+ X_mean : ndarray, shape (n_feature,)
+ The mean of ``X`` for each feature.
+ """
+ if not self.fit_intercept:
+ # in this case centering has been done in preprocessing
+ # or we are not fitting an intercept.
+ X_mean = np.zeros(X.shape[1], dtype=X.dtype)
+ return safe_sparse_dot(X.T, X, dense_output=True), X_mean
+ # this function only gets called for sparse X
+ n_samples = X.shape[0]
+ sample_weight_matrix = sparse.dia_matrix(
+ (sqrt_sw, 0), shape=(n_samples, n_samples))
+ X_weighted = sample_weight_matrix.dot(X)
+ X_mean, _ = mean_variance_axis(X_weighted, axis=0)
+ X_mean = X_mean * n_samples / sqrt_sw.dot(sqrt_sw)
+ weight_sum = sqrt_sw.dot(sqrt_sw)
+ return (safe_sparse_dot(X.T, X, dense_output=True) -
+ weight_sum * np.outer(X_mean, X_mean),
+ X_mean)
+
+ def _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw):
+ """Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T)
+ without explicitely centering X nor computing X.dot(A)
+ when X is sparse.
+
+ Parameters
+ ----------
+ X : sparse matrix, shape = (n_samples, n_features)
+
+ A : np.ndarray, shape = (n_features, n_features)
+
+ X_mean : np.ndarray, shape = (n_features,)
+
+ sqrt_sw : np.ndarray, shape = (n_features,)
+ square roots of sample weights
+
+ Returns
+ -------
+ diag : np.ndarray, shape = (n_samples,)
+ The computed diagonal.
+ """
+ intercept_col = sqrt_sw
+ scale = sqrt_sw
+ batch_size = X.shape[1]
+ diag = np.empty(X.shape[0], dtype=X.dtype)
+ for start in range(0, X.shape[0], batch_size):
+ batch = slice(start, min(X.shape[0], start + batch_size), 1)
+ X_batch = np.empty(
+ (X[batch].shape[0], X.shape[1] + self.fit_intercept),
+ dtype=X.dtype
+ )
+ if self.fit_intercept:
+ X_batch[:, :-1] = X[batch].A - X_mean * scale[batch][:, None]
+ X_batch[:, -1] = intercept_col[batch]
+ else:
+ X_batch = X[batch].A
+ diag[batch] = (X_batch.dot(A) * X_batch).sum(axis=1)
+ return diag
+
+ def _eigen_decompose_gram(self, X, y, sqrt_sw):
+ """Eigendecomposition of X.X^T, used when n_samples <= n_features"""
+ # if X is dense it has already been centered in preprocessing
+ K, X_mean = self._compute_gram(X, sqrt_sw)
+ if self.fit_intercept:
+ # to emulate centering X with sample weights,
+ # ie removing the weighted average, we add a column
+ # containing the square roots of the sample weights.
+ # by centering, it is orthogonal to the other columns
+ K += np.outer(sqrt_sw, sqrt_sw)
+ v, Q = linalg.eigh(K)
+ QT_y = np.dot(Q.T, y)
+ return X_mean, v, Q, QT_y
+
+ def _solve_eigen_gram(self, alpha, y, sqrt_sw, X_mean, v, Q, QT_y):
+ """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
+
+ Used when we have a decomposition of X.X^T (n_features >= n_samples).
"""
w = 1. / (v + alpha)
- constant_column = np.var(Q, 0) < 1.e-12
- # detect constant columns
- w[constant_column] = 0 # cancel the regularization for the intercept
+ if self.fit_intercept:
+ # the vector containing the square roots of the sample weights (1
+ # when no sample weights) is the eigenvector of XX^T which
+ # corresponds to the intercept; we cancel the regularization on
+ # this dimension. the corresponding eigenvalue is
+ # sum(sample_weight).
+ normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)
+ intercept_dim = _find_smallest_angle(normalized_sw, Q)
+ w[intercept_dim] = 0 # cancel regularization for the intercept
c = np.dot(Q, self._diag_dot(w, QT_y))
G_diag = self._decomp_diag(w, Q)
@@ -1015,35 +1253,117 @@ def _errors_and_values_helper(self, alpha, y, v, Q, QT_y):
G_diag = G_diag[:, np.newaxis]
return G_diag, c
- def _errors(self, alpha, y, v, Q, QT_y):
- G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y)
- return (c / G_diag) ** 2, c
+ def _eigen_decompose_covariance(self, X, y, sqrt_sw):
+ """Eigendecomposition of X^T.X, used when n_samples > n_features."""
+ n_samples, n_features = X.shape
+ cov = np.empty((n_features + 1, n_features + 1), dtype=X.dtype)
+ cov[:-1, :-1], X_mean = self._compute_covariance(X, sqrt_sw)
+ if not self.fit_intercept:
+ cov = cov[:-1, :-1]
+ # to emulate centering X with sample weights,
+ # ie removing the weighted average, we add a column
+ # containing the square roots of the sample weights.
+ # by centering, it is orthogonal to the other columns
+ # when all samples have the same weight we add a column of 1
+ else:
+ cov[-1] = 0
+ cov[:, -1] = 0
+ cov[-1, -1] = sqrt_sw.dot(sqrt_sw)
+ nullspace_dim = max(0, X.shape[1] - X.shape[0])
+ s, V = linalg.eigh(cov)
+ # remove eigenvalues and vectors in the null space of X^T.X
+ s = s[nullspace_dim:]
+ V = V[:, nullspace_dim:]
+ return X_mean, s, V, X
+
+ def _solve_eigen_covariance_no_intercept(
+ self, alpha, y, sqrt_sw, X_mean, s, V, X):
+ """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
+
+ Used when we have a decomposition of X^T.X
+ (n_features < n_samples and X is sparse), and not fitting an intercept.
+ """
+ w = 1 / (s + alpha)
+ A = (V * w).dot(V.T)
+ AXy = A.dot(safe_sparse_dot(X.T, y, dense_output=True))
+ y_hat = safe_sparse_dot(X, AXy, dense_output=True)
+ hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)
+ if len(y.shape) != 1:
+ # handle case where y is 2-d
+ hat_diag = hat_diag[:, np.newaxis]
+ return (1 - hat_diag) / alpha, (y - y_hat) / alpha
- def _values(self, alpha, y, v, Q, QT_y):
- G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y)
- return y - (c / G_diag), c
+ def _solve_eigen_covariance_intercept(
+ self, alpha, y, sqrt_sw, X_mean, s, V, X):
+ """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
- def _pre_compute_svd(self, X, y, centered_kernel=True):
- if sparse.issparse(X):
- raise TypeError("SVD not supported for sparse matrices")
- if centered_kernel:
- X = np.hstack((X, np.ones((X.shape[0], 1))))
- # to emulate fit_intercept=True situation, add a column on ones
- # Note that by centering, the other columns are orthogonal to that one
+ Used when we have a decomposition of X^T.X
+ (n_features < n_samples and X is sparse),
+ and we are fitting an intercept.
+ """
+ # the vector [0, 0, ..., 0, 1]
+ # is the eigenvector of X^TX which
+ # corresponds to the intercept; we cancel the regularization on
+ # this dimension. the corresponding eigenvalue is
+ # sum(sample_weight), e.g. n when uniform sample weights.
+ intercept_sv = np.zeros(V.shape[0])
+ intercept_sv[-1] = 1
+ intercept_dim = _find_smallest_angle(intercept_sv, V)
+ w = 1 / (s + alpha)
+ w[intercept_dim] = 1 / s[intercept_dim]
+ A = (V * w).dot(V.T)
+ # add a column to X containing the square roots of sample weights
+ X_op = _X_operator(X, X_mean, sqrt_sw)
+ AXy = A.dot(X_op.T.dot(y))
+ y_hat = X_op.dot(AXy)
+ hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)
+ # return (1 - hat_diag), (y - y_hat)
+ if len(y.shape) != 1:
+ # handle case where y is 2-d
+ hat_diag = hat_diag[:, np.newaxis]
+ return (1 - hat_diag) / alpha, (y - y_hat) / alpha
+
+ def _solve_eigen_covariance(
+ self, alpha, y, sqrt_sw, X_mean, s, V, X):
+ """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
+
+ Used when we have a decomposition of X^T.X
+ (n_features < n_samples and X is sparse).
+ """
+ if self.fit_intercept:
+ return self._solve_eigen_covariance_intercept(
+ alpha, y, sqrt_sw, X_mean, s, V, X)
+ return self._solve_eigen_covariance_no_intercept(
+ alpha, y, sqrt_sw, X_mean, s, V, X)
+
+ def _svd_decompose_design_matrix(self, X, y, sqrt_sw):
+ # X already centered
+ X_mean = np.zeros(X.shape[1], dtype=X.dtype)
+ if self.fit_intercept:
+ # to emulate fit_intercept=True situation, add a column
+ # containing the square roots of the sample weights
+ # by centering, the other columns are orthogonal to that one
+ intercept_column = sqrt_sw[:, None]
+ X = np.hstack((X, intercept_column))
U, s, _ = linalg.svd(X, full_matrices=0)
v = s ** 2
UT_y = np.dot(U.T, y)
- return v, U, UT_y
+ return X_mean, v, U, UT_y
- def _errors_and_values_svd_helper(self, alpha, y, v, U, UT_y):
- """Helper function to avoid code duplication between self._errors_svd
- and self._values_svd.
+ def _solve_svd_design_matrix(
+ self, alpha, y, sqrt_sw, X_mean, v, U, UT_y):
+ """Compute dual coefficients and diagonal of (Identity - Hat_matrix)
+
+ Used when we have an SVD decomposition of X
+ (n_features >= n_samples and X is dense).
"""
- constant_column = np.var(U, 0) < 1.e-12
- # detect columns colinear to ones
w = ((v + alpha) ** -1) - (alpha ** -1)
- w[constant_column] = - (alpha ** -1)
- # cancel the regularization for the intercept
+ if self.fit_intercept:
+ # detect intercept column
+ normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)
+ intercept_dim = _find_smallest_angle(normalized_sw, U)
+ # cancel the regularization for the intercept
+ w[intercept_dim] = - (alpha ** -1)
c = np.dot(U, self._diag_dot(w, UT_y)) + (alpha ** -1) * y
G_diag = self._decomp_diag(w, U) + (alpha ** -1)
if len(y.shape) != 1:
@@ -1051,24 +1371,16 @@ def _errors_and_values_svd_helper(self, alpha, y, v, U, UT_y):
G_diag = G_diag[:, np.newaxis]
return G_diag, c
- def _errors_svd(self, alpha, y, v, U, UT_y):
- G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y)
- return (c / G_diag) ** 2, c
-
- def _values_svd(self, alpha, y, v, U, UT_y):
- G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y)
- return y - (c / G_diag), c
-
def fit(self, X, y, sample_weight=None):
"""Fit Ridge regression model
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
- Training data
+ Training data. Will be cast to float64 if necessary
y : array-like, shape = [n_samples] or [n_samples, n_targets]
- Target values. Will be cast to X's dtype if necessary
+ Target values. Will be cast to float64 if necessary
sample_weight : float or array-like of shape [n_samples]
Sample weight
@@ -1077,66 +1389,60 @@ def fit(self, X, y, sample_weight=None):
-------
self : object
"""
- X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64,
+ X, y = check_X_y(X, y, ['csr', 'csc', 'coo'],
+ dtype=[np.float64],
multi_output=True, y_numeric=True)
+
+ if np.any(self.alphas <= 0):
+ raise ValueError(
+ "alphas must be positive. Got {} containing some "
+ "negative or null value instead.".format(self.alphas))
+
if sample_weight is not None and not isinstance(sample_weight, float):
- sample_weight = check_array(sample_weight, ensure_2d=False)
+ sample_weight = check_array(sample_weight, ensure_2d=False,
+ dtype=X.dtype)
n_samples, n_features = X.shape
X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data(
X, y, self.fit_intercept, self.normalize, self.copy_X,
sample_weight=sample_weight)
- gcv_mode = self.gcv_mode
- with_sw = len(np.shape(sample_weight))
-
- if gcv_mode is None or gcv_mode == 'auto':
- if sparse.issparse(X) or n_features > n_samples or with_sw:
- gcv_mode = 'eigen'
- else:
- gcv_mode = 'svd'
- elif gcv_mode == "svd" and with_sw:
- # FIXME non-uniform sample weights not yet supported
- warnings.warn("non-uniform sample weights unsupported for svd, "
- "forcing usage of eigen")
- gcv_mode = 'eigen'
+ gcv_mode = _check_gcv_mode(X, self.gcv_mode)
if gcv_mode == 'eigen':
- _pre_compute = self._pre_compute
- _errors = self._errors
- _values = self._values
+ decompose = self._eigen_decompose_gram
+ solve = self._solve_eigen_gram
elif gcv_mode == 'svd':
- # assert n_samples >= n_features
- _pre_compute = self._pre_compute_svd
- _errors = self._errors_svd
- _values = self._values_svd
- else:
- raise ValueError('bad gcv_mode "%s"' % gcv_mode)
+ if sparse.issparse(X):
+ decompose = self._eigen_decompose_covariance
+ solve = self._solve_eigen_covariance
+ else:
+ decompose = self._svd_decompose_design_matrix
+ solve = self._solve_svd_design_matrix
if sample_weight is not None:
X, y = _rescale_data(X, y, sample_weight)
-
- centered_kernel = not sparse.issparse(X) and self.fit_intercept
-
- v, Q, QT_y = _pre_compute(X, y, centered_kernel)
- n_y = 1 if len(y.shape) == 1 else y.shape[1]
- cv_values = np.zeros((n_samples * n_y, len(self.alphas)))
- C = []
+ sqrt_sw = np.sqrt(sample_weight)
+ else:
+ sqrt_sw = np.ones(X.shape[0], dtype=X.dtype)
scorer = check_scoring(self, scoring=self.scoring, allow_none=True)
error = scorer is None
- if np.any(self.alphas < 0):
- raise ValueError("alphas cannot be negative. "
- "Got {} containing some "
- "negative value instead.".format(self.alphas))
-
+ n_y = 1 if len(y.shape) == 1 else y.shape[1]
+ cv_values = np.zeros((n_samples * n_y, len(self.alphas)),
+ dtype=X.dtype)
+ C = []
+ X_mean, *decomposition = decompose(X, y, sqrt_sw)
for i, alpha in enumerate(self.alphas):
+ G_diag, c = solve(
+ float(alpha), y, sqrt_sw, X_mean, *decomposition)
if error:
- out, c = _errors(float(alpha), y, v, Q, QT_y)
+ squared_errors = (c / G_diag) ** 2
+ cv_values[:, i] = squared_errors.ravel()
else:
- out, c = _values(float(alpha), y, v, Q, QT_y)
- cv_values[:, i] = out.ravel()
+ predictions = y - (c / G_diag)
+ cv_values[:, i] = predictions.ravel()
C.append(c)
if error:
@@ -1158,6 +1464,7 @@ def identity_estimator():
self.dual_coef_ = C[best]
self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)
+ X_offset += X_mean * X_scale
self._set_intercept(X_offset, y_offset, X_scale)
if self.store_cv_values:
@@ -1189,7 +1496,8 @@ def fit(self, X, y, sample_weight=None):
Parameters
----------
X : array-like, shape = [n_samples, n_features]
- Training data
+ Training data. If using GCV, will be cast to float64
+ if necessary.
y : array-like, shape = [n_samples] or [n_samples, n_targets]
Target values. Will be cast to X's dtype if necessary
@@ -1200,8 +1508,17 @@ def fit(self, X, y, sample_weight=None):
Returns
-------
self : object
+
+ Notes
+ -----
+ When sample_weight is provided, the selected hyperparameter may depend
+ on whether we use generalized cross-validation (cv=None or cv='auto')
+ or another form of cross-validation, because only generalized
+ cross-validation takes the sample weights into account when computing
+ the validation score.
"""
- if self.cv is None:
+ cv = self.cv
+ if cv is None:
estimator = _RidgeGCV(self.alphas,
fit_intercept=self.fit_intercept,
normalize=self.normalize,
@@ -1217,9 +1534,11 @@ def fit(self, X, y, sample_weight=None):
raise ValueError("cv!=None and store_cv_values=True "
" are incompatible")
parameters = {'alpha': self.alphas}
+ solver = 'sparse_cg' if sparse.issparse(X) else 'auto'
gs = GridSearchCV(Ridge(fit_intercept=self.fit_intercept,
- normalize=self.normalize),
- parameters, cv=self.cv, scoring=self.scoring)
+ normalize=self.normalize,
+ solver=solver),
+ parameters, cv=cv, scoring=self.scoring)
gs.fit(X, y, sample_weight=sample_weight)
estimator = gs.best_estimator_
self.alpha_ = gs.best_estimator_.alpha
@@ -1249,6 +1568,7 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin):
the estimates. Larger values specify stronger regularization.
Alpha corresponds to ``C^-1`` in other linear models such as
LogisticRegression or LinearSVC.
+ If using generalized cross-validation, alphas must be positive.
fit_intercept : boolean
Whether to calculate the intercept for this model. If set
@@ -1267,12 +1587,15 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin):
A string (see model evaluation documentation) or
a scorer callable object / function with signature
``scorer(estimator, X, y)``.
+ If None, the negative mean squared error if cv is 'auto' or None
+ (i.e. when using generalized cross-validation), and r2 score otherwise.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the efficient Leave-One-Out cross-validation
+ (also known as Generalized Cross-Validation).
- integer, to specify the number of folds.
- :term:`CV splitter`,
- An iterable yielding (train, test) splits as arrays of indices.
@@ -1288,15 +1611,13 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin):
Flag indicating which strategy to use when performing
Generalized Cross-Validation. Options are::
- 'auto' : use svd if n_samples > n_features or when X is a sparse
- matrix, otherwise use eigen
- 'svd' : force computation via singular value decomposition of X
- (does not work for sparse matrices)
- 'eigen' : force computation via eigendecomposition of X^T X
+ 'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen'
+ 'svd' : force use of singular value decomposition of X when X is
+ dense, eigenvalue decomposition of X^T.X when X is sparse.
+ 'eigen' : force computation via eigendecomposition of X.X^T
The 'auto' mode is the default and is intended to pick the cheaper
- option of the two depending upon the shape and format of the training
- data.
+ option of the two depending on the shape of the training data.
store_cv_values : boolean, default=False
Flag indicating if the cross-validation values corresponding to
@@ -1463,7 +1784,8 @@ def fit(self, X, y, sample_weight=None):
----------
X : array-like, shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples
- and n_features is the number of features.
+ and n_features is the number of features. When using GCV,
+ will be cast to float64 if necessary.
y : array-like, shape (n_samples,)
Target values. Will be cast to X's dtype if necessary
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index 7bfb617d4beff..fa7f0606b1010 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -6,8 +6,8 @@
import pytest
from sklearn.utils.testing import assert_almost_equal
-from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_allclose
+from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_greater
@@ -33,10 +33,12 @@
from sklearn.linear_model.ridge import RidgeClassifierCV
from sklearn.linear_model.ridge import _solve_cholesky
from sklearn.linear_model.ridge import _solve_cholesky_kernel
+from sklearn.linear_model.ridge import _check_gcv_mode
+from sklearn.linear_model.ridge import _X_operator
from sklearn.datasets import make_regression
from sklearn.model_selection import GridSearchCV
-from sklearn.model_selection import KFold
+from sklearn.model_selection import KFold, GroupKFold, cross_val_predict
from sklearn.utils import check_random_state
from sklearn.datasets import make_multilabel_classification
@@ -311,6 +313,213 @@ def test_ridge_individual_penalties():
assert_raises(ValueError, ridge.fit, X, y)
+@pytest.mark.parametrize('n_col', [(), (1,), (3,)])
+def test_x_operator(n_col):
+ rng = np.random.RandomState(0)
+ X = rng.randn(11, 8)
+ X_m = rng.randn(8)
+ sqrt_sw = rng.randn(len(X))
+ Y = rng.randn(11, *n_col)
+ A = rng.randn(9, *n_col)
+ operator = _X_operator(sp.csr_matrix(X), X_m, sqrt_sw)
+ reference_operator = np.hstack(
+ [X - sqrt_sw[:, None] * X_m, sqrt_sw[:, None]])
+ assert_allclose(reference_operator.dot(A), operator.dot(A))
+ assert_allclose(reference_operator.T.dot(Y), operator.T.dot(Y))
+
+
+@pytest.mark.parametrize('shape', [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)])
+@pytest.mark.parametrize('uniform_weights', [True, False])
+def test_compute_gram(shape, uniform_weights):
+ rng = np.random.RandomState(0)
+ X = rng.randn(*shape)
+ if uniform_weights:
+ sw = np.ones(X.shape[0])
+ else:
+ sw = rng.chisquare(1, shape[0])
+ sqrt_sw = np.sqrt(sw)
+ X_mean = np.average(X, axis=0, weights=sw)
+ X_centered = (X - X_mean) * sqrt_sw[:, None]
+ true_gram = X_centered.dot(X_centered.T)
+ X_sparse = sp.csr_matrix(X * sqrt_sw[:, None])
+ gcv = _RidgeGCV(fit_intercept=True)
+ computed_gram, computed_mean = gcv._compute_gram(X_sparse, sqrt_sw)
+ assert_allclose(X_mean, computed_mean)
+ assert_allclose(true_gram, computed_gram)
+
+
+@pytest.mark.parametrize('shape', [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)])
+@pytest.mark.parametrize('uniform_weights', [True, False])
+def test_compute_covariance(shape, uniform_weights):
+ rng = np.random.RandomState(0)
+ X = rng.randn(*shape)
+ if uniform_weights:
+ sw = np.ones(X.shape[0])
+ else:
+ sw = rng.chisquare(1, shape[0])
+ sqrt_sw = np.sqrt(sw)
+ X_mean = np.average(X, axis=0, weights=sw)
+ X_centered = (X - X_mean) * sqrt_sw[:, None]
+ true_covariance = X_centered.T.dot(X_centered)
+ X_sparse = sp.csr_matrix(X * sqrt_sw[:, None])
+ gcv = _RidgeGCV(fit_intercept=True)
+ computed_cov, computed_mean = gcv._compute_covariance(X_sparse, sqrt_sw)
+ assert_allclose(X_mean, computed_mean)
+ assert_allclose(true_covariance, computed_cov)
+
+
+def _make_sparse_offset_regression(
+ n_samples=100, n_features=100, proportion_nonzero=.5,
+ n_informative=10, n_targets=1, bias=13., X_offset=30.,
+ noise=30., shuffle=True, coef=False, random_state=None):
+ X, y, c = make_regression(
+ n_samples=n_samples, n_features=n_features,
+ n_informative=n_informative, n_targets=n_targets, bias=bias,
+ noise=noise, shuffle=shuffle,
+ coef=True, random_state=random_state)
+ if n_features == 1:
+ c = np.asarray([c])
+ X += X_offset
+ mask = np.random.RandomState(random_state).binomial(
+ 1, proportion_nonzero, X.shape) > 0
+ removed_X = X.copy()
+ X[~mask] = 0.
+ removed_X[mask] = 0.
+ y -= removed_X.dot(c)
+ if n_features == 1:
+ c = c[0]
+ if coef:
+ return X, y, c
+ return X, y
+
+
+@pytest.mark.parametrize('gcv_mode', ['svd', 'eigen'])
+@pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix])
+@pytest.mark.parametrize('X_shape', [(11, 8), (11, 20)])
+@pytest.mark.parametrize('fit_intercept', [True, False])
+@pytest.mark.parametrize(
+ 'y_shape, normalize, noise',
+ [
+ ((11,), True, 1.),
+ ((11, 1), False, 30.),
+ ((11, 3), False, 150.),
+ ]
+)
+def test_ridge_gcv_vs_ridge_loo_cv(
+ gcv_mode, X_constructor, X_shape, y_shape,
+ fit_intercept, normalize, noise):
+ n_samples, n_features = X_shape
+ n_targets = y_shape[-1] if len(y_shape) == 2 else 1
+ X, y = _make_sparse_offset_regression(
+ n_samples=n_samples, n_features=n_features, n_targets=n_targets,
+ random_state=0, shuffle=False, noise=noise, n_informative=5
+ )
+ y = y.reshape(y_shape)
+
+ alphas = [1e-3, .1, 1., 10., 1e3]
+ loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept,
+ alphas=alphas, scoring='neg_mean_squared_error',
+ normalize=normalize)
+ gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept,
+ alphas=alphas, normalize=normalize)
+
+ loo_ridge.fit(X, y)
+
+ X_gcv = X_constructor(X)
+ gcv_ridge.fit(X_gcv, y)
+
+ assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_)
+ assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3)
+ assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
+
+
+@pytest.mark.parametrize('gcv_mode', ['svd', 'eigen'])
+@pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix])
+@pytest.mark.parametrize('n_features', [8, 20])
+@pytest.mark.parametrize('y_shape, fit_intercept, noise',
+ [((11,), True, 1.),
+ ((11, 1), True, 20.),
+ ((11, 3), True, 150.),
+ ((11, 3), False, 30.)])
+def test_ridge_gcv_sample_weights(
+ gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise):
+ alphas = [1e-3, .1, 1., 10., 1e3]
+ rng = np.random.RandomState(0)
+ n_targets = y_shape[-1] if len(y_shape) == 2 else 1
+ X, y = _make_sparse_offset_regression(
+ n_samples=11, n_features=n_features, n_targets=n_targets,
+ random_state=0, shuffle=False, noise=noise)
+ y = y.reshape(y_shape)
+
+ sample_weight = 3 * rng.randn(len(X))
+ sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
+ indices = np.repeat(np.arange(X.shape[0]), sample_weight)
+ sample_weight = sample_weight.astype(float)
+ X_tiled, y_tiled = X[indices], y[indices]
+
+ cv = GroupKFold(n_splits=X.shape[0])
+ splits = cv.split(X_tiled, y_tiled, groups=indices)
+ kfold = RidgeCV(
+ alphas=alphas, cv=splits, scoring='neg_mean_squared_error',
+ fit_intercept=fit_intercept)
+ # ignore warning from GridSearchCV: DeprecationWarning: The default of the
+ # `iid` parameter will change from True to False in version 0.22 and will
+ # be removed in 0.24
+ with ignore_warnings(category=DeprecationWarning):
+ kfold.fit(X_tiled, y_tiled)
+
+ ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
+ splits = cv.split(X_tiled, y_tiled, groups=indices)
+ predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
+ kfold_errors = (y_tiled - predictions)**2
+ kfold_errors = [
+ np.sum(kfold_errors[indices == i], axis=0) for
+ i in np.arange(X.shape[0])]
+ kfold_errors = np.asarray(kfold_errors)
+
+ X_gcv = X_constructor(X)
+ gcv_ridge = RidgeCV(
+ alphas=alphas, store_cv_values=True,
+ gcv_mode=gcv_mode, fit_intercept=fit_intercept)
+ gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
+ if len(y_shape) == 2:
+ gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
+ else:
+ gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]
+
+ assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
+ assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
+ assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
+ assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
+
+
+@pytest.mark.parametrize('mode', [True, 1, 5, 'bad', 'gcv'])
+def test_check_gcv_mode_error(mode):
+ X, y = make_regression(n_samples=5, n_features=2)
+ gcv = RidgeCV(gcv_mode=mode)
+ with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"):
+ gcv.fit(X, y)
+ with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"):
+ _check_gcv_mode(X, mode)
+
+
+@pytest.mark.parametrize("sparse", [True, False])
+@pytest.mark.parametrize(
+ 'mode, mode_n_greater_than_p, mode_p_greater_than_n',
+ [(None, 'svd', 'eigen'),
+ ('auto', 'svd', 'eigen'),
+ ('eigen', 'eigen', 'eigen'),
+ ('svd', 'svd', 'svd')]
+)
+def test_check_gcv_mode_choice(sparse, mode, mode_n_greater_than_p,
+ mode_p_greater_than_n):
+ X, _ = make_regression(n_samples=5, n_features=2)
+ if sparse:
+ X = sp.csr_matrix(X)
+ assert _check_gcv_mode(X, mode) == mode_n_greater_than_p
+ assert _check_gcv_mode(X.T, mode) == mode_p_greater_than_n
+
+
def _test_ridge_loo(filter_):
# test that can work with both dense or sparse matrices
n_samples = X_diabetes.shape[0]
@@ -318,46 +527,7 @@ def _test_ridge_loo(filter_):
ret = []
fit_intercept = filter_ == DENSE_FILTER
- if fit_intercept:
- X_diabetes_ = X_diabetes - X_diabetes.mean(0)
- else:
- X_diabetes_ = X_diabetes
ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept)
- ridge = Ridge(alpha=1.0, fit_intercept=fit_intercept)
-
- # because fit_intercept is applied
-
- # generalized cross-validation (efficient leave-one-out)
- decomp = ridge_gcv._pre_compute(X_diabetes_, y_diabetes, fit_intercept)
- errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp)
- values, c = ridge_gcv._values(1.0, y_diabetes, *decomp)
-
- # brute-force leave-one-out: remove one example at a time
- errors2 = []
- values2 = []
- for i in range(n_samples):
- sel = np.arange(n_samples) != i
- X_new = X_diabetes_[sel]
- y_new = y_diabetes[sel]
- ridge.fit(X_new, y_new)
- value = ridge.predict([X_diabetes_[i]])[0]
- error = (y_diabetes[i] - value) ** 2
- errors2.append(error)
- values2.append(value)
-
- # check that efficient and brute-force LOO give same results
- assert_almost_equal(errors, errors2)
- assert_almost_equal(values, values2)
-
- # generalized cross-validation (efficient leave-one-out,
- # SVD variation)
- decomp = ridge_gcv._pre_compute_svd(X_diabetes_, y_diabetes, fit_intercept)
- errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp)
- values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp)
-
- # check that efficient and SVD efficient LOO give same results
- assert_almost_equal(errors, errors3)
- assert_almost_equal(values, values3)
# check best alpha
ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
@@ -369,25 +539,26 @@ def _test_ridge_loo(filter_):
scoring = make_scorer(mean_squared_error, greater_is_better=False)
ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring)
f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes)
- assert_equal(ridge_gcv2.alpha_, alpha_)
+ assert ridge_gcv2.alpha_ == pytest.approx(alpha_)
# check that we get same best alpha with custom score_func
func = lambda x, y: -mean_squared_error(x, y)
scoring = make_scorer(func)
ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring)
f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes)
- assert_equal(ridge_gcv3.alpha_, alpha_)
+ assert ridge_gcv3.alpha_ == pytest.approx(alpha_)
# check that we get same best alpha with a scorer
scorer = get_scorer('neg_mean_squared_error')
ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer)
ridge_gcv4.fit(filter_(X_diabetes), y_diabetes)
- assert_equal(ridge_gcv4.alpha_, alpha_)
+ assert ridge_gcv4.alpha_ == pytest.approx(alpha_)
# check that we get same best alpha with sample weights
- ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
- sample_weight=np.ones(n_samples))
- assert_equal(ridge_gcv.alpha_, alpha_)
+ if filter_ == DENSE_FILTER:
+ ridge_gcv.fit(filter_(X_diabetes), y_diabetes,
+ sample_weight=np.ones(n_samples))
+ assert ridge_gcv.alpha_ == pytest.approx(alpha_)
# simulate several responses
Y = np.vstack((y_diabetes, y_diabetes)).T
@@ -397,8 +568,8 @@ def _test_ridge_loo(filter_):
ridge_gcv.fit(filter_(X_diabetes), y_diabetes)
y_pred = ridge_gcv.predict(filter_(X_diabetes))
- assert_array_almost_equal(np.vstack((y_pred, y_pred)).T,
- Y_pred, decimal=5)
+ assert_allclose(np.vstack((y_pred, y_pred)).T,
+ Y_pred, rtol=1e-5)
return ret
@@ -407,7 +578,7 @@ def _test_ridge_cv_normalize(filter_):
ridge_cv = RidgeCV(normalize=True, cv=3)
ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes)
- gs = GridSearchCV(Ridge(normalize=True), cv=3,
+ gs = GridSearchCV(Ridge(normalize=True, solver='sparse_cg'), cv=3,
param_grid={'alpha': ridge_cv.alphas})
gs.fit(filter_(10. * X_diabetes), y_diabetes)
assert_equal(gs.best_estimator_.alpha, ridge_cv.alpha_)
@@ -501,12 +672,6 @@ def test_dense_sparse(test_func):
check_dense_sparse(test_func)
-def test_ridge_cv_sparse_svd():
- X = sp.csr_matrix(X_diabetes)
- ridge = RidgeCV(gcv_mode="svd")
- assert_raises(TypeError, ridge.fit, X)
-
-
def test_ridge_sparse_svd():
X = sp.csc_matrix(rng.rand(100, 10))
y = rng.rand(100)
@@ -620,6 +785,10 @@ def test_ridgecv_store_cv_values():
r.fit(x, y)
assert r.cv_values_.shape == (n_samples, n_targets, n_alphas)
+ r = RidgeCV(cv=3, store_cv_values=True)
+ assert_raises_regex(ValueError, 'cv!=None and store_cv_values',
+ r.fit, x, y)
+
@pytest.mark.filterwarnings('ignore: The default value of cv') # 0.22
def test_ridge_classifier_cv_store_cv_values():
@@ -762,13 +931,13 @@ def test_ridgecv_negative_alphas():
# Negative integers
ridge = RidgeCV(alphas=(-1, -10, -100))
assert_raises_regex(ValueError,
- "alphas cannot be negative.",
+ "alphas must be positive",
ridge.fit, X, y)
# Negative floats
ridge = RidgeCV(alphas=(-0.1, -1.0, -10.0))
assert_raises_regex(ValueError,
- "alphas cannot be negative.",
+ "alphas must be positive",
ridge.fit, X, y)
@@ -887,54 +1056,14 @@ def test_ridge_regression_check_arguments_validity(return_intercept,
assert_allclose(out, true_coefs, rtol=0, atol=atol)
-def test_errors_and_values_helper():
- ridgecv = _RidgeGCV()
- rng = check_random_state(42)
- alpha = 1.
- n = 5
- y = rng.randn(n)
- v = rng.randn(n)
- Q = rng.randn(len(v), len(v))
- QT_y = Q.T.dot(y)
- G_diag, c = ridgecv._errors_and_values_helper(alpha, y, v, Q, QT_y)
-
- # test that helper function behaves as expected
- out, c_ = ridgecv._errors(alpha, y, v, Q, QT_y)
- np.testing.assert_array_equal(out, (c / G_diag) ** 2)
- np.testing.assert_array_equal(c, c)
-
- out, c_ = ridgecv._values(alpha, y, v, Q, QT_y)
- np.testing.assert_array_equal(out, y - (c / G_diag))
- np.testing.assert_array_equal(c_, c)
-
-
-def test_errors_and_values_svd_helper():
- ridgecv = _RidgeGCV()
- rng = check_random_state(42)
- alpha = 1.
- for n, p in zip((5, 10), (12, 6)):
- y = rng.randn(n)
- v = rng.randn(p)
- U = rng.randn(n, p)
- UT_y = U.T.dot(y)
- G_diag, c = ridgecv._errors_and_values_svd_helper(alpha, y, v, U, UT_y)
-
- # test that helper function behaves as expected
- out, c_ = ridgecv._errors_svd(alpha, y, v, U, UT_y)
- np.testing.assert_array_equal(out, (c / G_diag) ** 2)
- np.testing.assert_array_equal(c, c)
-
- out, c_ = ridgecv._values_svd(alpha, y, v, U, UT_y)
- np.testing.assert_array_equal(out, y - (c / G_diag))
- np.testing.assert_array_equal(c_, c)
-
-
def test_ridge_classifier_no_support_multilabel():
X, y = make_multilabel_classification(n_samples=10, random_state=0)
assert_raises(ValueError, RidgeClassifier().fit, X, y)
-def test_dtype_match():
+@pytest.mark.parametrize(
+ "solver", ["svd", "sparse_cg", "cholesky", "lsqr", "sag", "saga"])
+def test_dtype_match(solver):
rng = np.random.RandomState(0)
alpha = 1.0
@@ -944,25 +1073,22 @@ def test_dtype_match():
X_32 = X_64.astype(np.float32)
y_32 = y_64.astype(np.float32)
- solvers = ["svd", "sparse_cg", "cholesky", "lsqr"]
- for solver in solvers:
-
- # Check type consistency 32bits
- ridge_32 = Ridge(alpha=alpha, solver=solver)
- ridge_32.fit(X_32, y_32)
- coef_32 = ridge_32.coef_
+ # Check type consistency 32bits
+ ridge_32 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,)
+ ridge_32.fit(X_32, y_32)
+ coef_32 = ridge_32.coef_
- # Check type consistency 64 bits
- ridge_64 = Ridge(alpha=alpha, solver=solver)
- ridge_64.fit(X_64, y_64)
- coef_64 = ridge_64.coef_
+ # Check type consistency 64 bits
+ ridge_64 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,)
+ ridge_64.fit(X_64, y_64)
+ coef_64 = ridge_64.coef_
- # Do the actual checks at once for easier debug
- assert coef_32.dtype == X_32.dtype
- assert coef_64.dtype == X_64.dtype
- assert ridge_32.predict(X_32).dtype == X_32.dtype
- assert ridge_64.predict(X_64).dtype == X_64.dtype
- assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5)
+ # Do the actual checks at once for easier debug
+ assert coef_32.dtype == X_32.dtype
+ assert coef_64.dtype == X_64.dtype
+ assert ridge_32.predict(X_32).dtype == X_32.dtype
+ assert ridge_64.predict(X_64).dtype == X_64.dtype
+ assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4)
def test_dtype_match_cholesky():
@@ -993,3 +1119,34 @@ def test_dtype_match_cholesky():
assert ridge_32.predict(X_32).dtype == X_32.dtype
assert ridge_64.predict(X_64).dtype == X_64.dtype
assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5)
+
+
+@pytest.mark.parametrize(
+ 'solver', ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
+@pytest.mark.parametrize('seed', range(1))
+def test_ridge_regression_dtype_stability(solver, seed):
+ random_state = np.random.RandomState(seed)
+ n_samples, n_features = 6, 5
+ X = random_state.randn(n_samples, n_features)
+ coef = random_state.randn(n_features)
+ y = np.dot(X, coef) + 0.01 * random_state.randn(n_samples)
+ alpha = 1.0
+ results = dict()
+ # XXX: Sparse CG seems to be far less numerically stable than the
+ # others, maybe we should not enable float32 for this one.
+ atol = 1e-3 if solver == "sparse_cg" else 1e-5
+ for current_dtype in (np.float32, np.float64):
+ results[current_dtype] = ridge_regression(X.astype(current_dtype),
+ y.astype(current_dtype),
+ alpha=alpha,
+ solver=solver,
+ random_state=random_state,
+ sample_weight=None,
+ max_iter=500,
+ tol=1e-10,
+ return_n_iter=False,
+ return_intercept=False)
+
+ assert results[np.float32].dtype == np.float32
+ assert results[np.float64].dtype == np.float64
+ assert_allclose(results[np.float32], results[np.float64], atol=atol)
diff --git a/sklearn/manifold/isomap.py b/sklearn/manifold/isomap.py
index bbb83a5ed81f8..88c979c0e1fdb 100644
--- a/sklearn/manifold/isomap.py
+++ b/sklearn/manifold/isomap.py
@@ -145,7 +145,7 @@ def reconstruction_error(self):
reconstruction_error : float
Notes
- -------
+ -----
The cost function of an isomap embedding is
``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py
index e387ecec0f4d5..a6d5af54f9bc4 100644
--- a/sklearn/manifold/spectral_embedding_.py
+++ b/sklearn/manifold/spectral_embedding_.py
@@ -348,7 +348,7 @@ class SpectralEmbedding(BaseEstimator):
Read more in the :ref:`User Guide `.
Parameters
- -----------
+ ----------
n_components : integer, default: 2
The dimension of the projected subspace.
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index 9eae62a28045e..d1337bdc61aed 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -1989,16 +1989,18 @@ def hamming_loss(y_true, y_pred, labels=None, sample_weight=None):
-----
In multiclass classification, the Hamming loss corresponds to the Hamming
distance between ``y_true`` and ``y_pred`` which is equivalent to the
- subset ``zero_one_loss`` function.
+ subset ``zero_one_loss`` function, when `normalize` parameter is set to
+ True.
In multilabel classification, the Hamming loss is different from the
subset zero-one loss. The zero-one loss considers the entire set of labels
for a given sample incorrect if it does not entirely match the true set of
- labels. Hamming loss is more forgiving in that it penalizes the individual
- labels.
+ labels. Hamming loss is more forgiving in that it penalizes only the
+ individual labels.
- The Hamming loss is upperbounded by the subset zero-one loss. When
- normalized over samples, the Hamming loss is always between 0 and 1.
+ The Hamming loss is upperbounded by the subset zero-one loss, when
+ `normalize` parameter is set to True. It is always between 0 and 1,
+ lower being better.
References
----------
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index a32d6aa6efbcc..9e377f3d4c07e 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -749,7 +749,7 @@ def paired_cosine_distances(X, Y):
distances : ndarray, shape (n_samples, )
Notes
- ------
+ -----
The cosine distance is equivalent to the half the squared
euclidean distance if each sample is normalized to unit norm
"""
@@ -1169,17 +1169,17 @@ def distance_metrics():
The valid distance metrics, and the function they map to, are:
- ============ ====================================
- metric Function
- ============ ====================================
- 'cityblock' metrics.pairwise.manhattan_distances
- 'cosine' metrics.pairwise.cosine_distances
- 'euclidean' metrics.pairwise.euclidean_distances
- 'haversine' metrics.pairwise.haversine_distances
- 'l1' metrics.pairwise.manhattan_distances
- 'l2' metrics.pairwise.euclidean_distances
- 'manhattan' metrics.pairwise.manhattan_distances
- ============ ====================================
+ ============ ====================================
+ metric Function
+ ============ ====================================
+ 'cityblock' metrics.pairwise.manhattan_distances
+ 'cosine' metrics.pairwise.cosine_distances
+ 'euclidean' metrics.pairwise.euclidean_distances
+ 'haversine' metrics.pairwise.haversine_distances
+ 'l1' metrics.pairwise.manhattan_distances
+ 'l2' metrics.pairwise.euclidean_distances
+ 'manhattan' metrics.pairwise.manhattan_distances
+ ============ ====================================
Read more in the :ref:`User Guide `.
diff --git a/sklearn/mixture/bayesian_mixture.py b/sklearn/mixture/bayesian_mixture.py
index 6f13f63e3fcd9..88c0ab66ae20a 100644
--- a/sklearn/mixture/bayesian_mixture.py
+++ b/sklearn/mixture/bayesian_mixture.py
@@ -140,7 +140,7 @@ class BayesianGaussianMixture(BaseMixture):
mean_precision_prior : float | None, optional.
The precision prior on the mean distribution (Gaussian).
- Controls the extend to where means can be placed. Smaller
+ Controls the extend to where means can be placed. Larger
values concentrate the means of each clusters around `mean_prior`.
The value of the parameter must be greater than 0.
If it is None, it's set to 1.
@@ -260,7 +260,7 @@ class BayesianGaussianMixture(BaseMixture):
mean_precision_prior : float
The precision prior on the mean distribution (Gaussian).
Controls the extend to where means can be placed.
- Smaller values concentrate the means of each clusters around
+ Larger values concentrate the means of each clusters around
`mean_prior`.
mean_precision_ : array-like, shape (n_components,)
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 68f0e296b077c..6fe2a8edfa12a 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -447,7 +447,7 @@ def predict(self, X):
``predict``.
Parameters
- -----------
+ ----------
X : indexable, length n_samples
Must fulfill the input assumptions of the
underlying estimator.
@@ -464,7 +464,7 @@ def predict_proba(self, X):
``predict_proba``.
Parameters
- -----------
+ ----------
X : indexable, length n_samples
Must fulfill the input assumptions of the
underlying estimator.
@@ -481,7 +481,7 @@ def predict_log_proba(self, X):
``predict_log_proba``.
Parameters
- -----------
+ ----------
X : indexable, length n_samples
Must fulfill the input assumptions of the
underlying estimator.
@@ -498,7 +498,7 @@ def decision_function(self, X):
``decision_function``.
Parameters
- -----------
+ ----------
X : indexable, length n_samples
Must fulfill the input assumptions of the
underlying estimator.
@@ -515,7 +515,7 @@ def transform(self, X):
``refit=True``.
Parameters
- -----------
+ ----------
X : indexable, length n_samples
Must fulfill the input assumptions of the
underlying estimator.
@@ -532,7 +532,7 @@ def inverse_transform(self, Xt):
``inverse_transform`` and ``refit=True``.
Parameters
- -----------
+ ----------
Xt : indexable, length n_samples
Must fulfill the input assumptions of the
underlying estimator.
@@ -1103,7 +1103,7 @@ class GridSearchCV(BaseSearchCV):
This is present only if ``refit`` is not False.
Notes
- ------
+ -----
The parameters selected are those that maximize the score of the left out
data, unless an explicit score is passed in which case it is used instead.
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 17fb16ae8340e..24fefef5216fe 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -853,7 +853,7 @@ class LeaveOneGroupOut(BaseCrossValidator):
>>> logo = LeaveOneGroupOut()
>>> logo.get_n_splits(X, y, groups)
2
- >>> logo.get_n_splits(groups=groups) # 'groups' is always required
+ >>> logo.get_n_splits(groups=groups) # 'groups' is always required
2
>>> print(logo)
LeaveOneGroupOut()
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 2f5505fff01c6..3dc8b0441a64a 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -1118,7 +1118,7 @@ def learning_curve(estimator, X, y, groups=None,
train_sizes=np.linspace(0.1, 1.0, 5), cv='warn',
scoring=None, exploit_incremental_learning=False,
n_jobs=None, pre_dispatch="all", verbose=0, shuffle=False,
- random_state=None, error_score='raise-deprecating'):
+ random_state=None, error_score='raise-deprecating'):
"""Learning curve.
Determines cross-validated training and test scores for different training
diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py
index 21f272c518f51..00c4b8636a17c 100644
--- a/sklearn/neighbors/base.py
+++ b/sklearn/neighbors/base.py
@@ -62,14 +62,14 @@ def _get_weights(dist, weights):
"""Get the weights from an array of distances and a parameter ``weights``
Parameters
- ===========
+ ----------
dist : ndarray
The input distances
weights : {'uniform', 'distance' or a callable}
The kind of weighting used
Returns
- ========
+ -------
weights_arr : array of the same shape as ``dist``
if ``weights == 'uniform'``, then returns None
"""
diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py
index 5ad2f7e9b7b1d..472710ea51bb2 100644
--- a/sklearn/neighbors/lof.py
+++ b/sklearn/neighbors/lof.py
@@ -401,7 +401,7 @@ def _decision_function(self, X):
def score_samples(self):
"""Opposite of the Local Outlier Factor of X.
- It is the opposite as as bigger is better, i.e. large values correspond
+ It is the opposite as bigger is better, i.e. large values correspond
to inliers.
Only available for novelty detection (when novelty is set to True).
@@ -437,7 +437,7 @@ def score_samples(self):
def _score_samples(self, X):
"""Opposite of the Local Outlier Factor of X.
- It is the opposite as as bigger is better, i.e. large values correspond
+ It is the opposite as bigger is better, i.e. large values correspond
to inliers.
Only available for novelty detection (when novelty is set to True).
@@ -500,5 +500,5 @@ def _local_reachability_density(self, distances_X, neighbors_indices):
self.n_neighbors_ - 1]
reach_dist_array = np.maximum(distances_X, dist_k)
- # 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_:
+ # 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_:
return 1. / (np.mean(reach_dist_array, axis=1) + 1e-10)
diff --git a/sklearn/neural_network/_stochastic_optimizers.py b/sklearn/neural_network/_stochastic_optimizers.py
index 8f19c7b488acc..3e49e94de8bd1 100644
--- a/sklearn/neural_network/_stochastic_optimizers.py
+++ b/sklearn/neural_network/_stochastic_optimizers.py
@@ -1,7 +1,7 @@
"""Stochastic optimization methods for MLP
"""
-# Authors: Jiyuan Qian
+# Authors: Jiyuan Qian
# License: BSD 3 clause
import numpy as np
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 1fcdadaabb6c0..9a51fefd144ac 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -95,12 +95,12 @@ class Pipeline(_BaseComposition):
>>> # For instance, fit using a k of 10 in the SelectKBest
>>> # and a parameter 'C' of the svm
>>> anova_svm.set_params(anova__k=10, svc__C=.1).fit(X, y)
- ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
+ ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
Pipeline(memory=None,
steps=[('anova', SelectKBest(...)),
('svc', SVC(...))], verbose=False)
>>> prediction = anova_svm.predict(X)
- >>> anova_svm.score(X, y) # doctest: +ELLIPSIS
+ >>> anova_svm.score(X, y) # doctest: +ELLIPSIS
0.83
>>> # getting the selected features chosen by anova_filter
>>> anova_svm['anova'].get_support()
@@ -671,7 +671,7 @@ def make_pipeline(*steps, **kwargs):
>>> from sklearn.naive_bayes import GaussianNB
>>> from sklearn.preprocessing import StandardScaler
>>> make_pipeline(StandardScaler(), GaussianNB(priors=None))
- ... # doctest: +NORMALIZE_WHITESPACE
+ ... # doctest: +NORMALIZE_WHITESPACE
Pipeline(memory=None,
steps=[('standardscaler',
StandardScaler(copy=True, with_mean=True, with_std=True)),
@@ -782,7 +782,7 @@ class FeatureUnion(_BaseComposition, TransformerMixin):
>>> union = FeatureUnion([("pca", PCA(n_components=1)),
... ("svd", TruncatedSVD(n_components=2))])
>>> X = [[0., 1., 3], [2., 2., 5]]
- >>> union.fit_transform(X) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+ >>> union.fit_transform(X) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
array([[ 1.5 , 3.0..., 0.8...],
[-1.5 , 5.7..., -0.4...]])
"""
@@ -1008,7 +1008,7 @@ def make_union(*transformers, **kwargs):
--------
>>> from sklearn.decomposition import PCA, TruncatedSVD
>>> from sklearn.pipeline import make_union
- >>> make_union(PCA(), TruncatedSVD()) # doctest: +NORMALIZE_WHITESPACE
+ >>> make_union(PCA(), TruncatedSVD()) # doctest: +NORMALIZE_WHITESPACE
FeatureUnion(n_jobs=None,
transformer_list=[('pca',
PCA(copy=True, iterated_power='auto',
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 8c8524ef6505c..2f020a0a4780e 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -1475,17 +1475,21 @@ def transform(self, X):
Parameters
----------
- X : array-like or sparse matrix, shape [n_samples, n_features]
+ X : array-like or CSR/CSC sparse matrix, shape [n_samples, n_features]
The data to transform, row by row.
- Sparse input should preferably be in CSR format (for speed),
- but must be in CSC format if the degree is 4 or higher.
- If the input matrix is in CSR format and the expansion is of
- degree 2 or 3, the method described in the work "Leveraging
- Sparsity to Speed Up Polynomial Feature Expansions of CSR
- Matrices Using K-Simplex Numbers" by Andrew Nystrom and
- John Hughes is used, which is much faster than the method
- used on CSC input.
+ Prefer CSR over CSC for sparse input (for speed), but CSC is
+ required if the degree is 4 or higher. If the degree is less than
+ 4 and the input format is CSC, it will be converted to CSR, have
+ its polynomial features generated, then converted back to CSC.
+
+ If the degree is 2 or 3, the method described in "Leveraging
+ Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices
+ Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is
+ used, which is much faster than the method used on CSC input. For
+ this reason, a CSC input will be converted to CSR, and the output
+ will be converted back to CSC prior to being returned, hence the
+ preference of CSR.
Returns
-------
@@ -1679,7 +1683,7 @@ class Normalizer(BaseEstimator, TransformerMixin):
>>> X = [[4, 1, 2, 2],
... [1, 3, 9, 3],
... [5, 7, 5, 1]]
- >>> transformer = Normalizer().fit(X) # fit does nothing.
+ >>> transformer = Normalizer().fit(X) # fit does nothing.
>>> transformer
Normalizer(copy=True, norm='l2')
>>> transformer.transform(X)
@@ -1815,7 +1819,7 @@ class Binarizer(BaseEstimator, TransformerMixin):
>>> X = [[ 1., -1., 2.],
... [ 2., 0., 0.],
... [ 0., 1., -1.]]
- >>> transformer = Binarizer().fit(X) # fit does nothing.
+ >>> transformer = Binarizer().fit(X) # fit does nothing.
>>> transformer
Binarizer(copy=True, threshold=0.0)
>>> transformer.transform(X)
@@ -2262,7 +2266,7 @@ def _transform_col(self, X_col, quantiles, inverse):
upper_bound_x = 1
lower_bound_y = quantiles[0]
upper_bound_y = quantiles[-1]
- # for inverse transform, match a uniform distribution
+ # for inverse transform, match a uniform distribution
with np.errstate(invalid='ignore'): # hide NaN comparison warnings
if output_distribution == 'normal':
X_col = stats.norm.cdf(X_col)
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index f7cffa1e663b5..4a1c700717555 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -420,7 +420,7 @@ def fit_transform(self, y):
"""Fit label binarizer and transform multi-class labels to binary
labels.
- The output of transform is sometimes referred to as
+ The output of transform is sometimes referred to as
the 1-of-K coding scheme.
Parameters
diff --git a/sklearn/setup.py b/sklearn/setup.py
index e6f10cad77d9f..5a377043e9e38 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -33,6 +33,8 @@ def configuration(parent_package='', top_path=None):
config.add_subpackage('feature_selection/tests')
config.add_subpackage('gaussian_process')
config.add_subpackage('gaussian_process/tests')
+ config.add_subpackage('impute')
+ config.add_subpackage('impute/tests')
config.add_subpackage('inspection')
config.add_subpackage('inspection/tests')
config.add_subpackage('mixture')
diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
index effb0dcd12504..fe2f943cbdb7c 100644
--- a/sklearn/svm/base.py
+++ b/sklearn/svm/base.py
@@ -126,7 +126,7 @@ def fit(self, X, y, sample_weight=None):
self : object
Notes
- ------
+ -----
If X and y are not C-ordered and contiguous arrays of np.float64 and
X is not a scipy.sparse.csr_matrix, X and/or y may be copied.
@@ -293,7 +293,7 @@ def _sparse_fit(self, X, y, sample_weight, solver_type, kernel,
if hasattr(self, "classes_"):
n_class = len(self.classes_) - 1
- else: # regression
+ else: # regression
n_class = 1
n_SV = self.support_vectors_.shape[0]
@@ -540,7 +540,7 @@ def decision_function(self, X):
n_classes).
Notes
- ------
+ -----
If decision_function_shape='ovo', the function values are proportional
to the distance of the samples X to the separating hyperplane. If the
exact distances are required, divide the function values by the norm of
diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
index 8f45a501ddf35..a236ba716bc0d 100644
--- a/sklearn/svm/classes.py
+++ b/sklearn/svm/classes.py
@@ -429,10 +429,10 @@ def fit(self, X, y, sample_weight=None):
class SVC(BaseSVC):
"""C-Support Vector Classification.
- The implementation is based on libsvm. The fit time complexity
- is more than quadratic with the number of samples which makes it hard
- to scale to datasets with more than a couple of 10000 samples. For large
- datasets consider using :class:`sklearn.linear_model.LinearSVC` or
+ The implementation is based on libsvm. The fit time scales at least
+ quadratically with the number of samples and may be impractical
+ beyond tens of thousands of samples. For large datasets
+ consider using :class:`sklearn.linear_model.LinearSVC` or
:class:`sklearn.linear_model.SGDClassifier` instead, possibly after a
:class:`sklearn.kernel_approximation.Nystroem` transformer.
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index fc3c7f3985e28..660b38c1ae4c2 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -215,7 +215,7 @@ def test_import_all_consistency():
def test_root_import_all_completeness():
- EXCEPTIONS = ('utils', 'tests', 'base', 'setup')
+ EXCEPTIONS = ('utils', 'tests', 'base', 'setup', 'conftest')
for _, modname, _ in pkgutil.walk_packages(path=sklearn.__path__,
onerror=lambda _: None):
if '.' in modname or modname.startswith('_') or modname in EXCEPTIONS:
diff --git a/sklearn/tree/__init__.py b/sklearn/tree/__init__.py
index e91540bed8c5f..a5ffc7585d4e4 100644
--- a/sklearn/tree/__init__.py
+++ b/sklearn/tree/__init__.py
@@ -11,4 +11,4 @@
__all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor",
"ExtraTreeClassifier", "ExtraTreeRegressor", "export_graphviz",
- "plot_tree", "export_text"]
+ "plot_tree", "export_text"]
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index edd47845ad197..f27b42ae9c956 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1123,3 +1123,110 @@ cdef class Tree:
Py_INCREF(self)
arr.base = self
return arr
+
+
+ def compute_partial_dependence(self, DTYPE_t[:, ::1] X,
+ int[::1] target_feature,
+ double[::1] out):
+ """Partial dependence of the response on the ``target_feature`` set.
+
+ For each sample in ``X`` a tree traversal is performed.
+ Each traversal starts from the root with weight 1.0.
+
+ At each non-leaf node that splits on a target feature, either
+ the left child or the right child is visited based on the feature
+ value of the current sample, and the weight is not modified.
+ At each non-leaf node that splits on a complementary feature,
+ both children are visited and the weight is multiplied by the fraction
+ of training samples which went to each child.
+
+ At each leaf, the value of the node is multiplied by the current
+ weight (weights sum to 1 for all visited terminal nodes).
+
+ Parameters
+ ----------
+ X : view on 2d ndarray, shape (n_samples, n_target_features)
+ The grid points on which the partial dependence should be
+ evaluated.
+ target_feature : view on 1d ndarray, shape (n_target_features)
+ The set of target features for which the partial dependence
+ should be evaluated.
+ out : view on 1d ndarray, shape (n_samples)
+ The value of the partial dependence function on each grid
+ point.
+ """
+ cdef:
+ double[::1] weight_stack = np.zeros(self.node_count,
+ dtype=np.float64)
+ SIZE_t[::1] node_idx_stack = np.zeros(self.node_count,
+ dtype=np.intp)
+ SIZE_t sample_idx
+ SIZE_t feature_idx
+ int stack_size
+ double left_sample_frac
+ double current_weight
+ double total_weight # used for sanity check only
+ Node *current_node # use a pointer to avoid copying attributes
+ SIZE_t current_node_idx
+ bint is_target_feature
+ SIZE_t _TREE_LEAF = TREE_LEAF # to avoid python interactions
+
+ for sample_idx in range(X.shape[0]):
+ # init stacks for current sample
+ stack_size = 1
+ node_idx_stack[0] = 0 # root node
+ weight_stack[0] = 1 # all the samples are in the root node
+ total_weight = 0
+
+ while stack_size > 0:
+ # pop the stack
+ stack_size -= 1
+ current_node_idx = node_idx_stack[stack_size]
+ current_node = &self.nodes[current_node_idx]
+
+ if current_node.left_child == _TREE_LEAF:
+ # leaf node
+ out[sample_idx] += (weight_stack[stack_size] *
+ self.value[current_node_idx])
+ total_weight += weight_stack[stack_size]
+ else:
+ # non-leaf node
+
+ # determine if the split feature is a target feature
+ is_target_feature = False
+ for feature_idx in range(target_feature.shape[0]):
+ if target_feature[feature_idx] == current_node.feature:
+ is_target_feature = True
+ break
+
+ if is_target_feature:
+ # In this case, we push left or right child on stack
+ if X[sample_idx, feature_idx] <= current_node.threshold:
+ node_idx_stack[stack_size] = current_node.left_child
+ else:
+ node_idx_stack[stack_size] = current_node.right_child
+ stack_size += 1
+ else:
+ # In this case, we push both children onto the stack,
+ # and give a weight proportional to the number of
+ # samples going through each branch.
+
+ # push left child
+ node_idx_stack[stack_size] = current_node.left_child
+ left_sample_frac = (
+ self.nodes[current_node.left_child].weighted_n_node_samples /
+ current_node.weighted_n_node_samples)
+ current_weight = weight_stack[stack_size]
+ weight_stack[stack_size] = current_weight * left_sample_frac
+ stack_size += 1
+
+ # push right child
+ node_idx_stack[stack_size] = current_node.right_child
+ weight_stack[stack_size] = (
+ current_weight * (1 - left_sample_frac))
+ stack_size += 1
+
+ # Sanity check. Should never happen.
+ if not (0.999 < total_weight < 1.001):
+ raise ValueError("Total weight should be 1.0 but was %.9f" %
+ total_weight)
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index fdbd48e75f3a9..634eb3ef84cdd 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -20,7 +20,7 @@ import numpy as np
cimport numpy as np
np.import_array()
-from ..utils cimport _random
+from ..utils._random cimport our_rand_r
# =============================================================================
# Helper functions
@@ -64,13 +64,13 @@ cdef inline np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size):
cdef inline SIZE_t rand_int(SIZE_t low, SIZE_t high,
UINT32_t* random_state) nogil:
"""Generate a random integer in [low; end)."""
- return low + _random.our_rand_r(random_state) % (high - low)
+ return low + our_rand_r(random_state) % (high - low)
cdef inline double rand_uniform(double low, double high,
UINT32_t* random_state) nogil:
"""Generate a random double in [low; high)."""
- return ((high - low) * _random.our_rand_r(random_state) /
+ return ((high - low) * our_rand_r(random_state) /
RAND_R_MAX) + low
diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py
index 02aa68b8af2dc..636ef03689a79 100644
--- a/sklearn/tree/export.py
+++ b/sklearn/tree/export.py
@@ -839,7 +839,7 @@ def export_text(decision_tree, feature_names=None, max_depth=10,
Text summary of all the rules in the decision tree.
Examples
- -------
+ --------
>>> from sklearn.datasets import load_iris
>>> from sklearn.tree import DecisionTreeClassifier
diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py
index 65b0a201be369..eed9be7bcb5d9 100644
--- a/sklearn/tree/tests/test_export.py
+++ b/sklearn/tree/tests/test_export.py
@@ -399,9 +399,8 @@ def test_export_text():
assert export_text(reg, decimals=1, show_weights=True) == expected_report
-def test_plot_tree():
+def test_plot_tree(pyplot):
# mostly smoke tests
- pytest.importorskip("matplotlib.pyplot")
# Check correctness of export_graphviz
clf = DecisionTreeClassifier(max_depth=3,
min_samples_split=2,
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index f04e43106e415..fcb03b0cecddd 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -392,7 +392,7 @@ def weighted_mode(a, w, axis=0):
The value 4 appears three times: with uniform weights, the result is
simply the mode of the distribution.
- >>> weights = [1, 3, 0.5, 1.5, 1, 2] # deweight the 4's
+ >>> weights = [1, 3, 0.5, 1.5, 1, 2] # deweight the 4's
>>> weighted_mode(x, weights)
(array([2.]), array([3.5]))
diff --git a/sklearn/utils/mocking.py b/sklearn/utils/mocking.py
index 9c059f2ed2ed9..76ad144ccb171 100644
--- a/sklearn/utils/mocking.py
+++ b/sklearn/utils/mocking.py
@@ -108,7 +108,7 @@ def fit(self, X, y, **fit_params):
def predict(self, T):
"""
Parameters
- -----------
+ ----------
T : indexable, length n_samples
"""
if self.check_X is not None:
diff --git a/sklearn/utils/seq_dataset.pyx.tp b/sklearn/utils/seq_dataset.pyx.tp
index f1b34c4c86bce..14f80804554db 100644
--- a/sklearn/utils/seq_dataset.pyx.tp
+++ b/sklearn/utils/seq_dataset.pyx.tp
@@ -45,7 +45,7 @@ import numpy as np
np.import_array()
-from . cimport _random
+from ._random cimport our_rand_r
cdef class SequentialDataset{{name}}:
"""Base class for datasets with sequential data access.
@@ -155,7 +155,7 @@ cdef class SequentialDataset{{name}}:
cdef int n = self.n_samples
cdef unsigned i, j
for i in range(n - 1):
- j = i + _random.our_rand_r(&seed) % (n - i)
+ j = i + our_rand_r(&seed) % (n - i)
ind[i], ind[j] = ind[j], ind[i]
cdef int _get_next_index(self) nogil:
@@ -169,7 +169,7 @@ cdef class SequentialDataset{{name}}:
cdef int _get_random_index(self) nogil:
cdef int n = self.n_samples
- cdef int current_index = _random.our_rand_r(&self.seed) % n
+ cdef int current_index = our_rand_r(&self.seed) % n
self.current_index = current_index
return current_index
diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py
index 65bed4c7ecef8..babf0b8658b5c 100644
--- a/sklearn/utils/testing.py
+++ b/sklearn/utils/testing.py
@@ -714,28 +714,6 @@ def set_random_state(estimator, random_state=0):
estimator.set_params(random_state=random_state)
-def if_matplotlib(func):
- """Test decorator that skips test if matplotlib not installed.
-
- Parameters
- ----------
- func
- """
- @wraps(func)
- def run_test(*args, **kwargs):
- try:
- import matplotlib
- matplotlib.use('Agg', warn=False)
- # this fails if no $DISPLAY specified
- import matplotlib.pyplot as plt
- plt.figure()
- except ImportError:
- raise SkipTest('Matplotlib not available.')
- else:
- return func(*args, **kwargs)
- return run_test
-
-
try:
import pytest
@@ -1024,21 +1002,3 @@ def assert_run_python_script(source_code, timeout=60):
% e.output.decode('utf-8'))
finally:
os.unlink(source_file)
-
-
-def close_figure(fig=None):
- """Close a matplotlibt figure.
-
- Parameters
- ----------
- fig : int or str or Figure, optional (default=None)
- The figure, figure number or figure name to close. If ``None``, all
- current figures are closed.
- """
- from matplotlib.pyplot import get_fignums, close as _close # noqa
-
- if fig is None:
- for fig in get_fignums():
- _close(fig)
- else:
- _close(fig)