diff --git a/azure-pipelines.yml b/azure-pipelines.yml index ae27828dd22a3..c31385dd3e48d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -22,6 +22,7 @@ jobs: SCIPY_VERSION: '0.17.0' CYTHON_VERSION: '*' PILLOW_VERSION: '4.0.0' + MATPLOTLIB_VERSION: '1.5.1' # later version of joblib are not packaged in conda for Python 3.5 JOBLIB_VERSION: '0.12.3' COVERAGE: 'true' diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd index 97f5cb4f7e465..a53cd61b34828 100644 --- a/build_tools/azure/install.cmd +++ b/build_tools/azure/install.cmd @@ -11,7 +11,7 @@ IF "%PYTHON_ARCH%"=="64" ( call deactivate @rem Clean up any left-over from a previous build conda remove --all -q -y -n %VIRTUALENV% - conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython pytest wheel pillow joblib + conda create -n %VIRTUALENV% -q -y python=%PYTHON_VERSION% numpy scipy cython matplotlib pytest wheel pillow joblib call activate %VIRTUALENV% ) else ( diff --git a/doc/conf.py b/doc/conf.py index 27a6bf2ee30c2..c736adc8e267e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -263,9 +263,9 @@ 'sphx_glr_plot_compare_methods_001.png': 349} -# enable experimental module so that the new GBDTs estimators can be +# enable experimental module so that experimental estimators can be # discovered properly by sphinx -from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.experimental import * # noqa def make_carousel_thumbs(app, exception): diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 34a5f63919c44..69e7f0b2b480d 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -195,67 +195,67 @@ The preferred way to contribute to scikit-learn is to fork the `main repository `__ on GitHub, then submit a "pull request" (PR): - 1. `Create an account `_ on - GitHub if you do not already have one. +1. `Create an account `_ on + GitHub if you do not already have one. - 2. Fork the `project repository - `__: click on the 'Fork' - button near the top of the page. This creates a copy of the code under your - account on the GitHub user account. For more details on how to fork a - repository see `this guide `_. +2. Fork the `project repository + `__: click on the 'Fork' + button near the top of the page. This creates a copy of the code under your + account on the GitHub user account. For more details on how to fork a + repository see `this guide `_. - 3. Clone your fork of the scikit-learn repo from your GitHub account to your - local disk:: +3. Clone your fork of the scikit-learn repo from your GitHub account to your + local disk:: - $ git clone git@github.com:YourLogin/scikit-learn.git - $ cd scikit-learn + $ git clone git@github.com:YourLogin/scikit-learn.git + $ cd scikit-learn - 4. Install library in editable mode:: +4. Install library in editable mode:: - $ pip install --editable . + $ pip install --editable . - for more details about advanced installation, see the - :ref:`install_bleeding_edge` section. + for more details about advanced installation, see the + :ref:`install_bleeding_edge` section. - 5. Create a branch to hold your development changes:: +5. Create a branch to hold your development changes:: - $ git checkout -b my-feature + $ git checkout -b my-feature - and start making changes. Always use a ``feature`` branch. It's good practice to - never work on the ``master`` branch! + and start making changes. Always use a ``feature`` branch. It's good practice to + never work on the ``master`` branch! -.. note:: + .. note:: - In the above setup, your ``origin`` remote repository points to - ``YourLogin/scikit-learn.git``. If you wish to fetch/merge from the main - repository instead of your forked one, you will need to add another remote - to use instead of ``origin``. If we choose the name ``upstream`` for it, the - command will be:: + In the above setup, your ``origin`` remote repository points to + ``YourLogin/scikit-learn.git``. If you wish to fetch/merge from the main + repository instead of your forked one, you will need to add another remote + to use instead of ``origin``. If we choose the name ``upstream`` for it, the + command will be:: - $ git remote add upstream https://github.com/scikit-learn/scikit-learn.git + $ git remote add upstream https://github.com/scikit-learn/scikit-learn.git - And in order to fetch the new remote and base your work on the latest changes - of it you can:: + And in order to fetch the new remote and base your work on the latest changes + of it you can:: - $ git fetch upstream - $ git checkout -b my-feature upstream/master + $ git fetch upstream + $ git checkout -b my-feature upstream/master - 6. Develop the feature on your feature branch on your computer, using Git to do the - version control. When you're done editing, add changed files using ``git add`` - and then ``git commit`` files:: +6. Develop the feature on your feature branch on your computer, using Git to do the + version control. When you're done editing, add changed files using ``git add`` + and then ``git commit`` files:: - $ git add modified_files - $ git commit + $ git add modified_files + $ git commit - to record your changes in Git, then push the changes to your GitHub account with:: + to record your changes in Git, then push the changes to your GitHub account with:: - $ git push -u origin my-feature + $ git push -u origin my-feature - 7. Follow `these - `_ - instructions to create a pull request from your fork. This will send an - email to the committers. You may want to consider sending an email to the - mailing list for more visibility. +7. Follow `these + `_ + instructions to create a pull request from your fork. This will send an + email to the committers. You may want to consider sending an email to the + mailing list for more visibility. .. note:: @@ -626,7 +626,7 @@ reviewing pull requests, you may find :ref:`this tip .. _testing_coverage: Testing and improving test coverage ------------------------------------- +----------------------------------- High-quality `unit testing `_ is a corner-stone of the scikit-learn development process. For this @@ -641,22 +641,42 @@ the corresponding subpackages. We expect code coverage of new features to be at least around 90%. -.. note:: **Workflow to improve test coverage** +For guidelines on how to use ``pytest`` efficiently, see the +:ref:`pytest_tips`. - To test code coverage, you need to install the `coverage - `_ package in addition to pytest. +Writing matplotlib related tests +................................ - 1. Run 'make test-coverage'. The output lists for each file the line - numbers that are not tested. +Test fixtures ensure that a set of tests will be executing with the appropriate +initialization and cleanup. The scikit-learn test suite implements a fixture +which can be used with ``matplotlib``. - 2. Find a low hanging fruit, looking at which lines are not tested, - write or adapt a test specifically for these lines. +``pyplot`` + The ``pyplot`` fixture should be used when a test function is dealing with + ``matplotlib``. ``matplotlib`` is a soft dependency and is not required. + This fixture is in charge of skipping the tests if ``matplotlib`` is not + installed. In addition, figures created during the tests will be + automatically closed once the test function has been executed. - 3. Loop. +To use this fixture in a test function, one needs to pass it as an +argument:: -For guidelines on how to use ``pytest`` efficiently, see the -:ref:`pytest_tips`. + def test_requiring_mpl_fixture(pyplot): + # you can now safely use matplotlib + +Workflow to improve test coverage +................................. + +To test code coverage, you need to install the `coverage +`_ package in addition to pytest. + +1. Run 'make test-coverage'. The output lists for each file the line + numbers that are not tested. + +2. Find a low hanging fruit, looking at which lines are not tested, + write or adapt a test specifically for these lines. +3. Loop. Developers web site ------------------- diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index c523236a11348..56de69db9519c 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -471,6 +471,7 @@ Samples generator :toctree: generated/ experimental.enable_hist_gradient_boosting + experimental.enable_iterative_imputer .. _feature_extraction_ref: diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 777a2bd157b29..4cd0ea6e85d60 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -105,7 +105,16 @@ of ``y``. This is done for each feature in an iterative fashion, and then is repeated for ``max_iter`` imputation rounds. The results of the final imputation round are returned. +.. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import ``enable_iterative_imputer``. + +:: + >>> import numpy as np + >>> from sklearn.experimental import enable_iterative_imputer >>> from sklearn.impute import IterativeImputer >>> imp = IterativeImputer(max_iter=10, random_state=0) >>> imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]]) # doctest: +NORMALIZE_WHITESPACE diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index a370791d248e2..c01b74775684f 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -136,17 +136,24 @@ Setting the regularization parameter: generalized Cross-Validation ------------------------------------------------------------------ :class:`RidgeCV` implements ridge regression with built-in -cross-validation of the alpha parameter. The object works in the same way +cross-validation of the alpha parameter. The object works in the same way as GridSearchCV except that it defaults to Generalized Cross-Validation (GCV), an efficient form of leave-one-out cross-validation:: + >>> import numpy as np >>> from sklearn import linear_model - >>> reg = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0], cv=3) - >>> reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) # doctest: +SKIP - RidgeCV(alphas=[0.1, 1.0, 10.0], cv=3, fit_intercept=True, scoring=None, - normalize=False) - >>> reg.alpha_ # doctest: +SKIP - 0.1 + >>> reg = linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13)) + >>> reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) # doctest: +NORMALIZE_WHITESPACE + RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, + 1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]), + cv=None, fit_intercept=True, gcv_mode=None, normalize=False, + scoring=None, store_cv_values=False) + >>> reg.alpha_ + 0.01 + +Specifying the value of the `cv` attribute will trigger the use of +cross-validation with `GridSearchCV`, for example `cv=10` for 10-fold +cross-validation, rather than Generalized Cross-Validation. .. topic:: References diff --git a/doc/roadmap.rst b/doc/roadmap.rst index a8334604395a2..2252b62d273e6 100644 --- a/doc/roadmap.rst +++ b/doc/roadmap.rst @@ -128,7 +128,6 @@ bottom. #. Improved tools for model diagnostics and basic inference - * partial dependence plots :issue:`5653` * alternative feature importances implementations (e.g. methods or wrappers) * better ways to handle validation sets when fitting * better ways to find thresholds / create decision rules :issue:`8614` @@ -144,19 +143,6 @@ bottom. :issue:`6929` * Callbacks or a similar system would facilitate logging and early stopping -#. Use scipy BLAS Cython bindings - - * This will make it possible to get rid of our partial copy of suboptimal - Atlas C-routines. :issue:`11638` - * This should speed up the Windows and Linux wheels - -#. Allow fine-grained parallelism in cython - - * Now that we do not use fork-based multiprocessing in joblib anymore it's - possible to use the prange / openmp thread management which makes it - possible to have very efficient thread-based parallelism at the Cython - level. Example with K-Means: :issue:`11950` - #. Distributed parallelism * Joblib can now plug onto several backends, some of them can distribute the @@ -240,9 +226,6 @@ Subpackage-specific goals :mod:`sklearn.ensemble` * a stacking implementation -* a binned feature histogram based and thread parallel implementation of - decision trees to compete with the performance of state of the art gradient - boosting like LightGBM. :mod:`sklearn.model_selection` @@ -269,5 +252,3 @@ Subpackage-specific goals * Performance issues with `Pipeline.memory` * see "Everything in Scikit-learn should conform to our API contract" above -* Add a verbose option :issue:`10435` - diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index bf18d8350646e..91c8e4506ec2b 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -7,7 +7,7 @@ Version 0.21.0 ============== -**May 2019** +**10 May 2019** Changed models -------------- @@ -38,6 +38,8 @@ random sampling procedures. seed, including :class:`linear_model.LogisticRegression`, :class:`linear_model.LogisticRegressionCV`, :class:`linear_model.Ridge`, and :class:`linear_model.RidgeCV` with 'sag' solver. |Fix| +- :class:`linear_model.ridge.RidgeCV` when using generalized cross-validation + with sparse inputs. |Fix| Details are listed in the changelog below. @@ -119,6 +121,12 @@ Support for Python 3.4 and below has been officially dropped. parameter which can be used to find the clusters instead of ``n_clusters``. :issue:`9069` by :user:`Vathsala Achar ` and `Adrin Jalali`_. +:mod:`sklearn.compose` +...................... + +- |API| :class:`compose.ColumnTransformer` is no longer an experimental + feature. :pr:`13835` by :user:`Hanmin Qin `. + :mod:`sklearn.datasets` ....................... @@ -214,7 +222,7 @@ Support for Python 3.4 and below has been officially dropped. >>> # explicitly require this experimental feature >>> from sklearn.experimental import enable_hist_gradient_boosting # noqa - >>> # now you can import normally from ensemble + >>> # now you can import normally from sklearn.ensemble >>> from sklearn.ensemble import HistGradientBoostingClassifier :pr:`12807` by :user:`Nicolas Hug`. @@ -319,6 +327,17 @@ Support for Python 3.4 and below has been officially dropped. :pr:`12599` by :user:`Trevor Stephens` and :user:`Nicolas Hug`. +- |Fix| :class:`ensemble.VotingClassifier` and + :class:`ensemble.VotingRegressor` were failing during ``fit`` in one + of the estimators was set to ``None`` and ``sample_weight`` was not ``None``. + :pr:`13779` by :user:`Guillaume Lemaitre `. + +- |API| :class:`ensemble.VotingClassifier` and + :class:`ensemble.VotingRegressor` accept ``'drop'`` to disable an estimator + in addition to ``None`` to be consistent with other estimators (i.e., + :class:`pipeline.FeatureUnion` and :class:`compose.ColumnTransformer`). + :pr:`13780` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.externals` ........................ @@ -345,6 +364,15 @@ Support for Python 3.4 and below has been officially dropped. :pr:`12177` by :user:`Sergey Feldman ` and :user:`Ben Lawson `. + The API of IterativeImputer is experimental and subject to change without any + deprecation cycle. To use them, you need to explicitly import + ``enable_iterative_imputer``:: + + >>> from sklearn.experimental import enable_iterative_imputer # noqa + >>> # now you can import normally from sklearn.impute + >>> from sklearn.impute import IterativeImputer + + - |Feature| The :class:`impute.SimpleImputer` and :class:`impute.IterativeImputer` have a new parameter ``'add_indicator'``, which simply stacks a :class:`impute.MissingIndicator` transform into the @@ -384,6 +412,10 @@ Support for Python 3.4 and below has been officially dropped. :mod:`sklearn.linear_model` ........................... +- |Enhancement| :class:`linear_model.Ridge` now preserves ``float32`` and + ``float64`` dtypes. :issues:`8769` and :issues:`11000` by + :user:`Guillaume Lemaitre `, and :user:`Joan Massich ` + - |Feature| :class:`linear_model.LogisticRegression` and :class:`linear_model.LogisticRegressionCV` now support Elastic-Net penalty, with the 'saga' solver. :pr:`11646` by :user:`Nicolas Hug `. @@ -478,6 +510,10 @@ Support for Python 3.4 and below has been officially dropped. in version 0.21 and will be removed in version 0.23. :pr:`12821` by :user:`Nicolas Hug `. +- |Fix| :class:`linear_model.ridge.RidgeCV` with generalized cross-validation + now correctly fits an intercept when ``fit_intercept=True`` and the design + matrix is sparse. :issue:`13350` by :user:`Jérôme Dockès ` + :mod:`sklearn.manifold` ....................... @@ -577,7 +613,7 @@ Support for Python 3.4 and below has been officially dropped. - |Feature| Classes :class:`~model_selection.GridSearchCV` and :class:`~model_selection.RandomizedSearchCV` now allow for refit=callable to add flexibility in identifying the best estimator. - See :doc:`/auto_examples/model_selection/plot_grid_search_refit_callable.py`. + See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_refit_callable.py`. :pr:`11354` by :user:`Wenhao Zhang `, `Joel Nothman`_ and :user:`Adrin Jalali `. diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index c6a8cb65d2c6b..06fab08c381f2 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -8,13 +8,13 @@ variable as an output in turn. In this example we compare some estimators for the purpose of missing feature -imputation with :class:`sklearn.imputeIterativeImputer`:: +imputation with :class:`sklearn.impute.IterativeImputer`: - :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression - :class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression - :class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R - :class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN - imputation approaches +* :class:`~sklearn.linear_model.BayesianRidge`: regularized linear regression +* :class:`~sklearn.tree.DecisionTreeRegressor`: non-linear regression +* :class:`~sklearn.ensemble.ExtraTreesRegressor`: similar to missForest in R +* :class:`~sklearn.neighbors.KNeighborsRegressor`: comparable to other KNN + imputation approaches Of particular interest is the ability of :class:`sklearn.impute.IterativeImputer` to mimic the behavior of missForest, a @@ -42,6 +42,8 @@ import matplotlib.pyplot as plt import pandas as pd +# To use this experimental feature, we need to explicitly ask for it: +from sklearn.experimental import enable_iterative_imputer # noqa from sklearn.datasets import fetch_california_housing from sklearn.impute import SimpleImputer from sklearn.impute import IterativeImputer diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 897b66aad246c..2d2d37745abf3 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -23,6 +23,8 @@ import numpy as np import matplotlib.pyplot as plt +# To use the experimental IterativeImputer, we need to explicitly ask for it: +from sklearn.experimental import enable_iterative_imputer # noqa from sklearn.datasets import load_diabetes from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor diff --git a/sklearn/__init__.py b/sklearn/__init__.py index bd5e052a50577..1271b7e9fd4a9 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -45,7 +45,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.21rc2' +__version__ = '0.21.0' # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index c99ef8f618b23..9f9245aa32f21 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -148,7 +148,7 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False): Parameters ---------- X : array, shape (n_samples, n_features) - feature matrix representing n_samples samples to be clustered + feature matrix representing n_samples samples to be clustered connectivity : sparse matrix (optional). connectivity matrix. Defines for each sample the neighboring samples @@ -219,7 +219,7 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False): n_samples, n_features = X.shape if connectivity is None: - from scipy.cluster import hierarchy # imports PIL + from scipy.cluster import hierarchy # imports PIL if n_clusters is not None: warnings.warn('Partial build of the tree is implemented ' @@ -433,7 +433,7 @@ def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete', 'of %s, but %s was given' % (linkage_choices.keys(), linkage)) if connectivity is None: - from scipy.cluster import hierarchy # imports PIL + from scipy.cluster import hierarchy # imports PIL if n_clusters is not None: warnings.warn('Partial build of the tree is implemented ' @@ -597,7 +597,7 @@ def _single_linkage(*args, **kwargs): ############################################################################### -# Functions for cutting hierarchical clustering tree +# Functions for cutting hierarchical clustering tree def _hc_cut(n_clusters, children, n_leaves): """Function cutting the ward tree for a given number of clusters. diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index 042e6990b5df1..37dc6a3abda61 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -44,7 +44,7 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): """Init n_clusters seeds according to k-means++ Parameters - ----------- + ---------- X : array or sparse matrix, shape (n_samples, n_features) The data to pick seeds for. To avoid memory copy, the input data should be double precision (dtype=np.float64). @@ -706,7 +706,7 @@ def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, an int to make the randomness deterministic. See :term:`Glossary `. - x_squared_norms : array, shape (n_samples,), optional + x_squared_norms : array, shape (n_samples,), optional Squared euclidean norm of each data point. Pass it if you have it at hands already to avoid it being recomputed here. Default: None @@ -887,7 +887,7 @@ class KMeans(BaseEstimator, ClusterMixin, TransformerMixin): probably much faster than the default batch implementation. Notes - ------ + ----- The k-means problem is solved using either Lloyd's or Elkan's algorithm. The average complexity is given by O(k n T), were n is the number of @@ -1419,8 +1419,8 @@ class MiniBatchKMeans(KMeans): ... [3, 2], [5, 5], [1, -1]]) >>> # manually fit on batches >>> kmeans = MiniBatchKMeans(n_clusters=2, - ... random_state=0, - ... batch_size=6) + ... random_state=0, + ... batch_size=6) >>> kmeans = kmeans.partial_fit(X[0:6,:]) >>> kmeans = kmeans.partial_fit(X[6:12,:]) >>> kmeans.cluster_centers_ @@ -1430,9 +1430,9 @@ class MiniBatchKMeans(KMeans): array([0, 1], dtype=int32) >>> # fit on the whole data >>> kmeans = MiniBatchKMeans(n_clusters=2, - ... random_state=0, - ... batch_size=6, - ... max_iter=10).fit(X) + ... random_state=0, + ... batch_size=6, + ... max_iter=10).fit(X) >>> kmeans.cluster_centers_ array([[3.95918367, 2.40816327], [1.12195122, 1.3902439 ]]) diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py index 7e93e715b7585..68b92139537d3 100644 --- a/sklearn/cluster/mean_shift_.py +++ b/sklearn/cluster/mean_shift_.py @@ -409,7 +409,7 @@ def fit(self, X, y=None): """Perform clustering. Parameters - ----------- + ---------- X : array-like, shape=[n_samples, n_features] Samples to cluster. diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 82a771756d09c..fdaf423a11db4 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -173,7 +173,7 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, Read more in the :ref:`User Guide `. Parameters - ----------- + ---------- affinity : array-like or sparse matrix, shape: (n_samples, n_samples) The affinity matrix describing the relationship of the samples to embed. **Must be symmetric**. @@ -240,7 +240,7 @@ def spectral_clustering(affinity, n_clusters=8, n_components=None, https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf Notes - ------ + ----- The graph should contain only one connect component, elsewhere the results make little sense. @@ -298,7 +298,7 @@ class SpectralClustering(BaseEstimator, ClusterMixin): Read more in the :ref:`User Guide `. Parameters - ----------- + ---------- n_clusters : integer, optional The dimension of the projection subspace. diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index a59e7962bbbb4..11dad7338b94a 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -33,9 +33,6 @@ class ColumnTransformer(_BaseComposition, TransformerMixin): """Applies transformers to columns of an array or pandas DataFrame. - EXPERIMENTAL: some behaviors may change between releases without - deprecation. - This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space. diff --git a/sklearn/conftest.py b/sklearn/conftest.py new file mode 100644 index 0000000000000..d38e45f57b4f8 --- /dev/null +++ b/sklearn/conftest.py @@ -0,0 +1,21 @@ +import pytest + + +@pytest.fixture(scope='function') +def pyplot(): + """Setup and teardown fixture for matplotlib. + + This fixture checks if we can import matplotlib. If not, the tests will be + skipped. Otherwise, we setup matplotlib backend and close the figures + after running the functions. + + Returns + ------- + pyplot : module + The ``matplotlib.pyplot`` module. + """ + matplotlib = pytest.importorskip('matplotlib') + matplotlib.use('agg', warn=False, force=True) + pyplot = pytest.importorskip('matplotlib.pyplot') + yield pyplot + pyplot.close('all') diff --git a/sklearn/covariance/empirical_covariance_.py b/sklearn/covariance/empirical_covariance_.py index 21d389846f198..a962c7ead8615 100644 --- a/sklearn/covariance/empirical_covariance_.py +++ b/sklearn/covariance/empirical_covariance_.py @@ -122,8 +122,8 @@ class EmpiricalCovariance(BaseEstimator): ... [.3, .4]]) >>> rng = np.random.RandomState(0) >>> X = rng.multivariate_normal(mean=[0, 0], - ... cov=real_cov, - ... size=500) + ... cov=real_cov, + ... size=500) >>> cov = EmpiricalCovariance().fit(X) >>> cov.covariance_ # doctest: +ELLIPSIS array([[0.7569..., 0.2818...], diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py index 35ead3fcd8210..2e355f5cf3f1b 100644 --- a/sklearn/covariance/graph_lasso_.py +++ b/sklearn/covariance/graph_lasso_.py @@ -337,10 +337,10 @@ class GraphicalLasso(EmpiricalCovariance): -------- >>> import numpy as np >>> from sklearn.covariance import GraphicalLasso - >>> true_cov = np.array([[.8, 0., .2, 0.], - ... [0., .4, 0., 0.], - ... [.2, 0., .3, .1], - ... [0., 0., .1, .7]]) + >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0], + ... [0.0, 0.4, 0.0, 0.0], + ... [0.2, 0.0, 0.3, 0.1], + ... [0.0, 0.0, 0.1, 0.7]]) >>> np.random.seed(0) >>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0], ... cov=true_cov, @@ -592,10 +592,10 @@ class GraphicalLassoCV(GraphicalLasso): -------- >>> import numpy as np >>> from sklearn.covariance import GraphicalLassoCV - >>> true_cov = np.array([[.8, 0., .2, 0.], - ... [0., .4, 0., 0.], - ... [.2, 0., .3, .1], - ... [0., 0., .1, .7]]) + >>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0], + ... [0.0, 0.4, 0.0, 0.0], + ... [0.2, 0.0, 0.3, 0.1], + ... [0.0, 0.0, 0.1, 0.7]]) >>> np.random.seed(0) >>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0], ... cov=true_cov, diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index 3fdffc5851d01..0b8f73c86117b 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -568,12 +568,12 @@ def load_digits(n_class=10, return_X_y=False): def load_diabetes(return_X_y=False): """Load and return the diabetes dataset (regression). - ============== ================== - Samples total 442 - Dimensionality 10 - Features real, -.2 < x < .2 - Targets integer 25 - 346 - ============== ================== + ============== ================== + Samples total 442 + Dimensionality 10 + Features real, -.2 < x < .2 + Targets integer 25 - 346 + ============== ================== Read more in the :ref:`User Guide `. @@ -621,12 +621,12 @@ def load_diabetes(return_X_y=False): def load_linnerud(return_X_y=False): """Load and return the linnerud dataset (multivariate regression). - ============== ============================ - Samples total 20 - Dimensionality 3 (for both data and target) - Features integer - Targets integer - ============== ============================ + ============== ============================ + Samples total 20 + Dimensionality 3 (for both data and target) + Features integer + Targets integer + ============== ============================ Read more in the :ref:`User Guide `. @@ -685,12 +685,12 @@ def load_linnerud(return_X_y=False): def load_boston(return_X_y=False): """Load and return the boston house-prices dataset (regression). - ============== ============== - Samples total 506 - Dimensionality 13 - Features real, positive - Targets real 5. - 50. - ============== ============== + ============== ============== + Samples total 506 + Dimensionality 13 + Features real, positive + Targets real 5. - 50. + ============== ============== Read more in the :ref:`User Guide `. @@ -810,7 +810,7 @@ def load_sample_image(image_name): Read more in the :ref:`User Guide `. Parameters - ----------- + ---------- image_name : {`china.jpg`, `flower.jpg`} The name of the sample image loaded @@ -820,7 +820,7 @@ def load_sample_image(image_name): The image as a numpy array: height x width x color Examples - --------- + -------- >>> from sklearn.datasets import load_sample_image >>> china = load_sample_image('china.jpg') # doctest: +SKIP @@ -895,7 +895,7 @@ def _fetch_remote(remote, dirname=None): downloaded file. Parameters - ----------- + ---------- remote : RemoteFileMetadata Named tuple containing remote dataset meta information: url, filename and checksum diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py index 372d6e44f1b92..26550270c3aab 100644 --- a/sklearn/datasets/california_housing.py +++ b/sklearn/datasets/california_housing.py @@ -50,12 +50,12 @@ def fetch_california_housing(data_home=None, download_if_missing=True, return_X_y=False): """Load the California housing dataset (regression). - ============== ============== - Samples total 20640 - Dimensionality 8 - Features real - Target real 0.15 - 5. - ============== ============== + ============== ============== + Samples total 20640 + Dimensionality 8 + Features real + Target real 0.15 - 5. + ============== ============== Read more in the :ref:`User Guide `. @@ -97,7 +97,7 @@ def fetch_california_housing(data_home=None, download_if_missing=True, .. versionadded:: 0.20 Notes - ------ + ----- This dataset consists of 20,640 samples and 9 features. """ diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 2363a9a4689ca..6f76ee15e2e40 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -449,9 +449,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, .. note:: EXPERIMENTAL - The API is experimental in version 0.20 (particularly the return value - structure), and might have small backward-incompatible changes in - future releases. + The API is experimental (particularly the return value structure), + and might have small backward-incompatible changes in future releases. Parameters ---------- @@ -515,10 +514,9 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, .. note:: EXPERIMENTAL - This interface is **experimental** as at version 0.20 and - subsequent releases may change attributes without notice - (although there should only be minor changes to ``data`` - and ``target``). + This interface is **experimental** and subsequent releases may + change attributes without notice (although there should only be + minor changes to ``data`` and ``target``). Missing values in the 'data' are represented as NaN's. Missing values in 'target' are represented as NaN's (numerical target) or None diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py index 34e8251f9551f..83cb5b132ccd5 100644 --- a/sklearn/datasets/species_distributions.py +++ b/sklearn/datasets/species_distributions.py @@ -154,7 +154,7 @@ def fetch_species_distributions(data_home=None, instead of trying to download the data from the source site. Returns - -------- + ------- The data is returned as a Bunch object with the following attributes: coverages : array, shape = [14, 1592, 1212] diff --git a/sklearn/datasets/svmlight_format.py b/sklearn/datasets/svmlight_format.py index fbb38ceffa298..c85d4e91749b6 100644 --- a/sklearn/datasets/svmlight_format.py +++ b/sklearn/datasets/svmlight_format.py @@ -435,7 +435,7 @@ def dump_svmlight_file(X, y, f, zero_based=True, comment=None, query_id=None, # if a user wants to get fancy, they'll have to decode themselves. # Avoid mention of str and unicode types for Python 3.x compat. if isinstance(comment, bytes): - comment.decode("ascii") # just for the exception + comment.decode("ascii") # just for the exception else: comment = comment.encode("utf-8") if b"\0" in comment: diff --git a/sklearn/decomposition/base.py b/sklearn/decomposition/base.py index b318de0cd0daf..e0b9b33de0bda 100644 --- a/sklearn/decomposition/base.py +++ b/sklearn/decomposition/base.py @@ -27,7 +27,7 @@ def get_covariance(self): """Compute data covariance with the generative model. ``cov = components_.T * S**2 * components_ + sigma2 * eye(n_features)`` - where S**2 contains the explained variances, and sigma2 contains the + where S**2 contains the explained variances, and sigma2 contains the noise variances. Returns diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index ef823272e0e8f..8075b706a5f9c 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -171,7 +171,7 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars', copy_Xy=copy_cov).T else: raise ValueError('Sparse coding method must be "lasso_lars" ' - '"lasso_cd", "lasso", "threshold" or "omp", got %s.' + '"lasso_cd", "lasso", "threshold" or "omp", got %s.' % algorithm) if new_code.ndim != 2: return new_code.reshape(n_samples, n_components) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index c1c695c96d82b..555bd619c5a62 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -230,9 +230,9 @@ def _fit_transform(self, K): # there is a link between # the eigenvectors of K=Phi(X)'Phi(X) and the ones of Phi(X)Phi(X)' # if v is an eigenvector of K - # then Phi(X)v is an eigenvector of Phi(X)Phi(X)' + # then Phi(X)v is an eigenvector of Phi(X)Phi(X)' # if u is an eigenvector of Phi(X)Phi(X)' - # then Phi(X)'u is an eigenvector of Phi(X)Phi(X)' + # then Phi(X)'u is an eigenvector of Phi(X)Phi(X)' # # At this stage our self.alphas_ (the v) have norm 1, we need to scale # them so that eigenvectors in kernel feature space (the u) have norm=1 diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index d1cee0345d5e6..5c8893d141724 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -223,6 +223,8 @@ class PCA(_BasePCA): The singular values are equal to the 2-norms of the ``n_components`` variables in the lower-dimensional space. + .. versionadded:: 0.19 + mean_ : array, shape (n_features,) Per-feature empirical mean, estimated from the training set. diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index e710bc5045b30..9d64292b702e0 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -716,7 +716,7 @@ def _decision_function(self, X): Xm = X - self.means_[i] X2 = np.dot(Xm, R * (S ** (-0.5))) norm2.append(np.sum(X2 ** 2, 1)) - norm2 = np.array(norm2).T # shape = [len(X), n_classes] + norm2 = np.array(norm2).T # shape = [len(X), n_classes] u = np.asarray([np.sum(np.log(s)) for s in self.scalings_]) return (-0.5 * (norm2 + u) + np.log(self.priors_)) diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 6e1fe461fabe7..98ecef6f6c459 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -492,10 +492,10 @@ def predict(self, X, return_std=False): Returns ------- - y : array, shape = [n_samples] or [n_samples, n_outputs] + y : array, shape = [n_samples] or [n_samples, n_outputs] Predicted target values for X. - y_std : array, shape = [n_samples] or [n_samples, n_outputs] + y_std : array, shape = [n_samples] or [n_samples, n_outputs] Standard deviation of predictive distribution of query points. """ check_is_fitted(self, "constant_") diff --git a/sklearn/ensemble/_gb_losses.py b/sklearn/ensemble/_gb_losses.py index ca92589075b0c..19c66710bf0ad 100644 --- a/sklearn/ensemble/_gb_losses.py +++ b/sklearn/ensemble/_gb_losses.py @@ -879,6 +879,6 @@ def get_init_raw_predictions(self, X, estimator): 'lad': LeastAbsoluteError, 'huber': HuberLossFunction, 'quantile': QuantileLossFunction, - 'deviance': None, # for both, multinomial and binomial + 'deviance': None, # for both, multinomial and binomial 'exponential': ExponentialLoss, } diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index c46ed25a4c4dc..64225db2348dc 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -239,131 +239,6 @@ def predict_stage(np.ndarray[object, ndim=2] estimators, return predict_stages(estimators[stage:stage + 1], X, scale, out) -cdef inline int array_index(int32 val, int32[::1] arr): - """Find index of ``val`` in array ``arr``. """ - cdef int32 res = -1 - cdef int32 i = 0 - cdef int32 n = arr.shape[0] - for i in range(n): - if arr[i] == val: - res = i - break - return res - - -cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X, - int32[::1] target_feature, - double learn_rate, - double[::1] out): - """Partial dependence of the response on the ``target_feature`` set. - - For each row in ``X`` a tree traversal is performed. - Each traversal starts from the root with weight 1.0. - - At each non-terminal node that splits on a target variable either - the left child or the right child is visited based on the feature - value of the current sample and the weight is not modified. - At each non-terminal node that splits on a complementary feature - both children are visited and the weight is multiplied by the fraction - of training samples which went to each child. - - At each terminal node the value of the node is multiplied by the - current weight (weights sum to 1 for all visited terminal nodes). - - Parameters - ---------- - tree : sklearn.tree.Tree - A regression tree; tree.values.shape[1] == 1 - X : memory view on 2d ndarray - The grid points on which the partial dependence - should be evaluated. X.shape[1] == target_feature.shape[0]. - target_feature : memory view on 1d ndarray - The set of target features for which the partial dependence - should be evaluated. X.shape[1] == target_feature.shape[0]. - learn_rate : double - Constant scaling factor for the leaf predictions. - out : memory view on 1d ndarray - The value of the partial dependence function on each grid - point. - """ - cdef Py_ssize_t i = 0 - cdef Py_ssize_t n_features = X.shape[1] - cdef Node* root_node = tree.nodes - cdef double *value = tree.value - cdef SIZE_t node_count = tree.node_count - - cdef SIZE_t stack_capacity = node_count * 2 - cdef Node **node_stack - cdef double[::1] weight_stack = np_ones((stack_capacity,), dtype=np_float64) - cdef SIZE_t stack_size = 1 - cdef double left_sample_frac - cdef double current_weight - cdef double total_weight = 0.0 - cdef Node *current_node - underlying_stack = np_zeros((stack_capacity,), dtype=np.intp) - node_stack = ( underlying_stack).data - - for i in range(X.shape[0]): - # init stacks for new example - stack_size = 1 - node_stack[0] = root_node - weight_stack[0] = 1.0 - total_weight = 0.0 - - while stack_size > 0: - # get top node on stack - stack_size -= 1 - current_node = node_stack[stack_size] - - if current_node.left_child == TREE_LEAF: - out[i] += weight_stack[stack_size] * value[current_node - root_node] * \ - learn_rate - total_weight += weight_stack[stack_size] - else: - # non-terminal node - feature_index = array_index(current_node.feature, target_feature) - if feature_index != -1: - # split feature in target set - # push left or right child on stack - if X[i, feature_index] <= current_node.threshold: - # left - node_stack[stack_size] = (root_node + - current_node.left_child) - else: - # right - node_stack[stack_size] = (root_node + - current_node.right_child) - stack_size += 1 - else: - # split feature in complement set - # push both children onto stack - - # push left child - node_stack[stack_size] = root_node + current_node.left_child - current_weight = weight_stack[stack_size] - left_sample_frac = root_node[current_node.left_child].weighted_n_node_samples / \ - current_node.weighted_n_node_samples - if left_sample_frac <= 0.0 or left_sample_frac >= 1.0: - raise ValueError("left_sample_frac:%d, " - "weighted_n_node_samples current: %d, " - "weighted_n_node_samples left: %d" - % (left_sample_frac, - current_node.weighted_n_node_samples, - root_node[current_node.left_child].weighted_n_node_samples)) - weight_stack[stack_size] = current_weight * left_sample_frac - stack_size +=1 - - # push right child - node_stack[stack_size] = root_node + current_node.right_child - weight_stack[stack_size] = current_weight * \ - (1.0 - left_sample_frac) - stack_size +=1 - - if not (0.999 < total_weight < 1.001): - raise ValueError("Total weight should be 1.0 but was %.9f" % - total_weight) - - def _random_sample_mask(np.npy_intp n_total_samples, np.npy_intp n_total_in_bag, random_state): """Create a random sample mask where ``n_total_in_bag`` elements are set. diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index ce7ac7116030a..064c7ce8b6411 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -101,7 +101,7 @@ def __lt__(self, other_node): priority). Parameters - ----------- + ---------- other_node : TreeNode The node to compare with. """ diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 3ce0eb7f456da..468de934f3666 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1694,7 +1694,7 @@ def _staged_raw_predict(self, X): Regression and binary classification are special cases with ``k == 1``, otherwise ``k==n_classes``. """ - X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr') + X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr') raw_predictions = self._raw_predict_init(X) for i in range(self.estimators_.shape[0]): predict_stage(self.estimators_, i, X, self.learning_rate, @@ -2085,7 +2085,7 @@ def decision_function(self, X): `classes_`. Regression and binary classification produce an array of shape [n_samples]. """ - X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr') + X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr') raw_predictions = self._raw_predict(X) if raw_predictions.shape[1] == 1: return raw_predictions.ravel() @@ -2527,7 +2527,7 @@ def predict(self, X): y : array, shape (n_samples,) The predicted values. """ - X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr') + X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr') # In regression we can directly return the raw value from the trees. return self._raw_predict(X).ravel() diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py index 11d5208d2d179..b1a40ffd09d1e 100644 --- a/sklearn/ensemble/partial_dependence.py +++ b/sklearn/ensemble/partial_dependence.py @@ -20,7 +20,6 @@ from ..tree._tree import DTYPE from ..utils import deprecated -from ._gradient_boosting import _partial_dependence_tree from .gradient_boosting import BaseGradientBoosting @@ -174,8 +173,8 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None, for stage in range(n_estimators): for k in range(n_trees_per_stage): tree = gbrt.estimators_[stage, k].tree_ - _partial_dependence_tree(tree, grid, target_variables, - gbrt.learning_rate, pdp[k]) + tree.compute_partial_dependence(grid, target_variables, pdp[k]) + pdp *= gbrt.learning_rate return pdp, axes diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py index a40fea2ff0099..dc0e0419e812e 100644 --- a/sklearn/ensemble/tests/test_partial_dependence.py +++ b/sklearn/ensemble/tests/test_partial_dependence.py @@ -7,14 +7,12 @@ from numpy.testing import assert_array_equal, assert_allclose from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import if_matplotlib from sklearn.ensemble.partial_dependence import partial_dependence from sklearn.ensemble.partial_dependence import plot_partial_dependence from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import GradientBoostingRegressor from sklearn import datasets from sklearn.utils.testing import ignore_warnings -from sklearn.utils.testing import assert_warns_message # toy sample @@ -156,8 +154,7 @@ def test_partial_dependecy_input(): @ignore_warnings(category=DeprecationWarning) @pytest.mark.filterwarnings('ignore: Using or importing the ABCs from') # matplotlib Python3.7 warning -@if_matplotlib -def test_plot_partial_dependence(): +def test_plot_partial_dependence(pyplot): # Test partial dependence plot function. clf = GradientBoostingRegressor(n_estimators=10, random_state=1) clf.fit(boston.data, boston.target) @@ -190,9 +187,8 @@ def test_plot_partial_dependence(): @pytest.mark.filterwarnings('ignore: Using or importing the ABCs from') # matplotlib Python3.7 warning -@if_matplotlib @ignore_warnings(category=DeprecationWarning) -def test_plot_partial_dependence_input(): +def test_plot_partial_dependence_input(pyplot): # Test partial dependence plot function input checks. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) @@ -228,9 +224,8 @@ def test_plot_partial_dependence_input(): @pytest.mark.filterwarnings('ignore: Using or importing the ABCs from') # matplotlib Python3.7 warning -@if_matplotlib @ignore_warnings(category=DeprecationWarning) -def test_plot_partial_dependence_multiclass(): +def test_plot_partial_dependence_multiclass(pyplot): # Test partial dependence plot function on multi-class input. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, iris.target) @@ -265,30 +260,18 @@ def test_plot_partial_dependence_multiclass(): grid_resolution=grid_resolution) -def test_warning_raised_partial_dependence(): - # Test that deprecation warning is raised - - clf = GradientBoostingRegressor(n_estimators=10, random_state=1) - clf.fit(boston.data, boston.target) - grid_resolution = 25 - - assert_warns_message(DeprecationWarning, "The function " - "ensemble.partial_dependence has been deprecated ", - partial_dependence, clf, [0], X=boston.data, - grid_resolution=grid_resolution) - - -@if_matplotlib -def test_warning_raised_partial_dependence_plot(): - # Test that deprecation warning is raised - +@pytest.mark.parametrize( + "func, params", + [(partial_dependence, {'target_variables': [0], 'X': boston.data}), + (plot_partial_dependence, {'X': boston.data, 'features': [0, 1, (0, 1)]})] +) +def test_raise_deprecation_warning(pyplot, func, params): clf = GradientBoostingRegressor(n_estimators=10, random_state=1) clf.fit(boston.data, boston.target) grid_resolution = 25 - assert_warns_message(DeprecationWarning, "The function " - "ensemble.plot_partial_dependence has been " - "deprecated", - plot_partial_dependence, clf, boston.data, - [0, 1, (0, 1)], grid_resolution=grid_resolution, - feature_names=boston.feature_names) + warn_msg = "The function ensemble.{} has been deprecated".format( + func.__name__ + ) + with pytest.warns(DeprecationWarning, match=warn_msg): + func(clf, **params, grid_resolution=grid_resolution) diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py index 2a19bc9a64dc0..b2b16cf8eeec3 100644 --- a/sklearn/ensemble/tests/test_voting.py +++ b/sklearn/ensemble/tests/test_voting.py @@ -8,9 +8,11 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_raise_message from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LinearRegression from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import VotingClassifier, VotingRegressor from sklearn.model_selection import GridSearchCV from sklearn import datasets @@ -340,12 +342,25 @@ def test_sample_weight(): assert_array_equal(eclf3.predict(X), clf1.predict(X)) assert_array_almost_equal(eclf3.predict_proba(X), clf1.predict_proba(X)) + # check that an error is raised and indicative if sample_weight is not + # supported. clf4 = KNeighborsClassifier() eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('svc', clf3), ('knn', clf4)], voting='soft') - msg = ('Underlying estimator \'knn\' does not support sample weights.') - assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight) + msg = ('Underlying estimator KNeighborsClassifier does not support ' + 'sample weights.') + with pytest.raises(ValueError, match=msg): + eclf3.fit(X, y, sample_weight) + + # check that _parallel_fit_estimator will raise the right error + # it should raise the original error if this is not linked to sample_weight + class ClassifierErrorFit(BaseEstimator, ClassifierMixin): + def fit(self, X, y, sample_weight): + raise TypeError('Error unrelated to sample_weight.') + clf = ClassifierErrorFit() + with pytest.raises(TypeError, match='Error unrelated to sample_weight'): + clf.fit(X, y, sample_weight=sample_weight) def test_sample_weight_kwargs(): @@ -402,8 +417,10 @@ def test_set_params(): @pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 @pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 @pytest.mark.filterwarnings('ignore:The default value of n_estimators') -def test_set_estimator_none(): - """VotingClassifier set_params should be able to set estimators as None""" +@pytest.mark.parametrize("drop", [None, 'drop']) +def test_set_estimator_none(drop): + """VotingClassifier set_params should be able to set estimators as None or + drop""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) @@ -415,22 +432,22 @@ def test_set_estimator_none(): eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) - eclf2.set_params(rf=None).fit(X, y) + eclf2.set_params(rf=drop).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) - assert dict(eclf2.estimators)["rf"] is None + assert dict(eclf2.estimators)["rf"] is drop assert len(eclf2.estimators_) == 2 assert all(isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) - assert eclf2.get_params()["rf"] is None + assert eclf2.get_params()["rf"] is drop eclf1.set_params(voting='soft').fit(X, y) eclf2.set_params(voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) - msg = 'All estimators are None. At least one is required!' + msg = 'All estimators are None or "drop". At least one is required!' assert_raise_message( - ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y) + ValueError, msg, eclf2.set_params(lr=drop, rf=drop, nb=drop).fit, X, y) # Test soft voting transform X1 = np.array([[1], [2]]) @@ -442,7 +459,7 @@ def test_set_estimator_none(): eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) - eclf2.set_params(rf=None).fit(X1, y1) + eclf2.set_params(rf=drop).fit(X1, y1) assert_array_almost_equal(eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) @@ -507,3 +524,26 @@ def test_transform(): eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X) ) + + +@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 +@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 +@pytest.mark.parametrize( + "X, y, voter", + [(X, y, VotingClassifier( + [('lr', LogisticRegression()), + ('rf', RandomForestClassifier(n_estimators=5))])), + (X_r, y_r, VotingRegressor( + [('lr', LinearRegression()), + ('rf', RandomForestRegressor(n_estimators=5))]))] +) +@pytest.mark.parametrize("drop", [None, 'drop']) +def test_none_estimator_with_weights(X, y, voter, drop): + # check that an estimator can be set to None and passing some weight + # regression test for + # https://github.com/scikit-learn/scikit-learn/issues/13777 + voter.fit(X, y, sample_weight=np.ones(y.shape)) + voter.set_params(lr=drop) + voter.fit(X, y, sample_weight=np.ones(y.shape)) + y_pred = voter.predict(X) + assert y_pred.shape == y.shape diff --git a/sklearn/ensemble/voting.py b/sklearn/ensemble/voting.py index 35821201b6617..f60bb8f49b81d 100644 --- a/sklearn/ensemble/voting.py +++ b/sklearn/ensemble/voting.py @@ -30,7 +30,15 @@ def _parallel_fit_estimator(estimator, X, y, sample_weight=None): """Private function used to fit an estimator within a job.""" if sample_weight is not None: - estimator.fit(X, y, sample_weight=sample_weight) + try: + estimator.fit(X, y, sample_weight=sample_weight) + except TypeError as exc: + if "unexpected keyword argument 'sample_weight'" in str(exc): + raise ValueError( + "Underlying estimator {} does not support sample weights." + .format(estimator.__class__.__name__) + ) from exc + raise else: estimator.fit(X, y) return estimator @@ -53,8 +61,8 @@ def _weights_not_none(self): """Get the weights of not `None` estimators""" if self.weights is None: return None - return [w for est, w in zip(self.estimators, - self.weights) if est[1] is not None] + return [w for est, w in zip(self.estimators, self.weights) + if est[1] not in (None, 'drop')] def _predict(self, X): """Collect results from clf.predict calls. """ @@ -76,24 +84,22 @@ def fit(self, X, y, sample_weight=None): '; got %d weights, %d estimators' % (len(self.weights), len(self.estimators))) - if sample_weight is not None: - for name, step in self.estimators: - if not has_fit_parameter(step, 'sample_weight'): - raise ValueError('Underlying estimator \'%s\' does not' - ' support sample weights.' % name) - names, clfs = zip(*self.estimators) self._validate_names(names) - n_isnone = np.sum([clf is None for _, clf in self.estimators]) + n_isnone = np.sum( + [clf in (None, 'drop') for _, clf in self.estimators] + ) if n_isnone == len(self.estimators): - raise ValueError('All estimators are None. At least one is ' - 'required!') + raise ValueError( + 'All estimators are None or "drop". At least one is required!' + ) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_fit_estimator)(clone(clf), X, y, sample_weight=sample_weight) - for clf in clfs if clf is not None) + for clf in clfs if clf not in (None, 'drop') + ) self.named_estimators_ = Bunch() for k, e in zip(self.estimators, self.estimators_): @@ -147,8 +153,8 @@ class VotingClassifier(_BaseVoting, ClassifierMixin): estimators : list of (string, estimator) tuples Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones of those original estimators that will be stored in the class attribute - ``self.estimators_``. An estimator can be set to `None` using - ``set_params``. + ``self.estimators_``. An estimator can be set to ``None`` or ``'drop'`` + using ``set_params``. voting : str, {'hard', 'soft'} (default='hard') If 'hard', uses predicted class labels for majority rule voting. @@ -281,7 +287,7 @@ def predict(self, X): The input samples. Returns - ---------- + ------- maj : array-like, shape (n_samples,) Predicted class labels. """ @@ -325,7 +331,7 @@ def predict_proba(self): The input samples. Returns - ---------- + ------- avg : array-like, shape (n_samples, n_classes) Weighted average probability for each class per sample. """ @@ -379,9 +385,9 @@ class VotingRegressor(_BaseVoting, RegressorMixin): Parameters ---------- estimators : list of (string, estimator) tuples - Invoking the ``fit`` method on the ``VotingRegressor`` will fit - clones of those original estimators that will be stored in the class - attribute ``self.estimators_``. An estimator can be set to `None` + Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones + of those original estimators that will be stored in the class attribute + ``self.estimators_``. An estimator can be set to ``None`` or ``'drop'`` using ``set_params``. weights : array-like, shape (n_regressors,), optional (default=`None`) diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index 6e13b7bd80ae2..724aa07d2d76c 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -685,7 +685,7 @@ class in ``classes_``, respectively. # The weights are all 1. for SAMME.R pred = sum(_samme_proba(estimator, n_classes, X) for estimator in self.estimators_) - else: # self.algorithm == "SAMME" + else: # self.algorithm == "SAMME" pred = sum((estimator.predict(X) == classes).T * w for estimator, w in zip(self.estimators_, self.estimator_weights_)) @@ -780,7 +780,7 @@ def predict_proba(self, X): # The weights are all 1. for SAMME.R proba = sum(_samme_proba(estimator, n_classes, X) for estimator in self.estimators_) - else: # self.algorithm == "SAMME" + else: # self.algorithm == "SAMME" proba = sum(estimator.predict_proba(X) * w for estimator, w in zip(self.estimators_, self.estimator_weights_)) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index 9cf207e40fdd6..22289db5c45e2 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -29,7 +29,7 @@ class NotFittedError(ValueError, AttributeError): ... LinearSVC().predict([[1, 2], [2, 3], [3, 4]]) ... except NotFittedError as e: ... print(repr(e)) - ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS NotFittedError('This LinearSVC instance is not fitted yet'...) .. versionchanged:: 0.18 @@ -114,7 +114,7 @@ class FitFailedWarning(RuntimeWarning): >>> X, y = [[1, 2], [3, 4], [5, 6], [7, 8]], [0, 0, 1, 1] >>> with warnings.catch_warnings(record=True) as w: ... try: - ... gs.fit(X, y) # This will raise a ValueError since C is < 0 + ... gs.fit(X, y) # This will raise a ValueError since C is < 0 ... except ValueError: ... pass ... print(repr(w[-1].message)) diff --git a/sklearn/experimental/enable_iterative_imputer.py b/sklearn/experimental/enable_iterative_imputer.py new file mode 100644 index 0000000000000..2f262141cc069 --- /dev/null +++ b/sklearn/experimental/enable_iterative_imputer.py @@ -0,0 +1,19 @@ +"""Enables IterativeImputer + +The API and results of this estimators might change without any deprecation +cycle. + +Importing this file dynamically sets :class:`sklearn.impute.IterativeImputer` +as an attribute of the impute module:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_iterative_imputer # noqa + >>> # now you can import normally from impute + >>> from sklearn.impute import IterativeImputer +""" + +from ..impute._iterative import IterativeImputer +from .. import impute + +impute.IterativeImputer = IterativeImputer +impute.__all__ += ['IterativeImputer'] diff --git a/sklearn/experimental/tests/test_enable_iterative_imputer.py b/sklearn/experimental/tests/test_enable_iterative_imputer.py new file mode 100644 index 0000000000000..17579e0c43612 --- /dev/null +++ b/sklearn/experimental/tests/test_enable_iterative_imputer.py @@ -0,0 +1,39 @@ +"""Tests for making sure experimental imports work as expected.""" + +import textwrap + +from sklearn.utils.testing import assert_run_python_script + + +def test_imports_strategies(): + # Make sure different import strategies work or fail as expected. + + # Since Python caches the imported modules, we need to run a child process + # for every test case. Else, the tests would not be independent + # (manually removing the imports from the cache (sys.modules) is not + # recommended and can lead to many complications). + + good_import = """ + from sklearn.experimental import enable_iterative_imputer + from sklearn.impute import IterativeImputer + """ + assert_run_python_script(textwrap.dedent(good_import)) + + good_import_with_ensemble_first = """ + import sklearn.ensemble + from sklearn.experimental import enable_iterative_imputer + from sklearn.impute import IterativeImputer + """ + assert_run_python_script(textwrap.dedent(good_import_with_ensemble_first)) + + bad_imports = """ + import pytest + + with pytest.raises(ImportError): + from sklearn.impute import IterativeImputer + + import sklearn.experimental + with pytest.raises(ImportError): + from sklearn.impute import IterativeImputer + """ + assert_run_python_script(textwrap.dedent(bad_imports)) diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index 68a43ff0be313..e6f8ff4e555fa 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -32,7 +32,7 @@ def _make_edges_3d(n_x, n_y, n_z=1): """Returns a list of edges for a 3D image. Parameters - =========== + ---------- n_x : integer The size of the grid in the x direction. n_y : integer diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 007e158f3a449..7891e332c8214 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -110,7 +110,7 @@ def _check_stop_list(stop): raise ValueError("not a built-in stop list: %s" % stop) elif stop is None: return None - else: # assume it's a collection + else: # assume it's a collection return frozenset(stop) @@ -1496,7 +1496,7 @@ class TfidfVectorizer(CountVectorizer): idf_ : array, shape (n_features) The inverse document frequency (IDF) vector; only defined - if ``use_idf`` is True. + if ``use_idf`` is True. stop_words_ : set Terms that were ignored because they either: diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py index e7d5e97037427..88e97deaecf54 100644 --- a/sklearn/gaussian_process/kernels.py +++ b/sklearn/gaussian_process/kernels.py @@ -1156,7 +1156,7 @@ class RBF(StationaryKernelMixin, NormalizedKernelMixin, Kernel): .. versionadded:: 0.18 Parameters - ----------- + ---------- length_scale : float or array with shape (n_features,), default: 1.0 The length scale of the kernel. If a float, an isotropic kernel is used. If an array, an anisotropic kernel is used where each dimension @@ -1269,7 +1269,7 @@ class Matern(RBF): .. versionadded:: 0.18 Parameters - ----------- + ---------- length_scale : float or array with shape (n_features,), default: 1.0 The length scale of the kernel. If a float, an isotropic kernel is used. If an array, an anisotropic kernel is used where each dimension diff --git a/sklearn/impute.py b/sklearn/impute.py deleted file mode 100644 index 8bbf1bb94e242..0000000000000 --- a/sklearn/impute.py +++ /dev/null @@ -1,1339 +0,0 @@ -"""Transformers for missing value imputation""" -# Authors: Nicolas Tresegnie -# Sergey Feldman -# License: BSD 3 clause - -from __future__ import division - -import warnings -import numbers -from time import time -from distutils.version import LooseVersion - -import numpy as np -import numpy.ma as ma -import scipy -from scipy import sparse -from scipy import stats -from collections import namedtuple - -from .base import BaseEstimator, TransformerMixin -from .base import clone -from .exceptions import ConvergenceWarning -from .preprocessing import normalize -from .utils import check_array, check_random_state, safe_indexing -from .utils.sparsefuncs import _get_median -from .utils.validation import check_is_fitted -from .utils.validation import FLOAT_DTYPES -from .utils.fixes import _object_dtype_isnan -from .utils import is_scalar_nan - - -ImputerTriplet = namedtuple('ImputerTriplet', ['feat_idx', - 'neighbor_feat_idx', - 'estimator']) - -__all__ = [ - 'MissingIndicator', - 'SimpleImputer', - 'IterativeImputer', -] - - -def _check_inputs_dtype(X, missing_values): - if (X.dtype.kind in ("f", "i", "u") and - not isinstance(missing_values, numbers.Real)): - raise ValueError("'X' and 'missing_values' types are expected to be" - " both numerical. Got X.dtype={} and " - " type(missing_values)={}." - .format(X.dtype, type(missing_values))) - - -def _get_mask(X, value_to_mask): - """Compute the boolean mask X == missing_values.""" - if is_scalar_nan(value_to_mask): - if X.dtype.kind == "f": - return np.isnan(X) - elif X.dtype.kind in ("i", "u"): - # can't have NaNs in integer array. - return np.zeros(X.shape, dtype=bool) - else: - # np.isnan does not work on object dtypes. - return _object_dtype_isnan(X) - else: - # X == value_to_mask with object dytpes does not always perform - # element-wise for old versions of numpy - return np.equal(X, value_to_mask) - - -def _most_frequent(array, extra_value, n_repeat): - """Compute the most frequent value in a 1d array extended with - [extra_value] * n_repeat, where extra_value is assumed to be not part - of the array.""" - # Compute the most frequent value in array only - if array.size > 0: - with warnings.catch_warnings(): - # stats.mode raises a warning when input array contains objects due - # to incapacity to detect NaNs. Irrelevant here since input array - # has already been NaN-masked. - warnings.simplefilter("ignore", RuntimeWarning) - mode = stats.mode(array) - - most_frequent_value = mode[0][0] - most_frequent_count = mode[1][0] - else: - most_frequent_value = 0 - most_frequent_count = 0 - - # Compare to array + [extra_value] * n_repeat - if most_frequent_count == 0 and n_repeat == 0: - return np.nan - elif most_frequent_count < n_repeat: - return extra_value - elif most_frequent_count > n_repeat: - return most_frequent_value - elif most_frequent_count == n_repeat: - # Ties the breaks. Copy the behaviour of scipy.stats.mode - if most_frequent_value < extra_value: - return most_frequent_value - else: - return extra_value - - -class SimpleImputer(BaseEstimator, TransformerMixin): - """Imputation transformer for completing missing values. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - missing_values : number, string, np.nan (default) or None - The placeholder for the missing values. All occurrences of - `missing_values` will be imputed. - - strategy : string, optional (default="mean") - The imputation strategy. - - - If "mean", then replace missing values using the mean along - each column. Can only be used with numeric data. - - If "median", then replace missing values using the median along - each column. Can only be used with numeric data. - - If "most_frequent", then replace missing using the most frequent - value along each column. Can be used with strings or numeric data. - - If "constant", then replace missing values with fill_value. Can be - used with strings or numeric data. - - .. versionadded:: 0.20 - strategy="constant" for fixed value imputation. - - fill_value : string or numerical value, optional (default=None) - When strategy == "constant", fill_value is used to replace all - occurrences of missing_values. - If left to the default, fill_value will be 0 when imputing numerical - data and "missing_value" for strings or object data types. - - verbose : integer, optional (default=0) - Controls the verbosity of the imputer. - - copy : boolean, optional (default=True) - If True, a copy of X will be created. If False, imputation will - be done in-place whenever possible. Note that, in the following cases, - a new copy will always be made, even if `copy=False`: - - - If X is not an array of floating values; - - If X is encoded as a CSR matrix; - - If add_indicator=True. - - add_indicator : boolean, optional (default=False) - If True, a `MissingIndicator` transform will stack onto output - of the imputer's transform. This allows a predictive estimator - to account for missingness despite imputation. If a feature has no - missing values at fit/train time, the feature won't appear on - the missing indicator even if there are missing values at - transform/test time. - - Attributes - ---------- - statistics_ : array of shape (n_features,) - The imputation fill value for each feature. - - indicator_ : :class:`sklearn.impute.MissingIndicator` - Indicator used to add binary indicators for missing values. - ``None`` if add_indicator is False. - - See also - -------- - IterativeImputer : Multivariate imputation of missing values. - - Examples - -------- - >>> import numpy as np - >>> from sklearn.impute import SimpleImputer - >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') - >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) - ... # doctest: +NORMALIZE_WHITESPACE - SimpleImputer(add_indicator=False, copy=True, fill_value=None, - missing_values=nan, strategy='mean', verbose=0) - >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] - >>> print(imp_mean.transform(X)) - ... # doctest: +NORMALIZE_WHITESPACE - [[ 7. 2. 3. ] - [ 4. 3.5 6. ] - [10. 3.5 9. ]] - - Notes - ----- - Columns which only contained missing values at `fit` are discarded upon - `transform` if strategy is not "constant". - - """ - def __init__(self, missing_values=np.nan, strategy="mean", - fill_value=None, verbose=0, copy=True, add_indicator=False): - self.missing_values = missing_values - self.strategy = strategy - self.fill_value = fill_value - self.verbose = verbose - self.copy = copy - self.add_indicator = add_indicator - - def _validate_input(self, X): - allowed_strategies = ["mean", "median", "most_frequent", "constant"] - if self.strategy not in allowed_strategies: - raise ValueError("Can only use these strategies: {0} " - " got strategy={1}".format(allowed_strategies, - self.strategy)) - - if self.strategy in ("most_frequent", "constant"): - dtype = None - else: - dtype = FLOAT_DTYPES - - if not is_scalar_nan(self.missing_values): - force_all_finite = True - else: - force_all_finite = "allow-nan" - - try: - X = check_array(X, accept_sparse='csc', dtype=dtype, - force_all_finite=force_all_finite, copy=self.copy) - except ValueError as ve: - if "could not convert" in str(ve): - raise ValueError("Cannot use {0} strategy with non-numeric " - "data. Received datatype :{1}." - "".format(self.strategy, X.dtype.kind)) - else: - raise ve - - _check_inputs_dtype(X, self.missing_values) - if X.dtype.kind not in ("i", "u", "f", "O"): - raise ValueError("SimpleImputer does not support data with dtype " - "{0}. Please provide either a numeric array (with" - " a floating point or integer dtype) or " - "categorical data represented either as an array " - "with integer dtype or an array of string values " - "with an object dtype.".format(X.dtype)) - - return X - - def fit(self, X, y=None): - """Fit the imputer on X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Input data, where ``n_samples`` is the number of samples and - ``n_features`` is the number of features. - - Returns - ------- - self : SimpleImputer - """ - X = self._validate_input(X) - - # default fill_value is 0 for numerical input and "missing_value" - # otherwise - if self.fill_value is None: - if X.dtype.kind in ("i", "u", "f"): - fill_value = 0 - else: - fill_value = "missing_value" - else: - fill_value = self.fill_value - - # fill_value should be numerical in case of numerical input - if (self.strategy == "constant" and - X.dtype.kind in ("i", "u", "f") and - not isinstance(fill_value, numbers.Real)): - raise ValueError("'fill_value'={0} is invalid. Expected a " - "numerical value when imputing numerical " - "data".format(fill_value)) - - if sparse.issparse(X): - # missing_values = 0 not allowed with sparse data as it would - # force densification - if self.missing_values == 0: - raise ValueError("Imputation not possible when missing_values " - "== 0 and input is sparse. Provide a dense " - "array instead.") - else: - self.statistics_ = self._sparse_fit(X, - self.strategy, - self.missing_values, - fill_value) - else: - self.statistics_ = self._dense_fit(X, - self.strategy, - self.missing_values, - fill_value) - - if self.add_indicator: - self.indicator_ = MissingIndicator( - missing_values=self.missing_values) - self.indicator_.fit(X) - else: - self.indicator_ = None - - return self - - def _sparse_fit(self, X, strategy, missing_values, fill_value): - """Fit the transformer on sparse data.""" - mask_data = _get_mask(X.data, missing_values) - n_implicit_zeros = X.shape[0] - np.diff(X.indptr) - - statistics = np.empty(X.shape[1]) - - if strategy == "constant": - # for constant strategy, self.statistcs_ is used to store - # fill_value in each column - statistics.fill(fill_value) - else: - for i in range(X.shape[1]): - column = X.data[X.indptr[i]:X.indptr[i + 1]] - mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]] - column = column[~mask_column] - - # combine explicit and implicit zeros - mask_zeros = _get_mask(column, 0) - column = column[~mask_zeros] - n_explicit_zeros = mask_zeros.sum() - n_zeros = n_implicit_zeros[i] + n_explicit_zeros - - if strategy == "mean": - s = column.size + n_zeros - statistics[i] = np.nan if s == 0 else column.sum() / s - - elif strategy == "median": - statistics[i] = _get_median(column, - n_zeros) - - elif strategy == "most_frequent": - statistics[i] = _most_frequent(column, - 0, - n_zeros) - return statistics - - def _dense_fit(self, X, strategy, missing_values, fill_value): - """Fit the transformer on dense data.""" - mask = _get_mask(X, missing_values) - masked_X = ma.masked_array(X, mask=mask) - - # Mean - if strategy == "mean": - mean_masked = np.ma.mean(masked_X, axis=0) - # Avoid the warning "Warning: converting a masked element to nan." - mean = np.ma.getdata(mean_masked) - mean[np.ma.getmask(mean_masked)] = np.nan - - return mean - - # Median - elif strategy == "median": - median_masked = np.ma.median(masked_X, axis=0) - # Avoid the warning "Warning: converting a masked element to nan." - median = np.ma.getdata(median_masked) - median[np.ma.getmaskarray(median_masked)] = np.nan - - return median - - # Most frequent - elif strategy == "most_frequent": - # scipy.stats.mstats.mode cannot be used because it will no work - # properly if the first element is masked and if its frequency - # is equal to the frequency of the most frequent valid element - # See https://github.com/scipy/scipy/issues/2636 - - # To be able access the elements by columns - X = X.transpose() - mask = mask.transpose() - - if X.dtype.kind == "O": - most_frequent = np.empty(X.shape[0], dtype=object) - else: - most_frequent = np.empty(X.shape[0]) - - for i, (row, row_mask) in enumerate(zip(X[:], mask[:])): - row_mask = np.logical_not(row_mask).astype(np.bool) - row = row[row_mask] - most_frequent[i] = _most_frequent(row, np.nan, 0) - - return most_frequent - - # Constant - elif strategy == "constant": - # for constant strategy, self.statistcs_ is used to store - # fill_value in each column - return np.full(X.shape[1], fill_value, dtype=X.dtype) - - def transform(self, X): - """Impute all missing values in X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - The input data to complete. - """ - check_is_fitted(self, 'statistics_') - - X = self._validate_input(X) - - statistics = self.statistics_ - - if X.shape[1] != statistics.shape[0]: - raise ValueError("X has %d features per sample, expected %d" - % (X.shape[1], self.statistics_.shape[0])) - - if self.add_indicator: - X_trans_indicator = self.indicator_.transform(X) - - # Delete the invalid columns if strategy is not constant - if self.strategy == "constant": - valid_statistics = statistics - else: - # same as np.isnan but also works for object dtypes - invalid_mask = _get_mask(statistics, np.nan) - valid_mask = np.logical_not(invalid_mask) - valid_statistics = statistics[valid_mask] - valid_statistics_indexes = np.flatnonzero(valid_mask) - - if invalid_mask.any(): - missing = np.arange(X.shape[1])[invalid_mask] - if self.verbose: - warnings.warn("Deleting features without " - "observed values: %s" % missing) - X = X[:, valid_statistics_indexes] - - # Do actual imputation - if sparse.issparse(X): - if self.missing_values == 0: - raise ValueError("Imputation not possible when missing_values " - "== 0 and input is sparse. Provide a dense " - "array instead.") - else: - mask = _get_mask(X.data, self.missing_values) - indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int), - np.diff(X.indptr))[mask] - - X.data[mask] = valid_statistics[indexes].astype(X.dtype, - copy=False) - else: - mask = _get_mask(X, self.missing_values) - n_missing = np.sum(mask, axis=0) - values = np.repeat(valid_statistics, n_missing) - coordinates = np.where(mask.transpose())[::-1] - - X[coordinates] = values - - if self.add_indicator: - hstack = sparse.hstack if sparse.issparse(X) else np.hstack - X = hstack((X, X_trans_indicator)) - - return X - - def _more_tags(self): - return {'allow_nan': True} - - -class IterativeImputer(BaseEstimator, TransformerMixin): - """Multivariate imputer that estimates each feature from all the others. - - A strategy for imputing missing values by modeling each feature with - missing values as a function of other features in a round-robin fashion. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - estimator : estimator object, default=BayesianRidge() - The estimator to use at each step of the round-robin imputation. - If ``sample_posterior`` is True, the estimator must support - ``return_std`` in its ``predict`` method. - - missing_values : int, np.nan, optional (default=np.nan) - The placeholder for the missing values. All occurrences of - ``missing_values`` will be imputed. - - sample_posterior : boolean, default=False - Whether to sample from the (Gaussian) predictive posterior of the - fitted estimator for each imputation. Estimator must support - ``return_std`` in its ``predict`` method if set to ``True``. Set to - ``True`` if using ``IterativeImputer`` for multiple imputations. - - max_iter : int, optional (default=10) - Maximum number of imputation rounds to perform before returning the - imputations computed during the final round. A round is a single - imputation of each feature with missing values. The stopping criterion - is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol, - where `X_t` is `X` at iteration `t. Note that early stopping is only - applied if ``sample_posterior=False``. - - tol : float, optional (default=1e-3) - Tolerance of the stopping condition. - - n_nearest_features : int, optional (default=None) - Number of other features to use to estimate the missing values of - each feature column. Nearness between features is measured using - the absolute correlation coefficient between each feature pair (after - initial imputation). To ensure coverage of features throughout the - imputation process, the neighbor features are not necessarily nearest, - but are drawn with probability proportional to correlation for each - imputed target feature. Can provide significant speed-up when the - number of features is huge. If ``None``, all features will be used. - - initial_strategy : str, optional (default="mean") - Which strategy to use to initialize the missing values. Same as the - ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer` - Valid values: {"mean", "median", "most_frequent", or "constant"}. - - imputation_order : str, optional (default="ascending") - The order in which the features will be imputed. Possible values: - - "ascending" - From features with fewest missing values to most. - "descending" - From features with most missing values to fewest. - "roman" - Left to right. - "arabic" - Right to left. - "random" - A random order for each round. - - min_value : float, optional (default=None) - Minimum possible imputed value. Default of ``None`` will set minimum - to negative infinity. - - max_value : float, optional (default=None) - Maximum possible imputed value. Default of ``None`` will set maximum - to positive infinity. - - verbose : int, optional (default=0) - Verbosity flag, controls the debug messages that are issued - as functions are evaluated. The higher, the more verbose. Can be 0, 1, - or 2. - - random_state : int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use. Randomizes - selection of estimator features if n_nearest_features is not None, the - ``imputation_order`` if ``random``, and the sampling from posterior if - ``sample_posterior`` is True. Use an integer for determinism. - See :term:`the Glossary `. - - add_indicator : boolean, optional (default=False) - If True, a `MissingIndicator` transform will stack onto output - of the imputer's transform. This allows a predictive estimator - to account for missingness despite imputation. If a feature has no - missing values at fit/train time, the feature won't appear on - the missing indicator even if there are missing values at - transform/test time. - - Attributes - ---------- - initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer` - Imputer used to initialize the missing values. - - imputation_sequence_ : list of tuples - Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where - ``feat_idx`` is the current feature to be imputed, - ``neighbor_feat_idx`` is the array of other features used to impute the - current feature, and ``estimator`` is the trained estimator used for - the imputation. Length is ``self.n_features_with_missing_ * - self.n_iter_``. - - n_iter_ : int - Number of iteration rounds that occurred. Will be less than - ``self.max_iter`` if early stopping criterion was reached. - - n_features_with_missing_ : int - Number of features with missing values. - - indicator_ : :class:`sklearn.impute.MissingIndicator` - Indicator used to add binary indicators for missing values. - ``None`` if add_indicator is False. - - See also - -------- - SimpleImputer : Univariate imputation of missing values. - - Notes - ----- - To support imputation in inductive mode we store each feature's estimator - during the ``fit`` phase, and predict without refitting (in order) during - the ``transform`` phase. - - Features which contain all missing values at ``fit`` are discarded upon - ``transform``. - - Features with missing values during ``transform`` which did not have any - missing values during ``fit`` will be imputed with the initial imputation - method only. - - References - ---------- - .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: - Multivariate Imputation by Chained Equations in R". Journal of - Statistical Software 45: 1-67. - `_ - - .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in - Multivariate Data Suitable for use with an Electronic Computer". - Journal of the Royal Statistical Society 22(2): 302-306. - `_ - """ - - def __init__(self, - estimator=None, - missing_values=np.nan, - sample_posterior=False, - max_iter=10, - tol=1e-3, - n_nearest_features=None, - initial_strategy="mean", - imputation_order='ascending', - min_value=None, - max_value=None, - verbose=0, - random_state=None, - add_indicator=False): - - self.estimator = estimator - self.missing_values = missing_values - self.sample_posterior = sample_posterior - self.max_iter = max_iter - self.tol = tol - self.n_nearest_features = n_nearest_features - self.initial_strategy = initial_strategy - self.imputation_order = imputation_order - self.min_value = min_value - self.max_value = max_value - self.verbose = verbose - self.random_state = random_state - self.add_indicator = add_indicator - - def _impute_one_feature(self, - X_filled, - mask_missing_values, - feat_idx, - neighbor_feat_idx, - estimator=None, - fit_mode=True): - """Impute a single feature from the others provided. - - This function predicts the missing values of one of the features using - the current estimates of all the other features. The ``estimator`` must - support ``return_std=True`` in its ``predict`` method for this function - to work. - - Parameters - ---------- - X_filled : ndarray - Input data with the most recent imputations. - - mask_missing_values : ndarray - Input data's missing indicator matrix. - - feat_idx : int - Index of the feature currently being imputed. - - neighbor_feat_idx : ndarray - Indices of the features to be used in imputing ``feat_idx``. - - estimator : object - The estimator to use at this step of the round-robin imputation. - If ``sample_posterior`` is True, the estimator must support - ``return_std`` in its ``predict`` method. - If None, it will be cloned from self._estimator. - - fit_mode : boolean, default=True - Whether to fit and predict with the estimator or just predict. - - Returns - ------- - X_filled : ndarray - Input data with ``X_filled[missing_row_mask, feat_idx]`` updated. - - estimator : estimator with sklearn API - The fitted estimator used to impute - ``X_filled[missing_row_mask, feat_idx]``. - """ - - # if nothing is missing, just return the default - # (should not happen at fit time because feat_ids would be excluded) - missing_row_mask = mask_missing_values[:, feat_idx] - if not np.any(missing_row_mask): - return X_filled, estimator - - if estimator is None and fit_mode is False: - raise ValueError("If fit_mode is False, then an already-fitted " - "estimator should be passed in.") - - if estimator is None: - estimator = clone(self._estimator) - - if fit_mode: - X_train = safe_indexing(X_filled[:, neighbor_feat_idx], - ~missing_row_mask) - y_train = safe_indexing(X_filled[:, feat_idx], - ~missing_row_mask) - estimator.fit(X_train, y_train) - - # get posterior samples - X_test = safe_indexing(X_filled[:, neighbor_feat_idx], - missing_row_mask) - if self.sample_posterior: - mus, sigmas = estimator.predict(X_test, return_std=True) - imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) - # two types of problems: (1) non-positive sigmas, (2) mus outside - # legal range of min_value and max_value (results in inf sample) - positive_sigmas = sigmas > 0 - imputed_values[~positive_sigmas] = mus[~positive_sigmas] - mus_too_low = mus < self._min_value - imputed_values[mus_too_low] = self._min_value - mus_too_high = mus > self._max_value - imputed_values[mus_too_high] = self._max_value - # the rest can be sampled without statistical issues - inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high - mus = mus[inrange_mask] - sigmas = sigmas[inrange_mask] - a = (self._min_value - mus) / sigmas - b = (self._max_value - mus) / sigmas - - if scipy.__version__ < LooseVersion('0.18'): - # bug with vector-valued `a` in old scipy - imputed_values[inrange_mask] = [ - stats.truncnorm(a=a_, b=b_, - loc=loc_, scale=scale_).rvs( - random_state=self.random_state_) - for a_, b_, loc_, scale_ - in zip(a, b, mus, sigmas)] - else: - truncated_normal = stats.truncnorm(a=a, b=b, - loc=mus, scale=sigmas) - imputed_values[inrange_mask] = truncated_normal.rvs( - random_state=self.random_state_) - else: - imputed_values = estimator.predict(X_test) - imputed_values = np.clip(imputed_values, - self._min_value, - self._max_value) - - # update the feature - X_filled[missing_row_mask, feat_idx] = imputed_values - return X_filled, estimator - - def _get_neighbor_feat_idx(self, - n_features, - feat_idx, - abs_corr_mat): - """Get a list of other features to predict ``feat_idx``. - - If self.n_nearest_features is less than or equal to the total - number of features, then use a probability proportional to the absolute - correlation between ``feat_idx`` and each other feature to randomly - choose a subsample of the other features (without replacement). - - Parameters - ---------- - n_features : int - Number of features in ``X``. - - feat_idx : int - Index of the feature currently being imputed. - - abs_corr_mat : ndarray, shape (n_features, n_features) - Absolute correlation matrix of ``X``. The diagonal has been zeroed - out and each feature has been normalized to sum to 1. Can be None. - - Returns - ------- - neighbor_feat_idx : array-like - The features to use to impute ``feat_idx``. - """ - if (self.n_nearest_features is not None and - self.n_nearest_features < n_features): - p = abs_corr_mat[:, feat_idx] - neighbor_feat_idx = self.random_state_.choice( - np.arange(n_features), self.n_nearest_features, replace=False, - p=p) - else: - inds_left = np.arange(feat_idx) - inds_right = np.arange(feat_idx + 1, n_features) - neighbor_feat_idx = np.concatenate((inds_left, inds_right)) - return neighbor_feat_idx - - def _get_ordered_idx(self, mask_missing_values): - """Decide in what order we will update the features. - - As a homage to the MICE R package, we will have 4 main options of - how to order the updates, and use a random order if anything else - is specified. - - Also, this function skips features which have no missing values. - - Parameters - ---------- - mask_missing_values : array-like, shape (n_samples, n_features) - Input data's missing indicator matrix, where "n_samples" is the - number of samples and "n_features" is the number of features. - - Returns - ------- - ordered_idx : ndarray, shape (n_features,) - The order in which to impute the features. - """ - frac_of_missing_values = mask_missing_values.mean(axis=0) - missing_values_idx = np.nonzero(frac_of_missing_values)[0] - if self.imputation_order == 'roman': - ordered_idx = missing_values_idx - elif self.imputation_order == 'arabic': - ordered_idx = missing_values_idx[::-1] - elif self.imputation_order == 'ascending': - n = len(frac_of_missing_values) - len(missing_values_idx) - ordered_idx = np.argsort(frac_of_missing_values, - kind='mergesort')[n:][::-1] - elif self.imputation_order == 'descending': - n = len(frac_of_missing_values) - len(missing_values_idx) - ordered_idx = np.argsort(frac_of_missing_values, - kind='mergesort')[n:] - elif self.imputation_order == 'random': - ordered_idx = missing_values_idx - self.random_state_.shuffle(ordered_idx) - else: - raise ValueError("Got an invalid imputation order: '{0}'. It must " - "be one of the following: 'roman', 'arabic', " - "'ascending', 'descending', or " - "'random'.".format(self.imputation_order)) - return ordered_idx - - def _get_abs_corr_mat(self, X_filled, tolerance=1e-6): - """Get absolute correlation matrix between features. - - Parameters - ---------- - X_filled : ndarray, shape (n_samples, n_features) - Input data with the most recent imputations. - - tolerance : float, optional (default=1e-6) - ``abs_corr_mat`` can have nans, which will be replaced - with ``tolerance``. - - Returns - ------- - abs_corr_mat : ndarray, shape (n_features, n_features) - Absolute correlation matrix of ``X`` at the beginning of the - current round. The diagonal has been zeroed out and each feature's - absolute correlations with all others have been normalized to sum - to 1. - """ - n_features = X_filled.shape[1] - if (self.n_nearest_features is None or - self.n_nearest_features >= n_features): - return None - abs_corr_mat = np.abs(np.corrcoef(X_filled.T)) - # np.corrcoef is not defined for features with zero std - abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance - # ensures exploration, i.e. at least some probability of sampling - np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat) - # features are not their own neighbors - np.fill_diagonal(abs_corr_mat, 0) - # needs to sum to 1 for np.random.choice sampling - abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False) - return abs_corr_mat - - def _initial_imputation(self, X): - """Perform initial imputation for input X. - - Parameters - ---------- - X : ndarray, shape (n_samples, n_features) - Input data, where "n_samples" is the number of samples and - "n_features" is the number of features. - - Returns - ------- - Xt : ndarray, shape (n_samples, n_features) - Input data, where "n_samples" is the number of samples and - "n_features" is the number of features. - - X_filled : ndarray, shape (n_samples, n_features) - Input data with the most recent imputations. - - mask_missing_values : ndarray, shape (n_samples, n_features) - Input data's missing indicator matrix, where "n_samples" is the - number of samples and "n_features" is the number of features. - """ - if is_scalar_nan(self.missing_values): - force_all_finite = "allow-nan" - else: - force_all_finite = True - - X = check_array(X, dtype=FLOAT_DTYPES, order="F", - force_all_finite=force_all_finite) - _check_inputs_dtype(X, self.missing_values) - - mask_missing_values = _get_mask(X, self.missing_values) - if self.initial_imputer_ is None: - self.initial_imputer_ = SimpleImputer( - missing_values=self.missing_values, - strategy=self.initial_strategy) - X_filled = self.initial_imputer_.fit_transform(X) - else: - X_filled = self.initial_imputer_.transform(X) - - valid_mask = np.flatnonzero(np.logical_not( - np.isnan(self.initial_imputer_.statistics_))) - Xt = X[:, valid_mask] - mask_missing_values = mask_missing_values[:, valid_mask] - - return Xt, X_filled, mask_missing_values - - def fit_transform(self, X, y=None): - """Fits the imputer on X and return the transformed X. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Input data, where "n_samples" is the number of samples and - "n_features" is the number of features. - - y : ignored. - - Returns - ------- - Xt : array-like, shape (n_samples, n_features) - The imputed input data. - """ - self.random_state_ = getattr(self, "random_state_", - check_random_state(self.random_state)) - - if self.max_iter < 0: - raise ValueError( - "'max_iter' should be a positive integer. Got {} instead." - .format(self.max_iter)) - - if self.tol < 0: - raise ValueError( - "'tol' should be a non-negative float. Got {} instead." - .format(self.tol) - ) - - if self.add_indicator: - self.indicator_ = MissingIndicator( - missing_values=self.missing_values) - X_trans_indicator = self.indicator_.fit_transform(X) - else: - self.indicator_ = None - - if self.estimator is None: - from .linear_model import BayesianRidge - self._estimator = BayesianRidge() - else: - self._estimator = clone(self.estimator) - - self.imputation_sequence_ = [] - - if hasattr(self._estimator, 'random_state'): - self._estimator.random_state = self.random_state_ - - self._min_value = -np.inf if self.min_value is None else self.min_value - self._max_value = np.inf if self.max_value is None else self.max_value - - self.initial_imputer_ = None - X, Xt, mask_missing_values = self._initial_imputation(X) - - if self.max_iter == 0 or np.all(mask_missing_values): - self.n_iter_ = 0 - return Xt - - # order in which to impute - # note this is probably too slow for large feature data (d > 100000) - # and a better way would be good. - # see: https://goo.gl/KyCNwj and subsequent comments - ordered_idx = self._get_ordered_idx(mask_missing_values) - self.n_features_with_missing_ = len(ordered_idx) - - abs_corr_mat = self._get_abs_corr_mat(Xt) - - n_samples, n_features = Xt.shape - if self.verbose > 0: - print("[IterativeImputer] Completing matrix with shape %s" - % (X.shape,)) - start_t = time() - if not self.sample_posterior: - Xt_previous = Xt.copy() - normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values])) - for self.n_iter_ in range(1, self.max_iter + 1): - if self.imputation_order == 'random': - ordered_idx = self._get_ordered_idx(mask_missing_values) - - for feat_idx in ordered_idx: - neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, - feat_idx, - abs_corr_mat) - Xt, estimator = self._impute_one_feature( - Xt, mask_missing_values, feat_idx, neighbor_feat_idx, - estimator=None, fit_mode=True) - estimator_triplet = ImputerTriplet(feat_idx, - neighbor_feat_idx, - estimator) - self.imputation_sequence_.append(estimator_triplet) - - if self.verbose > 1: - print('[IterativeImputer] Ending imputation round ' - '%d/%d, elapsed time %0.2f' - % (self.n_iter_, self.max_iter, time() - start_t)) - - if not self.sample_posterior: - inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, - axis=None) - if inf_norm < normalized_tol: - if self.verbose > 0: - print('[IterativeImputer] Early stopping criterion ' - 'reached.') - break - Xt_previous = Xt.copy() - else: - if not self.sample_posterior: - warnings.warn("[IterativeImputer] Early stopping criterion not" - " reached.", ConvergenceWarning) - Xt[~mask_missing_values] = X[~mask_missing_values] - - if self.add_indicator: - Xt = np.hstack((Xt, X_trans_indicator)) - return Xt - - def transform(self, X): - """Imputes all missing values in X. - - Note that this is stochastic, and that if random_state is not fixed, - repeated calls, or permuted input, will yield different results. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - The input data to complete. - - Returns - ------- - Xt : array-like, shape (n_samples, n_features) - The imputed input data. - """ - check_is_fitted(self, 'initial_imputer_') - - if self.add_indicator: - X_trans_indicator = self.indicator_.transform(X) - - X, Xt, mask_missing_values = self._initial_imputation(X) - - if self.n_iter_ == 0 or np.all(mask_missing_values): - return Xt - - imputations_per_round = len(self.imputation_sequence_) // self.n_iter_ - i_rnd = 0 - if self.verbose > 0: - print("[IterativeImputer] Completing matrix with shape %s" - % (X.shape,)) - start_t = time() - for it, estimator_triplet in enumerate(self.imputation_sequence_): - Xt, _ = self._impute_one_feature( - Xt, - mask_missing_values, - estimator_triplet.feat_idx, - estimator_triplet.neighbor_feat_idx, - estimator=estimator_triplet.estimator, - fit_mode=False - ) - if not (it + 1) % imputations_per_round: - if self.verbose > 1: - print('[IterativeImputer] Ending imputation round ' - '%d/%d, elapsed time %0.2f' - % (i_rnd + 1, self.n_iter_, time() - start_t)) - i_rnd += 1 - - Xt[~mask_missing_values] = X[~mask_missing_values] - - if self.add_indicator: - Xt = np.hstack((Xt, X_trans_indicator)) - return Xt - - def fit(self, X, y=None): - """Fits the imputer on X and return self. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Input data, where "n_samples" is the number of samples and - "n_features" is the number of features. - - y : ignored - - Returns - ------- - self : object - Returns self. - """ - self.fit_transform(X) - return self - - def _more_tags(self): - return {'allow_nan': True} - - -class MissingIndicator(BaseEstimator, TransformerMixin): - """Binary indicators for missing values. - - Note that this component typically should not be used in a vanilla - :class:`Pipeline` consisting of transformers and a classifier, but rather - could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - missing_values : number, string, np.nan (default) or None - The placeholder for the missing values. All occurrences of - `missing_values` will be indicated (True in the output array), the - other values will be marked as False. - - features : str, optional - Whether the imputer mask should represent all or a subset of - features. - - - If "missing-only" (default), the imputer mask will only represent - features containing missing values during fit time. - - If "all", the imputer mask will represent all features. - - sparse : boolean or "auto", optional - Whether the imputer mask format should be sparse or dense. - - - If "auto" (default), the imputer mask will be of same type as - input. - - If True, the imputer mask will be a sparse matrix. - - If False, the imputer mask will be a numpy array. - - error_on_new : boolean, optional - If True (default), transform will raise an error when there are - features with missing values in transform that have no missing values - in fit. This is applicable only when ``features="missing-only"``. - - Attributes - ---------- - features_ : ndarray, shape (n_missing_features,) or (n_features,) - The features indices which will be returned when calling ``transform``. - They are computed during ``fit``. For ``features='all'``, it is - to ``range(n_features)``. - - Examples - -------- - >>> import numpy as np - >>> from sklearn.impute import MissingIndicator - >>> X1 = np.array([[np.nan, 1, 3], - ... [4, 0, np.nan], - ... [8, 1, 0]]) - >>> X2 = np.array([[5, 1, np.nan], - ... [np.nan, 2, 3], - ... [2, 4, 0]]) - >>> indicator = MissingIndicator() - >>> indicator.fit(X1) # doctest: +NORMALIZE_WHITESPACE - MissingIndicator(error_on_new=True, features='missing-only', - missing_values=nan, sparse='auto') - >>> X2_tr = indicator.transform(X2) - >>> X2_tr - array([[False, True], - [ True, False], - [False, False]]) - - """ - - def __init__(self, missing_values=np.nan, features="missing-only", - sparse="auto", error_on_new=True): - self.missing_values = missing_values - self.features = features - self.sparse = sparse - self.error_on_new = error_on_new - - def _get_missing_features_info(self, X): - """Compute the imputer mask and the indices of the features - containing missing values. - - Parameters - ---------- - X : {ndarray or sparse matrix}, shape (n_samples, n_features) - The input data with missing values. Note that ``X`` has been - checked in ``fit`` and ``transform`` before to call this function. - - Returns - ------- - imputer_mask : {ndarray or sparse matrix}, shape \ -(n_samples, n_features) or (n_samples, n_features_with_missing) - The imputer mask of the original data. - - features_with_missing : ndarray, shape (n_features_with_missing) - The features containing missing values. - - """ - if sparse.issparse(X): - mask = _get_mask(X.data, self.missing_values) - - # The imputer mask will be constructed with the same sparse format - # as X. - sparse_constructor = (sparse.csr_matrix if X.format == 'csr' - else sparse.csc_matrix) - imputer_mask = sparse_constructor( - (mask, X.indices.copy(), X.indptr.copy()), - shape=X.shape, dtype=bool) - imputer_mask.eliminate_zeros() - - if self.features == 'missing-only': - n_missing = imputer_mask.getnnz(axis=0) - - if self.sparse is False: - imputer_mask = imputer_mask.toarray() - elif imputer_mask.format == 'csr': - imputer_mask = imputer_mask.tocsc() - else: - imputer_mask = _get_mask(X, self.missing_values) - - if self.features == 'missing-only': - n_missing = imputer_mask.sum(axis=0) - - if self.sparse is True: - imputer_mask = sparse.csc_matrix(imputer_mask) - - if self.features == 'all': - features_indices = np.arange(X.shape[1]) - else: - features_indices = np.flatnonzero(n_missing) - - return imputer_mask, features_indices - - def _validate_input(self, X): - if not is_scalar_nan(self.missing_values): - force_all_finite = True - else: - force_all_finite = "allow-nan" - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None, - force_all_finite=force_all_finite) - _check_inputs_dtype(X, self.missing_values) - if X.dtype.kind not in ("i", "u", "f", "O"): - raise ValueError("MissingIndicator does not support data with " - "dtype {0}. Please provide either a numeric array" - " (with a floating point or integer dtype) or " - "categorical data represented either as an array " - "with integer dtype or an array of string values " - "with an object dtype.".format(X.dtype)) - - if sparse.issparse(X) and self.missing_values == 0: - # missing_values = 0 not allowed with sparse data as it would - # force densification - raise ValueError("Sparse input with missing_values=0 is " - "not supported. Provide a dense " - "array instead.") - - return X - - def fit(self, X, y=None): - """Fit the transformer on X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Input data, where ``n_samples`` is the number of samples and - ``n_features`` is the number of features. - - Returns - ------- - self : object - Returns self. - """ - X = self._validate_input(X) - self._n_features = X.shape[1] - - if self.features not in ('missing-only', 'all'): - raise ValueError("'features' has to be either 'missing-only' or " - "'all'. Got {} instead.".format(self.features)) - - if not ((isinstance(self.sparse, str) and - self.sparse == "auto") or isinstance(self.sparse, bool)): - raise ValueError("'sparse' has to be a boolean or 'auto'. " - "Got {!r} instead.".format(self.sparse)) - - self.features_ = self._get_missing_features_info(X)[1] - - return self - - def transform(self, X): - """Generate missing values indicator for X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - The input data to complete. - - Returns - ------- - Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) - The missing indicator for input data. The data type of ``Xt`` - will be boolean. - - """ - check_is_fitted(self, "features_") - X = self._validate_input(X) - - if X.shape[1] != self._n_features: - raise ValueError("X has a different number of features " - "than during fitting.") - - imputer_mask, features = self._get_missing_features_info(X) - - if self.features == "missing-only": - features_diff_fit_trans = np.setdiff1d(features, self.features_) - if (self.error_on_new and features_diff_fit_trans.size > 0): - raise ValueError("The features {} have missing values " - "in transform but have no missing values " - "in fit.".format(features_diff_fit_trans)) - - if self.features_.size < self._n_features: - imputer_mask = imputer_mask[:, self.features_] - - return imputer_mask - - def fit_transform(self, X, y=None): - """Generate missing values indicator for X. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - The input data to complete. - - Returns - ------- - Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) - The missing indicator for input data. The data type of ``Xt`` - will be boolean. - - """ - return self.fit(X, y).transform(X) - - def _more_tags(self): - return {'allow_nan': True, - 'X_types': ['2darray', 'str']} diff --git a/sklearn/impute/__init__.py b/sklearn/impute/__init__.py new file mode 100644 index 0000000000000..abeb4d471f5f3 --- /dev/null +++ b/sklearn/impute/__init__.py @@ -0,0 +1,8 @@ +"""Transformers for missing value imputation""" + +from ._base import MissingIndicator, SimpleImputer + +__all__ = [ + 'MissingIndicator', + 'SimpleImputer', +] diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py new file mode 100644 index 0000000000000..7be9da691ce11 --- /dev/null +++ b/sklearn/impute/_base.py @@ -0,0 +1,675 @@ +# Authors: Nicolas Tresegnie +# Sergey Feldman +# License: BSD 3 clause + +from __future__ import division + +import warnings +import numbers + +import numpy as np +import numpy.ma as ma +from scipy import sparse +from scipy import stats + +from ..base import BaseEstimator, TransformerMixin +from ..utils.sparsefuncs import _get_median +from ..utils.validation import check_is_fitted +from ..utils.validation import FLOAT_DTYPES +from ..utils.fixes import _object_dtype_isnan +from ..utils import is_scalar_nan +from ..utils import check_array + + +def _check_inputs_dtype(X, missing_values): + if (X.dtype.kind in ("f", "i", "u") and + not isinstance(missing_values, numbers.Real)): + raise ValueError("'X' and 'missing_values' types are expected to be" + " both numerical. Got X.dtype={} and " + " type(missing_values)={}." + .format(X.dtype, type(missing_values))) + + +def _get_mask(X, value_to_mask): + """Compute the boolean mask X == missing_values.""" + if is_scalar_nan(value_to_mask): + if X.dtype.kind == "f": + return np.isnan(X) + elif X.dtype.kind in ("i", "u"): + # can't have NaNs in integer array. + return np.zeros(X.shape, dtype=bool) + else: + # np.isnan does not work on object dtypes. + return _object_dtype_isnan(X) + else: + # X == value_to_mask with object dytpes does not always perform + # element-wise for old versions of numpy + return np.equal(X, value_to_mask) + + +def _most_frequent(array, extra_value, n_repeat): + """Compute the most frequent value in a 1d array extended with + [extra_value] * n_repeat, where extra_value is assumed to be not part + of the array.""" + # Compute the most frequent value in array only + if array.size > 0: + with warnings.catch_warnings(): + # stats.mode raises a warning when input array contains objects due + # to incapacity to detect NaNs. Irrelevant here since input array + # has already been NaN-masked. + warnings.simplefilter("ignore", RuntimeWarning) + mode = stats.mode(array) + + most_frequent_value = mode[0][0] + most_frequent_count = mode[1][0] + else: + most_frequent_value = 0 + most_frequent_count = 0 + + # Compare to array + [extra_value] * n_repeat + if most_frequent_count == 0 and n_repeat == 0: + return np.nan + elif most_frequent_count < n_repeat: + return extra_value + elif most_frequent_count > n_repeat: + return most_frequent_value + elif most_frequent_count == n_repeat: + # Ties the breaks. Copy the behaviour of scipy.stats.mode + if most_frequent_value < extra_value: + return most_frequent_value + else: + return extra_value + + +class SimpleImputer(BaseEstimator, TransformerMixin): + """Imputation transformer for completing missing values. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + missing_values : number, string, np.nan (default) or None + The placeholder for the missing values. All occurrences of + `missing_values` will be imputed. + + strategy : string, optional (default="mean") + The imputation strategy. + + - If "mean", then replace missing values using the mean along + each column. Can only be used with numeric data. + - If "median", then replace missing values using the median along + each column. Can only be used with numeric data. + - If "most_frequent", then replace missing using the most frequent + value along each column. Can be used with strings or numeric data. + - If "constant", then replace missing values with fill_value. Can be + used with strings or numeric data. + + .. versionadded:: 0.20 + strategy="constant" for fixed value imputation. + + fill_value : string or numerical value, optional (default=None) + When strategy == "constant", fill_value is used to replace all + occurrences of missing_values. + If left to the default, fill_value will be 0 when imputing numerical + data and "missing_value" for strings or object data types. + + verbose : integer, optional (default=0) + Controls the verbosity of the imputer. + + copy : boolean, optional (default=True) + If True, a copy of X will be created. If False, imputation will + be done in-place whenever possible. Note that, in the following cases, + a new copy will always be made, even if `copy=False`: + + - If X is not an array of floating values; + - If X is encoded as a CSR matrix; + - If add_indicator=True. + + add_indicator : boolean, optional (default=False) + If True, a `MissingIndicator` transform will stack onto output + of the imputer's transform. This allows a predictive estimator + to account for missingness despite imputation. If a feature has no + missing values at fit/train time, the feature won't appear on + the missing indicator even if there are missing values at + transform/test time. + + Attributes + ---------- + statistics_ : array of shape (n_features,) + The imputation fill value for each feature. + + indicator_ : :class:`sklearn.impute.MissingIndicator` + Indicator used to add binary indicators for missing values. + ``None`` if add_indicator is False. + + See also + -------- + IterativeImputer : Multivariate imputation of missing values. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.impute import SimpleImputer + >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') + >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]) + ... # doctest: +NORMALIZE_WHITESPACE + SimpleImputer(add_indicator=False, copy=True, fill_value=None, + missing_values=nan, strategy='mean', verbose=0) + >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] + >>> print(imp_mean.transform(X)) + ... # doctest: +NORMALIZE_WHITESPACE + [[ 7. 2. 3. ] + [ 4. 3.5 6. ] + [10. 3.5 9. ]] + + Notes + ----- + Columns which only contained missing values at `fit` are discarded upon + `transform` if strategy is not "constant". + + """ + def __init__(self, missing_values=np.nan, strategy="mean", + fill_value=None, verbose=0, copy=True, add_indicator=False): + self.missing_values = missing_values + self.strategy = strategy + self.fill_value = fill_value + self.verbose = verbose + self.copy = copy + self.add_indicator = add_indicator + + def _validate_input(self, X): + allowed_strategies = ["mean", "median", "most_frequent", "constant"] + if self.strategy not in allowed_strategies: + raise ValueError("Can only use these strategies: {0} " + " got strategy={1}".format(allowed_strategies, + self.strategy)) + + if self.strategy in ("most_frequent", "constant"): + dtype = None + else: + dtype = FLOAT_DTYPES + + if not is_scalar_nan(self.missing_values): + force_all_finite = True + else: + force_all_finite = "allow-nan" + + try: + X = check_array(X, accept_sparse='csc', dtype=dtype, + force_all_finite=force_all_finite, copy=self.copy) + except ValueError as ve: + if "could not convert" in str(ve): + raise ValueError("Cannot use {0} strategy with non-numeric " + "data. Received datatype :{1}." + "".format(self.strategy, X.dtype.kind)) + else: + raise ve + + _check_inputs_dtype(X, self.missing_values) + if X.dtype.kind not in ("i", "u", "f", "O"): + raise ValueError("SimpleImputer does not support data with dtype " + "{0}. Please provide either a numeric array (with" + " a floating point or integer dtype) or " + "categorical data represented either as an array " + "with integer dtype or an array of string values " + "with an object dtype.".format(X.dtype)) + + return X + + def fit(self, X, y=None): + """Fit the imputer on X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Input data, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. + + Returns + ------- + self : SimpleImputer + """ + X = self._validate_input(X) + + # default fill_value is 0 for numerical input and "missing_value" + # otherwise + if self.fill_value is None: + if X.dtype.kind in ("i", "u", "f"): + fill_value = 0 + else: + fill_value = "missing_value" + else: + fill_value = self.fill_value + + # fill_value should be numerical in case of numerical input + if (self.strategy == "constant" and + X.dtype.kind in ("i", "u", "f") and + not isinstance(fill_value, numbers.Real)): + raise ValueError("'fill_value'={0} is invalid. Expected a " + "numerical value when imputing numerical " + "data".format(fill_value)) + + if sparse.issparse(X): + # missing_values = 0 not allowed with sparse data as it would + # force densification + if self.missing_values == 0: + raise ValueError("Imputation not possible when missing_values " + "== 0 and input is sparse. Provide a dense " + "array instead.") + else: + self.statistics_ = self._sparse_fit(X, + self.strategy, + self.missing_values, + fill_value) + else: + self.statistics_ = self._dense_fit(X, + self.strategy, + self.missing_values, + fill_value) + + if self.add_indicator: + self.indicator_ = MissingIndicator( + missing_values=self.missing_values) + self.indicator_.fit(X) + else: + self.indicator_ = None + + return self + + def _sparse_fit(self, X, strategy, missing_values, fill_value): + """Fit the transformer on sparse data.""" + mask_data = _get_mask(X.data, missing_values) + n_implicit_zeros = X.shape[0] - np.diff(X.indptr) + + statistics = np.empty(X.shape[1]) + + if strategy == "constant": + # for constant strategy, self.statistcs_ is used to store + # fill_value in each column + statistics.fill(fill_value) + else: + for i in range(X.shape[1]): + column = X.data[X.indptr[i]:X.indptr[i + 1]] + mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]] + column = column[~mask_column] + + # combine explicit and implicit zeros + mask_zeros = _get_mask(column, 0) + column = column[~mask_zeros] + n_explicit_zeros = mask_zeros.sum() + n_zeros = n_implicit_zeros[i] + n_explicit_zeros + + if strategy == "mean": + s = column.size + n_zeros + statistics[i] = np.nan if s == 0 else column.sum() / s + + elif strategy == "median": + statistics[i] = _get_median(column, + n_zeros) + + elif strategy == "most_frequent": + statistics[i] = _most_frequent(column, + 0, + n_zeros) + return statistics + + def _dense_fit(self, X, strategy, missing_values, fill_value): + """Fit the transformer on dense data.""" + mask = _get_mask(X, missing_values) + masked_X = ma.masked_array(X, mask=mask) + + # Mean + if strategy == "mean": + mean_masked = np.ma.mean(masked_X, axis=0) + # Avoid the warning "Warning: converting a masked element to nan." + mean = np.ma.getdata(mean_masked) + mean[np.ma.getmask(mean_masked)] = np.nan + + return mean + + # Median + elif strategy == "median": + median_masked = np.ma.median(masked_X, axis=0) + # Avoid the warning "Warning: converting a masked element to nan." + median = np.ma.getdata(median_masked) + median[np.ma.getmaskarray(median_masked)] = np.nan + + return median + + # Most frequent + elif strategy == "most_frequent": + # scipy.stats.mstats.mode cannot be used because it will no work + # properly if the first element is masked and if its frequency + # is equal to the frequency of the most frequent valid element + # See https://github.com/scipy/scipy/issues/2636 + + # To be able access the elements by columns + X = X.transpose() + mask = mask.transpose() + + if X.dtype.kind == "O": + most_frequent = np.empty(X.shape[0], dtype=object) + else: + most_frequent = np.empty(X.shape[0]) + + for i, (row, row_mask) in enumerate(zip(X[:], mask[:])): + row_mask = np.logical_not(row_mask).astype(np.bool) + row = row[row_mask] + most_frequent[i] = _most_frequent(row, np.nan, 0) + + return most_frequent + + # Constant + elif strategy == "constant": + # for constant strategy, self.statistcs_ is used to store + # fill_value in each column + return np.full(X.shape[1], fill_value, dtype=X.dtype) + + def transform(self, X): + """Impute all missing values in X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input data to complete. + """ + check_is_fitted(self, 'statistics_') + + X = self._validate_input(X) + + statistics = self.statistics_ + + if X.shape[1] != statistics.shape[0]: + raise ValueError("X has %d features per sample, expected %d" + % (X.shape[1], self.statistics_.shape[0])) + + if self.add_indicator: + X_trans_indicator = self.indicator_.transform(X) + + # Delete the invalid columns if strategy is not constant + if self.strategy == "constant": + valid_statistics = statistics + else: + # same as np.isnan but also works for object dtypes + invalid_mask = _get_mask(statistics, np.nan) + valid_mask = np.logical_not(invalid_mask) + valid_statistics = statistics[valid_mask] + valid_statistics_indexes = np.flatnonzero(valid_mask) + + if invalid_mask.any(): + missing = np.arange(X.shape[1])[invalid_mask] + if self.verbose: + warnings.warn("Deleting features without " + "observed values: %s" % missing) + X = X[:, valid_statistics_indexes] + + # Do actual imputation + if sparse.issparse(X): + if self.missing_values == 0: + raise ValueError("Imputation not possible when missing_values " + "== 0 and input is sparse. Provide a dense " + "array instead.") + else: + mask = _get_mask(X.data, self.missing_values) + indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int), + np.diff(X.indptr))[mask] + + X.data[mask] = valid_statistics[indexes].astype(X.dtype, + copy=False) + else: + mask = _get_mask(X, self.missing_values) + n_missing = np.sum(mask, axis=0) + values = np.repeat(valid_statistics, n_missing) + coordinates = np.where(mask.transpose())[::-1] + + X[coordinates] = values + + if self.add_indicator: + hstack = sparse.hstack if sparse.issparse(X) else np.hstack + X = hstack((X, X_trans_indicator)) + + return X + + def _more_tags(self): + return {'allow_nan': True} + + +class MissingIndicator(BaseEstimator, TransformerMixin): + """Binary indicators for missing values. + + Note that this component typically should not be used in a vanilla + :class:`Pipeline` consisting of transformers and a classifier, but rather + could be added using a :class:`FeatureUnion` or :class:`ColumnTransformer`. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + missing_values : number, string, np.nan (default) or None + The placeholder for the missing values. All occurrences of + `missing_values` will be indicated (True in the output array), the + other values will be marked as False. + + features : str, optional + Whether the imputer mask should represent all or a subset of + features. + + - If "missing-only" (default), the imputer mask will only represent + features containing missing values during fit time. + - If "all", the imputer mask will represent all features. + + sparse : boolean or "auto", optional + Whether the imputer mask format should be sparse or dense. + + - If "auto" (default), the imputer mask will be of same type as + input. + - If True, the imputer mask will be a sparse matrix. + - If False, the imputer mask will be a numpy array. + + error_on_new : boolean, optional + If True (default), transform will raise an error when there are + features with missing values in transform that have no missing values + in fit. This is applicable only when ``features="missing-only"``. + + Attributes + ---------- + features_ : ndarray, shape (n_missing_features,) or (n_features,) + The features indices which will be returned when calling ``transform``. + They are computed during ``fit``. For ``features='all'``, it is + to ``range(n_features)``. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.impute import MissingIndicator + >>> X1 = np.array([[np.nan, 1, 3], + ... [4, 0, np.nan], + ... [8, 1, 0]]) + >>> X2 = np.array([[5, 1, np.nan], + ... [np.nan, 2, 3], + ... [2, 4, 0]]) + >>> indicator = MissingIndicator() + >>> indicator.fit(X1) # doctest: +NORMALIZE_WHITESPACE + MissingIndicator(error_on_new=True, features='missing-only', + missing_values=nan, sparse='auto') + >>> X2_tr = indicator.transform(X2) + >>> X2_tr + array([[False, True], + [ True, False], + [False, False]]) + + """ + + def __init__(self, missing_values=np.nan, features="missing-only", + sparse="auto", error_on_new=True): + self.missing_values = missing_values + self.features = features + self.sparse = sparse + self.error_on_new = error_on_new + + def _get_missing_features_info(self, X): + """Compute the imputer mask and the indices of the features + containing missing values. + + Parameters + ---------- + X : {ndarray or sparse matrix}, shape (n_samples, n_features) + The input data with missing values. Note that ``X`` has been + checked in ``fit`` and ``transform`` before to call this function. + + Returns + ------- + imputer_mask : {ndarray or sparse matrix}, shape \ +(n_samples, n_features) or (n_samples, n_features_with_missing) + The imputer mask of the original data. + + features_with_missing : ndarray, shape (n_features_with_missing) + The features containing missing values. + + """ + if sparse.issparse(X): + mask = _get_mask(X.data, self.missing_values) + + # The imputer mask will be constructed with the same sparse format + # as X. + sparse_constructor = (sparse.csr_matrix if X.format == 'csr' + else sparse.csc_matrix) + imputer_mask = sparse_constructor( + (mask, X.indices.copy(), X.indptr.copy()), + shape=X.shape, dtype=bool) + imputer_mask.eliminate_zeros() + + if self.features == 'missing-only': + n_missing = imputer_mask.getnnz(axis=0) + + if self.sparse is False: + imputer_mask = imputer_mask.toarray() + elif imputer_mask.format == 'csr': + imputer_mask = imputer_mask.tocsc() + else: + imputer_mask = _get_mask(X, self.missing_values) + + if self.features == 'missing-only': + n_missing = imputer_mask.sum(axis=0) + + if self.sparse is True: + imputer_mask = sparse.csc_matrix(imputer_mask) + + if self.features == 'all': + features_indices = np.arange(X.shape[1]) + else: + features_indices = np.flatnonzero(n_missing) + + return imputer_mask, features_indices + + def _validate_input(self, X): + if not is_scalar_nan(self.missing_values): + force_all_finite = True + else: + force_all_finite = "allow-nan" + X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None, + force_all_finite=force_all_finite) + _check_inputs_dtype(X, self.missing_values) + if X.dtype.kind not in ("i", "u", "f", "O"): + raise ValueError("MissingIndicator does not support data with " + "dtype {0}. Please provide either a numeric array" + " (with a floating point or integer dtype) or " + "categorical data represented either as an array " + "with integer dtype or an array of string values " + "with an object dtype.".format(X.dtype)) + + if sparse.issparse(X) and self.missing_values == 0: + # missing_values = 0 not allowed with sparse data as it would + # force densification + raise ValueError("Sparse input with missing_values=0 is " + "not supported. Provide a dense " + "array instead.") + + return X + + def fit(self, X, y=None): + """Fit the transformer on X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Input data, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. + + Returns + ------- + self : object + Returns self. + """ + X = self._validate_input(X) + self._n_features = X.shape[1] + + if self.features not in ('missing-only', 'all'): + raise ValueError("'features' has to be either 'missing-only' or " + "'all'. Got {} instead.".format(self.features)) + + if not ((isinstance(self.sparse, str) and + self.sparse == "auto") or isinstance(self.sparse, bool)): + raise ValueError("'sparse' has to be a boolean or 'auto'. " + "Got {!r} instead.".format(self.sparse)) + + self.features_ = self._get_missing_features_info(X)[1] + + return self + + def transform(self, X): + """Generate missing values indicator for X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input data to complete. + + Returns + ------- + Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) + The missing indicator for input data. The data type of ``Xt`` + will be boolean. + + """ + check_is_fitted(self, "features_") + X = self._validate_input(X) + + if X.shape[1] != self._n_features: + raise ValueError("X has a different number of features " + "than during fitting.") + + imputer_mask, features = self._get_missing_features_info(X) + + if self.features == "missing-only": + features_diff_fit_trans = np.setdiff1d(features, self.features_) + if (self.error_on_new and features_diff_fit_trans.size > 0): + raise ValueError("The features {} have missing values " + "in transform but have no missing values " + "in fit.".format(features_diff_fit_trans)) + + if self.features_.size < self._n_features: + imputer_mask = imputer_mask[:, self.features_] + + return imputer_mask + + def fit_transform(self, X, y=None): + """Generate missing values indicator for X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input data to complete. + + Returns + ------- + Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) + The missing indicator for input data. The data type of ``Xt`` + will be boolean. + + """ + return self.fit(X, y).transform(X) + + def _more_tags(self): + return {'allow_nan': True, + 'X_types': ['2darray', 'str']} diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py new file mode 100644 index 0000000000000..40df3f4059c04 --- /dev/null +++ b/sklearn/impute/_iterative.py @@ -0,0 +1,680 @@ + +from time import time +from distutils.version import LooseVersion +from collections import namedtuple +import warnings + +import scipy +from scipy import stats +import numpy as np + +from ..base import clone, BaseEstimator, TransformerMixin +from ..exceptions import ConvergenceWarning +from ..preprocessing import normalize +from ..utils import check_array, check_random_state, safe_indexing +from ..utils.validation import FLOAT_DTYPES, check_is_fitted +from ..utils import is_scalar_nan + +from ._base import (_get_mask, MissingIndicator, SimpleImputer, + _check_inputs_dtype) + + +_ImputerTriplet = namedtuple('_ImputerTriplet', ['feat_idx', + 'neighbor_feat_idx', + 'estimator']) + + +class IterativeImputer(BaseEstimator, TransformerMixin): + """Multivariate imputer that estimates each feature from all the others. + + A strategy for imputing missing values by modeling each feature with + missing values as a function of other features in a round-robin fashion. + + Read more in the :ref:`User Guide `. + + .. note:: + + This estimator is still **experimental** for now: the predictions + and the API might change without any deprecation cycle. To use it, + you need to explicitly import ``enable_iterative_imputer``:: + + >>> # explicitly require this experimental feature + >>> from sklearn.experimental import enable_iterative_imputer # noqa + >>> # now you can import normally from sklearn.impute + >>> from sklearn.impute import IterativeImputer + + Parameters + ---------- + estimator : estimator object, default=BayesianRidge() + The estimator to use at each step of the round-robin imputation. + If ``sample_posterior`` is True, the estimator must support + ``return_std`` in its ``predict`` method. + + missing_values : int, np.nan, optional (default=np.nan) + The placeholder for the missing values. All occurrences of + ``missing_values`` will be imputed. + + sample_posterior : boolean, default=False + Whether to sample from the (Gaussian) predictive posterior of the + fitted estimator for each imputation. Estimator must support + ``return_std`` in its ``predict`` method if set to ``True``. Set to + ``True`` if using ``IterativeImputer`` for multiple imputations. + + max_iter : int, optional (default=10) + Maximum number of imputation rounds to perform before returning the + imputations computed during the final round. A round is a single + imputation of each feature with missing values. The stopping criterion + is met once `abs(max(X_t - X_{t-1}))/abs(max(X[known_vals]))` < tol, + where `X_t` is `X` at iteration `t. Note that early stopping is only + applied if ``sample_posterior=False``. + + tol : float, optional (default=1e-3) + Tolerance of the stopping condition. + + n_nearest_features : int, optional (default=None) + Number of other features to use to estimate the missing values of + each feature column. Nearness between features is measured using + the absolute correlation coefficient between each feature pair (after + initial imputation). To ensure coverage of features throughout the + imputation process, the neighbor features are not necessarily nearest, + but are drawn with probability proportional to correlation for each + imputed target feature. Can provide significant speed-up when the + number of features is huge. If ``None``, all features will be used. + + initial_strategy : str, optional (default="mean") + Which strategy to use to initialize the missing values. Same as the + ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer` + Valid values: {"mean", "median", "most_frequent", or "constant"}. + + imputation_order : str, optional (default="ascending") + The order in which the features will be imputed. Possible values: + + "ascending" + From features with fewest missing values to most. + "descending" + From features with most missing values to fewest. + "roman" + Left to right. + "arabic" + Right to left. + "random" + A random order for each round. + + min_value : float, optional (default=None) + Minimum possible imputed value. Default of ``None`` will set minimum + to negative infinity. + + max_value : float, optional (default=None) + Maximum possible imputed value. Default of ``None`` will set maximum + to positive infinity. + + verbose : int, optional (default=0) + Verbosity flag, controls the debug messages that are issued + as functions are evaluated. The higher, the more verbose. Can be 0, 1, + or 2. + + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator to use. Randomizes + selection of estimator features if n_nearest_features is not None, the + ``imputation_order`` if ``random``, and the sampling from posterior if + ``sample_posterior`` is True. Use an integer for determinism. + See :term:`the Glossary `. + + add_indicator : boolean, optional (default=False) + If True, a `MissingIndicator` transform will stack onto output + of the imputer's transform. This allows a predictive estimator + to account for missingness despite imputation. If a feature has no + missing values at fit/train time, the feature won't appear on + the missing indicator even if there are missing values at + transform/test time. + + Attributes + ---------- + initial_imputer_ : object of type :class:`sklearn.impute.SimpleImputer` + Imputer used to initialize the missing values. + + imputation_sequence_ : list of tuples + Each tuple has ``(feat_idx, neighbor_feat_idx, estimator)``, where + ``feat_idx`` is the current feature to be imputed, + ``neighbor_feat_idx`` is the array of other features used to impute the + current feature, and ``estimator`` is the trained estimator used for + the imputation. Length is ``self.n_features_with_missing_ * + self.n_iter_``. + + n_iter_ : int + Number of iteration rounds that occurred. Will be less than + ``self.max_iter`` if early stopping criterion was reached. + + n_features_with_missing_ : int + Number of features with missing values. + + indicator_ : :class:`sklearn.impute.MissingIndicator` + Indicator used to add binary indicators for missing values. + ``None`` if add_indicator is False. + + See also + -------- + SimpleImputer : Univariate imputation of missing values. + + Notes + ----- + To support imputation in inductive mode we store each feature's estimator + during the ``fit`` phase, and predict without refitting (in order) during + the ``transform`` phase. + + Features which contain all missing values at ``fit`` are discarded upon + ``transform``. + + Features with missing values during ``transform`` which did not have any + missing values during ``fit`` will be imputed with the initial imputation + method only. + + References + ---------- + .. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice: + Multivariate Imputation by Chained Equations in R". Journal of + Statistical Software 45: 1-67. + `_ + + .. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in + Multivariate Data Suitable for use with an Electronic Computer". + Journal of the Royal Statistical Society 22(2): 302-306. + `_ + """ + + def __init__(self, + estimator=None, + missing_values=np.nan, + sample_posterior=False, + max_iter=10, + tol=1e-3, + n_nearest_features=None, + initial_strategy="mean", + imputation_order='ascending', + min_value=None, + max_value=None, + verbose=0, + random_state=None, + add_indicator=False): + + self.estimator = estimator + self.missing_values = missing_values + self.sample_posterior = sample_posterior + self.max_iter = max_iter + self.tol = tol + self.n_nearest_features = n_nearest_features + self.initial_strategy = initial_strategy + self.imputation_order = imputation_order + self.min_value = min_value + self.max_value = max_value + self.verbose = verbose + self.random_state = random_state + self.add_indicator = add_indicator + + def _impute_one_feature(self, + X_filled, + mask_missing_values, + feat_idx, + neighbor_feat_idx, + estimator=None, + fit_mode=True): + """Impute a single feature from the others provided. + + This function predicts the missing values of one of the features using + the current estimates of all the other features. The ``estimator`` must + support ``return_std=True`` in its ``predict`` method for this function + to work. + + Parameters + ---------- + X_filled : ndarray + Input data with the most recent imputations. + + mask_missing_values : ndarray + Input data's missing indicator matrix. + + feat_idx : int + Index of the feature currently being imputed. + + neighbor_feat_idx : ndarray + Indices of the features to be used in imputing ``feat_idx``. + + estimator : object + The estimator to use at this step of the round-robin imputation. + If ``sample_posterior`` is True, the estimator must support + ``return_std`` in its ``predict`` method. + If None, it will be cloned from self._estimator. + + fit_mode : boolean, default=True + Whether to fit and predict with the estimator or just predict. + + Returns + ------- + X_filled : ndarray + Input data with ``X_filled[missing_row_mask, feat_idx]`` updated. + + estimator : estimator with sklearn API + The fitted estimator used to impute + ``X_filled[missing_row_mask, feat_idx]``. + """ + + # if nothing is missing, just return the default + # (should not happen at fit time because feat_ids would be excluded) + missing_row_mask = mask_missing_values[:, feat_idx] + if not np.any(missing_row_mask): + return X_filled, estimator + + if estimator is None and fit_mode is False: + raise ValueError("If fit_mode is False, then an already-fitted " + "estimator should be passed in.") + + if estimator is None: + estimator = clone(self._estimator) + + if fit_mode: + X_train = safe_indexing(X_filled[:, neighbor_feat_idx], + ~missing_row_mask) + y_train = safe_indexing(X_filled[:, feat_idx], + ~missing_row_mask) + estimator.fit(X_train, y_train) + + # get posterior samples + X_test = safe_indexing(X_filled[:, neighbor_feat_idx], + missing_row_mask) + if self.sample_posterior: + mus, sigmas = estimator.predict(X_test, return_std=True) + imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype) + # two types of problems: (1) non-positive sigmas, (2) mus outside + # legal range of min_value and max_value (results in inf sample) + positive_sigmas = sigmas > 0 + imputed_values[~positive_sigmas] = mus[~positive_sigmas] + mus_too_low = mus < self._min_value + imputed_values[mus_too_low] = self._min_value + mus_too_high = mus > self._max_value + imputed_values[mus_too_high] = self._max_value + # the rest can be sampled without statistical issues + inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high + mus = mus[inrange_mask] + sigmas = sigmas[inrange_mask] + a = (self._min_value - mus) / sigmas + b = (self._max_value - mus) / sigmas + + if scipy.__version__ < LooseVersion('0.18'): + # bug with vector-valued `a` in old scipy + imputed_values[inrange_mask] = [ + stats.truncnorm(a=a_, b=b_, + loc=loc_, scale=scale_).rvs( + random_state=self.random_state_) + for a_, b_, loc_, scale_ + in zip(a, b, mus, sigmas)] + else: + truncated_normal = stats.truncnorm(a=a, b=b, + loc=mus, scale=sigmas) + imputed_values[inrange_mask] = truncated_normal.rvs( + random_state=self.random_state_) + else: + imputed_values = estimator.predict(X_test) + imputed_values = np.clip(imputed_values, + self._min_value, + self._max_value) + + # update the feature + X_filled[missing_row_mask, feat_idx] = imputed_values + return X_filled, estimator + + def _get_neighbor_feat_idx(self, + n_features, + feat_idx, + abs_corr_mat): + """Get a list of other features to predict ``feat_idx``. + + If self.n_nearest_features is less than or equal to the total + number of features, then use a probability proportional to the absolute + correlation between ``feat_idx`` and each other feature to randomly + choose a subsample of the other features (without replacement). + + Parameters + ---------- + n_features : int + Number of features in ``X``. + + feat_idx : int + Index of the feature currently being imputed. + + abs_corr_mat : ndarray, shape (n_features, n_features) + Absolute correlation matrix of ``X``. The diagonal has been zeroed + out and each feature has been normalized to sum to 1. Can be None. + + Returns + ------- + neighbor_feat_idx : array-like + The features to use to impute ``feat_idx``. + """ + if (self.n_nearest_features is not None and + self.n_nearest_features < n_features): + p = abs_corr_mat[:, feat_idx] + neighbor_feat_idx = self.random_state_.choice( + np.arange(n_features), self.n_nearest_features, replace=False, + p=p) + else: + inds_left = np.arange(feat_idx) + inds_right = np.arange(feat_idx + 1, n_features) + neighbor_feat_idx = np.concatenate((inds_left, inds_right)) + return neighbor_feat_idx + + def _get_ordered_idx(self, mask_missing_values): + """Decide in what order we will update the features. + + As a homage to the MICE R package, we will have 4 main options of + how to order the updates, and use a random order if anything else + is specified. + + Also, this function skips features which have no missing values. + + Parameters + ---------- + mask_missing_values : array-like, shape (n_samples, n_features) + Input data's missing indicator matrix, where "n_samples" is the + number of samples and "n_features" is the number of features. + + Returns + ------- + ordered_idx : ndarray, shape (n_features,) + The order in which to impute the features. + """ + frac_of_missing_values = mask_missing_values.mean(axis=0) + missing_values_idx = np.nonzero(frac_of_missing_values)[0] + if self.imputation_order == 'roman': + ordered_idx = missing_values_idx + elif self.imputation_order == 'arabic': + ordered_idx = missing_values_idx[::-1] + elif self.imputation_order == 'ascending': + n = len(frac_of_missing_values) - len(missing_values_idx) + ordered_idx = np.argsort(frac_of_missing_values, + kind='mergesort')[n:][::-1] + elif self.imputation_order == 'descending': + n = len(frac_of_missing_values) - len(missing_values_idx) + ordered_idx = np.argsort(frac_of_missing_values, + kind='mergesort')[n:] + elif self.imputation_order == 'random': + ordered_idx = missing_values_idx + self.random_state_.shuffle(ordered_idx) + else: + raise ValueError("Got an invalid imputation order: '{0}'. It must " + "be one of the following: 'roman', 'arabic', " + "'ascending', 'descending', or " + "'random'.".format(self.imputation_order)) + return ordered_idx + + def _get_abs_corr_mat(self, X_filled, tolerance=1e-6): + """Get absolute correlation matrix between features. + + Parameters + ---------- + X_filled : ndarray, shape (n_samples, n_features) + Input data with the most recent imputations. + + tolerance : float, optional (default=1e-6) + ``abs_corr_mat`` can have nans, which will be replaced + with ``tolerance``. + + Returns + ------- + abs_corr_mat : ndarray, shape (n_features, n_features) + Absolute correlation matrix of ``X`` at the beginning of the + current round. The diagonal has been zeroed out and each feature's + absolute correlations with all others have been normalized to sum + to 1. + """ + n_features = X_filled.shape[1] + if (self.n_nearest_features is None or + self.n_nearest_features >= n_features): + return None + abs_corr_mat = np.abs(np.corrcoef(X_filled.T)) + # np.corrcoef is not defined for features with zero std + abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance + # ensures exploration, i.e. at least some probability of sampling + np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat) + # features are not their own neighbors + np.fill_diagonal(abs_corr_mat, 0) + # needs to sum to 1 for np.random.choice sampling + abs_corr_mat = normalize(abs_corr_mat, norm='l1', axis=0, copy=False) + return abs_corr_mat + + def _initial_imputation(self, X): + """Perform initial imputation for input X. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + Returns + ------- + Xt : ndarray, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + X_filled : ndarray, shape (n_samples, n_features) + Input data with the most recent imputations. + + mask_missing_values : ndarray, shape (n_samples, n_features) + Input data's missing indicator matrix, where "n_samples" is the + number of samples and "n_features" is the number of features. + """ + if is_scalar_nan(self.missing_values): + force_all_finite = "allow-nan" + else: + force_all_finite = True + + X = check_array(X, dtype=FLOAT_DTYPES, order="F", + force_all_finite=force_all_finite) + _check_inputs_dtype(X, self.missing_values) + + mask_missing_values = _get_mask(X, self.missing_values) + if self.initial_imputer_ is None: + self.initial_imputer_ = SimpleImputer( + missing_values=self.missing_values, + strategy=self.initial_strategy) + X_filled = self.initial_imputer_.fit_transform(X) + else: + X_filled = self.initial_imputer_.transform(X) + + valid_mask = np.flatnonzero(np.logical_not( + np.isnan(self.initial_imputer_.statistics_))) + Xt = X[:, valid_mask] + mask_missing_values = mask_missing_values[:, valid_mask] + + return Xt, X_filled, mask_missing_values + + def fit_transform(self, X, y=None): + """Fits the imputer on X and return the transformed X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + y : ignored. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + The imputed input data. + """ + self.random_state_ = getattr(self, "random_state_", + check_random_state(self.random_state)) + + if self.max_iter < 0: + raise ValueError( + "'max_iter' should be a positive integer. Got {} instead." + .format(self.max_iter)) + + if self.tol < 0: + raise ValueError( + "'tol' should be a non-negative float. Got {} instead." + .format(self.tol) + ) + + if self.add_indicator: + self.indicator_ = MissingIndicator( + missing_values=self.missing_values) + X_trans_indicator = self.indicator_.fit_transform(X) + else: + self.indicator_ = None + + if self.estimator is None: + from ..linear_model import BayesianRidge + self._estimator = BayesianRidge() + else: + self._estimator = clone(self.estimator) + + self.imputation_sequence_ = [] + + if hasattr(self._estimator, 'random_state'): + self._estimator.random_state = self.random_state_ + + self._min_value = -np.inf if self.min_value is None else self.min_value + self._max_value = np.inf if self.max_value is None else self.max_value + + self.initial_imputer_ = None + X, Xt, mask_missing_values = self._initial_imputation(X) + + if self.max_iter == 0 or np.all(mask_missing_values): + self.n_iter_ = 0 + return Xt + + # order in which to impute + # note this is probably too slow for large feature data (d > 100000) + # and a better way would be good. + # see: https://goo.gl/KyCNwj and subsequent comments + ordered_idx = self._get_ordered_idx(mask_missing_values) + self.n_features_with_missing_ = len(ordered_idx) + + abs_corr_mat = self._get_abs_corr_mat(Xt) + + n_samples, n_features = Xt.shape + if self.verbose > 0: + print("[IterativeImputer] Completing matrix with shape %s" + % (X.shape,)) + start_t = time() + if not self.sample_posterior: + Xt_previous = Xt.copy() + normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values])) + for self.n_iter_ in range(1, self.max_iter + 1): + if self.imputation_order == 'random': + ordered_idx = self._get_ordered_idx(mask_missing_values) + + for feat_idx in ordered_idx: + neighbor_feat_idx = self._get_neighbor_feat_idx(n_features, + feat_idx, + abs_corr_mat) + Xt, estimator = self._impute_one_feature( + Xt, mask_missing_values, feat_idx, neighbor_feat_idx, + estimator=None, fit_mode=True) + estimator_triplet = _ImputerTriplet(feat_idx, + neighbor_feat_idx, + estimator) + self.imputation_sequence_.append(estimator_triplet) + + if self.verbose > 1: + print('[IterativeImputer] Ending imputation round ' + '%d/%d, elapsed time %0.2f' + % (self.n_iter_, self.max_iter, time() - start_t)) + + if not self.sample_posterior: + inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, + axis=None) + if inf_norm < normalized_tol: + if self.verbose > 0: + print('[IterativeImputer] Early stopping criterion ' + 'reached.') + break + Xt_previous = Xt.copy() + else: + if not self.sample_posterior: + warnings.warn("[IterativeImputer] Early stopping criterion not" + " reached.", ConvergenceWarning) + Xt[~mask_missing_values] = X[~mask_missing_values] + + if self.add_indicator: + Xt = np.hstack((Xt, X_trans_indicator)) + return Xt + + def transform(self, X): + """Imputes all missing values in X. + + Note that this is stochastic, and that if random_state is not fixed, + repeated calls, or permuted input, will yield different results. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + The input data to complete. + + Returns + ------- + Xt : array-like, shape (n_samples, n_features) + The imputed input data. + """ + check_is_fitted(self, 'initial_imputer_') + + if self.add_indicator: + X_trans_indicator = self.indicator_.transform(X) + + X, Xt, mask_missing_values = self._initial_imputation(X) + + if self.n_iter_ == 0 or np.all(mask_missing_values): + return Xt + + imputations_per_round = len(self.imputation_sequence_) // self.n_iter_ + i_rnd = 0 + if self.verbose > 0: + print("[IterativeImputer] Completing matrix with shape %s" + % (X.shape,)) + start_t = time() + for it, estimator_triplet in enumerate(self.imputation_sequence_): + Xt, _ = self._impute_one_feature( + Xt, + mask_missing_values, + estimator_triplet.feat_idx, + estimator_triplet.neighbor_feat_idx, + estimator=estimator_triplet.estimator, + fit_mode=False + ) + if not (it + 1) % imputations_per_round: + if self.verbose > 1: + print('[IterativeImputer] Ending imputation round ' + '%d/%d, elapsed time %0.2f' + % (i_rnd + 1, self.n_iter_, time() - start_t)) + i_rnd += 1 + + Xt[~mask_missing_values] = X[~mask_missing_values] + + if self.add_indicator: + Xt = np.hstack((Xt, X_trans_indicator)) + return Xt + + def fit(self, X, y=None): + """Fits the imputer on X and return self. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Input data, where "n_samples" is the number of samples and + "n_features" is the number of features. + + y : ignored + + Returns + ------- + self : object + Returns self. + """ + self.fit_transform(X) + return self + + def _more_tags(self): + return {'allow_nan': True} diff --git a/sklearn/impute/tests/__init__.py b/sklearn/impute/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/tests/test_impute.py b/sklearn/impute/tests/test_impute.py similarity index 99% rename from sklearn/tests/test_impute.py rename to sklearn/impute/tests/test_impute.py index 979140ba246cf..1552031ff2193 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -13,6 +13,9 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal +# make IterativeImputer available +from sklearn.experimental import enable_iterative_imputer # noqa + from sklearn.impute import MissingIndicator from sklearn.impute import SimpleImputer, IterativeImputer from sklearn.dummy import DummyRegressor diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index 8798fb459ec74..3191dcd7a1352 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -22,7 +22,6 @@ from ..tree._tree import DTYPE from ..exceptions import NotFittedError from ..ensemble.gradient_boosting import BaseGradientBoosting -from ..ensemble._gradient_boosting import _partial_dependence_tree __all__ = ['partial_dependence', 'plot_partial_dependence'] @@ -105,14 +104,14 @@ def _partial_dependence_recursion(est, grid, features): grid = np.asarray(grid, dtype=DTYPE, order='C') n_estimators, n_trees_per_stage = est.estimators_.shape - learning_rate = est.learning_rate averaged_predictions = np.zeros((n_trees_per_stage, grid.shape[0]), dtype=np.float64, order='C') for stage in range(n_estimators): for k in range(n_trees_per_stage): tree = est.estimators_[stage, k].tree_ - _partial_dependence_tree(tree, grid, features, - learning_rate, averaged_predictions[k]) + tree.compute_partial_dependence(grid, features, + averaged_predictions[k]) + averaged_predictions *= est.learning_rate return averaged_predictions @@ -356,7 +355,7 @@ def partial_dependence(estimator, X, features, response_method='auto', features) # reshape averaged_predictions to - # (n_outputs, n_values_feature_0, # n_values_feature_1, ...) + # (n_outputs, n_values_feature_0, n_values_feature_1, ...) averaged_predictions = averaged_predictions.reshape( -1, *[val.shape[0] for val in values]) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index d2d3c7818e448..b90b76c4220f3 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -27,7 +27,6 @@ from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import if_matplotlib # toy sample @@ -396,11 +395,8 @@ def test_partial_dependence_sample_weight(): assert np.corrcoef(pdp, values)[0, 1] > 0.99 -@if_matplotlib -def test_plot_partial_dependence(): +def test_plot_partial_dependence(pyplot): # Test partial dependence plot function. - import matplotlib.pyplot as plt # noqa - boston = load_boston() clf = GradientBoostingRegressor(n_estimators=10, random_state=1) clf.fit(boston.data, boston.target) @@ -409,7 +405,7 @@ def test_plot_partial_dependence(): plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)], grid_resolution=grid_resolution, feature_names=boston.feature_names) - fig = plt.gcf() + fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 3 assert all(ax.has_data for ax in axs) @@ -420,7 +416,7 @@ def test_plot_partial_dependence(): grid_resolution=grid_resolution, feature_names=boston.feature_names) - fig = plt.gcf() + fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 3 assert all(ax.has_data for ax in axs) @@ -431,18 +427,14 @@ def test_plot_partial_dependence(): ('CRIM', 'ZN')], grid_resolution=grid_resolution, feature_names=feature_names) - fig = plt.gcf() + fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 3 assert all(ax.has_data for ax in axs) - plt.close('all') - -@if_matplotlib -def test_plot_partial_dependence_multiclass(): +def test_plot_partial_dependence_multiclass(pyplot): # Test partial dependence plot function on multi-class input. - import matplotlib.pyplot as plt # noqa iris = load_iris() clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, iris.target) @@ -451,7 +443,7 @@ def test_plot_partial_dependence_multiclass(): plot_partial_dependence(clf, iris.data, [0, 1], target=0, grid_resolution=grid_resolution) - fig = plt.gcf() + fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 2 assert all(ax.has_data for ax in axs) @@ -465,18 +457,14 @@ def test_plot_partial_dependence_multiclass(): plot_partial_dependence(clf, iris.data, [0, 1], target='setosa', grid_resolution=grid_resolution) - fig = plt.gcf() + fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 2 assert all(ax.has_data for ax in axs) - plt.close('all') - -@if_matplotlib -def test_plot_partial_dependence_multioutput(): +def test_plot_partial_dependence_multioutput(pyplot): # Test partial dependence plot function on multi-output input. - import matplotlib.pyplot as plt # noqa (X, y), _ = multioutput_regression_data clf = LinearRegression() clf.fit(X, y) @@ -485,7 +473,7 @@ def test_plot_partial_dependence_multioutput(): plot_partial_dependence(clf, X, [0, 1], target=0, grid_resolution=grid_resolution) - fig = plt.gcf() + fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 2 assert all(ax.has_data for ax in axs) @@ -493,15 +481,12 @@ def test_plot_partial_dependence_multioutput(): plot_partial_dependence(clf, X, [0, 1], target=1, grid_resolution=grid_resolution) - fig = plt.gcf() + fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 2 assert all(ax.has_data for ax in axs) - plt.close('all') - -@if_matplotlib @pytest.mark.parametrize( "data, params, err_msg", [(multioutput_regression_data[0], {"target": None, 'features': [0]}, @@ -531,32 +516,23 @@ def test_plot_partial_dependence_multioutput(): ) @pytest.mark.filterwarnings('ignore:Default solver will be changed ') # 0.22 @pytest.mark.filterwarnings('ignore:Default multi_class will be') # 0.22 -def test_plot_partial_dependence_error(data, params, err_msg): - import matplotlib.pyplot as plt # noqa +def test_plot_partial_dependence_error(pyplot, data, params, err_msg): X, y = data estimator = LinearRegression().fit(X, y) with pytest.raises(ValueError, match=err_msg): plot_partial_dependence(estimator, X, **params) - plt.close() - -@if_matplotlib -def test_plot_partial_dependence_fig(): +def test_plot_partial_dependence_fig(pyplot): # Make sure fig object is correctly used if not None - - import matplotlib.pyplot as plt - (X, y), _ = regression_data clf = LinearRegression() clf.fit(X, y) - fig = plt.figure() + fig = pyplot.figure() grid_resolution = 25 plot_partial_dependence( clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig) - assert plt.gcf() is fig - - plt.close() + assert pyplot.gcf() is fig diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index 3b8f74a946699..7cff336715322 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -209,6 +209,15 @@ class IsotonicRegression(BaseEstimator, TransformerMixin, RegressorMixin): Correctness of Kruskal's algorithms for monotone regression with ties Leeuw, Psychometrica, 1977 + + Examples + -------- + >>> from sklearn.datasets import make_regression + >>> from sklearn.isotonic import IsotonicRegression + >>> X, y = make_regression(n_samples=10, n_features=1, random_state=41) + >>> iso_reg = IsotonicRegression().fit(X.flatten(), y) + >>> iso_reg.predict([.1, .2]) # doctest: +ELLIPSIS + array([1.8628..., 3.7256...]) """ def __init__(self, y_min=None, y_max=None, increasing=True, out_of_bounds='nan'): diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index aeb5fd45f413f..17a5247d5ab20 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -27,7 +27,7 @@ class KernelRidge(BaseEstimator, RegressorMixin, MultiOutputMixin): squared error loss while support vector regression uses epsilon-insensitive loss, both combined with l2 regularization. In contrast to SVR, fitting a KRR model can be done in closed-form and is typically faster for - medium-sized datasets. On the other hand, the learned model is non-sparse + medium-sized datasets. On the other hand, the learned model is non-sparse and thus slower than SVR, which learns a sparse model for epsilon > 0, at prediction-time. diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx index ad0fa4277f3be..fcbe46ce77711 100644 --- a/sklearn/linear_model/cd_fast.pyx +++ b/sklearn/linear_model/cd_fast.pyx @@ -24,7 +24,7 @@ from ..utils._cython_blas cimport (_axpy, _dot, _asum, _ger, _gemv, _nrm2, from ..utils._cython_blas cimport RowMajor, ColMajor, Trans, NoTrans -from ..utils cimport _random +from ..utils._random cimport our_rand_r ctypedef np.float64_t DOUBLE ctypedef np.uint32_t UINT32_t @@ -42,7 +42,7 @@ cdef enum: cdef inline UINT32_t rand_int(UINT32_t end, UINT32_t* random_state) nogil: """Generate a random integer in [0; end).""" - return _random.our_rand_r(random_state) % end + return our_rand_r(random_state) % end cdef inline floating fmax(floating x, floating y) nogil: diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py index ceccafd706101..b14188bff50c1 100644 --- a/sklearn/linear_model/coordinate_descent.py +++ b/sklearn/linear_model/coordinate_descent.py @@ -224,7 +224,7 @@ def lasso_path(X, y, eps=1e-3, n_alphas=100, alphas=None, values output by lars_path Examples - --------- + -------- Comparing lasso_path and lars_path with interpolation: @@ -661,7 +661,7 @@ def fit(self, X, y, check_input=True): """Fit model with coordinate descent. Parameters - ----------- + ---------- X : ndarray or scipy.sparse matrix, (n_samples, n_features) Data @@ -1747,7 +1747,7 @@ def fit(self, X, y): """Fit MultiTaskElasticNet model with coordinate descent Parameters - ----------- + ---------- X : ndarray, shape (n_samples, n_features) Data y : ndarray, shape (n_samples, n_tasks) diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index 5df45535db462..594fdb3676adb 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -42,7 +42,7 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0, Read more in the :ref:`User Guide `. Parameters - ----------- + ---------- X : None or array, shape (n_samples, n_features) Input data. Note that if X is None then the Gram matrix must be specified, i.e., cannot be None or False. @@ -112,7 +112,7 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0, solution of the coordinate descent lasso_path function. Returns - -------- + ------- alphas : array, shape (n_alphas + 1,) Maximum of covariances (in absolute value) at each iteration. ``n_alphas`` is either ``max_iter``, ``n_features`` or the @@ -179,7 +179,7 @@ def lars_path_gram(Xy, Gram, n_samples, max_iter=500, alpha_min=0, Read more in the :ref:`User Guide `. Parameters - ----------- + ---------- Xy : array-like, shape (n_samples,) or (n_samples, n_targets) Xy = np.dot(X.T, y). @@ -231,7 +231,7 @@ def lars_path_gram(Xy, Gram, n_samples, max_iter=500, alpha_min=0, solution of the coordinate descent lasso_path function. Returns - -------- + ------- alphas : array, shape (n_alphas + 1,) Maximum of covariances (in absolute value) at each iteration. ``n_alphas`` is either ``max_iter``, ``n_features`` or the @@ -295,7 +295,7 @@ def _lars_path_solver(X, y, Xy=None, Gram=None, n_samples=None, max_iter=500, Read more in the :ref:`User Guide `. Parameters - ----------- + ---------- X : None or ndarray, shape (n_samples, n_features) Input data. Note that if X is None then Gram must be specified, i.e., cannot be None or False. @@ -358,7 +358,7 @@ def _lars_path_solver(X, y, Xy=None, Gram=None, n_samples=None, max_iter=500, solution of the coordinate descent lasso_path function. Returns - -------- + ------- alphas : array, shape (n_alphas + 1,) Maximum of covariances (in absolute value) at each iteration. ``n_alphas`` is either ``max_iter``, ``n_features`` or the @@ -1128,7 +1128,7 @@ def _lars_path_residues(X_train, y_train, X_test, y_test, Gram=None, """Compute the residues on left-out data for a full LARS path Parameters - ----------- + ---------- X_train : array, shape (n_samples, n_features) The data to fit the LARS on @@ -1189,7 +1189,7 @@ def _lars_path_residues(X_train, y_train, X_test, y_test, Gram=None, Returns - -------- + ------- alphas : array, shape (n_alphas,) Maximum of covariances (in absolute value) at each iteration. ``n_alphas`` is either ``max_iter`` or ``n_features``, whichever diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py index d9ee49cd37698..38be6ddd37540 100644 --- a/sklearn/linear_model/omp.py +++ b/sklearn/linear_model/omp.py @@ -681,7 +681,7 @@ def _omp_path_residues(X_train, y_train, X_test, y_test, copy=True, """Compute the residues on left-out data for a full LARS path Parameters - ----------- + ---------- X_train : array, shape (n_samples, n_features) The data to fit the LARS on diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py index 2a491bd3ef515..0e54126e52c33 100644 --- a/sklearn/linear_model/ridge.py +++ b/sklearn/linear_model/ridge.py @@ -31,6 +31,7 @@ from ..model_selection import GridSearchCV from ..metrics.scorer import check_scoring from ..exceptions import ConvergenceWarning +from ..utils.sparsefuncs import mean_variance_axis def _solve_sparse_cg(X, y, alpha, max_iter=None, tol=1e-3, verbose=0, @@ -226,9 +227,17 @@ def _solve_svd(X, y, alpha): return np.dot(Vt.T, d_UT_y).T +def _get_valid_accept_sparse(is_X_sparse, solver): + if is_X_sparse and solver in ['auto', 'sag', 'saga']: + return 'csr' + else: + return ['csr', 'csc', 'coo'] + + def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', max_iter=None, tol=1e-3, verbose=0, random_state=None, - return_n_iter=False, return_intercept=False): + return_n_iter=False, return_intercept=False, + check_input=True): """Solve the ridge equation by the method of normal equations. Read more in the :ref:`User Guide `. @@ -332,6 +341,11 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', .. versionadded:: 0.17 + check_input : boolean, default True + If False, the input arrays X and y will not be checked. + + .. versionadded:: 0.21 + Returns ------- coef : array, shape = [n_features] or [n_targets, n_features] @@ -360,13 +374,14 @@ def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', return_n_iter=return_n_iter, return_intercept=return_intercept, X_scale=None, - X_offset=None) + X_offset=None, + check_input=check_input) def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', max_iter=None, tol=1e-3, verbose=0, random_state=None, return_n_iter=False, return_intercept=False, - X_scale=None, X_offset=None): + X_scale=None, X_offset=None, check_input=True): has_sw = sample_weight is not None @@ -388,17 +403,12 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', "intercept. Please change solver to 'sag' or set " "return_intercept=False.") - _dtype = [np.float64, np.float32] - - # SAG needs X and y columns to be C-contiguous and np.float64 - if solver in ['sag', 'saga']: - X = check_array(X, accept_sparse=['csr'], - dtype=np.float64, order='C') - y = check_array(y, dtype=np.float64, ensure_2d=False, order='F') - else: - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=_dtype) - y = check_array(y, dtype=X.dtype, ensure_2d=False) + if check_input: + _dtype = [np.float64, np.float32] + _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver) + X = check_array(X, accept_sparse=_accept_sparse, dtype=_dtype, + order="C") + y = check_array(y, dtype=X.dtype, ensure_2d=False, order="C") check_consistent_length(X, y) n_samples, n_features = X.shape @@ -417,8 +427,6 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', raise ValueError("Number of samples in X and y does not correspond:" " %d != %d" % (n_samples, n_samples_)) - - if has_sw: if np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") @@ -438,7 +446,6 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', if alpha.size == 1 and n_targets > 1: alpha = np.repeat(alpha, n_targets) - n_iter = None if solver == 'sparse_cg': coef = _solve_sparse_cg(X, y, alpha, @@ -461,7 +468,6 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', except linalg.LinAlgError: # use SVD solver if matrix is singular solver = 'svd' - else: try: coef = _solve_cholesky(X, y, alpha) @@ -473,11 +479,12 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', # precompute max_squared_sum for all targets max_squared_sum = row_norms(X, squared=True).max() - coef = np.empty((y.shape[1], n_features)) + coef = np.empty((y.shape[1], n_features), dtype=X.dtype) n_iter = np.empty(y.shape[1], dtype=np.int32) - intercept = np.zeros((y.shape[1], )) + intercept = np.zeros((y.shape[1], ), dtype=X.dtype) for i, (alpha_i, target) in enumerate(zip(alpha, y.T)): - init = {'coef': np.zeros((n_features + int(return_intercept), 1))} + init = {'coef': np.zeros((n_features + int(return_intercept), 1), + dtype=X.dtype)} coef_, n_iter_, _ = sag_solver( X, target.ravel(), sample_weight, 'squared', alpha_i, 0, max_iter, tol, verbose, random_state, False, max_squared_sum, @@ -530,13 +537,13 @@ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, def fit(self, X, y, sample_weight=None): - if self.solver in ('sag', 'saga'): - _dtype = np.float64 - else: - # all other solvers work at both float precision levels - _dtype = [np.float64, np.float32] - - X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=_dtype, + # all other solvers work at both float precision levels + _dtype = [np.float64, np.float32] + _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), + self.solver) + X, y = check_X_y(X, y, + accept_sparse=_accept_sparse, + dtype=_dtype, multi_output=True, y_numeric=True) if ((sample_weight is not None) and @@ -555,7 +562,7 @@ def fit(self, X, y, sample_weight=None): X, y, alpha=self.alpha, sample_weight=sample_weight, max_iter=self.max_iter, tol=self.tol, solver=self.solver, random_state=self.random_state, return_n_iter=True, - return_intercept=True) + return_intercept=True, check_input=False) # add the offset which was subtracted by _preprocess_data self.intercept_ += y_offset else: @@ -570,8 +577,7 @@ def fit(self, X, y, sample_weight=None): X, y, alpha=self.alpha, sample_weight=sample_weight, max_iter=self.max_iter, tol=self.tol, solver=self.solver, random_state=self.random_state, return_n_iter=True, - return_intercept=False, **params) - + return_intercept=False, check_input=False, **params) self._set_intercept(X_offset, y_offset, X_scale) return self @@ -893,8 +899,9 @@ def fit(self, X, y, sample_weight=None): ------- self : returns an instance of self. """ - check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - multi_output=True) + _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), + self.solver) + check_X_y(X, y, accept_sparse=_accept_sparse, multi_output=True) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = self._label_binarizer.fit_transform(y) @@ -921,6 +928,106 @@ def classes_(self): return self._label_binarizer.classes_ +def _check_gcv_mode(X, gcv_mode): + possible_gcv_modes = [None, 'auto', 'svd', 'eigen'] + if gcv_mode not in possible_gcv_modes: + raise ValueError( + "Unknown value for 'gcv_mode'. " + "Got {} instead of one of {}" .format( + gcv_mode, possible_gcv_modes)) + if gcv_mode in ['eigen', 'svd']: + return gcv_mode + # if X has more rows than columns, use decomposition of X^T.X, + # otherwise X.X^T + if X.shape[0] > X.shape[1]: + return 'svd' + return 'eigen' + + +def _find_smallest_angle(query, vectors): + """Find the column of vectors that is most aligned with the query. + + Both query and the columns of vectors must have their l2 norm equal to 1. + + Parameters + ---------- + query : ndarray, shape (n_samples,) + Normalized query vector. + + vectors : ndarray, shape (n_samples, n_features) + Vectors to which we compare query, as columns. Must be normalized. + """ + abs_cosine = np.abs(query.dot(vectors)) + index = np.argmax(abs_cosine) + return index + + +class _X_operator(sparse.linalg.LinearOperator): + """Behaves as centered and scaled X with an added intercept column. + + This operator behaves as + np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]) + """ + + def __init__(self, X, X_mean, sqrt_sw): + n_samples, n_features = X.shape + super().__init__(X.dtype, (n_samples, n_features + 1)) + self.X = X + self.X_mean = X_mean + self.sqrt_sw = sqrt_sw + + def _matvec(self, v): + v = v.ravel() + return safe_sparse_dot( + self.X, v[:-1], dense_output=True + ) - self.sqrt_sw * self.X_mean.dot(v[:-1]) + v[-1] * self.sqrt_sw + + def _matmat(self, v): + return ( + safe_sparse_dot(self.X, v[:-1], dense_output=True) - + self.sqrt_sw[:, None] * self.X_mean.dot(v[:-1]) + v[-1] * + self.sqrt_sw[:, None]) + + def _transpose(self): + return _Xt_operator(self.X, self.X_mean, self.sqrt_sw) + + +class _Xt_operator(sparse.linalg.LinearOperator): + """Behaves as transposed centered and scaled X with an intercept column. + + This operator behaves as + np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T + """ + + def __init__(self, X, X_mean, sqrt_sw): + n_samples, n_features = X.shape + super().__init__(X.dtype, (n_features + 1, n_samples)) + self.X = X + self.X_mean = X_mean + self.sqrt_sw = sqrt_sw + + def _matvec(self, v): + v = v.ravel() + n_features = self.shape[0] + res = np.empty(n_features, dtype=self.X.dtype) + res[:-1] = ( + safe_sparse_dot(self.X.T, v, dense_output=True) - + (self.X_mean * self.sqrt_sw.dot(v)) + ) + res[-1] = np.dot(v, self.sqrt_sw) + return res + + def _matmat(self, v): + n_features = self.shape[0] + res = np.empty((n_features, v.shape[1]), dtype=self.X.dtype) + res[:-1] = ( + safe_sparse_dot(self.X.T, v, dense_output=True) - + self.X_mean[:, None] * self.sqrt_sw.dot(v) + ) + res[-1] = np.dot(self.sqrt_sw, v) + return res + + class _RidgeGCV(LinearModel): """Ridge regression with built-in Generalized Cross-Validation @@ -972,18 +1079,6 @@ def __init__(self, alphas=(0.1, 1.0, 10.0), self.gcv_mode = gcv_mode self.store_cv_values = store_cv_values - def _pre_compute(self, X, y, centered_kernel=True): - # even if X is very sparse, K is usually very dense - K = safe_sparse_dot(X, X.T, dense_output=True) - # the following emulates an additional constant regressor - # corresponding to fit_intercept=True - # but this is done only when the features have been centered - if centered_kernel: - K += np.ones_like(K) - v, Q = linalg.eigh(K) - QT_y = np.dot(Q.T, y) - return v, Q, QT_y - def _decomp_diag(self, v_prime, Q): # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T)) return (v_prime * Q ** 2).sum(axis=-1) @@ -995,18 +1090,161 @@ def _diag_dot(self, D, B): D = D[(slice(None), ) + (np.newaxis, ) * (len(B.shape) - 1)] return D * B - def _errors_and_values_helper(self, alpha, y, v, Q, QT_y): - """Helper function to avoid code duplication between self._errors and - self._values. + def _compute_gram(self, X, sqrt_sw): + """Computes the Gram matrix with possible centering. - Notes - ----- - We don't construct matrix G, instead compute action on y & diagonal. + If ``center`` is ``True``, compute + (X - X.mean(axis=0)).dot((X - X.mean(axis=0)).T) + else X.dot(X.T) + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + The input uncentered data. + + sqrt_sw : ndarray, shape (n_samples,) + square roots of sample weights + + center : bool, default is True + Whether or not to remove the mean from ``X``. + + Returns + ------- + gram : ndarray, shape (n_samples, n_samples) + The Gram matrix. + X_mean : ndarray, shape (n_feature,) + The mean of ``X`` for each feature. + """ + center = self.fit_intercept and sparse.issparse(X) + if not center: + # in this case centering has been done in preprocessing + # or we are not fitting an intercept. + X_mean = np.zeros(X.shape[1], dtype=X.dtype) + return safe_sparse_dot(X, X.T, dense_output=True), X_mean + # otherwise X is always sparse + n_samples = X.shape[0] + sample_weight_matrix = sparse.dia_matrix( + (sqrt_sw, 0), shape=(n_samples, n_samples)) + X_weighted = sample_weight_matrix.dot(X) + X_mean, _ = mean_variance_axis(X_weighted, axis=0) + X_mean *= n_samples / sqrt_sw.dot(sqrt_sw) + X_mX = sqrt_sw[:, None] * safe_sparse_dot( + X_mean, X.T, dense_output=True) + X_mX_m = np.outer(sqrt_sw, sqrt_sw) * np.dot(X_mean, X_mean) + return (safe_sparse_dot(X, X.T, dense_output=True) + X_mX_m + - X_mX - X_mX.T, X_mean) + + def _compute_covariance(self, X, sqrt_sw): + """Computes centered covariance matrix. + + If ``center`` is ``True``, compute + (X - X.mean(axis=0)).T.dot(X - X.mean(axis=0)) + else + X.T.dot(X) + + Parameters + ---------- + X : sparse matrix, shape (n_samples, n_features) + The input uncentered data. + + sqrt_sw : ndarray, shape (n_samples,) + square roots of sample weights + + center : bool, default is True + Whether or not to remove the mean from ``X``. + + Returns + ------- + covariance : ndarray, shape (n_features, n_features) + The covariance matrix. + X_mean : ndarray, shape (n_feature,) + The mean of ``X`` for each feature. + """ + if not self.fit_intercept: + # in this case centering has been done in preprocessing + # or we are not fitting an intercept. + X_mean = np.zeros(X.shape[1], dtype=X.dtype) + return safe_sparse_dot(X.T, X, dense_output=True), X_mean + # this function only gets called for sparse X + n_samples = X.shape[0] + sample_weight_matrix = sparse.dia_matrix( + (sqrt_sw, 0), shape=(n_samples, n_samples)) + X_weighted = sample_weight_matrix.dot(X) + X_mean, _ = mean_variance_axis(X_weighted, axis=0) + X_mean = X_mean * n_samples / sqrt_sw.dot(sqrt_sw) + weight_sum = sqrt_sw.dot(sqrt_sw) + return (safe_sparse_dot(X.T, X, dense_output=True) - + weight_sum * np.outer(X_mean, X_mean), + X_mean) + + def _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw): + """Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T) + without explicitely centering X nor computing X.dot(A) + when X is sparse. + + Parameters + ---------- + X : sparse matrix, shape = (n_samples, n_features) + + A : np.ndarray, shape = (n_features, n_features) + + X_mean : np.ndarray, shape = (n_features,) + + sqrt_sw : np.ndarray, shape = (n_features,) + square roots of sample weights + + Returns + ------- + diag : np.ndarray, shape = (n_samples,) + The computed diagonal. + """ + intercept_col = sqrt_sw + scale = sqrt_sw + batch_size = X.shape[1] + diag = np.empty(X.shape[0], dtype=X.dtype) + for start in range(0, X.shape[0], batch_size): + batch = slice(start, min(X.shape[0], start + batch_size), 1) + X_batch = np.empty( + (X[batch].shape[0], X.shape[1] + self.fit_intercept), + dtype=X.dtype + ) + if self.fit_intercept: + X_batch[:, :-1] = X[batch].A - X_mean * scale[batch][:, None] + X_batch[:, -1] = intercept_col[batch] + else: + X_batch = X[batch].A + diag[batch] = (X_batch.dot(A) * X_batch).sum(axis=1) + return diag + + def _eigen_decompose_gram(self, X, y, sqrt_sw): + """Eigendecomposition of X.X^T, used when n_samples <= n_features""" + # if X is dense it has already been centered in preprocessing + K, X_mean = self._compute_gram(X, sqrt_sw) + if self.fit_intercept: + # to emulate centering X with sample weights, + # ie removing the weighted average, we add a column + # containing the square roots of the sample weights. + # by centering, it is orthogonal to the other columns + K += np.outer(sqrt_sw, sqrt_sw) + v, Q = linalg.eigh(K) + QT_y = np.dot(Q.T, y) + return X_mean, v, Q, QT_y + + def _solve_eigen_gram(self, alpha, y, sqrt_sw, X_mean, v, Q, QT_y): + """Compute dual coefficients and diagonal of (Identity - Hat_matrix) + + Used when we have a decomposition of X.X^T (n_features >= n_samples). """ w = 1. / (v + alpha) - constant_column = np.var(Q, 0) < 1.e-12 - # detect constant columns - w[constant_column] = 0 # cancel the regularization for the intercept + if self.fit_intercept: + # the vector containing the square roots of the sample weights (1 + # when no sample weights) is the eigenvector of XX^T which + # corresponds to the intercept; we cancel the regularization on + # this dimension. the corresponding eigenvalue is + # sum(sample_weight). + normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw) + intercept_dim = _find_smallest_angle(normalized_sw, Q) + w[intercept_dim] = 0 # cancel regularization for the intercept c = np.dot(Q, self._diag_dot(w, QT_y)) G_diag = self._decomp_diag(w, Q) @@ -1015,35 +1253,117 @@ def _errors_and_values_helper(self, alpha, y, v, Q, QT_y): G_diag = G_diag[:, np.newaxis] return G_diag, c - def _errors(self, alpha, y, v, Q, QT_y): - G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y) - return (c / G_diag) ** 2, c + def _eigen_decompose_covariance(self, X, y, sqrt_sw): + """Eigendecomposition of X^T.X, used when n_samples > n_features.""" + n_samples, n_features = X.shape + cov = np.empty((n_features + 1, n_features + 1), dtype=X.dtype) + cov[:-1, :-1], X_mean = self._compute_covariance(X, sqrt_sw) + if not self.fit_intercept: + cov = cov[:-1, :-1] + # to emulate centering X with sample weights, + # ie removing the weighted average, we add a column + # containing the square roots of the sample weights. + # by centering, it is orthogonal to the other columns + # when all samples have the same weight we add a column of 1 + else: + cov[-1] = 0 + cov[:, -1] = 0 + cov[-1, -1] = sqrt_sw.dot(sqrt_sw) + nullspace_dim = max(0, X.shape[1] - X.shape[0]) + s, V = linalg.eigh(cov) + # remove eigenvalues and vectors in the null space of X^T.X + s = s[nullspace_dim:] + V = V[:, nullspace_dim:] + return X_mean, s, V, X + + def _solve_eigen_covariance_no_intercept( + self, alpha, y, sqrt_sw, X_mean, s, V, X): + """Compute dual coefficients and diagonal of (Identity - Hat_matrix) + + Used when we have a decomposition of X^T.X + (n_features < n_samples and X is sparse), and not fitting an intercept. + """ + w = 1 / (s + alpha) + A = (V * w).dot(V.T) + AXy = A.dot(safe_sparse_dot(X.T, y, dense_output=True)) + y_hat = safe_sparse_dot(X, AXy, dense_output=True) + hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw) + if len(y.shape) != 1: + # handle case where y is 2-d + hat_diag = hat_diag[:, np.newaxis] + return (1 - hat_diag) / alpha, (y - y_hat) / alpha - def _values(self, alpha, y, v, Q, QT_y): - G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y) - return y - (c / G_diag), c + def _solve_eigen_covariance_intercept( + self, alpha, y, sqrt_sw, X_mean, s, V, X): + """Compute dual coefficients and diagonal of (Identity - Hat_matrix) - def _pre_compute_svd(self, X, y, centered_kernel=True): - if sparse.issparse(X): - raise TypeError("SVD not supported for sparse matrices") - if centered_kernel: - X = np.hstack((X, np.ones((X.shape[0], 1)))) - # to emulate fit_intercept=True situation, add a column on ones - # Note that by centering, the other columns are orthogonal to that one + Used when we have a decomposition of X^T.X + (n_features < n_samples and X is sparse), + and we are fitting an intercept. + """ + # the vector [0, 0, ..., 0, 1] + # is the eigenvector of X^TX which + # corresponds to the intercept; we cancel the regularization on + # this dimension. the corresponding eigenvalue is + # sum(sample_weight), e.g. n when uniform sample weights. + intercept_sv = np.zeros(V.shape[0]) + intercept_sv[-1] = 1 + intercept_dim = _find_smallest_angle(intercept_sv, V) + w = 1 / (s + alpha) + w[intercept_dim] = 1 / s[intercept_dim] + A = (V * w).dot(V.T) + # add a column to X containing the square roots of sample weights + X_op = _X_operator(X, X_mean, sqrt_sw) + AXy = A.dot(X_op.T.dot(y)) + y_hat = X_op.dot(AXy) + hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw) + # return (1 - hat_diag), (y - y_hat) + if len(y.shape) != 1: + # handle case where y is 2-d + hat_diag = hat_diag[:, np.newaxis] + return (1 - hat_diag) / alpha, (y - y_hat) / alpha + + def _solve_eigen_covariance( + self, alpha, y, sqrt_sw, X_mean, s, V, X): + """Compute dual coefficients and diagonal of (Identity - Hat_matrix) + + Used when we have a decomposition of X^T.X + (n_features < n_samples and X is sparse). + """ + if self.fit_intercept: + return self._solve_eigen_covariance_intercept( + alpha, y, sqrt_sw, X_mean, s, V, X) + return self._solve_eigen_covariance_no_intercept( + alpha, y, sqrt_sw, X_mean, s, V, X) + + def _svd_decompose_design_matrix(self, X, y, sqrt_sw): + # X already centered + X_mean = np.zeros(X.shape[1], dtype=X.dtype) + if self.fit_intercept: + # to emulate fit_intercept=True situation, add a column + # containing the square roots of the sample weights + # by centering, the other columns are orthogonal to that one + intercept_column = sqrt_sw[:, None] + X = np.hstack((X, intercept_column)) U, s, _ = linalg.svd(X, full_matrices=0) v = s ** 2 UT_y = np.dot(U.T, y) - return v, U, UT_y + return X_mean, v, U, UT_y - def _errors_and_values_svd_helper(self, alpha, y, v, U, UT_y): - """Helper function to avoid code duplication between self._errors_svd - and self._values_svd. + def _solve_svd_design_matrix( + self, alpha, y, sqrt_sw, X_mean, v, U, UT_y): + """Compute dual coefficients and diagonal of (Identity - Hat_matrix) + + Used when we have an SVD decomposition of X + (n_features >= n_samples and X is dense). """ - constant_column = np.var(U, 0) < 1.e-12 - # detect columns colinear to ones w = ((v + alpha) ** -1) - (alpha ** -1) - w[constant_column] = - (alpha ** -1) - # cancel the regularization for the intercept + if self.fit_intercept: + # detect intercept column + normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw) + intercept_dim = _find_smallest_angle(normalized_sw, U) + # cancel the regularization for the intercept + w[intercept_dim] = - (alpha ** -1) c = np.dot(U, self._diag_dot(w, UT_y)) + (alpha ** -1) * y G_diag = self._decomp_diag(w, U) + (alpha ** -1) if len(y.shape) != 1: @@ -1051,24 +1371,16 @@ def _errors_and_values_svd_helper(self, alpha, y, v, U, UT_y): G_diag = G_diag[:, np.newaxis] return G_diag, c - def _errors_svd(self, alpha, y, v, U, UT_y): - G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y) - return (c / G_diag) ** 2, c - - def _values_svd(self, alpha, y, v, U, UT_y): - G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y) - return y - (c / G_diag), c - def fit(self, X, y, sample_weight=None): """Fit Ridge regression model Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] - Training data + Training data. Will be cast to float64 if necessary y : array-like, shape = [n_samples] or [n_samples, n_targets] - Target values. Will be cast to X's dtype if necessary + Target values. Will be cast to float64 if necessary sample_weight : float or array-like of shape [n_samples] Sample weight @@ -1077,66 +1389,60 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64, + X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], + dtype=[np.float64], multi_output=True, y_numeric=True) + + if np.any(self.alphas <= 0): + raise ValueError( + "alphas must be positive. Got {} containing some " + "negative or null value instead.".format(self.alphas)) + if sample_weight is not None and not isinstance(sample_weight, float): - sample_weight = check_array(sample_weight, ensure_2d=False) + sample_weight = check_array(sample_weight, ensure_2d=False, + dtype=X.dtype) n_samples, n_features = X.shape X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data( X, y, self.fit_intercept, self.normalize, self.copy_X, sample_weight=sample_weight) - gcv_mode = self.gcv_mode - with_sw = len(np.shape(sample_weight)) - - if gcv_mode is None or gcv_mode == 'auto': - if sparse.issparse(X) or n_features > n_samples or with_sw: - gcv_mode = 'eigen' - else: - gcv_mode = 'svd' - elif gcv_mode == "svd" and with_sw: - # FIXME non-uniform sample weights not yet supported - warnings.warn("non-uniform sample weights unsupported for svd, " - "forcing usage of eigen") - gcv_mode = 'eigen' + gcv_mode = _check_gcv_mode(X, self.gcv_mode) if gcv_mode == 'eigen': - _pre_compute = self._pre_compute - _errors = self._errors - _values = self._values + decompose = self._eigen_decompose_gram + solve = self._solve_eigen_gram elif gcv_mode == 'svd': - # assert n_samples >= n_features - _pre_compute = self._pre_compute_svd - _errors = self._errors_svd - _values = self._values_svd - else: - raise ValueError('bad gcv_mode "%s"' % gcv_mode) + if sparse.issparse(X): + decompose = self._eigen_decompose_covariance + solve = self._solve_eigen_covariance + else: + decompose = self._svd_decompose_design_matrix + solve = self._solve_svd_design_matrix if sample_weight is not None: X, y = _rescale_data(X, y, sample_weight) - - centered_kernel = not sparse.issparse(X) and self.fit_intercept - - v, Q, QT_y = _pre_compute(X, y, centered_kernel) - n_y = 1 if len(y.shape) == 1 else y.shape[1] - cv_values = np.zeros((n_samples * n_y, len(self.alphas))) - C = [] + sqrt_sw = np.sqrt(sample_weight) + else: + sqrt_sw = np.ones(X.shape[0], dtype=X.dtype) scorer = check_scoring(self, scoring=self.scoring, allow_none=True) error = scorer is None - if np.any(self.alphas < 0): - raise ValueError("alphas cannot be negative. " - "Got {} containing some " - "negative value instead.".format(self.alphas)) - + n_y = 1 if len(y.shape) == 1 else y.shape[1] + cv_values = np.zeros((n_samples * n_y, len(self.alphas)), + dtype=X.dtype) + C = [] + X_mean, *decomposition = decompose(X, y, sqrt_sw) for i, alpha in enumerate(self.alphas): + G_diag, c = solve( + float(alpha), y, sqrt_sw, X_mean, *decomposition) if error: - out, c = _errors(float(alpha), y, v, Q, QT_y) + squared_errors = (c / G_diag) ** 2 + cv_values[:, i] = squared_errors.ravel() else: - out, c = _values(float(alpha), y, v, Q, QT_y) - cv_values[:, i] = out.ravel() + predictions = y - (c / G_diag) + cv_values[:, i] = predictions.ravel() C.append(c) if error: @@ -1158,6 +1464,7 @@ def identity_estimator(): self.dual_coef_ = C[best] self.coef_ = safe_sparse_dot(self.dual_coef_.T, X) + X_offset += X_mean * X_scale self._set_intercept(X_offset, y_offset, X_scale) if self.store_cv_values: @@ -1189,7 +1496,8 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- X : array-like, shape = [n_samples, n_features] - Training data + Training data. If using GCV, will be cast to float64 + if necessary. y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values. Will be cast to X's dtype if necessary @@ -1200,8 +1508,17 @@ def fit(self, X, y, sample_weight=None): Returns ------- self : object + + Notes + ----- + When sample_weight is provided, the selected hyperparameter may depend + on whether we use generalized cross-validation (cv=None or cv='auto') + or another form of cross-validation, because only generalized + cross-validation takes the sample weights into account when computing + the validation score. """ - if self.cv is None: + cv = self.cv + if cv is None: estimator = _RidgeGCV(self.alphas, fit_intercept=self.fit_intercept, normalize=self.normalize, @@ -1217,9 +1534,11 @@ def fit(self, X, y, sample_weight=None): raise ValueError("cv!=None and store_cv_values=True " " are incompatible") parameters = {'alpha': self.alphas} + solver = 'sparse_cg' if sparse.issparse(X) else 'auto' gs = GridSearchCV(Ridge(fit_intercept=self.fit_intercept, - normalize=self.normalize), - parameters, cv=self.cv, scoring=self.scoring) + normalize=self.normalize, + solver=solver), + parameters, cv=cv, scoring=self.scoring) gs.fit(X, y, sample_weight=sample_weight) estimator = gs.best_estimator_ self.alpha_ = gs.best_estimator_.alpha @@ -1249,6 +1568,7 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin): the estimates. Larger values specify stronger regularization. Alpha corresponds to ``C^-1`` in other linear models such as LogisticRegression or LinearSVC. + If using generalized cross-validation, alphas must be positive. fit_intercept : boolean Whether to calculate the intercept for this model. If set @@ -1267,12 +1587,15 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin): A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. + If None, the negative mean squared error if cv is 'auto' or None + (i.e. when using generalized cross-validation), and r2 score otherwise. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the efficient Leave-One-Out cross-validation + (also known as Generalized Cross-Validation). - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. @@ -1288,15 +1611,13 @@ class RidgeCV(_BaseRidgeCV, RegressorMixin): Flag indicating which strategy to use when performing Generalized Cross-Validation. Options are:: - 'auto' : use svd if n_samples > n_features or when X is a sparse - matrix, otherwise use eigen - 'svd' : force computation via singular value decomposition of X - (does not work for sparse matrices) - 'eigen' : force computation via eigendecomposition of X^T X + 'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen' + 'svd' : force use of singular value decomposition of X when X is + dense, eigenvalue decomposition of X^T.X when X is sparse. + 'eigen' : force computation via eigendecomposition of X.X^T The 'auto' mode is the default and is intended to pick the cheaper - option of the two depending upon the shape and format of the training - data. + option of the two depending on the shape of the training data. store_cv_values : boolean, default=False Flag indicating if the cross-validation values corresponding to @@ -1463,7 +1784,8 @@ def fit(self, X, y, sample_weight=None): ---------- X : array-like, shape (n_samples, n_features) Training vectors, where n_samples is the number of samples - and n_features is the number of features. + and n_features is the number of features. When using GCV, + will be cast to float64 if necessary. y : array-like, shape (n_samples,) Target values. Will be cast to X's dtype if necessary diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index 7bfb617d4beff..fa7f0606b1010 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -6,8 +6,8 @@ import pytest from sklearn.utils.testing import assert_almost_equal -from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_allclose +from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_greater @@ -33,10 +33,12 @@ from sklearn.linear_model.ridge import RidgeClassifierCV from sklearn.linear_model.ridge import _solve_cholesky from sklearn.linear_model.ridge import _solve_cholesky_kernel +from sklearn.linear_model.ridge import _check_gcv_mode +from sklearn.linear_model.ridge import _X_operator from sklearn.datasets import make_regression from sklearn.model_selection import GridSearchCV -from sklearn.model_selection import KFold +from sklearn.model_selection import KFold, GroupKFold, cross_val_predict from sklearn.utils import check_random_state from sklearn.datasets import make_multilabel_classification @@ -311,6 +313,213 @@ def test_ridge_individual_penalties(): assert_raises(ValueError, ridge.fit, X, y) +@pytest.mark.parametrize('n_col', [(), (1,), (3,)]) +def test_x_operator(n_col): + rng = np.random.RandomState(0) + X = rng.randn(11, 8) + X_m = rng.randn(8) + sqrt_sw = rng.randn(len(X)) + Y = rng.randn(11, *n_col) + A = rng.randn(9, *n_col) + operator = _X_operator(sp.csr_matrix(X), X_m, sqrt_sw) + reference_operator = np.hstack( + [X - sqrt_sw[:, None] * X_m, sqrt_sw[:, None]]) + assert_allclose(reference_operator.dot(A), operator.dot(A)) + assert_allclose(reference_operator.T.dot(Y), operator.T.dot(Y)) + + +@pytest.mark.parametrize('shape', [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)]) +@pytest.mark.parametrize('uniform_weights', [True, False]) +def test_compute_gram(shape, uniform_weights): + rng = np.random.RandomState(0) + X = rng.randn(*shape) + if uniform_weights: + sw = np.ones(X.shape[0]) + else: + sw = rng.chisquare(1, shape[0]) + sqrt_sw = np.sqrt(sw) + X_mean = np.average(X, axis=0, weights=sw) + X_centered = (X - X_mean) * sqrt_sw[:, None] + true_gram = X_centered.dot(X_centered.T) + X_sparse = sp.csr_matrix(X * sqrt_sw[:, None]) + gcv = _RidgeGCV(fit_intercept=True) + computed_gram, computed_mean = gcv._compute_gram(X_sparse, sqrt_sw) + assert_allclose(X_mean, computed_mean) + assert_allclose(true_gram, computed_gram) + + +@pytest.mark.parametrize('shape', [(10, 1), (13, 9), (3, 7), (2, 2), (20, 20)]) +@pytest.mark.parametrize('uniform_weights', [True, False]) +def test_compute_covariance(shape, uniform_weights): + rng = np.random.RandomState(0) + X = rng.randn(*shape) + if uniform_weights: + sw = np.ones(X.shape[0]) + else: + sw = rng.chisquare(1, shape[0]) + sqrt_sw = np.sqrt(sw) + X_mean = np.average(X, axis=0, weights=sw) + X_centered = (X - X_mean) * sqrt_sw[:, None] + true_covariance = X_centered.T.dot(X_centered) + X_sparse = sp.csr_matrix(X * sqrt_sw[:, None]) + gcv = _RidgeGCV(fit_intercept=True) + computed_cov, computed_mean = gcv._compute_covariance(X_sparse, sqrt_sw) + assert_allclose(X_mean, computed_mean) + assert_allclose(true_covariance, computed_cov) + + +def _make_sparse_offset_regression( + n_samples=100, n_features=100, proportion_nonzero=.5, + n_informative=10, n_targets=1, bias=13., X_offset=30., + noise=30., shuffle=True, coef=False, random_state=None): + X, y, c = make_regression( + n_samples=n_samples, n_features=n_features, + n_informative=n_informative, n_targets=n_targets, bias=bias, + noise=noise, shuffle=shuffle, + coef=True, random_state=random_state) + if n_features == 1: + c = np.asarray([c]) + X += X_offset + mask = np.random.RandomState(random_state).binomial( + 1, proportion_nonzero, X.shape) > 0 + removed_X = X.copy() + X[~mask] = 0. + removed_X[mask] = 0. + y -= removed_X.dot(c) + if n_features == 1: + c = c[0] + if coef: + return X, y, c + return X, y + + +@pytest.mark.parametrize('gcv_mode', ['svd', 'eigen']) +@pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix]) +@pytest.mark.parametrize('X_shape', [(11, 8), (11, 20)]) +@pytest.mark.parametrize('fit_intercept', [True, False]) +@pytest.mark.parametrize( + 'y_shape, normalize, noise', + [ + ((11,), True, 1.), + ((11, 1), False, 30.), + ((11, 3), False, 150.), + ] +) +def test_ridge_gcv_vs_ridge_loo_cv( + gcv_mode, X_constructor, X_shape, y_shape, + fit_intercept, normalize, noise): + n_samples, n_features = X_shape + n_targets = y_shape[-1] if len(y_shape) == 2 else 1 + X, y = _make_sparse_offset_regression( + n_samples=n_samples, n_features=n_features, n_targets=n_targets, + random_state=0, shuffle=False, noise=noise, n_informative=5 + ) + y = y.reshape(y_shape) + + alphas = [1e-3, .1, 1., 10., 1e3] + loo_ridge = RidgeCV(cv=n_samples, fit_intercept=fit_intercept, + alphas=alphas, scoring='neg_mean_squared_error', + normalize=normalize) + gcv_ridge = RidgeCV(gcv_mode=gcv_mode, fit_intercept=fit_intercept, + alphas=alphas, normalize=normalize) + + loo_ridge.fit(X, y) + + X_gcv = X_constructor(X) + gcv_ridge.fit(X_gcv, y) + + assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_) + assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3) + assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3) + + +@pytest.mark.parametrize('gcv_mode', ['svd', 'eigen']) +@pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix]) +@pytest.mark.parametrize('n_features', [8, 20]) +@pytest.mark.parametrize('y_shape, fit_intercept, noise', + [((11,), True, 1.), + ((11, 1), True, 20.), + ((11, 3), True, 150.), + ((11, 3), False, 30.)]) +def test_ridge_gcv_sample_weights( + gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise): + alphas = [1e-3, .1, 1., 10., 1e3] + rng = np.random.RandomState(0) + n_targets = y_shape[-1] if len(y_shape) == 2 else 1 + X, y = _make_sparse_offset_regression( + n_samples=11, n_features=n_features, n_targets=n_targets, + random_state=0, shuffle=False, noise=noise) + y = y.reshape(y_shape) + + sample_weight = 3 * rng.randn(len(X)) + sample_weight = (sample_weight - sample_weight.min() + 1).astype(int) + indices = np.repeat(np.arange(X.shape[0]), sample_weight) + sample_weight = sample_weight.astype(float) + X_tiled, y_tiled = X[indices], y[indices] + + cv = GroupKFold(n_splits=X.shape[0]) + splits = cv.split(X_tiled, y_tiled, groups=indices) + kfold = RidgeCV( + alphas=alphas, cv=splits, scoring='neg_mean_squared_error', + fit_intercept=fit_intercept) + # ignore warning from GridSearchCV: DeprecationWarning: The default of the + # `iid` parameter will change from True to False in version 0.22 and will + # be removed in 0.24 + with ignore_warnings(category=DeprecationWarning): + kfold.fit(X_tiled, y_tiled) + + ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept) + splits = cv.split(X_tiled, y_tiled, groups=indices) + predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits) + kfold_errors = (y_tiled - predictions)**2 + kfold_errors = [ + np.sum(kfold_errors[indices == i], axis=0) for + i in np.arange(X.shape[0])] + kfold_errors = np.asarray(kfold_errors) + + X_gcv = X_constructor(X) + gcv_ridge = RidgeCV( + alphas=alphas, store_cv_values=True, + gcv_mode=gcv_mode, fit_intercept=fit_intercept) + gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight) + if len(y_shape) == 2: + gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)] + else: + gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)] + + assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_) + assert_allclose(gcv_errors, kfold_errors, rtol=1e-3) + assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3) + assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3) + + +@pytest.mark.parametrize('mode', [True, 1, 5, 'bad', 'gcv']) +def test_check_gcv_mode_error(mode): + X, y = make_regression(n_samples=5, n_features=2) + gcv = RidgeCV(gcv_mode=mode) + with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): + gcv.fit(X, y) + with pytest.raises(ValueError, match="Unknown value for 'gcv_mode'"): + _check_gcv_mode(X, mode) + + +@pytest.mark.parametrize("sparse", [True, False]) +@pytest.mark.parametrize( + 'mode, mode_n_greater_than_p, mode_p_greater_than_n', + [(None, 'svd', 'eigen'), + ('auto', 'svd', 'eigen'), + ('eigen', 'eigen', 'eigen'), + ('svd', 'svd', 'svd')] +) +def test_check_gcv_mode_choice(sparse, mode, mode_n_greater_than_p, + mode_p_greater_than_n): + X, _ = make_regression(n_samples=5, n_features=2) + if sparse: + X = sp.csr_matrix(X) + assert _check_gcv_mode(X, mode) == mode_n_greater_than_p + assert _check_gcv_mode(X.T, mode) == mode_p_greater_than_n + + def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] @@ -318,46 +527,7 @@ def _test_ridge_loo(filter_): ret = [] fit_intercept = filter_ == DENSE_FILTER - if fit_intercept: - X_diabetes_ = X_diabetes - X_diabetes.mean(0) - else: - X_diabetes_ = X_diabetes ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept) - ridge = Ridge(alpha=1.0, fit_intercept=fit_intercept) - - # because fit_intercept is applied - - # generalized cross-validation (efficient leave-one-out) - decomp = ridge_gcv._pre_compute(X_diabetes_, y_diabetes, fit_intercept) - errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp) - values, c = ridge_gcv._values(1.0, y_diabetes, *decomp) - - # brute-force leave-one-out: remove one example at a time - errors2 = [] - values2 = [] - for i in range(n_samples): - sel = np.arange(n_samples) != i - X_new = X_diabetes_[sel] - y_new = y_diabetes[sel] - ridge.fit(X_new, y_new) - value = ridge.predict([X_diabetes_[i]])[0] - error = (y_diabetes[i] - value) ** 2 - errors2.append(error) - values2.append(value) - - # check that efficient and brute-force LOO give same results - assert_almost_equal(errors, errors2) - assert_almost_equal(values, values2) - - # generalized cross-validation (efficient leave-one-out, - # SVD variation) - decomp = ridge_gcv._pre_compute_svd(X_diabetes_, y_diabetes, fit_intercept) - errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp) - values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp) - - # check that efficient and SVD efficient LOO give same results - assert_almost_equal(errors, errors3) - assert_almost_equal(values, values3) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) @@ -369,25 +539,26 @@ def _test_ridge_loo(filter_): scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) - assert_equal(ridge_gcv2.alpha_, alpha_) + assert ridge_gcv2.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) - assert_equal(ridge_gcv3.alpha_, alpha_) + assert ridge_gcv3.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('neg_mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) - assert_equal(ridge_gcv4.alpha_, alpha_) + assert ridge_gcv4.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with sample weights - ridge_gcv.fit(filter_(X_diabetes), y_diabetes, - sample_weight=np.ones(n_samples)) - assert_equal(ridge_gcv.alpha_, alpha_) + if filter_ == DENSE_FILTER: + ridge_gcv.fit(filter_(X_diabetes), y_diabetes, + sample_weight=np.ones(n_samples)) + assert ridge_gcv.alpha_ == pytest.approx(alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T @@ -397,8 +568,8 @@ def _test_ridge_loo(filter_): ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) - assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, - Y_pred, decimal=5) + assert_allclose(np.vstack((y_pred, y_pred)).T, + Y_pred, rtol=1e-5) return ret @@ -407,7 +578,7 @@ def _test_ridge_cv_normalize(filter_): ridge_cv = RidgeCV(normalize=True, cv=3) ridge_cv.fit(filter_(10. * X_diabetes), y_diabetes) - gs = GridSearchCV(Ridge(normalize=True), cv=3, + gs = GridSearchCV(Ridge(normalize=True, solver='sparse_cg'), cv=3, param_grid={'alpha': ridge_cv.alphas}) gs.fit(filter_(10. * X_diabetes), y_diabetes) assert_equal(gs.best_estimator_.alpha, ridge_cv.alpha_) @@ -501,12 +672,6 @@ def test_dense_sparse(test_func): check_dense_sparse(test_func) -def test_ridge_cv_sparse_svd(): - X = sp.csr_matrix(X_diabetes) - ridge = RidgeCV(gcv_mode="svd") - assert_raises(TypeError, ridge.fit, X) - - def test_ridge_sparse_svd(): X = sp.csc_matrix(rng.rand(100, 10)) y = rng.rand(100) @@ -620,6 +785,10 @@ def test_ridgecv_store_cv_values(): r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_targets, n_alphas) + r = RidgeCV(cv=3, store_cv_values=True) + assert_raises_regex(ValueError, 'cv!=None and store_cv_values', + r.fit, x, y) + @pytest.mark.filterwarnings('ignore: The default value of cv') # 0.22 def test_ridge_classifier_cv_store_cv_values(): @@ -762,13 +931,13 @@ def test_ridgecv_negative_alphas(): # Negative integers ridge = RidgeCV(alphas=(-1, -10, -100)) assert_raises_regex(ValueError, - "alphas cannot be negative.", + "alphas must be positive", ridge.fit, X, y) # Negative floats ridge = RidgeCV(alphas=(-0.1, -1.0, -10.0)) assert_raises_regex(ValueError, - "alphas cannot be negative.", + "alphas must be positive", ridge.fit, X, y) @@ -887,54 +1056,14 @@ def test_ridge_regression_check_arguments_validity(return_intercept, assert_allclose(out, true_coefs, rtol=0, atol=atol) -def test_errors_and_values_helper(): - ridgecv = _RidgeGCV() - rng = check_random_state(42) - alpha = 1. - n = 5 - y = rng.randn(n) - v = rng.randn(n) - Q = rng.randn(len(v), len(v)) - QT_y = Q.T.dot(y) - G_diag, c = ridgecv._errors_and_values_helper(alpha, y, v, Q, QT_y) - - # test that helper function behaves as expected - out, c_ = ridgecv._errors(alpha, y, v, Q, QT_y) - np.testing.assert_array_equal(out, (c / G_diag) ** 2) - np.testing.assert_array_equal(c, c) - - out, c_ = ridgecv._values(alpha, y, v, Q, QT_y) - np.testing.assert_array_equal(out, y - (c / G_diag)) - np.testing.assert_array_equal(c_, c) - - -def test_errors_and_values_svd_helper(): - ridgecv = _RidgeGCV() - rng = check_random_state(42) - alpha = 1. - for n, p in zip((5, 10), (12, 6)): - y = rng.randn(n) - v = rng.randn(p) - U = rng.randn(n, p) - UT_y = U.T.dot(y) - G_diag, c = ridgecv._errors_and_values_svd_helper(alpha, y, v, U, UT_y) - - # test that helper function behaves as expected - out, c_ = ridgecv._errors_svd(alpha, y, v, U, UT_y) - np.testing.assert_array_equal(out, (c / G_diag) ** 2) - np.testing.assert_array_equal(c, c) - - out, c_ = ridgecv._values_svd(alpha, y, v, U, UT_y) - np.testing.assert_array_equal(out, y - (c / G_diag)) - np.testing.assert_array_equal(c_, c) - - def test_ridge_classifier_no_support_multilabel(): X, y = make_multilabel_classification(n_samples=10, random_state=0) assert_raises(ValueError, RidgeClassifier().fit, X, y) -def test_dtype_match(): +@pytest.mark.parametrize( + "solver", ["svd", "sparse_cg", "cholesky", "lsqr", "sag", "saga"]) +def test_dtype_match(solver): rng = np.random.RandomState(0) alpha = 1.0 @@ -944,25 +1073,22 @@ def test_dtype_match(): X_32 = X_64.astype(np.float32) y_32 = y_64.astype(np.float32) - solvers = ["svd", "sparse_cg", "cholesky", "lsqr"] - for solver in solvers: - - # Check type consistency 32bits - ridge_32 = Ridge(alpha=alpha, solver=solver) - ridge_32.fit(X_32, y_32) - coef_32 = ridge_32.coef_ + # Check type consistency 32bits + ridge_32 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,) + ridge_32.fit(X_32, y_32) + coef_32 = ridge_32.coef_ - # Check type consistency 64 bits - ridge_64 = Ridge(alpha=alpha, solver=solver) - ridge_64.fit(X_64, y_64) - coef_64 = ridge_64.coef_ + # Check type consistency 64 bits + ridge_64 = Ridge(alpha=alpha, solver=solver, max_iter=500, tol=1e-10,) + ridge_64.fit(X_64, y_64) + coef_64 = ridge_64.coef_ - # Do the actual checks at once for easier debug - assert coef_32.dtype == X_32.dtype - assert coef_64.dtype == X_64.dtype - assert ridge_32.predict(X_32).dtype == X_32.dtype - assert ridge_64.predict(X_64).dtype == X_64.dtype - assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5) + # Do the actual checks at once for easier debug + assert coef_32.dtype == X_32.dtype + assert coef_64.dtype == X_64.dtype + assert ridge_32.predict(X_32).dtype == X_32.dtype + assert ridge_64.predict(X_64).dtype == X_64.dtype + assert_allclose(ridge_32.coef_, ridge_64.coef_, rtol=1e-4) def test_dtype_match_cholesky(): @@ -993,3 +1119,34 @@ def test_dtype_match_cholesky(): assert ridge_32.predict(X_32).dtype == X_32.dtype assert ridge_64.predict(X_64).dtype == X_64.dtype assert_almost_equal(ridge_32.coef_, ridge_64.coef_, decimal=5) + + +@pytest.mark.parametrize( + 'solver', ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']) +@pytest.mark.parametrize('seed', range(1)) +def test_ridge_regression_dtype_stability(solver, seed): + random_state = np.random.RandomState(seed) + n_samples, n_features = 6, 5 + X = random_state.randn(n_samples, n_features) + coef = random_state.randn(n_features) + y = np.dot(X, coef) + 0.01 * random_state.randn(n_samples) + alpha = 1.0 + results = dict() + # XXX: Sparse CG seems to be far less numerically stable than the + # others, maybe we should not enable float32 for this one. + atol = 1e-3 if solver == "sparse_cg" else 1e-5 + for current_dtype in (np.float32, np.float64): + results[current_dtype] = ridge_regression(X.astype(current_dtype), + y.astype(current_dtype), + alpha=alpha, + solver=solver, + random_state=random_state, + sample_weight=None, + max_iter=500, + tol=1e-10, + return_n_iter=False, + return_intercept=False) + + assert results[np.float32].dtype == np.float32 + assert results[np.float64].dtype == np.float64 + assert_allclose(results[np.float32], results[np.float64], atol=atol) diff --git a/sklearn/manifold/isomap.py b/sklearn/manifold/isomap.py index bbb83a5ed81f8..88c979c0e1fdb 100644 --- a/sklearn/manifold/isomap.py +++ b/sklearn/manifold/isomap.py @@ -145,7 +145,7 @@ def reconstruction_error(self): reconstruction_error : float Notes - ------- + ----- The cost function of an isomap embedding is ``E = frobenius_norm[K(D) - K(D_fit)] / n_samples`` diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index e387ecec0f4d5..a6d5af54f9bc4 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -348,7 +348,7 @@ class SpectralEmbedding(BaseEstimator): Read more in the :ref:`User Guide `. Parameters - ----------- + ---------- n_components : integer, default: 2 The dimension of the projected subspace. diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py index 9eae62a28045e..d1337bdc61aed 100644 --- a/sklearn/metrics/classification.py +++ b/sklearn/metrics/classification.py @@ -1989,16 +1989,18 @@ def hamming_loss(y_true, y_pred, labels=None, sample_weight=None): ----- In multiclass classification, the Hamming loss corresponds to the Hamming distance between ``y_true`` and ``y_pred`` which is equivalent to the - subset ``zero_one_loss`` function. + subset ``zero_one_loss`` function, when `normalize` parameter is set to + True. In multilabel classification, the Hamming loss is different from the subset zero-one loss. The zero-one loss considers the entire set of labels for a given sample incorrect if it does not entirely match the true set of - labels. Hamming loss is more forgiving in that it penalizes the individual - labels. + labels. Hamming loss is more forgiving in that it penalizes only the + individual labels. - The Hamming loss is upperbounded by the subset zero-one loss. When - normalized over samples, the Hamming loss is always between 0 and 1. + The Hamming loss is upperbounded by the subset zero-one loss, when + `normalize` parameter is set to True. It is always between 0 and 1, + lower being better. References ---------- diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index a32d6aa6efbcc..9e377f3d4c07e 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -749,7 +749,7 @@ def paired_cosine_distances(X, Y): distances : ndarray, shape (n_samples, ) Notes - ------ + ----- The cosine distance is equivalent to the half the squared euclidean distance if each sample is normalized to unit norm """ @@ -1169,17 +1169,17 @@ def distance_metrics(): The valid distance metrics, and the function they map to, are: - ============ ==================================== - metric Function - ============ ==================================== - 'cityblock' metrics.pairwise.manhattan_distances - 'cosine' metrics.pairwise.cosine_distances - 'euclidean' metrics.pairwise.euclidean_distances - 'haversine' metrics.pairwise.haversine_distances - 'l1' metrics.pairwise.manhattan_distances - 'l2' metrics.pairwise.euclidean_distances - 'manhattan' metrics.pairwise.manhattan_distances - ============ ==================================== + ============ ==================================== + metric Function + ============ ==================================== + 'cityblock' metrics.pairwise.manhattan_distances + 'cosine' metrics.pairwise.cosine_distances + 'euclidean' metrics.pairwise.euclidean_distances + 'haversine' metrics.pairwise.haversine_distances + 'l1' metrics.pairwise.manhattan_distances + 'l2' metrics.pairwise.euclidean_distances + 'manhattan' metrics.pairwise.manhattan_distances + ============ ==================================== Read more in the :ref:`User Guide `. diff --git a/sklearn/mixture/bayesian_mixture.py b/sklearn/mixture/bayesian_mixture.py index 6f13f63e3fcd9..88c0ab66ae20a 100644 --- a/sklearn/mixture/bayesian_mixture.py +++ b/sklearn/mixture/bayesian_mixture.py @@ -140,7 +140,7 @@ class BayesianGaussianMixture(BaseMixture): mean_precision_prior : float | None, optional. The precision prior on the mean distribution (Gaussian). - Controls the extend to where means can be placed. Smaller + Controls the extend to where means can be placed. Larger values concentrate the means of each clusters around `mean_prior`. The value of the parameter must be greater than 0. If it is None, it's set to 1. @@ -260,7 +260,7 @@ class BayesianGaussianMixture(BaseMixture): mean_precision_prior : float The precision prior on the mean distribution (Gaussian). Controls the extend to where means can be placed. - Smaller values concentrate the means of each clusters around + Larger values concentrate the means of each clusters around `mean_prior`. mean_precision_ : array-like, shape (n_components,) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 68f0e296b077c..6fe2a8edfa12a 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -447,7 +447,7 @@ def predict(self, X): ``predict``. Parameters - ----------- + ---------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. @@ -464,7 +464,7 @@ def predict_proba(self, X): ``predict_proba``. Parameters - ----------- + ---------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. @@ -481,7 +481,7 @@ def predict_log_proba(self, X): ``predict_log_proba``. Parameters - ----------- + ---------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. @@ -498,7 +498,7 @@ def decision_function(self, X): ``decision_function``. Parameters - ----------- + ---------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. @@ -515,7 +515,7 @@ def transform(self, X): ``refit=True``. Parameters - ----------- + ---------- X : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. @@ -532,7 +532,7 @@ def inverse_transform(self, Xt): ``inverse_transform`` and ``refit=True``. Parameters - ----------- + ---------- Xt : indexable, length n_samples Must fulfill the input assumptions of the underlying estimator. @@ -1103,7 +1103,7 @@ class GridSearchCV(BaseSearchCV): This is present only if ``refit`` is not False. Notes - ------ + ----- The parameters selected are those that maximize the score of the left out data, unless an explicit score is passed in which case it is used instead. diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 17fb16ae8340e..24fefef5216fe 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -853,7 +853,7 @@ class LeaveOneGroupOut(BaseCrossValidator): >>> logo = LeaveOneGroupOut() >>> logo.get_n_splits(X, y, groups) 2 - >>> logo.get_n_splits(groups=groups) # 'groups' is always required + >>> logo.get_n_splits(groups=groups) # 'groups' is always required 2 >>> print(logo) LeaveOneGroupOut() diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 2f5505fff01c6..3dc8b0441a64a 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -1118,7 +1118,7 @@ def learning_curve(estimator, X, y, groups=None, train_sizes=np.linspace(0.1, 1.0, 5), cv='warn', scoring=None, exploit_incremental_learning=False, n_jobs=None, pre_dispatch="all", verbose=0, shuffle=False, - random_state=None, error_score='raise-deprecating'): + random_state=None, error_score='raise-deprecating'): """Learning curve. Determines cross-validated training and test scores for different training diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 21f272c518f51..00c4b8636a17c 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -62,14 +62,14 @@ def _get_weights(dist, weights): """Get the weights from an array of distances and a parameter ``weights`` Parameters - =========== + ---------- dist : ndarray The input distances weights : {'uniform', 'distance' or a callable} The kind of weighting used Returns - ======== + ------- weights_arr : array of the same shape as ``dist`` if ``weights == 'uniform'``, then returns None """ diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py index 5ad2f7e9b7b1d..472710ea51bb2 100644 --- a/sklearn/neighbors/lof.py +++ b/sklearn/neighbors/lof.py @@ -401,7 +401,7 @@ def _decision_function(self, X): def score_samples(self): """Opposite of the Local Outlier Factor of X. - It is the opposite as as bigger is better, i.e. large values correspond + It is the opposite as bigger is better, i.e. large values correspond to inliers. Only available for novelty detection (when novelty is set to True). @@ -437,7 +437,7 @@ def score_samples(self): def _score_samples(self, X): """Opposite of the Local Outlier Factor of X. - It is the opposite as as bigger is better, i.e. large values correspond + It is the opposite as bigger is better, i.e. large values correspond to inliers. Only available for novelty detection (when novelty is set to True). @@ -500,5 +500,5 @@ def _local_reachability_density(self, distances_X, neighbors_indices): self.n_neighbors_ - 1] reach_dist_array = np.maximum(distances_X, dist_k) - # 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_: + # 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_: return 1. / (np.mean(reach_dist_array, axis=1) + 1e-10) diff --git a/sklearn/neural_network/_stochastic_optimizers.py b/sklearn/neural_network/_stochastic_optimizers.py index 8f19c7b488acc..3e49e94de8bd1 100644 --- a/sklearn/neural_network/_stochastic_optimizers.py +++ b/sklearn/neural_network/_stochastic_optimizers.py @@ -1,7 +1,7 @@ """Stochastic optimization methods for MLP """ -# Authors: Jiyuan Qian +# Authors: Jiyuan Qian # License: BSD 3 clause import numpy as np diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 1fcdadaabb6c0..9a51fefd144ac 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -95,12 +95,12 @@ class Pipeline(_BaseComposition): >>> # For instance, fit using a k of 10 in the SelectKBest >>> # and a parameter 'C' of the svm >>> anova_svm.set_params(anova__k=10, svc__C=.1).fit(X, y) - ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE + ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE Pipeline(memory=None, steps=[('anova', SelectKBest(...)), ('svc', SVC(...))], verbose=False) >>> prediction = anova_svm.predict(X) - >>> anova_svm.score(X, y) # doctest: +ELLIPSIS + >>> anova_svm.score(X, y) # doctest: +ELLIPSIS 0.83 >>> # getting the selected features chosen by anova_filter >>> anova_svm['anova'].get_support() @@ -671,7 +671,7 @@ def make_pipeline(*steps, **kwargs): >>> from sklearn.naive_bayes import GaussianNB >>> from sklearn.preprocessing import StandardScaler >>> make_pipeline(StandardScaler(), GaussianNB(priors=None)) - ... # doctest: +NORMALIZE_WHITESPACE + ... # doctest: +NORMALIZE_WHITESPACE Pipeline(memory=None, steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), @@ -782,7 +782,7 @@ class FeatureUnion(_BaseComposition, TransformerMixin): >>> union = FeatureUnion([("pca", PCA(n_components=1)), ... ("svd", TruncatedSVD(n_components=2))]) >>> X = [[0., 1., 3], [2., 2., 5]] - >>> union.fit_transform(X) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + >>> union.fit_transform(X) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS array([[ 1.5 , 3.0..., 0.8...], [-1.5 , 5.7..., -0.4...]]) """ @@ -1008,7 +1008,7 @@ def make_union(*transformers, **kwargs): -------- >>> from sklearn.decomposition import PCA, TruncatedSVD >>> from sklearn.pipeline import make_union - >>> make_union(PCA(), TruncatedSVD()) # doctest: +NORMALIZE_WHITESPACE + >>> make_union(PCA(), TruncatedSVD()) # doctest: +NORMALIZE_WHITESPACE FeatureUnion(n_jobs=None, transformer_list=[('pca', PCA(copy=True, iterated_power='auto', diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 8c8524ef6505c..2f020a0a4780e 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1475,17 +1475,21 @@ def transform(self, X): Parameters ---------- - X : array-like or sparse matrix, shape [n_samples, n_features] + X : array-like or CSR/CSC sparse matrix, shape [n_samples, n_features] The data to transform, row by row. - Sparse input should preferably be in CSR format (for speed), - but must be in CSC format if the degree is 4 or higher. - If the input matrix is in CSR format and the expansion is of - degree 2 or 3, the method described in the work "Leveraging - Sparsity to Speed Up Polynomial Feature Expansions of CSR - Matrices Using K-Simplex Numbers" by Andrew Nystrom and - John Hughes is used, which is much faster than the method - used on CSC input. + Prefer CSR over CSC for sparse input (for speed), but CSC is + required if the degree is 4 or higher. If the degree is less than + 4 and the input format is CSC, it will be converted to CSR, have + its polynomial features generated, then converted back to CSC. + + If the degree is 2 or 3, the method described in "Leveraging + Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices + Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is + used, which is much faster than the method used on CSC input. For + this reason, a CSC input will be converted to CSR, and the output + will be converted back to CSC prior to being returned, hence the + preference of CSR. Returns ------- @@ -1679,7 +1683,7 @@ class Normalizer(BaseEstimator, TransformerMixin): >>> X = [[4, 1, 2, 2], ... [1, 3, 9, 3], ... [5, 7, 5, 1]] - >>> transformer = Normalizer().fit(X) # fit does nothing. + >>> transformer = Normalizer().fit(X) # fit does nothing. >>> transformer Normalizer(copy=True, norm='l2') >>> transformer.transform(X) @@ -1815,7 +1819,7 @@ class Binarizer(BaseEstimator, TransformerMixin): >>> X = [[ 1., -1., 2.], ... [ 2., 0., 0.], ... [ 0., 1., -1.]] - >>> transformer = Binarizer().fit(X) # fit does nothing. + >>> transformer = Binarizer().fit(X) # fit does nothing. >>> transformer Binarizer(copy=True, threshold=0.0) >>> transformer.transform(X) @@ -2262,7 +2266,7 @@ def _transform_col(self, X_col, quantiles, inverse): upper_bound_x = 1 lower_bound_y = quantiles[0] upper_bound_y = quantiles[-1] - # for inverse transform, match a uniform distribution + # for inverse transform, match a uniform distribution with np.errstate(invalid='ignore'): # hide NaN comparison warnings if output_distribution == 'normal': X_col = stats.norm.cdf(X_col) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index f7cffa1e663b5..4a1c700717555 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -420,7 +420,7 @@ def fit_transform(self, y): """Fit label binarizer and transform multi-class labels to binary labels. - The output of transform is sometimes referred to as + The output of transform is sometimes referred to as the 1-of-K coding scheme. Parameters diff --git a/sklearn/setup.py b/sklearn/setup.py index e6f10cad77d9f..5a377043e9e38 100644 --- a/sklearn/setup.py +++ b/sklearn/setup.py @@ -33,6 +33,8 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('feature_selection/tests') config.add_subpackage('gaussian_process') config.add_subpackage('gaussian_process/tests') + config.add_subpackage('impute') + config.add_subpackage('impute/tests') config.add_subpackage('inspection') config.add_subpackage('inspection/tests') config.add_subpackage('mixture') diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py index effb0dcd12504..fe2f943cbdb7c 100644 --- a/sklearn/svm/base.py +++ b/sklearn/svm/base.py @@ -126,7 +126,7 @@ def fit(self, X, y, sample_weight=None): self : object Notes - ------ + ----- If X and y are not C-ordered and contiguous arrays of np.float64 and X is not a scipy.sparse.csr_matrix, X and/or y may be copied. @@ -293,7 +293,7 @@ def _sparse_fit(self, X, y, sample_weight, solver_type, kernel, if hasattr(self, "classes_"): n_class = len(self.classes_) - 1 - else: # regression + else: # regression n_class = 1 n_SV = self.support_vectors_.shape[0] @@ -540,7 +540,7 @@ def decision_function(self, X): n_classes). Notes - ------ + ----- If decision_function_shape='ovo', the function values are proportional to the distance of the samples X to the separating hyperplane. If the exact distances are required, divide the function values by the norm of diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py index 8f45a501ddf35..a236ba716bc0d 100644 --- a/sklearn/svm/classes.py +++ b/sklearn/svm/classes.py @@ -429,10 +429,10 @@ def fit(self, X, y, sample_weight=None): class SVC(BaseSVC): """C-Support Vector Classification. - The implementation is based on libsvm. The fit time complexity - is more than quadratic with the number of samples which makes it hard - to scale to datasets with more than a couple of 10000 samples. For large - datasets consider using :class:`sklearn.linear_model.LinearSVC` or + The implementation is based on libsvm. The fit time scales at least + quadratically with the number of samples and may be impractical + beyond tens of thousands of samples. For large datasets + consider using :class:`sklearn.linear_model.LinearSVC` or :class:`sklearn.linear_model.SGDClassifier` instead, possibly after a :class:`sklearn.kernel_approximation.Nystroem` transformer. diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index fc3c7f3985e28..660b38c1ae4c2 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -215,7 +215,7 @@ def test_import_all_consistency(): def test_root_import_all_completeness(): - EXCEPTIONS = ('utils', 'tests', 'base', 'setup') + EXCEPTIONS = ('utils', 'tests', 'base', 'setup', 'conftest') for _, modname, _ in pkgutil.walk_packages(path=sklearn.__path__, onerror=lambda _: None): if '.' in modname or modname.startswith('_') or modname in EXCEPTIONS: diff --git a/sklearn/tree/__init__.py b/sklearn/tree/__init__.py index e91540bed8c5f..a5ffc7585d4e4 100644 --- a/sklearn/tree/__init__.py +++ b/sklearn/tree/__init__.py @@ -11,4 +11,4 @@ __all__ = ["DecisionTreeClassifier", "DecisionTreeRegressor", "ExtraTreeClassifier", "ExtraTreeRegressor", "export_graphviz", - "plot_tree", "export_text"] + "plot_tree", "export_text"] diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index edd47845ad197..f27b42ae9c956 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1123,3 +1123,110 @@ cdef class Tree: Py_INCREF(self) arr.base = self return arr + + + def compute_partial_dependence(self, DTYPE_t[:, ::1] X, + int[::1] target_feature, + double[::1] out): + """Partial dependence of the response on the ``target_feature`` set. + + For each sample in ``X`` a tree traversal is performed. + Each traversal starts from the root with weight 1.0. + + At each non-leaf node that splits on a target feature, either + the left child or the right child is visited based on the feature + value of the current sample, and the weight is not modified. + At each non-leaf node that splits on a complementary feature, + both children are visited and the weight is multiplied by the fraction + of training samples which went to each child. + + At each leaf, the value of the node is multiplied by the current + weight (weights sum to 1 for all visited terminal nodes). + + Parameters + ---------- + X : view on 2d ndarray, shape (n_samples, n_target_features) + The grid points on which the partial dependence should be + evaluated. + target_feature : view on 1d ndarray, shape (n_target_features) + The set of target features for which the partial dependence + should be evaluated. + out : view on 1d ndarray, shape (n_samples) + The value of the partial dependence function on each grid + point. + """ + cdef: + double[::1] weight_stack = np.zeros(self.node_count, + dtype=np.float64) + SIZE_t[::1] node_idx_stack = np.zeros(self.node_count, + dtype=np.intp) + SIZE_t sample_idx + SIZE_t feature_idx + int stack_size + double left_sample_frac + double current_weight + double total_weight # used for sanity check only + Node *current_node # use a pointer to avoid copying attributes + SIZE_t current_node_idx + bint is_target_feature + SIZE_t _TREE_LEAF = TREE_LEAF # to avoid python interactions + + for sample_idx in range(X.shape[0]): + # init stacks for current sample + stack_size = 1 + node_idx_stack[0] = 0 # root node + weight_stack[0] = 1 # all the samples are in the root node + total_weight = 0 + + while stack_size > 0: + # pop the stack + stack_size -= 1 + current_node_idx = node_idx_stack[stack_size] + current_node = &self.nodes[current_node_idx] + + if current_node.left_child == _TREE_LEAF: + # leaf node + out[sample_idx] += (weight_stack[stack_size] * + self.value[current_node_idx]) + total_weight += weight_stack[stack_size] + else: + # non-leaf node + + # determine if the split feature is a target feature + is_target_feature = False + for feature_idx in range(target_feature.shape[0]): + if target_feature[feature_idx] == current_node.feature: + is_target_feature = True + break + + if is_target_feature: + # In this case, we push left or right child on stack + if X[sample_idx, feature_idx] <= current_node.threshold: + node_idx_stack[stack_size] = current_node.left_child + else: + node_idx_stack[stack_size] = current_node.right_child + stack_size += 1 + else: + # In this case, we push both children onto the stack, + # and give a weight proportional to the number of + # samples going through each branch. + + # push left child + node_idx_stack[stack_size] = current_node.left_child + left_sample_frac = ( + self.nodes[current_node.left_child].weighted_n_node_samples / + current_node.weighted_n_node_samples) + current_weight = weight_stack[stack_size] + weight_stack[stack_size] = current_weight * left_sample_frac + stack_size += 1 + + # push right child + node_idx_stack[stack_size] = current_node.right_child + weight_stack[stack_size] = ( + current_weight * (1 - left_sample_frac)) + stack_size += 1 + + # Sanity check. Should never happen. + if not (0.999 < total_weight < 1.001): + raise ValueError("Total weight should be 1.0 but was %.9f" % + total_weight) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index fdbd48e75f3a9..634eb3ef84cdd 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -20,7 +20,7 @@ import numpy as np cimport numpy as np np.import_array() -from ..utils cimport _random +from ..utils._random cimport our_rand_r # ============================================================================= # Helper functions @@ -64,13 +64,13 @@ cdef inline np.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size): cdef inline SIZE_t rand_int(SIZE_t low, SIZE_t high, UINT32_t* random_state) nogil: """Generate a random integer in [low; end).""" - return low + _random.our_rand_r(random_state) % (high - low) + return low + our_rand_r(random_state) % (high - low) cdef inline double rand_uniform(double low, double high, UINT32_t* random_state) nogil: """Generate a random double in [low; high).""" - return ((high - low) * _random.our_rand_r(random_state) / + return ((high - low) * our_rand_r(random_state) / RAND_R_MAX) + low diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py index 02aa68b8af2dc..636ef03689a79 100644 --- a/sklearn/tree/export.py +++ b/sklearn/tree/export.py @@ -839,7 +839,7 @@ def export_text(decision_tree, feature_names=None, max_depth=10, Text summary of all the rules in the decision tree. Examples - ------- + -------- >>> from sklearn.datasets import load_iris >>> from sklearn.tree import DecisionTreeClassifier diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py index 65b0a201be369..eed9be7bcb5d9 100644 --- a/sklearn/tree/tests/test_export.py +++ b/sklearn/tree/tests/test_export.py @@ -399,9 +399,8 @@ def test_export_text(): assert export_text(reg, decimals=1, show_weights=True) == expected_report -def test_plot_tree(): +def test_plot_tree(pyplot): # mostly smoke tests - pytest.importorskip("matplotlib.pyplot") # Check correctness of export_graphviz clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2, diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index f04e43106e415..fcb03b0cecddd 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -392,7 +392,7 @@ def weighted_mode(a, w, axis=0): The value 4 appears three times: with uniform weights, the result is simply the mode of the distribution. - >>> weights = [1, 3, 0.5, 1.5, 1, 2] # deweight the 4's + >>> weights = [1, 3, 0.5, 1.5, 1, 2] # deweight the 4's >>> weighted_mode(x, weights) (array([2.]), array([3.5])) diff --git a/sklearn/utils/mocking.py b/sklearn/utils/mocking.py index 9c059f2ed2ed9..76ad144ccb171 100644 --- a/sklearn/utils/mocking.py +++ b/sklearn/utils/mocking.py @@ -108,7 +108,7 @@ def fit(self, X, y, **fit_params): def predict(self, T): """ Parameters - ----------- + ---------- T : indexable, length n_samples """ if self.check_X is not None: diff --git a/sklearn/utils/seq_dataset.pyx.tp b/sklearn/utils/seq_dataset.pyx.tp index f1b34c4c86bce..14f80804554db 100644 --- a/sklearn/utils/seq_dataset.pyx.tp +++ b/sklearn/utils/seq_dataset.pyx.tp @@ -45,7 +45,7 @@ import numpy as np np.import_array() -from . cimport _random +from ._random cimport our_rand_r cdef class SequentialDataset{{name}}: """Base class for datasets with sequential data access. @@ -155,7 +155,7 @@ cdef class SequentialDataset{{name}}: cdef int n = self.n_samples cdef unsigned i, j for i in range(n - 1): - j = i + _random.our_rand_r(&seed) % (n - i) + j = i + our_rand_r(&seed) % (n - i) ind[i], ind[j] = ind[j], ind[i] cdef int _get_next_index(self) nogil: @@ -169,7 +169,7 @@ cdef class SequentialDataset{{name}}: cdef int _get_random_index(self) nogil: cdef int n = self.n_samples - cdef int current_index = _random.our_rand_r(&self.seed) % n + cdef int current_index = our_rand_r(&self.seed) % n self.current_index = current_index return current_index diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 65bed4c7ecef8..babf0b8658b5c 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -714,28 +714,6 @@ def set_random_state(estimator, random_state=0): estimator.set_params(random_state=random_state) -def if_matplotlib(func): - """Test decorator that skips test if matplotlib not installed. - - Parameters - ---------- - func - """ - @wraps(func) - def run_test(*args, **kwargs): - try: - import matplotlib - matplotlib.use('Agg', warn=False) - # this fails if no $DISPLAY specified - import matplotlib.pyplot as plt - plt.figure() - except ImportError: - raise SkipTest('Matplotlib not available.') - else: - return func(*args, **kwargs) - return run_test - - try: import pytest @@ -1024,21 +1002,3 @@ def assert_run_python_script(source_code, timeout=60): % e.output.decode('utf-8')) finally: os.unlink(source_file) - - -def close_figure(fig=None): - """Close a matplotlibt figure. - - Parameters - ---------- - fig : int or str or Figure, optional (default=None) - The figure, figure number or figure name to close. If ``None``, all - current figures are closed. - """ - from matplotlib.pyplot import get_fignums, close as _close # noqa - - if fig is None: - for fig in get_fignums(): - _close(fig) - else: - _close(fig)