From 33868d1edda783fbfaf3cd5476a3441a01b51b7d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 5 Jun 2019 15:55:11 +0200 Subject: [PATCH 01/86] TST add test to ensure support of pipeline in PDP --- .../tests/test_partial_dependence.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 0b9405e471741..1ed74a2e3c347 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -23,6 +23,8 @@ from sklearn.cluster import KMeans from sklearn.metrics import r2_score from sklearn.preprocessing import PolynomialFeatures +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import make_pipeline from sklearn.dummy import DummyClassifier from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils.testing import assert_allclose @@ -393,6 +395,31 @@ def test_partial_dependence_sample_weight(): assert np.corrcoef(pdp, values)[0, 1] > 0.99 +def test_partial_dependence_pipeline(): + # check that the partial dependence support pipeline + iris = load_iris() + + scaler = StandardScaler() + clf = DummyClassifier(random_state=42) + pipe = make_pipeline(scaler, clf) + + clf.fit(scaler.fit_transform(iris.data), iris.target) + pipe.fit(iris.data, iris.target) + + features = 0 + pdp_pipe, values_pipe = partial_dependence( + pipe, iris.data, features=[features] + ) + pdp_clf, values_clf = partial_dependence( + clf, scaler.transform(iris.data), features=[features] + ) + assert_allclose(pdp_pipe, pdp_clf) + assert_allclose( + values_pipe[0], + (values_clf[0] * scaler.scale_[features]) + scaler.mean_[features] + ) + + def test_plot_partial_dependence(pyplot): # Test partial dependence plot function. boston = load_boston() From f2035fe1e0a1389029e38cce17882ecc0daae8fc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 5 Jun 2019 17:27:12 +0200 Subject: [PATCH 02/86] EHN add support for dataframe in PDP --- sklearn/compose/_column_transformer.py | 18 ++++- sklearn/inspection/partial_dependence.py | 75 ++++++++++++------- .../tests/test_partial_dependence.py | 67 ++++++++++++++++- sklearn/utils/testing.py | 11 +++ 4 files changed, 136 insertions(+), 35 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 1bfae5d200e13..5c234611e6aa5 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -649,7 +649,12 @@ def _get_column_indices(X, key): if (_check_key_type(key, int) or hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_)): # Convert key into positive indexes - idx = np.arange(n_columns)[key] + try: + idx = np.arange(n_columns)[key] + except IndexError as e: + raise ValueError( + 'all features must be in [0, %d]' % (n_columns - 1) + ) from e return np.atleast_1d(idx).tolist() elif _check_key_type(key, str): try: @@ -672,7 +677,16 @@ def _get_column_indices(X, key): else: columns = list(key) - return [all_columns.index(col) for col in columns] + try: + column_indices = [all_columns.index(col) for col in columns] + except ValueError as e: + if 'not in list' in str(e): + raise ValueError( + "A given feature is not a column of the dataframe" + ) from e + raise + + return column_indices else: raise ValueError("No valid specification of the columns. Only a " "scalar, list or slice of all integers or all " diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index 3191dcd7a1352..d9f2758dbcb21 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -11,10 +11,15 @@ from collections.abc import Iterable import numpy as np +from scipy import sparse from scipy.stats.mstats import mquantiles from joblib import Parallel, delayed +from ..compose._column_transformer import _get_column +from ..compose._column_transformer import _get_column_indices + from ..base import is_classifier, is_regressor +from ..pipeline import Pipeline from ..utils.extmath import cartesian from ..utils import check_array from ..utils import check_matplotlib_support # noqa @@ -70,16 +75,16 @@ def _grid_from_X(X, percentiles, grid_resolution): values = [] for feature in range(X.shape[1]): - uniques = np.unique(X[:, feature]) + uniques = np.unique(_get_column(X, feature)) if uniques.shape[0] < grid_resolution: # feature has low resolution use unique vals axis = uniques else: # create axis based on percentiles and grid resolution - emp_percentiles = mquantiles(X[:, feature], prob=percentiles, - axis=0) - if np.allclose(emp_percentiles[0], - emp_percentiles[1]): + emp_percentiles = mquantiles( + _get_column(X, feature), prob=percentiles, axis=0 + ) + if np.allclose(emp_percentiles[0], emp_percentiles[1]): raise ValueError( 'percentiles are too close to each other, ' 'unable to build the grid. Please choose percentiles ' @@ -146,7 +151,10 @@ def _partial_dependence_brute(est, grid, features, X, response_method): for new_values in grid: X_eval = X.copy() for i, variable in enumerate(features): - X_eval[:, variable] = new_values[i] + if hasattr(X_eval, 'iloc'): + X_eval.iloc[:, variable] = new_values[i] + else: + X_eval[:, variable] = new_values[i] try: predictions = prediction_method(X_eval) @@ -203,6 +211,7 @@ def partial_dependence(estimator, X, features, response_method='auto', ``X`` is used both to generate a grid of values for the ``features``, and to compute the averaged predictions when method is 'brute'. + # TODO: update the type accepted features : list or array-like of int The target features for which the partial dependency should be computed. @@ -284,16 +293,20 @@ def partial_dependence(estimator, X, features, response_method='auto', dependence values are incorrect for 'recursion'. """ + preprocessor = estimator[:-1] if isinstance(estimator, Pipeline) else None + final_estimator = (estimator[-1] if isinstance(estimator, Pipeline) + else estimator) - if not (is_classifier(estimator) or is_regressor(estimator)): + if not (is_classifier(final_estimator) or is_regressor(final_estimator)): raise ValueError( "'estimator' must be a fitted regressor or classifier.") - if (hasattr(estimator, 'classes_') and - isinstance(estimator.classes_[0], np.ndarray)): + if (hasattr(final_estimator, 'classes_') and + isinstance(final_estimator.classes_[0], np.ndarray)): raise ValueError('Multiclass-multioutput estimators are not supported') - X = check_array(X) + if not(hasattr(X, '__array__') or sparse.issparse(X)): + X = check_array(X, force_all_finite='allow-nan', dtype=np.object) accepted_responses = ('auto', 'predict_proba', 'decision_function') if response_method not in accepted_responses: @@ -301,7 +314,7 @@ def partial_dependence(estimator, X, features, response_method='auto', 'response_method {} is invalid. Accepted response_method names ' 'are {}.'.format(response_method, ', '.join(accepted_responses))) - if is_regressor(estimator) and response_method != 'auto': + if is_regressor(final_estimator) and response_method != 'auto': raise ValueError( "The response_method parameter is ignored for regressors and " "must be 'auto'." @@ -313,14 +326,14 @@ def partial_dependence(estimator, X, features, response_method='auto', method, ', '.join(accepted_methods))) if method == 'auto': - if (isinstance(estimator, BaseGradientBoosting) and - estimator.init is None): + if (isinstance(final_estimator, BaseGradientBoosting) and + final_estimator.init is None): method = 'recursion' else: method = 'brute' if method == 'recursion': - if not isinstance(estimator, BaseGradientBoosting): + if not isinstance(final_estimator, BaseGradientBoosting): raise ValueError( "'estimator' must be an instance of BaseGradientBoosting " "for the 'recursion' method. Try using method='brute'.") @@ -332,27 +345,31 @@ def partial_dependence(estimator, X, features, response_method='auto', "With the 'recursion' method, the response_method must be " "'decision_function'. Got {}.".format(response_method) ) - check_is_fitted(estimator, 'estimators_', + check_is_fitted(final_estimator, 'estimators_', msg="'estimator' parameter must be a fitted estimator") - # Note: if method is brute, this check is done at prediction time - n_features = estimator.n_features_ + + features_indices = np.asarray( + _get_column_indices(X, features), dtype=np.int32, order='C' + ).ravel() + + if method == 'recursion' and preprocessor is not None: + X_preprocessed = preprocessor.transform(X) else: - n_features = X.shape[1] + X_preprocessed = X - features = np.asarray(features, dtype=np.int32, order='C').ravel() - if any(not (0 <= f < n_features) for f in features): - raise ValueError('all features must be in [0, %d]' - % (n_features - 1)) + grid, values = _grid_from_X( + _get_column(X_preprocessed, features_indices), percentiles, + grid_resolution + ) - grid, values = _grid_from_X(X[:, features], percentiles, - grid_resolution) if method == 'brute': - averaged_predictions = _partial_dependence_brute(estimator, grid, - features, X, - response_method) + averaged_predictions = _partial_dependence_brute( + estimator, grid, features_indices, X, response_method + ) else: - averaged_predictions = _partial_dependence_recursion(estimator, grid, - features) + averaged_predictions = _partial_dependence_recursion( + final_estimator, grid, features_indices + ) # reshape averaged_predictions to # (n_outputs, n_values_feature_0, n_values_feature_1, ...) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 1ed74a2e3c347..88946aa770436 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -21,14 +21,17 @@ from sklearn.datasets import load_boston, load_iris from sklearn.datasets import make_classification, make_regression from sklearn.cluster import KMeans +from sklearn.compose import make_column_transformer from sklearn.metrics import r2_score from sklearn.preprocessing import PolynomialFeatures from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import RobustScaler from sklearn.pipeline import make_pipeline from sklearn.dummy import DummyClassifier -from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.base import BaseEstimator, ClassifierMixin, clone from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import skip_if_no_pandas # toy sample @@ -44,6 +47,9 @@ regression_data = (make_regression(random_state=0), 1) multioutput_regression_data = (make_regression(n_targets=2, random_state=0), 2) +# iris +iris = load_iris() + @pytest.mark.parametrize('Estimator, method, data', [ (GradientBoostingClassifier, 'recursion', binary_classification_data), @@ -244,7 +250,6 @@ def test_partial_dependence_easy_target(est, power): assert r2 > .99 -@pytest.mark.filterwarnings('ignore:The default value of ') # 0.22 @pytest.mark.parametrize('Estimator', (sklearn.tree.DecisionTreeClassifier, sklearn.tree.ExtraTreeClassifier, @@ -321,16 +326,32 @@ def test_partial_dependence_error(estimator, params, err_msg): 'estimator', [LinearRegression(), GradientBoostingClassifier(random_state=0)] ) -@pytest.mark.parametrize('features', [-1, 1000000]) -def test_partial_dependence_unknown_feature(estimator, features): +def test_partial_dependence_unknown_feature_indices(estimator): X, y = make_classification(random_state=0) estimator.fit(X, y) + features = 100000 err_msg = 'all features must be in' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, [features]) +@pytest.mark.parametrize( + 'estimator', + [LinearRegression(), GradientBoostingClassifier(random_state=0)] +) +def test_partial_dependence_unknown_feature_string(estimator): + pd = pytest.importorskip("pandas") + X, y = make_classification(random_state=0) + df = pd.DataFrame(X) + estimator.fit(df, y) + + features = 'random' + err_msg = 'A given feature is not a column of the dataframe' + with pytest.raises(ValueError, match=err_msg): + partial_dependence(estimator, df, [features]) + + @pytest.mark.parametrize( 'estimator', [LinearRegression(), GradientBoostingClassifier(random_state=0)] @@ -420,6 +441,44 @@ def test_partial_dependence_pipeline(): ) +@pytest.mark.parametrize( + "estimator", + [LogisticRegression(max_iter=1000, random_state=0), + GradientBoostingClassifier(random_state=0, n_estimators=5)], + ids=['estimator-brute', 'estimator-recursion'] +) +@pytest.mark.parametrize( + "preprocessor", + [None, + make_column_transformer((StandardScaler(), iris.feature_names[:2]), + (RobustScaler(), iris.feature_names[2:]))], + ids=['None', 'column-transformer'] +) +@pytest.mark.parametrize( + "features", + [[0, 1], iris.feature_names[:2]], + ids=['features-integer', 'features-string'] +) +def test_partial_dependence_dataframe(estimator, preprocessor, features): + # check that the partial dependence support dataframe + pd = pytest.importorskip("pandas") + df = pd.DataFrame(iris.data, columns=iris.feature_names) + + pipe = make_pipeline(preprocessor, estimator) + pipe.fit(df, iris.target) + pdp_pipe, values_pipe = partial_dependence(pipe, df, features=features) + + X_preprocessed = (clone(preprocessor).fit_transform(df) + if preprocessor is not None else df.values) + clf = clone(estimator).fit(X_preprocessed, iris.target) + pdp_clf, values_clf = partial_dependence( + clf, X_preprocessed, features=[0, 1] + ) + + assert_allclose(pdp_pipe, pdp_clf) + # assert_allclose(values_pipe, values_clf) + + def test_plot_partial_dependence(pyplot): # Test partial dependence plot function. boston = load_boston() diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 3ad4096c1d091..6b3f5649ea08e 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -581,6 +581,17 @@ def set_random_state(estimator, random_state=0): skip_if_no_parallel = pytest.mark.skipif(not joblib.parallel.mp, reason="joblib is in serial mode") + def has_pandas(): + try: + import pandas + return True + except ImportError: + return False + + skip_if_no_pandas = pytest.mark.skipif( + not has_pandas, reason="pandas is not installed" + ) + # Decorator for tests involving both BLAS calls and multiprocessing. # # Under POSIX (e.g. Linux or OSX), using multiprocessing in conjunction From 133c11696035dae57834b837d3983dee8c595fcc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 6 Jun 2019 14:47:24 +0200 Subject: [PATCH 03/86] revert to brute method for pipeline --- sklearn/inspection/partial_dependence.py | 31 ++++++---------- .../tests/test_partial_dependence.py | 35 +++++++++++++------ sklearn/utils/testing.py | 11 ------ 3 files changed, 35 insertions(+), 42 deletions(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index d9f2758dbcb21..ead089780cd3d 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -19,7 +19,6 @@ from ..compose._column_transformer import _get_column_indices from ..base import is_classifier, is_regressor -from ..pipeline import Pipeline from ..utils.extmath import cartesian from ..utils import check_array from ..utils import check_matplotlib_support # noqa @@ -293,16 +292,12 @@ def partial_dependence(estimator, X, features, response_method='auto', dependence values are incorrect for 'recursion'. """ - preprocessor = estimator[:-1] if isinstance(estimator, Pipeline) else None - final_estimator = (estimator[-1] if isinstance(estimator, Pipeline) - else estimator) - - if not (is_classifier(final_estimator) or is_regressor(final_estimator)): + if not (is_classifier(estimator) or is_regressor(estimator)): raise ValueError( "'estimator' must be a fitted regressor or classifier.") - if (hasattr(final_estimator, 'classes_') and - isinstance(final_estimator.classes_[0], np.ndarray)): + if (hasattr(estimator, 'classes_') and + isinstance(estimator.classes_[0], np.ndarray)): raise ValueError('Multiclass-multioutput estimators are not supported') if not(hasattr(X, '__array__') or sparse.issparse(X)): @@ -314,7 +309,7 @@ def partial_dependence(estimator, X, features, response_method='auto', 'response_method {} is invalid. Accepted response_method names ' 'are {}.'.format(response_method, ', '.join(accepted_responses))) - if is_regressor(final_estimator) and response_method != 'auto': + if is_regressor(estimator) and response_method != 'auto': raise ValueError( "The response_method parameter is ignored for regressors and " "must be 'auto'." @@ -326,14 +321,14 @@ def partial_dependence(estimator, X, features, response_method='auto', method, ', '.join(accepted_methods))) if method == 'auto': - if (isinstance(final_estimator, BaseGradientBoosting) and - final_estimator.init is None): + if (isinstance(estimator, BaseGradientBoosting) and + estimator.init is None): method = 'recursion' else: method = 'brute' if method == 'recursion': - if not isinstance(final_estimator, BaseGradientBoosting): + if not isinstance(estimator, BaseGradientBoosting): raise ValueError( "'estimator' must be an instance of BaseGradientBoosting " "for the 'recursion' method. Try using method='brute'.") @@ -345,21 +340,15 @@ def partial_dependence(estimator, X, features, response_method='auto', "With the 'recursion' method, the response_method must be " "'decision_function'. Got {}.".format(response_method) ) - check_is_fitted(final_estimator, 'estimators_', + check_is_fitted(estimator, 'estimators_', msg="'estimator' parameter must be a fitted estimator") features_indices = np.asarray( _get_column_indices(X, features), dtype=np.int32, order='C' ).ravel() - if method == 'recursion' and preprocessor is not None: - X_preprocessed = preprocessor.transform(X) - else: - X_preprocessed = X - grid, values = _grid_from_X( - _get_column(X_preprocessed, features_indices), percentiles, - grid_resolution + _get_column(X, features_indices), percentiles, grid_resolution ) if method == 'brute': @@ -368,7 +357,7 @@ def partial_dependence(estimator, X, features, response_method='auto', ) else: averaged_predictions = _partial_dependence_recursion( - final_estimator, grid, features_indices + estimator, grid, features_indices ) # reshape averaged_predictions to diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 88946aa770436..5a3b82dc05f27 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -31,7 +31,6 @@ from sklearn.base import BaseEstimator, ClassifierMixin, clone from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import skip_if_no_pandas # toy sample @@ -437,7 +436,7 @@ def test_partial_dependence_pipeline(): assert_allclose(pdp_pipe, pdp_clf) assert_allclose( values_pipe[0], - (values_clf[0] * scaler.scale_[features]) + scaler.mean_[features] + values_clf[0] * scaler.scale_[features] + scaler.mean_[features] ) @@ -450,13 +449,14 @@ def test_partial_dependence_pipeline(): @pytest.mark.parametrize( "preprocessor", [None, - make_column_transformer((StandardScaler(), iris.feature_names[:2]), - (RobustScaler(), iris.feature_names[2:]))], + make_column_transformer( + (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), + (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]))], ids=['None', 'column-transformer'] ) @pytest.mark.parametrize( "features", - [[0, 1], iris.feature_names[:2]], + [[0, 2], [iris.feature_names[i] for i in (0, 2)]], ids=['features-integer', 'features-string'] ) def test_partial_dependence_dataframe(estimator, preprocessor, features): @@ -468,15 +468,30 @@ def test_partial_dependence_dataframe(estimator, preprocessor, features): pipe.fit(df, iris.target) pdp_pipe, values_pipe = partial_dependence(pipe, df, features=features) - X_preprocessed = (clone(preprocessor).fit_transform(df) - if preprocessor is not None else df.values) - clf = clone(estimator).fit(X_preprocessed, iris.target) + # the column transformer will reorder the column when transforming + # we mixed the index to be sure that we are computing the partial + # dependence of the right columns + if preprocessor is not None: + X_proc = clone(preprocessor).fit_transform(df) + features_clf = [0, 1] + else: + X_proc = df + features_clf = [0, 2] + + clf = clone(estimator).fit(X_proc, iris.target) pdp_clf, values_clf = partial_dependence( - clf, X_preprocessed, features=[0, 1] + clf, X_proc, features=features_clf, method='brute' ) assert_allclose(pdp_pipe, pdp_clf) - # assert_allclose(values_pipe, values_clf) + if preprocessor is not None: + scaler = preprocessor.named_transformers_['standardscaler'] + assert_allclose( + values_pipe[1], + values_clf[1] * scaler.scale_[1] + scaler.mean_[1] + ) + else: + assert_allclose(values_pipe[1], values_clf[1]) def test_plot_partial_dependence(pyplot): diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 6b3f5649ea08e..3ad4096c1d091 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -581,17 +581,6 @@ def set_random_state(estimator, random_state=0): skip_if_no_parallel = pytest.mark.skipif(not joblib.parallel.mp, reason="joblib is in serial mode") - def has_pandas(): - try: - import pandas - return True - except ImportError: - return False - - skip_if_no_pandas = pytest.mark.skipif( - not has_pandas, reason="pandas is not installed" - ) - # Decorator for tests involving both BLAS calls and multiprocessing. # # Under POSIX (e.g. Linux or OSX), using multiprocessing in conjunction From 79156f31c0a5eb1c3dfa0da64e6087d0df365adf Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 6 Jun 2019 15:21:00 +0200 Subject: [PATCH 04/86] refactor common part with columntransformer --- sklearn/compose/_column_transformer.py | 144 +----------------- sklearn/inspection/partial_dependence.py | 14 +- sklearn/utils/__init__.py | 179 ++++++++++++++++++++++- 3 files changed, 188 insertions(+), 149 deletions(-) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 5c234611e6aa5..b6521b6f0270b 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -17,6 +17,8 @@ from ..pipeline import _fit_transform_one, _transform_one, _name_estimators from ..preprocessing import FunctionTransformer from ..utils import Bunch +from ..utils import safe_indexing +from ..utils import _get_column_indices from ..utils.metaestimators import _BaseComposition from ..utils.validation import check_array, check_is_fitted @@ -402,7 +404,7 @@ def _fit_transform(self, X, y, func, fitted=False): return Parallel(n_jobs=self.n_jobs)( delayed(func)( transformer=clone(trans) if not fitted else trans, - X=_get_column(X, column), + X=safe_indexing(X, column, axis=1), y=y, weight=weight, message_clsname='ColumnTransformer', @@ -553,146 +555,6 @@ def _check_X(X): return check_array(X, force_all_finite='allow-nan', dtype=np.object) -def _check_key_type(key, superclass): - """ - Check that scalar, list or slice is of a certain type. - - This is only used in _get_column and _get_column_indices to check - if the `key` (column specification) is fully integer or fully string-like. - - Parameters - ---------- - key : scalar, list, slice, array-like - The column specification to check - superclass : int or str - The type for which to check the `key` - - """ - if isinstance(key, superclass): - return True - if isinstance(key, slice): - return (isinstance(key.start, (superclass, type(None))) and - isinstance(key.stop, (superclass, type(None)))) - if isinstance(key, list): - return all(isinstance(x, superclass) for x in key) - if hasattr(key, 'dtype'): - if superclass is int: - return key.dtype.kind == 'i' - else: - # superclass = str - return key.dtype.kind in ('O', 'U', 'S') - return False - - -def _get_column(X, key): - """ - Get feature column(s) from input data X. - - Supported input types (X): numpy arrays, sparse arrays and DataFrames - - Supported key types (key): - - scalar: output is 1D - - lists, slices, boolean masks: output is 2D - - callable that returns any of the above - - Supported key data types: - - - integer or boolean mask (positional): - - supported for arrays, sparse matrices and dataframes - - string (key-based): - - only supported for dataframes - - So no keys other than strings are allowed (while in principle you - can use any hashable object as key). - - """ - # check whether we have string column names or integers - if _check_key_type(key, int): - column_names = False - elif _check_key_type(key, str): - column_names = True - elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_): - # boolean mask - column_names = False - if hasattr(X, 'loc'): - # pandas boolean masks don't work with iloc, so take loc path - column_names = True - else: - raise ValueError("No valid specification of the columns. Only a " - "scalar, list or slice of all integers or all " - "strings, or boolean mask is allowed") - - if column_names: - if hasattr(X, 'loc'): - # pandas dataframes - return X.loc[:, key] - else: - raise ValueError("Specifying the columns using strings is only " - "supported for pandas DataFrames") - else: - if hasattr(X, 'iloc'): - # pandas dataframes - return X.iloc[:, key] - else: - # numpy arrays, sparse arrays - return X[:, key] - - -def _get_column_indices(X, key): - """ - Get feature column indices for input data X and key. - - For accepted values of `key`, see the docstring of _get_column - - """ - n_columns = X.shape[1] - - if (_check_key_type(key, int) - or hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_)): - # Convert key into positive indexes - try: - idx = np.arange(n_columns)[key] - except IndexError as e: - raise ValueError( - 'all features must be in [0, %d]' % (n_columns - 1) - ) from e - return np.atleast_1d(idx).tolist() - elif _check_key_type(key, str): - try: - all_columns = list(X.columns) - except AttributeError: - raise ValueError("Specifying the columns using strings is only " - "supported for pandas DataFrames") - if isinstance(key, str): - columns = [key] - elif isinstance(key, slice): - start, stop = key.start, key.stop - if start is not None: - start = all_columns.index(start) - if stop is not None: - # pandas indexing with strings is endpoint included - stop = all_columns.index(stop) + 1 - else: - stop = n_columns + 1 - return list(range(n_columns)[slice(start, stop)]) - else: - columns = list(key) - - try: - column_indices = [all_columns.index(col) for col in columns] - except ValueError as e: - if 'not in list' in str(e): - raise ValueError( - "A given feature is not a column of the dataframe" - ) from e - raise - - return column_indices - else: - raise ValueError("No valid specification of the columns. Only a " - "scalar, list or slice of all integers or all " - "strings, or boolean mask is allowed") - - def _is_empty_column_selection(column): """ Return True if the column selection is empty (empty list or all-False diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index ead089780cd3d..2a10ba78d0a2d 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -15,13 +15,12 @@ from scipy.stats.mstats import mquantiles from joblib import Parallel, delayed -from ..compose._column_transformer import _get_column -from ..compose._column_transformer import _get_column_indices - from ..base import is_classifier, is_regressor from ..utils.extmath import cartesian from ..utils import check_array from ..utils import check_matplotlib_support # noqa +from ..utils import safe_indexing +from ..utils import _get_column_indices from ..utils.validation import check_is_fitted from ..tree._tree import DTYPE from ..exceptions import NotFittedError @@ -74,14 +73,14 @@ def _grid_from_X(X, percentiles, grid_resolution): values = [] for feature in range(X.shape[1]): - uniques = np.unique(_get_column(X, feature)) + uniques = np.unique(safe_indexing(X, feature, axis=1)) if uniques.shape[0] < grid_resolution: # feature has low resolution use unique vals axis = uniques else: # create axis based on percentiles and grid resolution emp_percentiles = mquantiles( - _get_column(X, feature), prob=percentiles, axis=0 + safe_indexing(X, feature, axis=1), prob=percentiles, axis=0 ) if np.allclose(emp_percentiles[0], emp_percentiles[1]): raise ValueError( @@ -301,7 +300,7 @@ def partial_dependence(estimator, X, features, response_method='auto', raise ValueError('Multiclass-multioutput estimators are not supported') if not(hasattr(X, '__array__') or sparse.issparse(X)): - X = check_array(X, force_all_finite='allow-nan', dtype=np.object) + return check_array(X, force_all_finite='allow-nan', dtype=np.object) accepted_responses = ('auto', 'predict_proba', 'decision_function') if response_method not in accepted_responses: @@ -348,7 +347,8 @@ def partial_dependence(estimator, X, features, response_method='auto', ).ravel() grid, values = _grid_from_X( - _get_column(X, features_indices), percentiles, grid_resolution + safe_indexing(X, features_indices, axis=1), percentiles, + grid_resolution ) if method == 'brute': diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index daf7e7763235d..3c610e94e7b34 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -178,7 +178,44 @@ def axis0_safe_slice(X, mask, len_mask): return np.zeros(shape=(0, X.shape[1])) -def safe_indexing(X, indices): +def safe_indexing(X, indices, axis=0): + """Return rows, items or columns of X using indices. + + Parameters + ---------- + X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series. + Data from which to sample rows, items or columns. + indices : + When ``axis=0``, indices need to be an array of integer. + When ``axis=1``, indices can be one of: + Supported key types (key): + - scalar: output is 1D + - lists, slices, boolean masks: output is 2D + - callable that returns any of the above + + Supported key data types: + + - integer or boolean mask (positional): + - supported for arrays, sparse matrices and dataframes + - string (key-based): + - only supported for dataframes + - So no keys other than strings are allowed (while in principle you + can use any hashable object as key). + axis : int, default=0 + The axis along which the X will be subsampled. ``axis=0`` will select + rows while ``axis=1`` will select columns. + + Notes + ----- + CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are + not supported. + """ + if axis ==0: + return _safe_indexing_row(X, indices) + return _safe_indexing_column(X, indices) + + +def _safe_indexing_row(X, indices): """Return items or rows from X using indices. Allows simple indexing of lists or arrays. @@ -223,6 +260,146 @@ def safe_indexing(X, indices): return [X[idx] for idx in indices] +def _check_key_type(key, superclass): + """ + Check that scalar, list or slice is of a certain type. + + This is only used in _get_column and _get_column_indices to check + if the `key` (column specification) is fully integer or fully string-like. + + Parameters + ---------- + key : scalar, list, slice, array-like + The column specification to check + superclass : int or str + The type for which to check the `key` + + """ + if isinstance(key, superclass): + return True + if isinstance(key, slice): + return (isinstance(key.start, (superclass, type(None))) and + isinstance(key.stop, (superclass, type(None)))) + if isinstance(key, list): + return all(isinstance(x, superclass) for x in key) + if hasattr(key, 'dtype'): + if superclass is int: + return key.dtype.kind == 'i' + else: + # superclass = str + return key.dtype.kind in ('O', 'U', 'S') + return False + + +def _safe_indexing_column(X, key): + """ + Get feature column(s) from input data X. + + Supported input types (X): numpy arrays, sparse arrays and DataFrames + + Supported key types (key): + - scalar: output is 1D + - lists, slices, boolean masks: output is 2D + - callable that returns any of the above + + Supported key data types: + + - integer or boolean mask (positional): + - supported for arrays, sparse matrices and dataframes + - string (key-based): + - only supported for dataframes + - So no keys other than strings are allowed (while in principle you + can use any hashable object as key). + + """ + # check whether we have string column names or integers + if _check_key_type(key, int): + column_names = False + elif _check_key_type(key, str): + column_names = True + elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_): + # boolean mask + column_names = False + if hasattr(X, 'loc'): + # pandas boolean masks don't work with iloc, so take loc path + column_names = True + else: + raise ValueError("No valid specification of the columns. Only a " + "scalar, list or slice of all integers or all " + "strings, or boolean mask is allowed") + + if column_names: + if hasattr(X, 'loc'): + # pandas dataframes + return X.loc[:, key] + else: + raise ValueError("Specifying the columns using strings is only " + "supported for pandas DataFrames") + else: + if hasattr(X, 'iloc'): + # pandas dataframes + return X.iloc[:, key] + else: + # numpy arrays, sparse arrays + return X[:, key] + + +def _get_column_indices(X, key): + """ + Get feature column indices for input data X and key. + + For accepted values of `key`, see the docstring of _get_column + + """ + n_columns = X.shape[1] + + if (_check_key_type(key, int) + or hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_)): + # Convert key into positive indexes + try: + idx = np.arange(n_columns)[key] + except IndexError as e: + raise ValueError( + 'all features must be in [0, %d]' % (n_columns - 1) + ) from e + return np.atleast_1d(idx).tolist() + elif _check_key_type(key, str): + try: + all_columns = list(X.columns) + except AttributeError: + raise ValueError("Specifying the columns using strings is only " + "supported for pandas DataFrames") + if isinstance(key, str): + columns = [key] + elif isinstance(key, slice): + start, stop = key.start, key.stop + if start is not None: + start = all_columns.index(start) + if stop is not None: + # pandas indexing with strings is endpoint included + stop = all_columns.index(stop) + 1 + else: + stop = n_columns + 1 + return list(range(n_columns)[slice(start, stop)]) + else: + columns = list(key) + + try: + column_indices = [all_columns.index(col) for col in columns] + except ValueError as e: + if 'not in list' in str(e): + raise ValueError( + "A given feature is not a column of the dataframe" + ) from e + raise + + return column_indices + else: + raise ValueError("No valid specification of the columns. Only a " + "scalar, list or slice of all integers or all " + "strings, or boolean mask is allowed") + + def resample(*arrays, **options): """Resample arrays or sparse matrices in a consistent way From 59cb6f5960eb2ad9146efc6e7d9463e3a3edcd69 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 6 Jun 2019 15:29:17 +0200 Subject: [PATCH 05/86] fix --- sklearn/inspection/partial_dependence.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index 2a10ba78d0a2d..b4ebb3e3b0139 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -300,7 +300,7 @@ def partial_dependence(estimator, X, features, response_method='auto', raise ValueError('Multiclass-multioutput estimators are not supported') if not(hasattr(X, '__array__') or sparse.issparse(X)): - return check_array(X, force_all_finite='allow-nan', dtype=np.object) + X = check_array(X, force_all_finite='allow-nan', dtype=np.object) accepted_responses = ('auto', 'predict_proba', 'decision_function') if response_method not in accepted_responses: @@ -313,6 +313,7 @@ def partial_dependence(estimator, X, features, response_method='auto', "The response_method parameter is ignored for regressors and " "must be 'auto'." ) + accepted_methods = ('brute', 'recursion', 'auto') if method not in accepted_methods: raise ValueError( From cb4b00b1be286b6ade496465286df2b1f2b5dc98 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 6 Jun 2019 16:38:00 +0200 Subject: [PATCH 06/86] TST check the support of different types for features --- .../tests/test_partial_dependence.py | 23 +++++++++++++++++++ sklearn/utils/__init__.py | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 5a3b82dc05f27..cad8908591457 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -494,6 +494,29 @@ def test_partial_dependence_dataframe(estimator, preprocessor, features): assert_allclose(values_pipe[1], values_clf[1]) +@pytest.mark.parametrize( + "features", + [0, iris.feature_names[0], + [0, 2], [iris.feature_names[i] for i in (0, 2)], + slice(0, 2, 1), [True, False, True, False]], + ids=['scalar-int', 'scalar-str', 'list-int', 'list-str', 'slice', 'mask'] +) +def test_partial_dependence_feature_type(features): + # check all possible features type supported in PDP + pd = pytest.importorskip("pandas") + df = pd.DataFrame(iris.data, columns=iris.feature_names) + + preprocessor = make_column_transformer( + (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), + (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]) + ) + pipe = make_pipeline( + preprocessor, LogisticRegression(max_iter=1000, random_state=0) + ) + pipe.fit(df, iris.target) + pdp_pipe, values_pipe = partial_dependence(pipe, df, features=features) + + def test_plot_partial_dependence(pyplot): # Test partial dependence plot function. boston = load_boston() diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 3c610e94e7b34..98ee5af8d9349 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -210,7 +210,7 @@ def safe_indexing(X, indices, axis=0): CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. """ - if axis ==0: + if axis == 0: return _safe_indexing_row(X, indices) return _safe_indexing_column(X, indices) From c04dcba72bc0d0fcadb70200f693a8de0e0b3056 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 18 Jul 2019 15:09:31 +0200 Subject: [PATCH 07/86] problem merge --- sklearn/inspection/tests/test_partial_dependence.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index c4d9f64796cbc..5b0b6a14fcf62 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -30,11 +30,8 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures from sklearn.preprocessing import StandardScaler -<<<<<<< HEAD from sklearn.preprocessing import RobustScaler from sklearn.pipeline import make_pipeline -======= ->>>>>>> origin/master from sklearn.dummy import DummyClassifier from sklearn.base import BaseEstimator, ClassifierMixin, clone from sklearn.utils.testing import assert_allclose From 0326a88aaf67dc7723d7eea97eebdeea7b36f65f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 19 Jul 2019 16:21:49 +0200 Subject: [PATCH 08/86] PEP8 --- sklearn/inspection/partial_dependence.py | 2 +- sklearn/inspection/tests/test_partial_dependence.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index 311fd57d44454..c778c15887015 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -362,7 +362,7 @@ def partial_dependence(estimator, X, features, response_method='auto', grid, values = _grid_from_X( safe_indexing(X, features_indices, axis=1), percentiles, - grid_resolution + grid_resolution ) if method == 'brute': diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 5b0b6a14fcf62..446e28cd4c939 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -27,7 +27,6 @@ from sklearn.cluster import KMeans from sklearn.compose import make_column_transformer from sklearn.metrics import r2_score -from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import RobustScaler From 2f0f69049f460195a71e3a102b301de3014c05a6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 19 Jul 2019 16:51:30 +0200 Subject: [PATCH 09/86] issue merge --- sklearn/utils/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 8cb9a16918ff5..efcaf6865faa5 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -407,6 +407,12 @@ def _get_column_indices(X, key): ) from e raise + return column_indices + else: + raise ValueError("No valid specification of the columns. Only a " + "scalar, list or slice of all integers or all " + "strings, or boolean mask is allowed") + def resample(*arrays, **options): """Resample arrays or sparse matrices in a consistent way From 33e655d842c9b11e6dfbb4ce7a7aee3e5c9e5920 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 19 Jul 2019 18:03:44 +0200 Subject: [PATCH 10/86] iter --- sklearn/inspection/partial_dependence.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index c778c15887015..6a9649e1bad16 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -353,8 +353,12 @@ def partial_dependence(estimator, X, features, response_method='auto', "With the 'recursion' method, the response_method must be " "'decision_function'. Got {}.".format(response_method) ) - check_is_fitted(estimator, 'estimators_', - msg="'estimator' parameter must be a fitted estimator") + msg="'estimator' parameter must be a fitted estimator" + if isinstance(estimator, BaseGradientBoosting): + fitted_attribute = 'estimators_' + else: + fitted_attribute = 'n_iter_' + check_is_fitted(estimator, fitted_attribute, msg=msg) features_indices = np.asarray( _get_column_indices(X, features), dtype=np.int32, order='C' From 01947170924b7e6d4b9b26c24f8899cb35487c22 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 22 Jul 2019 14:32:59 +0200 Subject: [PATCH 11/86] fix --- sklearn/inspection/partial_dependence.py | 2 +- sklearn/inspection/tests/test_partial_dependence.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index 6a9649e1bad16..38173a5e4c8f6 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -353,7 +353,7 @@ def partial_dependence(estimator, X, features, response_method='auto', "With the 'recursion' method, the response_method must be " "'decision_function'. Got {}.".format(response_method) ) - msg="'estimator' parameter must be a fitted estimator" + msg ="'estimator' parameter must be a fitted estimator" if isinstance(estimator, BaseGradientBoosting): fitted_attribute = 'estimators_' else: diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 446e28cd4c939..d494a05f599b7 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -366,7 +366,7 @@ def test_partial_dependence_unknown_feature_string(estimator): estimator.fit(df, y) features = 'random' - err_msg = 'A given feature is not a column of the dataframe' + err_msg = 'A given column is not a column of the dataframe' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, df, [features]) From 72ee546fceb1fb3582ea16460d32dc2a8ee515c5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 22 Jul 2019 14:34:54 +0200 Subject: [PATCH 12/86] PEP8 --- sklearn/inspection/partial_dependence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index 38173a5e4c8f6..4fb6231580053 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -353,7 +353,7 @@ def partial_dependence(estimator, X, features, response_method='auto', "With the 'recursion' method, the response_method must be " "'decision_function'. Got {}.".format(response_method) ) - msg ="'estimator' parameter must be a fitted estimator" + msg = "'estimator' parameter must be a fitted estimator" if isinstance(estimator, BaseGradientBoosting): fitted_attribute = 'estimators_' else: From db25ee6246412086d5045d2e896dca6dece95bfe Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 23 Jul 2019 18:08:22 +0200 Subject: [PATCH 13/86] update docstring --- sklearn/inspection/partial_dependence.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index 4fb6231580053..b5228a75eab59 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -186,11 +186,10 @@ def partial_dependence(estimator, X, features, response_method='auto', A fitted estimator object implementing `predict`, `predict_proba`, or `decision_function`. Multioutput-multiclass classifiers are not supported. - X : array-like, shape (n_samples, n_features) + X : array-like or DataFrame, shape (n_samples, n_features) ``X`` is used both to generate a grid of values for the ``features``, and to compute the averaged predictions when method is 'brute'. - # TODO: update the type accepted features : list or array-like of int The target features for which the partial dependency should be computed. @@ -404,7 +403,7 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, A fitted estimator object implementing `predict`, `predict_proba`, or `decision_function`. Multioutput-multiclass classifiers are not supported. - X : array-like, shape (n_samples, n_features) + X : array-like or DataFrame, shape (n_samples, n_features) The data to use to build the grid of values on which the dependence will be evaluated. This is usually the training data. features : list of {int, str, pair of int, pair of str} From 60b8f59f12298f25d925d54b222111ae14474025 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 23 Jul 2019 18:51:34 +0200 Subject: [PATCH 14/86] whats new --- doc/whats_new/v0.22.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index f2046cc6b64f1..9e288c5079893 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -168,6 +168,13 @@ Changelog match `spectral_clustering`. :pr:`13726` by :user:`Shuzhe Xiao `. +:mod:`sklearn.inspection` +......................... + +- |Enhancement| :func:`inspection.partial_dependence` accepts pandas DataFrame + and :class:`pipeline.Pipeline` containing :class:`compose.ColumnTransformer`. + :pr:`14028` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.feature_selection` ................................ - |Fix| Fixed a bug where :class:`VarianceThreshold` with `threshold=0` did not From c01385c052a010f138707ad41e593414cfe76133 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 25 Jul 2019 18:20:00 +0200 Subject: [PATCH 15/86] EHN add support for scalar, slice and mask in safe_indexing axis=0 --- doc/whats_new/v0.22.rst | 4 ++ sklearn/utils/__init__.py | 21 +++++++++-- sklearn/utils/tests/test_utils.py | 62 ++++++++++++++++++++++++++----- 3 files changed, 73 insertions(+), 14 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index bcef08ff1881b..d1d838ec99dc5 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -206,6 +206,10 @@ Changelog NumPy array, SciPy sparse matrix, and Pandas DataFrame. :pr:`14035` by `Guillaume Lemaitre `. +- |Enhancement| :func:`utils.safe_indexing` becomes more consistent and support + scalar, slice, mask indexing for both `axis=0` and `axis=1`. + :pr:`xx` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.neural_network` ............................. diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index efcaf6865faa5..2b2b33b0186d4 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -1,6 +1,7 @@ """ The :mod:`sklearn.utils` module includes various utilities. """ +from collections.abc import Iterable from collections.abc import Sequence from contextlib import contextmanager from itertools import islice @@ -188,6 +189,10 @@ def safe_indexing(X, indices, axis=0): Data from which to sample rows, items or columns. indices : array-like - When ``axis=0``, indices need to be an array of integer. + - container: lists, slices, boolean masks: output is 2D. + Supported data types for containers: + - integer or boolean (positional): supported for arrays, sparse + and dataframes - When ``axis=1``, indices can be one of: - scalar: output is 1D, unless `X` is sparse. Supported data types for scalars: @@ -247,10 +252,12 @@ def _safe_indexing_row(X, indices): CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. """ - if hasattr(X, "iloc"): - # Work-around for indexing with read-only indices in pandas + if not isinstance(indices, slice): indices = np.asarray(indices) - indices = indices if indices.flags.writeable else indices.copy() + if hasattr(X, "iloc"): + if not isinstance(indices, slice): + # Work-around for indexing with read-only indices in pandas + indices = indices if indices.flags.writeable else indices.copy() # Pandas Dataframes and Series try: return X.iloc[indices] @@ -268,7 +275,13 @@ def _safe_indexing_row(X, indices): else: return X[indices] else: - return [X[idx] for idx in indices] + # In the case of a slice or a scalar + if not isinstance(indices, Iterable) or indices.ndim == 0: + return X[indices] + else: + if np.issubdtype(indices.dtype, np.bool_): + indices = np.flatnonzero(indices) + return [X[idx] for idx in indices] def _check_key_type(key, superclass): diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index a39e8160047a5..1601002449924 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -10,6 +10,7 @@ from sklearn.utils.testing import (assert_raises, assert_array_equal, + assert_allclose_dense_sparse, assert_raises_regex, assert_warns_message, assert_no_warnings) from sklearn.utils import check_random_state @@ -219,14 +220,51 @@ def test_check_key_type(key, clazz, is_expected_type): assert _check_key_type(key, clazz) is is_expected_type -@pytest.mark.parametrize("asarray", [True, False], ids=["array-like", "array"]) -def test_safe_indexing_axis_0(asarray): +@pytest.mark.parametrize( + "idx", + [[0, 2], [True, False, True], # array-like + np.array([0, 2]), np.array([True, False, True])], # numpy array + ids=['list-indices', 'list-mask', 'array-indices', 'array-mask'] +) +@pytest.mark.parametrize( + "array_type", [None, np.asarray, sp.csr_matrix], + ids=["list", "array", "sparse"] +) +def test_safe_indexing_axis_0_container(idx, array_type): X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - inds = np.array([1, 2]) if asarray else [1, 2] - X_inds = safe_indexing(X, inds) - X_arrays = safe_indexing(np.array(X), inds) - assert_array_equal(np.array(X_inds), X_arrays) - assert_array_equal(np.array(X_inds), np.array(X)[inds]) + X = array_type(X) if array_type is not None else X + X_subset = safe_indexing(X, idx, axis=0) + X_expect = [[1, 2, 3], [7, 8, 9]] + X_expect = array_type(X_expect) if array_type is not None else X_expect + assert_allclose_dense_sparse(X_subset, X_expect) + + +@pytest.mark.parametrize( + "array_type", [None, np.asarray, sp.csr_matrix], + ids=["list", "array", "sparse"] +) +def test_safe_indexing_axis_0_slice(array_type): + X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + X = array_type(X) if array_type is not None else X + idx = slice(0, 2) + X_subset = safe_indexing(X, idx, axis=0) + X_expect = [[1, 2, 3], [4, 5, 6]] + X_expect = array_type(X_expect) if array_type is not None else X_expect + assert_allclose_dense_sparse(X_subset, X_expect) + + +@pytest.mark.parametrize( + "array_type", [None, np.asarray, sp.csr_matrix], + ids=["list", "array", "sparse"] +) +def test_safe_indexing_axis_0_scalar(array_type): + X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + X = array_type(X) if array_type is not None else X + idx = 1 # scalar indexing + X_subset = safe_indexing(X, idx, axis=0) + X_expect = [4, 5, 6] + X_expect = array_type(X_expect) if array_type is not None else X_expect + assert_allclose_dense_sparse(X_subset, X_expect) @pytest.mark.parametrize("idx", [0, [0, 1]], ids=['scalar', 'list']) @@ -355,11 +393,15 @@ def test_safe_indexing_pandas_series(idx, asarray): assert_array_equal(safe_indexing(serie, idx).values, [0, 1]) -@pytest.mark.parametrize("asarray", [True, False], ids=["array-like", "array"]) -def test_safe_indexing_mock_pandas(asarray): +@pytest.mark.parametrize( + "inds", + [[1, 2], [False, True, True], + np.array([1, 2]), np.array([False, True, True]), + slice(1, None)] +) +def test_safe_indexing_mock_pandas(inds): X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) X_df = MockDataFrame(X) - inds = np.array([1, 2]) if asarray else [1, 2] X_df_indexed = safe_indexing(X_df, inds) X_indexed = safe_indexing(X_df, inds) assert_array_equal(np.array(X_df_indexed), X_indexed) From 0e5c03738557804fcccc9a4c1746a38714529139 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 25 Jul 2019 18:29:37 +0200 Subject: [PATCH 16/86] DOC --- doc/whats_new/v0.22.rst | 2 +- sklearn/utils/__init__.py | 17 ++++++----------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index d1d838ec99dc5..716a500ebe085 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -208,7 +208,7 @@ Changelog - |Enhancement| :func:`utils.safe_indexing` becomes more consistent and support scalar, slice, mask indexing for both `axis=0` and `axis=1`. - :pr:`xx` by :user:`Guillaume Lemaitre `. + :pr:`14475` by :user:`Guillaume Lemaitre `. :mod:`sklearn.neural_network` ............................. diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 2b2b33b0186d4..b1bb933efd140 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -188,23 +188,18 @@ def safe_indexing(X, indices, axis=0): X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series Data from which to sample rows, items or columns. indices : array-like - - When ``axis=0``, indices need to be an array of integer. - - container: lists, slices, boolean masks: output is 2D. - Supported data types for containers: - - integer or boolean (positional): supported for arrays, sparse - and dataframes - - When ``axis=1``, indices can be one of: + - For both `axis=0` and `axis=1`, indices can be one of: - scalar: output is 1D, unless `X` is sparse. Supported data types for scalars: - integer: supported for arrays, sparse matrices and dataframes. - - string (key-based): only supported for dataframes. - container: lists, slices, boolean masks: output is 2D. Supported data types for containers: - - integer or boolean (positional): supported for - arrays, sparse matrices and dataframes - - string (key-based): only supported for dataframes. No keys - other than strings are allowed. + - integer or boolean (positional): supported for arrays, sparse + and dataframes + - If `X` is a dataframe and `axis=1`, indices support string data type + (key-based) as a scalar or a container. The output dimension will be + identical to the above case. axis : int, default=0 The axis along which `X` will be subsampled. ``axis=0`` will select rows while ``axis=1`` will select columns. From f5e08c4272fe1633f490ec15ff5381c5f5b3caef Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 25 Jul 2019 22:20:08 +0200 Subject: [PATCH 17/86] FIX behaviour when passing None --- sklearn/utils/__init__.py | 4 +++- sklearn/utils/tests/test_utils.py | 17 +++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index b1bb933efd140..e0f5be6dc35c0 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -247,7 +247,9 @@ def _safe_indexing_row(X, indices): CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. """ - if not isinstance(indices, slice): + if indices is None: + return X + elif not isinstance(indices, slice): indices = np.asarray(indices) if hasattr(X, "iloc"): if not isinstance(indices, slice): diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 1601002449924..737a20221b346 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -267,6 +267,17 @@ def test_safe_indexing_axis_0_scalar(array_type): assert_allclose_dense_sparse(X_subset, X_expect) +@pytest.mark.parametrize( + "array_type", [None, np.asarray, sp.csr_matrix], + ids=["list", "array", "sparse"] +) +def test_safe_indexing_axis_0_None(array_type): + X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] + X = array_type(X) if array_type is not None else X + X_subset = safe_indexing(X, None, axis=0) + assert_allclose_dense_sparse(X_subset, X) + + @pytest.mark.parametrize("idx", [0, [0, 1]], ids=['scalar', 'list']) @pytest.mark.parametrize("asarray", [True, False], ids=["array-like", "array"]) def test_safe_indexing_axis_1_sparse(idx, asarray): @@ -395,9 +406,11 @@ def test_safe_indexing_pandas_series(idx, asarray): @pytest.mark.parametrize( "inds", - [[1, 2], [False, True, True], + [None, + [1, 2], [False, True, True], np.array([1, 2]), np.array([False, True, True]), - slice(1, None)] + slice(1, None)], + ids=['None', 'list-int', 'list-mask', 'array-int', 'array-mask', 'slice'] ) def test_safe_indexing_mock_pandas(inds): X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) From bb4db91755cf724a0e856ef9dab8bcccdecca686 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 25 Jul 2019 22:21:53 +0200 Subject: [PATCH 18/86] PEP8 --- sklearn/utils/tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 737a20221b346..468f7dae4e2e8 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -410,7 +410,7 @@ def test_safe_indexing_pandas_series(idx, asarray): [1, 2], [False, True, True], np.array([1, 2]), np.array([False, True, True]), slice(1, None)], - ids=['None', 'list-int', 'list-mask', 'array-int', 'array-mask', 'slice'] + ids=['None', 'list-int', 'list-mask', 'array-int', 'array-mask', 'slice'] ) def test_safe_indexing_mock_pandas(inds): X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) From 8cd74db2fba750c17c6b4dfe066e411dd30c43b6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 29 Jul 2019 12:05:25 +0200 Subject: [PATCH 19/86] address thomas comments --- sklearn/utils/__init__.py | 2 +- sklearn/utils/tests/test_utils.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index b1bb933efd140..e06cf0dd2838b 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -196,7 +196,7 @@ def safe_indexing(X, indices, axis=0): - container: lists, slices, boolean masks: output is 2D. Supported data types for containers: - integer or boolean (positional): supported for arrays, sparse - and dataframes + matrices and dataframes - If `X` is a dataframe and `axis=1`, indices support string data type (key-based) as a scalar or a container. The output dimension will be identical to the above case. diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 1601002449924..897a95542e33d 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -240,12 +240,12 @@ def test_safe_indexing_axis_0_container(idx, array_type): @pytest.mark.parametrize( - "array_type", [None, np.asarray, sp.csr_matrix], + "array_type", [list, np.asarray, sp.csr_matrix], ids=["list", "array", "sparse"] ) def test_safe_indexing_axis_0_slice(array_type): X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - X = array_type(X) if array_type is not None else X + X = array_type(X) idx = slice(0, 2) X_subset = safe_indexing(X, idx, axis=0) X_expect = [[1, 2, 3], [4, 5, 6]] @@ -254,12 +254,12 @@ def test_safe_indexing_axis_0_slice(array_type): @pytest.mark.parametrize( - "array_type", [None, np.asarray, sp.csr_matrix], + "array_type", [list, np.asarray, sp.csr_matrix], ids=["list", "array", "sparse"] ) def test_safe_indexing_axis_0_scalar(array_type): X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - X = array_type(X) if array_type is not None else X + X = array_type(X) idx = 1 # scalar indexing X_subset = safe_indexing(X, idx, axis=0) X_expect = [4, 5, 6] From 075dd80f408c3b38acd6464c42a5874c1bb33089 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 29 Jul 2019 14:53:40 +0200 Subject: [PATCH 20/86] debug --- build_tools/azure/test_script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 6e27915e0be6b..27a3e4b649a13 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -21,7 +21,7 @@ except ImportError: python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())" pip list -TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML" +TEST_CMD="python -m pytest --verbose --showlocals --durations=20 --junitxml=$JUNITXML" if [[ "$COVERAGE" == "true" ]]; then export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" From d0f8d60a8654747f2f5600e3fae9c071c1c417aa Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 29 Jul 2019 18:31:25 +0200 Subject: [PATCH 21/86] FIX change boolean array-likes indexing in old NumPy version --- sklearn/compose/tests/test_column_transformer.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index ae7ef31d6c7f1..2ccfd6d6c2eae 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -16,6 +16,7 @@ from sklearn.base import BaseEstimator from sklearn.compose import ColumnTransformer, make_column_transformer from sklearn.exceptions import NotFittedError +from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder from sklearn.feature_extraction import DictVectorizer @@ -1108,3 +1109,14 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname): err_msg = 'Specifying the columns' with pytest.raises(ValueError, match=err_msg): tf.transform(X_array) + + +def test_column_transformer_mask_indexing(): + # Regression test for #xxxxx + # Boolean mask indexing with NumPy < 1.13 + X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]]) + column_transformer = ColumnTransformer( + [('identity', FunctionTransformer(), [False, True, False, True])] + ) + X_trans = column_transformer.fit_transform(X) + assert X_trans.shape == (3, 2) From f95a228e5444d801b6ab09d30dcc655d347cc663 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 29 Jul 2019 19:12:10 +0200 Subject: [PATCH 22/86] change indexing --- sklearn/utils/__init__.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index efcaf6865faa5..8db41bb27986d 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -18,6 +18,7 @@ from . import _joblib from ..exceptions import DataConversionWarning from .deprecation import deprecated +from .fixes import np_version from .validation import (as_float_array, assert_all_finite, check_random_state, column_or_1d, check_array, @@ -225,6 +226,17 @@ def safe_indexing(X, indices, axis=0): ) +# FIXME: to be removed once NumPy 1.13 is the minimum version required +def _array_indexing(array, key, axis=0): + """Index an array consistently across NumPy version.""" + if np_version < (1, 13): + # check if we have an boolean array-likes to make the proper indexing + key_array = np.asarray(key) + if np.issubdtype(key_array.dtype, np.bool_): + key = key_array + return array[key] if axis == 0 else array[:, key] + + def _safe_indexing_row(X, indices): """Return items or rows from X using indices. @@ -266,7 +278,7 @@ def _safe_indexing_row(X, indices): # This is often substantially faster than X[indices] return X.take(indices, axis=0) else: - return X[indices] + return _array_indexing(X, indices, axis=0) else: return [X[idx] for idx in indices] @@ -356,7 +368,7 @@ def _safe_indexing_column(X, key): return X.iloc[:, key] else: # numpy arrays, sparse arrays - return X[:, key] + return _array_indexing(X, key, axis=1) def _get_column_indices(X, key): @@ -371,7 +383,7 @@ def _get_column_indices(X, key): or hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_)): # Convert key into positive indexes try: - idx = np.arange(n_columns)[key] + idx = safe_indexing(np.arange(n_columns), key) except IndexError as e: raise ValueError( 'all features must be in [0, %d]' % (n_columns - 1) From 1c8180390799d22bf42b5c1673caf0cb3dd71c79 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 30 Jul 2019 09:54:22 +0200 Subject: [PATCH 23/86] add regression test in utils --- doc/whats_new/v0.22.rst | 8 +++++++ .../compose/tests/test_column_transformer.py | 4 ++-- sklearn/utils/tests/test_utils.py | 22 +++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 93635d88069d5..114afb9185a18 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -61,6 +61,14 @@ Changelog `sample_weights` are not supported by the wrapped estimator). :pr:`13575` by :user:`William de Vazelhes `. +:mod:`sklearn.compose` +...................... + +- |Fix| Fixed a bug in :class:`compose.ColumnTransformer` which failed to + select the proper columns when using a boolean list and NumPy older than + 1.13. + :pr:`14510` by :user:`Guillaume Lemaitre `. + :mod:`sklearn.datasets` ....................... diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 2ccfd6d6c2eae..a9c4fd9e25fbe 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1112,8 +1112,8 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname): def test_column_transformer_mask_indexing(): - # Regression test for #xxxxx - # Boolean mask indexing with NumPy < 1.13 + # Regression test for #14510 + # Boolean array-like does not behave as boolean array with NumPy < 1.13 X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]]) column_transformer = ColumnTransformer( [('identity', FunctionTransformer(), [False, True, False, True])] diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index a39e8160047a5..35cfde4aaef7d 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -10,6 +10,7 @@ from sklearn.utils.testing import (assert_raises, assert_array_equal, + assert_allclose_dense_sparse, assert_raises_regex, assert_warns_message, assert_no_warnings) from sklearn.utils import check_random_state @@ -365,6 +366,27 @@ def test_safe_indexing_mock_pandas(asarray): assert_array_equal(np.array(X_df_indexed), X_indexed) +@pytest.mark.parametrize("array_type", ['array', 'sparse', 'dataframe']) +def test_safe_indexing_mask_axis_1(array_type): + # regression test for #14510 + # check that boolean array-like and boolean array lead to the same indexing + # even in NumPy < 1.13 + if array_type == 'array': + array_constructor = np.asarray + elif array_type == 'sparse': + array_constructor = sp.csr_matrix + elif array_type == 'dataframe': + pd = pytest.importorskip('pandas') + array_constructor = pd.DataFrame + + X = array_constructor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + mask = [True, False, True] + mask_array = np.array(mask) + X_masked = safe_indexing(X, mask, axis=1) + X_masked_array = safe_indexing(X, mask_array, axis=1) + assert_allclose_dense_sparse(X_masked, X_masked_array) + + def test_shuffle_on_ndim_equals_three(): def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A) From c8009a28aa2855e8e01cfebd3ef5df337c0536f4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 30 Jul 2019 12:17:43 +0200 Subject: [PATCH 24/86] fix --- sklearn/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 8db41bb27986d..ac6446afcd6cf 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -229,7 +229,7 @@ def safe_indexing(X, indices, axis=0): # FIXME: to be removed once NumPy 1.13 is the minimum version required def _array_indexing(array, key, axis=0): """Index an array consistently across NumPy version.""" - if np_version < (1, 13): + if np_version < (1, 13) or issparse(array): # check if we have an boolean array-likes to make the proper indexing key_array = np.asarray(key) if np.issubdtype(key_array.dtype, np.bool_): From a80b33d7a60d67706150d48f71b3d0837fceba38 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 30 Jul 2019 14:25:31 +0200 Subject: [PATCH 25/86] add test in column transformer --- sklearn/compose/tests/test_column_transformer.py | 5 ++++- sklearn/utils/__init__.py | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index a9c4fd9e25fbe..a667b35cf65e3 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1111,10 +1111,13 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname): tf.transform(X_array) -def test_column_transformer_mask_indexing(): +@pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix]) +def test_column_transformer_mask_indexing(array_type): # Regression test for #14510 # Boolean array-like does not behave as boolean array with NumPy < 1.13 + # and sparse matrices as well X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]]) + X = array_type(X) column_transformer = ColumnTransformer( [('identity', FunctionTransformer(), [False, True, False, True])] ) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index ac6446afcd6cf..83f4d7fd1876c 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -226,7 +226,6 @@ def safe_indexing(X, indices, axis=0): ) -# FIXME: to be removed once NumPy 1.13 is the minimum version required def _array_indexing(array, key, axis=0): """Index an array consistently across NumPy version.""" if np_version < (1, 13) or issparse(array): From 9fb045dcf1b7923bf06021b6944ca0cb3dd8ad40 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Aug 2019 14:22:46 +0200 Subject: [PATCH 26/86] raise error if axis not 0 or 1 --- sklearn/utils/__init__.py | 5 +++++ sklearn/utils/tests/test_utils.py | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 83f4d7fd1876c..3b4a20d08716b 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -228,6 +228,11 @@ def safe_indexing(X, indices, axis=0): def _array_indexing(array, key, axis=0): """Index an array consistently across NumPy version.""" + if axis not in (0, 1): + raise ValueError( + "'axis' should be either 0 (to index rows) or 1 (to index " + " column). Got {} instead.".format(axis) + ) if np_version < (1, 13) or issparse(array): # check if we have an boolean array-likes to make the proper indexing key_array = np.asarray(key) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 35cfde4aaef7d..49f50eedc0a42 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -13,6 +13,7 @@ assert_allclose_dense_sparse, assert_raises_regex, assert_warns_message, assert_no_warnings) +from sklearn.utils import _array_indexing from sklearn.utils import check_random_state from sklearn.utils import _check_key_type from sklearn.utils import deprecated @@ -387,6 +388,13 @@ def test_safe_indexing_mask_axis_1(array_type): assert_allclose_dense_sparse(X_masked, X_masked_array) +def test_array_indexing_array_error(): + X = np.array([[0, 1], [2, 3]]) + mask = [True, False] + with pytest.raises(ValueError, match="'axis' should be either 0"): + _array_indexing(X, mask, axis=3) + + def test_shuffle_on_ndim_equals_three(): def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A) From 5dcf34f3979c365f99b94b4f4a116c2ec4ed18b2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Aug 2019 14:40:02 +0200 Subject: [PATCH 27/86] itert --- sklearn/utils/__init__.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index c41325c03d307..901ad3561dd5c 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -268,10 +268,10 @@ def _safe_indexing_row(X, indices): elif not isinstance(indices, slice): indices = np.asarray(indices) if hasattr(X, "iloc"): + # Pandas Dataframes and Series if not isinstance(indices, slice): # Work-around for indexing with read-only indices in pandas indices = indices if indices.flags.writeable else indices.copy() - # Pandas Dataframes and Series try: return X.iloc[indices] except ValueError: @@ -281,20 +281,14 @@ def _safe_indexing_row(X, indices): DataConversionWarning) return X.copy().iloc[indices] elif hasattr(X, "shape"): - if hasattr(X, 'take') and (hasattr(indices, 'dtype') and - indices.dtype.kind == 'i'): - # This is often substantially faster than X[indices] - return X.take(indices, axis=0) - else: - return _array_indexing(X, indices, axis=0) - else: + return _array_indexing(X, indices, axis=0) + elif not isinstance(indices, Iterable) or indices.ndim == 0: # In the case of a slice or a scalar - if not isinstance(indices, Iterable) or indices.ndim == 0: - return X[indices] - else: - if np.issubdtype(indices.dtype, np.bool_): - indices = np.flatnonzero(indices) - return [X[idx] for idx in indices] + return X[indices] + else: + if np.issubdtype(indices.dtype, np.bool_): + indices = np.flatnonzero(indices) + return [X[idx] for idx in indices] def _check_key_type(key, superclass): From 70f0e023b76558bad6226aa92a25c1024870e839 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Aug 2019 16:52:10 +0200 Subject: [PATCH 28/86] iter --- sklearn/utils/__init__.py | 68 +++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 901ad3561dd5c..cad6ecaa64f80 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -4,6 +4,7 @@ from collections.abc import Iterable from collections.abc import Sequence from contextlib import contextmanager +from itertools import compress from itertools import islice import numbers import platform @@ -188,19 +189,17 @@ def safe_indexing(X, indices, axis=0): ---------- X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series Data from which to sample rows, items or columns. - indices : array-like - - For both `axis=0` and `axis=1`, indices can be one of: - - scalar: output is 1D, unless `X` is sparse. - Supported data types for scalars: - - integer: supported for arrays, sparse matrices and - dataframes. - - container: lists, slices, boolean masks: output is 2D. - Supported data types for containers: - - integer or boolean (positional): supported for arrays, sparse - matrices and dataframes - - If `X` is a dataframe and `axis=1`, indices support string data type - (key-based) as a scalar or a container. The output dimension will be - identical to the above case. + indices : bool, int, str, array-like + - To select a single element (i.e. row or column), `indices` can be one + of the following: `bool` or `int` which are supported by all types of + `X`. `indices` being a `str` is only supported for `X` being a + dataframe. The selected subset will be 1D, unless `X` is a sparse + matrix in which case it will be 2D. + - To select multiple elements (i.e. rows or columns), `indices` can be + one of the following: `list`, `array`, `slice`. The type used in + these containers can be one of the following: `int`, `bool`, and + `str`. `str` is only supported when `X` is a dataframe. + The selected subset will be 2D. axis : int, default=0 The axis along which `X` will be subsampled. ``axis=0`` will select rows while ``axis=1`` will select columns. @@ -241,17 +240,18 @@ def _array_indexing(array, key, axis=0): return array[key] if axis == 0 else array[:, key] -def _safe_indexing_row(X, indices): +def _safe_indexing_row(X, key): """Return items or rows from X using indices. - Allows simple indexing of lists or arrays. + Allows simple indexing of lists, NumPy array, SciPy sparse matrices, and + Pandas DataFrame`. Parameters ---------- X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series Data from which to sample rows or items. - indices : array-like of int - Indices according to which X will be subsampled. + key : int, slice, bool or int array-like + Key used to get a subset of X. Returns ------- @@ -263,32 +263,36 @@ def _safe_indexing_row(X, indices): CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. """ - if indices is None: + if key is None: return X - elif not isinstance(indices, slice): - indices = np.asarray(indices) if hasattr(X, "iloc"): # Pandas Dataframes and Series - if not isinstance(indices, slice): - # Work-around for indexing with read-only indices in pandas - indices = indices if indices.flags.writeable else indices.copy() + if hasattr(key, 'flags'): + # Work-around for indexing with read-only key in pandas + key = key if key.flags.writeable else key.copy() try: - return X.iloc[indices] + return X.iloc[key] except ValueError: # Cython typed memoryviews internally used in pandas do not support # readonly buffers. warnings.warn("Copying input dataframe for slicing.", DataConversionWarning) - return X.copy().iloc[indices] + return X.copy().iloc[key] elif hasattr(X, "shape"): - return _array_indexing(X, indices, axis=0) - elif not isinstance(indices, Iterable) or indices.ndim == 0: - # In the case of a slice or a scalar - return X[indices] + # NumPy array and SciPy sparse matrix + return _array_indexing(X, key, axis=0) else: - if np.issubdtype(indices.dtype, np.bool_): - indices = np.flatnonzero(indices) - return [X[idx] for idx in indices] + # Python list + if not isinstance(key, Iterable) or isinstance(indexable, slice): + # key being a slice or a scalar + return X[key] + key_set = set(key) + if len(key_set) == 2 and all(isinstance(k, (bool, np.bool_)) + for k in key_set): + # key is a boolean array-like + return list(compress(X, key)) + # key is a integer array-like of key + return [X[idx] for idx in key] def _check_key_type(key, superclass): From 7127b5a9cf2efcc01ac8c9028111d74a33fe1088 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Aug 2019 18:31:00 +0200 Subject: [PATCH 29/86] refactor --- sklearn/utils/__init__.py | 202 +++++++++++------------------- sklearn/utils/tests/test_utils.py | 7 -- 2 files changed, 71 insertions(+), 138 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index cad6ecaa64f80..5a41c4a8bd298 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -182,56 +182,8 @@ def axis0_safe_slice(X, mask, len_mask): return np.zeros(shape=(0, X.shape[1])) -def safe_indexing(X, indices, axis=0): - """Return rows, items or columns of X using indices. - - Parameters - ---------- - X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series - Data from which to sample rows, items or columns. - indices : bool, int, str, array-like - - To select a single element (i.e. row or column), `indices` can be one - of the following: `bool` or `int` which are supported by all types of - `X`. `indices` being a `str` is only supported for `X` being a - dataframe. The selected subset will be 1D, unless `X` is a sparse - matrix in which case it will be 2D. - - To select multiple elements (i.e. rows or columns), `indices` can be - one of the following: `list`, `array`, `slice`. The type used in - these containers can be one of the following: `int`, `bool`, and - `str`. `str` is only supported when `X` is a dataframe. - The selected subset will be 2D. - axis : int, default=0 - The axis along which `X` will be subsampled. ``axis=0`` will select - rows while ``axis=1`` will select columns. - - Returns - ------- - subset - Subset of X on axis 0 or 1. - - Notes - ----- - CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are - not supported. - """ - if axis == 0: - return _safe_indexing_row(X, indices) - elif axis == 1: - return _safe_indexing_column(X, indices) - else: - raise ValueError( - "'axis' should be either 0 (to index rows) or 1 (to index " - " column). Got {} instead.".format(axis) - ) - - -def _array_indexing(array, key, axis=0): +def _array_indexing(array, key, axis): """Index an array consistently across NumPy version.""" - if axis not in (0, 1): - raise ValueError( - "'axis' should be either 0 (to index rows) or 1 (to index " - " column). Got {} instead.".format(axis) - ) if np_version < (1, 13) or issparse(array): # check if we have an boolean array-likes to make the proper indexing key_array = np.asarray(key) @@ -240,48 +192,18 @@ def _array_indexing(array, key, axis=0): return array[key] if axis == 0 else array[:, key] -def _safe_indexing_row(X, key): - """Return items or rows from X using indices. +def _pandas_indexing(X, key, axis, by_name): + """Index a pandas dataframe or a series.""" + if hasattr(key, 'flags'): + # Work-around for indexing with read-only key in pandas + key = key if key.flags.writeable else key.copy() + indexer = 'loc' if by_name else 'iloc' + return getattr(X, indexer)[:, key] if axis else getattr(X, indexer)[key] - Allows simple indexing of lists, NumPy array, SciPy sparse matrices, and - Pandas DataFrame`. - Parameters - ---------- - X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series - Data from which to sample rows or items. - key : int, slice, bool or int array-like - Key used to get a subset of X. - - Returns - ------- - subset - Subset of X on first axis. - - Notes - ----- - CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are - not supported. - """ - if key is None: - return X - if hasattr(X, "iloc"): - # Pandas Dataframes and Series - if hasattr(key, 'flags'): - # Work-around for indexing with read-only key in pandas - key = key if key.flags.writeable else key.copy() - try: - return X.iloc[key] - except ValueError: - # Cython typed memoryviews internally used in pandas do not support - # readonly buffers. - warnings.warn("Copying input dataframe for slicing.", - DataConversionWarning) - return X.copy().iloc[key] - elif hasattr(X, "shape"): - # NumPy array and SciPy sparse matrix - return _array_indexing(X, key, axis=0) - else: +def _list_indexing(X, key, axis): + """Index a Python list.""" + if axis == 0: # Python list if not isinstance(key, Iterable) or isinstance(indexable, slice): # key being a slice or a scalar @@ -315,7 +237,7 @@ def _check_key_type(key, superclass): return (isinstance(key.start, (superclass, type(None))) and isinstance(key.stop, (superclass, type(None)))) if isinstance(key, list): - return all(isinstance(x, superclass) for x in key) + return all(isinstance(x, superclass) for x in set(key)) if hasattr(key, 'dtype'): if superclass is int: return key.dtype.kind == 'i' @@ -327,60 +249,78 @@ def _check_key_type(key, superclass): return False -def _safe_indexing_column(X, key): - """Get feature column(s) from input data X. +def safe_indexing(X, indices, axis=0): + """Return rows, items or columns of X using indices. - Supported input types (X): numpy arrays, sparse arrays and DataFrames. + Parameters + ---------- + X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series + Data from which to sample rows, items or columns. + indices : bool, int, str, array-like + - To select a single element (i.e. row or column), `indices` can be one + of the following: `bool` or `int` which are supported by all types of + `X`. `indices` being a `str` is only supported for `X` being a + dataframe. The selected subset will be 1D, unless `X` is a sparse + matrix in which case it will be 2D. + - To select multiple elements (i.e. rows or columns), `indices` can be + one of the following: `list`, `array`, `slice`. The type used in + these containers can be one of the following: `int`, `bool`, and + `str`. `str` is only supported when `X` is a dataframe. + The selected subset will be 2D. + axis : int, default=0 + The axis along which `X` will be subsampled. ``axis=0`` will select + rows while ``axis=1`` will select columns. - Supported key types (key): - - scalar: output is 1D; - - lists, slices, boolean masks: output is 2D. + Returns + ------- + subset + Subset of X on axis 0 or 1. - Supported key data types: - - integer or boolean mask (positional): - - supported for arrays, sparse matrices and dataframes. - - string (key-based): - - only supported for dataframes; - - So no keys other than strings are allowed (while in principle you - can use any hashable object as key). + Notes + ----- + CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are + not supported. """ - # check that X is a 2D structure - if X.ndim != 2: - raise ValueError( - "'X' should be a 2D NumPy array, 2D sparse matrix or pandas " - "dataframe when indexing the columns (i.e. 'axis=1'). " - "Got {} instead with {} dimension(s).".format(type(X), X.ndim) - ) - # check whether we have string column names or integers - if _check_key_type(key, int): - column_names = False - elif _check_key_type(key, str): - column_names = True - elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_): + if indices is None: + return X + if _check_key_type(indices, int): + by_name = False + elif _check_key_type(indices, str): + by_name = True + elif _check_key_type(indices, bool): # boolean mask - column_names = False + by_name = False if hasattr(X, 'loc'): # pandas boolean masks don't work with iloc, so take loc path - column_names = True + by_name = True else: raise ValueError("No valid specification of the columns. Only a " "scalar, list or slice of all integers or all " "strings, or boolean mask is allowed") - if column_names: - if hasattr(X, 'loc'): - # pandas dataframes - return X.loc[:, key] - else: - raise ValueError("Specifying the columns using strings is only " - "supported for pandas DataFrames") + if axis not in (0, 1): + raise ValueError( + "'axis' should be either 0 (to index rows) or 1 (to index " + " column). Got {} instead.".format(axis) + ) + + if axis == 1 and X.ndim != 2: + raise ValueError( + "'X' should be a 2D NumPy array, 2D sparse matrix or pandas " + "dataframe when indexing the columns (i.e. 'axis=1'). " + "Got {} instead with {} dimension(s).".format(type(X), X.ndim) + ) + + if by_name and not hasattr(X, 'loc'): + raise ValueError("Specifying the columns using strings is only " + "supported for pandas DataFrames") + + if hasattr(X, "iloc"): + return _pandas_indexing(X, indices, axis=axis, by_name=by_name) + elif hasattr(X, "shape"): + return _array_indexing(X, indices, axis=axis) else: - if hasattr(X, 'iloc'): - # pandas dataframes - return X.iloc[:, key] - else: - # numpy arrays, sparse arrays - return _array_indexing(X, key, axis=1) + return _list_indexing(X, indices, axis=axis) def _get_column_indices(X, key): diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index da7306c1f2a1b..73b342d0e5e50 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -442,13 +442,6 @@ def test_safe_indexing_mask_axis_1(array_type): assert_allclose_dense_sparse(X_masked, X_masked_array) -def test_array_indexing_array_error(): - X = np.array([[0, 1], [2, 3]]) - mask = [True, False] - with pytest.raises(ValueError, match="'axis' should be either 0"): - _array_indexing(X, mask, axis=3) - - def test_shuffle_on_ndim_equals_three(): def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A) From 2f96882bbf9619c4c5cc4b08de12dec7590e1251 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Aug 2019 18:39:02 +0200 Subject: [PATCH 30/86] PEP8 comments --- sklearn/utils/__init__.py | 6 +++--- sklearn/utils/tests/test_utils.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 5a41c4a8bd298..47b704a59d12d 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -256,7 +256,7 @@ def safe_indexing(X, indices, axis=0): ---------- X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series Data from which to sample rows, items or columns. - indices : bool, int, str, array-like + indices : bool, int, str, slice, array-like - To select a single element (i.e. row or column), `indices` can be one of the following: `bool` or `int` which are supported by all types of `X`. `indices` being a `str` is only supported for `X` being a @@ -264,8 +264,8 @@ def safe_indexing(X, indices, axis=0): matrix in which case it will be 2D. - To select multiple elements (i.e. rows or columns), `indices` can be one of the following: `list`, `array`, `slice`. The type used in - these containers can be one of the following: `int`, `bool`, and - `str`. `str` is only supported when `X` is a dataframe. + these containers can be one of the following: `int` and `str`. + However, `str` is only supported when `X` is a dataframe. The selected subset will be 2D. axis : int, default=0 The axis along which `X` will be subsampled. ``axis=0`` will select diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 73b342d0e5e50..6ebc378890bf3 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -13,7 +13,6 @@ assert_allclose_dense_sparse, assert_raises_regex, assert_warns_message, assert_no_warnings) -from sklearn.utils import _array_indexing from sklearn.utils import check_random_state from sklearn.utils import _check_key_type from sklearn.utils import deprecated From 619fb0526c5b905e31ab11da580c197b505690ac Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Aug 2019 18:41:00 +0200 Subject: [PATCH 31/86] iter --- sklearn/utils/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 47b704a59d12d..29e1b324ea9c6 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -255,7 +255,8 @@ def safe_indexing(X, indices, axis=0): Parameters ---------- X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series - Data from which to sample rows, items or columns. + Data from which to sample rows, items or columns. `list` are only + supported when `axis=0`. indices : bool, int, str, slice, array-like - To select a single element (i.e. row or column), `indices` can be one of the following: `bool` or `int` which are supported by all types of From b7539bd75ea5bd8276621b16aa4f15165e66a71a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Aug 2019 18:46:55 +0200 Subject: [PATCH 32/86] style --- sklearn/utils/__init__.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 29e1b324ea9c6..9e6396e7b5ba1 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -201,20 +201,18 @@ def _pandas_indexing(X, key, axis, by_name): return getattr(X, indexer)[:, key] if axis else getattr(X, indexer)[key] -def _list_indexing(X, key, axis): +def _list_indexing(X, key): """Index a Python list.""" - if axis == 0: - # Python list - if not isinstance(key, Iterable) or isinstance(indexable, slice): - # key being a slice or a scalar - return X[key] - key_set = set(key) - if len(key_set) == 2 and all(isinstance(k, (bool, np.bool_)) - for k in key_set): - # key is a boolean array-like - return list(compress(X, key)) - # key is a integer array-like of key - return [X[idx] for idx in key] + if not isinstance(key, Iterable) or isinstance(indexable, slice): + # key is a slice or a scalar + return X[key] + key_set = set(key) + if (len(key_set) == 2 and + all(isinstance(k, (bool, np.bool_)) for k in key_set)): + # key is a boolean array-like + return list(compress(X, key)) + # key is a integer array-like of key + return [X[idx] for idx in key] def _check_key_type(key, superclass): @@ -321,7 +319,7 @@ def safe_indexing(X, indices, axis=0): elif hasattr(X, "shape"): return _array_indexing(X, indices, axis=axis) else: - return _list_indexing(X, indices, axis=axis) + return _list_indexing(X, indices) def _get_column_indices(X, key): From 18fba6c6f7b51ce63bb5b435bf1828eac32ca158 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 1 Aug 2019 13:02:25 -0400 Subject: [PATCH 33/86] make check_is_fitted not take attributes --- benchmarks/bench_plot_nmf.py | 4 +-- doc/developers/contributing.rst | 2 +- sklearn/calibration.py | 4 +-- sklearn/cluster/_feature_agglomeration.py | 4 +-- sklearn/cluster/affinity_propagation_.py | 2 +- sklearn/cluster/birch.py | 4 +-- sklearn/cluster/k_means_.py | 8 +++--- sklearn/cluster/mean_shift_.py | 2 +- sklearn/compose/_column_transformer.py | 4 +-- sklearn/compose/_target.py | 2 +- sklearn/covariance/elliptic_envelope.py | 4 +-- sklearn/cross_decomposition/pls_.py | 6 ++-- sklearn/decomposition/base.py | 2 +- sklearn/decomposition/dict_learning.py | 2 +- sklearn/decomposition/factor_analysis.py | 8 +++--- sklearn/decomposition/fastica_.py | 4 +-- sklearn/decomposition/kernel_pca.py | 2 +- sklearn/decomposition/nmf.py | 4 +-- sklearn/decomposition/online_lda.py | 4 +-- sklearn/decomposition/pca.py | 2 +- sklearn/decomposition/sparse_pca.py | 2 +- sklearn/discriminant_analysis.py | 6 ++-- sklearn/dummy.py | 6 ++-- .../_hist_gradient_boosting/binning.py | 2 +- sklearn/ensemble/bagging.py | 8 +++--- sklearn/ensemble/forest.py | 10 +++---- sklearn/ensemble/gradient_boosting.py | 4 +-- sklearn/ensemble/iforest.py | 4 +-- sklearn/ensemble/voting.py | 10 +++---- sklearn/ensemble/weight_boosting.py | 10 +++---- sklearn/feature_extraction/text.py | 6 ++-- sklearn/feature_selection/rfe.py | 12 ++++---- .../feature_selection/univariate_selection.py | 12 ++++---- .../feature_selection/variance_threshold.py | 2 +- sklearn/gaussian_process/gpc.py | 10 +++---- sklearn/impute/_base.py | 4 +-- sklearn/impute/_iterative.py | 2 +- sklearn/kernel_approximation.py | 6 ++-- sklearn/kernel_ridge.py | 2 +- sklearn/linear_model/base.py | 4 +-- sklearn/linear_model/coordinate_descent.py | 2 +- sklearn/linear_model/logistic.py | 2 +- sklearn/linear_model/ransac.py | 4 +-- sklearn/linear_model/stochastic_gradient.py | 4 +-- sklearn/manifold/locally_linear.py | 2 +- sklearn/mixture/gaussian_mixture.py | 2 +- sklearn/model_selection/_search.py | 2 +- sklearn/multiclass.py | 20 ++++++------- sklearn/multioutput.py | 8 +++--- sklearn/naive_bayes.py | 8 +++--- sklearn/neighbors/base.py | 8 +++--- sklearn/neighbors/nca.py | 2 +- sklearn/neighbors/nearest_centroid.py | 2 +- .../neural_network/multilayer_perceptron.py | 6 ++-- sklearn/neural_network/rbm.py | 6 ++-- sklearn/preprocessing/_discretization.py | 4 +-- sklearn/preprocessing/_encoders.py | 8 +++--- sklearn/preprocessing/data.py | 28 +++++++++---------- sklearn/preprocessing/label.py | 12 ++++---- sklearn/random_projection.py | 2 +- sklearn/semi_supervised/label_propagation.py | 2 +- sklearn/svm/base.py | 4 +-- sklearn/tests/test_metaestimators.py | 2 +- sklearn/tree/tree.py | 12 ++++---- sklearn/utils/tests/test_estimator_checks.py | 2 +- sklearn/utils/tests/test_validation.py | 16 +++++------ sklearn/utils/validation.py | 8 +++--- 67 files changed, 187 insertions(+), 187 deletions(-) diff --git a/benchmarks/bench_plot_nmf.py b/benchmarks/bench_plot_nmf.py index 3ec7cea92cf2d..d8d34d8f952ce 100644 --- a/benchmarks/bench_plot_nmf.py +++ b/benchmarks/bench_plot_nmf.py @@ -213,13 +213,13 @@ def fit(self, X, y=None, **params): return self def transform(self, X): - check_is_fitted(self, 'components_') + check_is_fitted(self) H = self.components_ W, _, self.n_iter_ = self._fit_transform(X, H=H, update_H=False) return W def inverse_transform(self, W): - check_is_fitted(self, 'components_') + check_is_fitted(self) return np.dot(W, self.components_) def fit_transform(self, X, y=None, W=None, H=None): diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 1ef8f2d03e14c..27d7236bf02d4 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -1354,7 +1354,7 @@ the correct interface more easily. ... def predict(self, X): ... ... # Check is fit had been called - ... check_is_fitted(self, ['X_', 'y_']) + ... check_is_fitted(self) ... ... # Input validation ... X = check_array(X) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 2c30cdabcb415..b88a8b8eb37ef 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -216,7 +216,7 @@ def predict_proba(self, X): C : array, shape (n_samples, n_classes) The predicted probas. """ - check_is_fitted(self, ["classes_", "calibrated_classifiers_"]) + check_is_fitted(self) X = check_array(X, accept_sparse=['csc', 'csr', 'coo'], force_all_finite=False) # Compute the arithmetic mean of the predictions of the calibrated @@ -244,7 +244,7 @@ def predict(self, X): C : array, shape (n_samples,) The predicted class. """ - check_is_fitted(self, ["classes_", "calibrated_classifiers_"]) + check_is_fitted(self) return self.classes_[np.argmax(self.predict_proba(X), axis=1)] diff --git a/sklearn/cluster/_feature_agglomeration.py b/sklearn/cluster/_feature_agglomeration.py index f20b8db7d535c..3b7767feedb00 100644 --- a/sklearn/cluster/_feature_agglomeration.py +++ b/sklearn/cluster/_feature_agglomeration.py @@ -36,7 +36,7 @@ def transform(self, X): Y : array, shape = [n_samples, n_clusters] or [n_clusters] The pooled values for each feature cluster. """ - check_is_fitted(self, "labels_") + check_is_fitted(self) X = check_array(X) if len(self.labels_) != X.shape[1]: @@ -71,7 +71,7 @@ def inverse_transform(self, Xred): A vector of size n_samples with the values of Xred assigned to each of the cluster of samples. """ - check_is_fitted(self, "labels_") + check_is_fitted(self) unil, inverse = np.unique(self.labels_, return_inverse=True) return Xred[..., inverse] diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py index 487ade4012133..89c6ce9fe8b34 100644 --- a/sklearn/cluster/affinity_propagation_.py +++ b/sklearn/cluster/affinity_propagation_.py @@ -407,7 +407,7 @@ def predict(self, X): labels : ndarray, shape (n_samples,) Cluster labels. """ - check_is_fitted(self, "cluster_centers_indices_") + check_is_fitted(self) if not hasattr(self, "cluster_centers_"): raise ValueError("Predict method is not supported when " "affinity='precomputed'.") diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py index 27b5038bb67a3..941b833e977f7 100644 --- a/sklearn/cluster/birch.py +++ b/sklearn/cluster/birch.py @@ -534,7 +534,7 @@ def partial_fit(self, X=None, y=None): return self._fit(X) def _check_fit(self, X): - check_is_fitted(self, ['subcluster_centers_', 'partial_fit_'], + check_is_fitted(self, all_or_any=any) if (hasattr(self, 'subcluster_centers_') and @@ -583,7 +583,7 @@ def transform(self, X): X_trans : {array-like, sparse matrix}, shape (n_samples, n_clusters) Transformed data. """ - check_is_fitted(self, 'subcluster_centers_') + check_is_fitted(self) return euclidean_distances(X, self.subcluster_centers_) def _global_clustering(self, X=None): diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index b7fbdf7da3ad1..27bdc934e4c0d 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -1033,7 +1033,7 @@ def transform(self, X): X_new : array, shape [n_samples, k] X transformed in the new space. """ - check_is_fitted(self, 'cluster_centers_') + check_is_fitted(self) X = self._check_test_data(X) return self._transform(X) @@ -1063,7 +1063,7 @@ def predict(self, X, sample_weight=None): labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ - check_is_fitted(self, 'cluster_centers_') + check_is_fitted(self) X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) @@ -1090,7 +1090,7 @@ def score(self, X, y=None, sample_weight=None): score : float Opposite of the value of X on the K-means objective. """ - check_is_fitted(self, 'cluster_centers_') + check_is_fitted(self) X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) @@ -1733,7 +1733,7 @@ def predict(self, X, sample_weight=None): labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ - check_is_fitted(self, 'cluster_centers_') + check_is_fitted(self) X = self._check_test_data(X) return self._labels_inertia_minibatch(X, sample_weight)[0] diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py index 960ac28984721..e588ccd6df1c8 100644 --- a/sklearn/cluster/mean_shift_.py +++ b/sklearn/cluster/mean_shift_.py @@ -435,6 +435,6 @@ def predict(self, X): labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ - check_is_fitted(self, "cluster_centers_") + check_is_fitted(self) return pairwise_distances_argmin(X, self.cluster_centers_) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index c0f537776cb6a..1d460b11dc480 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -341,7 +341,7 @@ def get_feature_names(self): feature_names : list of strings Names of the features produced by transform. """ - check_is_fitted(self, 'transformers_') + check_is_fitted(self) feature_names = [] for name, trans, _, _ in self._iter(fitted=True): if trans == 'drop': @@ -516,7 +516,7 @@ def transform(self, X): sparse matrices. """ - check_is_fitted(self, 'transformers_') + check_is_fitted(self) X = _check_X(X) if self._n_features > X.shape[1]: diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index c1c3f4df4e95f..35b7ed6af962a 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -221,7 +221,7 @@ def predict(self, X): Predicted values. """ - check_is_fitted(self, "regressor_") + check_is_fitted(self) pred = self.regressor_.predict(X) if pred.ndim == 1: pred_trans = self.transformer_.inverse_transform( diff --git a/sklearn/covariance/elliptic_envelope.py b/sklearn/covariance/elliptic_envelope.py index 517f9a32dc9af..aa5e01ffa14b0 100644 --- a/sklearn/covariance/elliptic_envelope.py +++ b/sklearn/covariance/elliptic_envelope.py @@ -147,7 +147,7 @@ def decision_function(self, X): compatibility with other outlier detection algorithms. """ - check_is_fitted(self, 'offset_') + check_is_fitted(self) negative_mahal_dist = self.score_samples(X) return negative_mahal_dist - self.offset_ @@ -163,7 +163,7 @@ def score_samples(self, X): negative_mahal_distances : array-like, shape (n_samples, ) Opposite of the Mahalanobis distances. """ - check_is_fitted(self, 'offset_') + check_is_fitted(self) return -self.mahalanobis(X) def predict(self, X): diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py index 175a472e6d4fb..94c517992e061 100644 --- a/sklearn/cross_decomposition/pls_.py +++ b/sklearn/cross_decomposition/pls_.py @@ -398,7 +398,7 @@ def transform(self, X, Y=None, copy=True): ------- x_scores if Y is not given, (x_scores, y_scores) otherwise. """ - check_is_fitted(self, 'x_mean_') + check_is_fitted(self) X = check_array(X, copy=copy, dtype=FLOAT_DTYPES) # Normalize X -= self.x_mean_ @@ -433,7 +433,7 @@ def predict(self, X, copy=True): This call requires the estimation of a p x q matrix, which may be an issue in high dimensional space. """ - check_is_fitted(self, 'x_mean_') + check_is_fitted(self) X = check_array(X, copy=copy, dtype=FLOAT_DTYPES) # Normalize X -= self.x_mean_ @@ -872,7 +872,7 @@ def transform(self, X, Y=None): Target vectors, where n_samples is the number of samples and n_targets is the number of response variables. """ - check_is_fitted(self, 'x_mean_') + check_is_fitted(self) X = check_array(X, dtype=np.float64) Xr = (X - self.x_mean_) / self.x_std_ x_scores = np.dot(Xr, self.x_weights_) diff --git a/sklearn/decomposition/base.py b/sklearn/decomposition/base.py index 3cbdb29723825..0dad8c6130d68 100644 --- a/sklearn/decomposition/base.py +++ b/sklearn/decomposition/base.py @@ -122,7 +122,7 @@ def transform(self, X): IncrementalPCA(batch_size=3, n_components=2) >>> ipca.transform(X) # doctest: +SKIP """ - check_is_fitted(self, ['mean_', 'components_'], all_or_any=all) + check_is_fitted(self, all_or_any=all) X = check_array(X) if self.mean_ is not None: diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index 56187948f8554..6fa6d1e7f2d6f 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -911,7 +911,7 @@ def transform(self, X): Transformed data """ - check_is_fitted(self, 'components_') + check_is_fitted(self) X = check_array(X) diff --git a/sklearn/decomposition/factor_analysis.py b/sklearn/decomposition/factor_analysis.py index f9d81737850ff..1306c4245a7f3 100644 --- a/sklearn/decomposition/factor_analysis.py +++ b/sklearn/decomposition/factor_analysis.py @@ -261,7 +261,7 @@ def transform(self, X): X_new : array-like, shape (n_samples, n_components) The latent variables of X. """ - check_is_fitted(self, 'components_') + check_is_fitted(self) X = check_array(X) Ih = np.eye(len(self.components_)) @@ -285,7 +285,7 @@ def get_covariance(self): cov : array, shape (n_features, n_features) Estimated covariance of data. """ - check_is_fitted(self, 'components_') + check_is_fitted(self) cov = np.dot(self.components_.T, self.components_) cov.flat[::len(cov) + 1] += self.noise_variance_ # modify diag inplace @@ -299,7 +299,7 @@ def get_precision(self): precision : array, shape (n_features, n_features) Estimated precision of data. """ - check_is_fitted(self, 'components_') + check_is_fitted(self) n_features = self.components_.shape[1] @@ -333,7 +333,7 @@ def score_samples(self, X): ll : array, shape (n_samples,) Log-likelihood of each sample under the current model """ - check_is_fitted(self, 'components_') + check_is_fitted(self) Xr = X - self.mean_ precision = self.get_precision() diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index d841926cdfc87..3f6f1af632494 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -574,7 +574,7 @@ def transform(self, X, copy=True): ------- X_new : array-like, shape (n_samples, n_components) """ - check_is_fitted(self, 'mixing_') + check_is_fitted(self) X = check_array(X, copy=copy, dtype=FLOAT_DTYPES) if self.whiten: @@ -597,7 +597,7 @@ def inverse_transform(self, X, copy=True): ------- X_new : array-like, shape (n_samples, n_features) """ - check_is_fitted(self, 'mixing_') + check_is_fitted(self) X = check_array(X, copy=(copy and self.whiten), dtype=FLOAT_DTYPES) X = np.dot(X, self.mixing_.T) diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py index 555bd619c5a62..59785fed3ac0e 100644 --- a/sklearn/decomposition/kernel_pca.py +++ b/sklearn/decomposition/kernel_pca.py @@ -319,7 +319,7 @@ def transform(self, X): ------- X_new : array-like, shape (n_samples, n_components) """ - check_is_fitted(self, 'X_fit_') + check_is_fitted(self) # Compute centered gram matrix between X and training data X_fit_ K = self._centerer.transform(self._get_kernel(X, self.X_fit_)) diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py index f64bc34b7fad7..0233688ae696e 100644 --- a/sklearn/decomposition/nmf.py +++ b/sklearn/decomposition/nmf.py @@ -1313,7 +1313,7 @@ def transform(self, X): W : array, shape (n_samples, n_components) Transformed data """ - check_is_fitted(self, 'n_components_') + check_is_fitted(self) W, _, n_iter_ = non_negative_factorization( X=X, W=None, H=self.components_, n_components=self.n_components_, @@ -1340,5 +1340,5 @@ def inverse_transform(self, W): .. versionadded:: 0.18 """ - check_is_fitted(self, 'n_components_') + check_is_fitted(self) return np.dot(W, self.components_) diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index c1d482f0a46c6..503cc24692e25 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -594,7 +594,7 @@ def _unnormalized_transform(self, X): doc_topic_distr : shape=(n_samples, n_components) Document topic distribution for X. """ - check_is_fitted(self, 'components_') + check_is_fitted(self) # make sure feature size is the same in fitted model and in X X = self._check_non_neg_array(X, "LatentDirichletAllocation.transform") @@ -748,7 +748,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, score : float Perplexity score. """ - check_is_fitted(self, 'components_') + check_is_fitted(self) X = self._check_non_neg_array(X, "LatentDirichletAllocation.perplexity") diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 99e392020abaf..1bf3d6e6b19e6 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -569,7 +569,7 @@ def score_samples(self, X): ll : array, shape (n_samples,) Log-likelihood of each sample under the current model """ - check_is_fitted(self, 'mean_') + check_is_fitted(self) X = check_array(X) Xr = X - self.mean_ diff --git a/sklearn/decomposition/sparse_pca.py b/sklearn/decomposition/sparse_pca.py index 238f6cc4ef403..3ca14cb528bb8 100644 --- a/sklearn/decomposition/sparse_pca.py +++ b/sklearn/decomposition/sparse_pca.py @@ -221,7 +221,7 @@ def transform(self, X): X_new array, shape (n_samples, n_components) Transformed data. """ - check_is_fitted(self, 'components_') + check_is_fitted(self) X = check_array(X) X = X - self.mean_ diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 9634b303ea946..4a3542e204288 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -505,7 +505,7 @@ def transform(self, X): if self.solver == 'lsqr': raise NotImplementedError("transform not implemented for 'lsqr' " "solver (use 'svd' or 'eigen').") - check_is_fitted(self, ['xbar_', 'scalings_'], all_or_any=any) + check_is_fitted(self, all_or_any=any) X = check_array(X) if self.solver == 'svd': @@ -528,7 +528,7 @@ def predict_proba(self, X): C : array, shape (n_samples, n_classes) Estimated probabilities. """ - check_is_fitted(self, 'classes_') + check_is_fitted(self) decision = self.decision_function(X) if self.classes_.size == 2: @@ -704,7 +704,7 @@ def fit(self, X, y): return self def _decision_function(self, X): - check_is_fitted(self, 'classes_') + check_is_fitted(self) X = check_array(X) norm2 = [] diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 02d8a448c9766..067a956f6435d 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -171,7 +171,7 @@ def predict(self, X): y : array, shape = [n_samples] or [n_samples, n_outputs] Predicted target values for X. """ - check_is_fitted(self, 'classes_') + check_is_fitted(self) # numpy random_state expects Python int and not long as size argument # under Windows @@ -249,7 +249,7 @@ def predict_proba(self, X): the model, where classes are ordered arithmetically, for each output. """ - check_is_fitted(self, 'classes_') + check_is_fitted(self) # numpy random_state expects Python int and not long as size argument # under Windows @@ -498,7 +498,7 @@ def predict(self, X, return_std=False): y_std : array, shape = [n_samples] or [n_samples, n_outputs] Standard deviation of predictive distribution of query points. """ - check_is_fitted(self, "constant_") + check_is_fitted(self) n_samples = _num_samples(X) y = np.full((n_samples, self.n_outputs_), self.constant_, diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py index a7ddc9a3ebb47..b35b2a2083b03 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/binning.py +++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py @@ -143,7 +143,7 @@ def transform(self, X): The binned data (fortran-aligned). """ X = check_array(X, dtype=[X_DTYPE], force_all_finite=False) - check_is_fitted(self, ['bin_thresholds_', 'actual_n_bins_']) + check_is_fitted(self) if X.shape[1] != self.actual_n_bins_.shape[0]: raise ValueError( 'This estimator was fitted with {} features but {} got passed ' diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py index 15096afefa810..7829b99b050f1 100644 --- a/sklearn/ensemble/bagging.py +++ b/sklearn/ensemble/bagging.py @@ -672,7 +672,7 @@ def predict_proba(self, X): The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ - check_is_fitted(self, "classes_") + check_is_fitted(self) # Check data X = check_array( X, accept_sparse=['csr', 'csc'], dtype=None, @@ -722,7 +722,7 @@ def predict_log_proba(self, X): The class log-probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ - check_is_fitted(self, "classes_") + check_is_fitted(self) if hasattr(self.base_estimator_, "predict_log_proba"): # Check data X = check_array( @@ -780,7 +780,7 @@ def decision_function(self, X): cases with ``k == 1``, otherwise ``k==n_classes``. """ - check_is_fitted(self, "classes_") + check_is_fitted(self) # Check data X = check_array( @@ -965,7 +965,7 @@ def predict(self, X): y : array of shape = [n_samples] The predicted values. """ - check_is_fitted(self, "estimators_features_") + check_is_fitted(self) # Check data X = check_array( X, accept_sparse=['csr', 'csc'], dtype=None, diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index a3513fdf32e41..b0fff1f6c9181 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -346,7 +346,7 @@ def _validate_y_class_weight(self, y): def _validate_X_predict(self, X): """Validate X whenever one tries to predict, apply, predict_proba""" - check_is_fitted(self, 'estimators_') + check_is_fitted(self) return self.estimators_[0]._validate_X_predict(X, check_input=True) @@ -362,7 +362,7 @@ def feature_importances_(self): trees consisting of only the root node, in which case it will be an array of zeros. """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) all_importances = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))( @@ -575,7 +575,7 @@ class in a leaf. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) # Check data X = self._validate_X_predict(X) @@ -680,7 +680,7 @@ def predict(self, X): y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted values. """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) # Check data X = self._validate_X_predict(X) @@ -2026,5 +2026,5 @@ def transform(self, X): X_transformed : sparse matrix, shape=(n_samples, n_out) Transformed dataset. """ - check_is_fitted(self, 'one_hot_encoder_') + check_is_fitted(self) return self.one_hot_encoder_.transform(self.apply(X)) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index cc43df36ba608..11813855d01d8 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -1390,7 +1390,7 @@ def _is_initialized(self): def _check_initialized(self): """Check that the estimator is initialized, raising an error if not.""" - check_is_fitted(self, 'estimators_') + check_is_fitted(self) def fit(self, X, y, sample_weight=None, monitor=None): """Fit the gradient boosting model. @@ -1741,7 +1741,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features): (n_trees_per_iteration, n_samples) The value of the partial dependence function on each grid point. """ - check_is_fitted(self, 'estimators_', + check_is_fitted(self, msg="'estimator' parameter must be a fitted estimator") if self.init is not None: warnings.warn( diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index 8aaae2925ccaf..4cdeb9673ccdb 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -303,7 +303,7 @@ def predict(self, X): For each observation, tells whether or not (+1 or -1) it should be considered as an inlier according to the fitted model. """ - check_is_fitted(self, ["offset_"]) + check_is_fitted(self) X = check_array(X, accept_sparse='csr') is_inlier = np.ones(X.shape[0], dtype=int) is_inlier[self.decision_function(X) < 0] = -1 @@ -365,7 +365,7 @@ def score_samples(self, X): The lower, the more abnormal. """ # code structure from ForestClassifier/predict_proba - check_is_fitted(self, ["estimators_"]) + check_is_fitted(self) # Check data X = check_array(X, accept_sparse='csr') diff --git a/sklearn/ensemble/voting.py b/sklearn/ensemble/voting.py index 7900d28c1f782..69381a39d9ce3 100644 --- a/sklearn/ensemble/voting.py +++ b/sklearn/ensemble/voting.py @@ -296,7 +296,7 @@ def predict(self, X): Predicted class labels. """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) if self.voting == 'soft': maj = np.argmax(self.predict_proba(X), axis=1) @@ -317,7 +317,7 @@ def _collect_probas(self, X): def _predict_proba(self, X): """Predict class probabilities for X in 'soft' voting """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) avg = np.average(self._collect_probas(X), axis=0, weights=self._weights_not_none) return avg @@ -363,7 +363,7 @@ def transform(self, X): array-like of shape (n_samples, n_classifiers), being class labels predicted by each classifier. """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) if self.voting == 'soft': probas = self._collect_probas(X) @@ -477,7 +477,7 @@ def predict(self, X): y : array of shape (n_samples,) The predicted values. """ - check_is_fitted(self, "estimators_") + check_is_fitted(self) return np.average(self._predict(X), axis=1, weights=self._weights_not_none) @@ -495,5 +495,5 @@ def transform(self, X): array-like of shape (n_samples, n_classifiers), being values predicted by each regressor. """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) return self._predict(X) diff --git a/sklearn/ensemble/weight_boosting.py b/sklearn/ensemble/weight_boosting.py index 2d0ba849f9990..9d3f1611a9d70 100644 --- a/sklearn/ensemble/weight_boosting.py +++ b/sklearn/ensemble/weight_boosting.py @@ -674,7 +674,7 @@ def decision_function(self, X): values closer to -1 or 1 mean more like the first or second class in ``classes_``, respectively. """ - check_is_fitted(self, "n_classes_") + check_is_fitted(self) X = self._validate_data(X) n_classes = self.n_classes_ @@ -717,7 +717,7 @@ def staged_decision_function(self, X): values closer to -1 or 1 mean more like the first or second class in ``classes_``, respectively. """ - check_is_fitted(self, "n_classes_") + check_is_fitted(self) X = self._validate_data(X) n_classes = self.n_classes_ @@ -786,7 +786,7 @@ def predict_proba(self, X): The class probabilities of the input samples. The order of outputs is the same of that of the `classes_` attribute. """ - check_is_fitted(self, "n_classes_") + check_is_fitted(self) X = self._validate_data(X) n_classes = self.n_classes_ @@ -1109,7 +1109,7 @@ def predict(self, X): y : array of shape = [n_samples] The predicted regression values. """ - check_is_fitted(self, "estimator_weights_") + check_is_fitted(self) X = self._validate_data(X) return self._get_median_predict(X, len(self.estimators_)) @@ -1134,7 +1134,7 @@ def staged_predict(self, X): y : generator of array, shape = [n_samples] The predicted regression values. """ - check_is_fitted(self, "estimator_weights_") + check_is_fitted(self) X = self._validate_data(X) for i, _ in enumerate(self.estimators_, 1): diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index ed4d41cc464f8..01a7b70587f3d 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -452,7 +452,7 @@ def _validate_vocabulary(self): def _check_vocabulary(self): """Check if vocabulary is empty or missing (not fit-ed)""" msg = "%(name)s - Vocabulary wasn't fitted." - check_is_fitted(self, 'vocabulary_', msg=msg), + check_is_fitted(self, msg=msg), if len(self.vocabulary_) == 0: raise ValueError("Vocabulary is empty") @@ -1380,7 +1380,7 @@ def transform(self, X, copy=True): X.data += 1 if self.use_idf: - check_is_fitted(self, '_idf_diag', 'idf vector is not fitted') + check_is_fitted(self, 'idf vector is not fitted') expected_n_features = self._idf_diag.shape[0] if n_features != expected_n_features: @@ -1749,7 +1749,7 @@ def transform(self, raw_documents, copy=True): X : sparse matrix, [n_samples, n_features] Tf-idf-weighted document-term matrix. """ - check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted') + check_is_fitted(self, 'The tfidf vector is not fitted') X = super().transform(raw_documents) return self._tfidf.transform(X, copy=False) diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index ce4eb5ed8bd45..4e957e8463a7c 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -242,7 +242,7 @@ def predict(self, X): y : array of shape [n_samples] The predicted target values. """ - check_is_fitted(self, 'estimator_') + check_is_fitted(self) return self.estimator_.predict(self.transform(X)) @if_delegate_has_method(delegate='estimator') @@ -258,11 +258,11 @@ def score(self, X, y): y : array of shape [n_samples] The target values. """ - check_is_fitted(self, 'estimator_') + check_is_fitted(self) return self.estimator_.score(self.transform(X), y) def _get_support_mask(self): - check_is_fitted(self, 'support_') + check_is_fitted(self) return self.support_ @if_delegate_has_method(delegate='estimator') @@ -284,7 +284,7 @@ def decision_function(self, X): Regression and binary classification produce an array of shape [n_samples]. """ - check_is_fitted(self, 'estimator_') + check_is_fitted(self) return self.estimator_.decision_function(self.transform(X)) @if_delegate_has_method(delegate='estimator') @@ -304,7 +304,7 @@ def predict_proba(self, X): The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ - check_is_fitted(self, 'estimator_') + check_is_fitted(self) return self.estimator_.predict_proba(self.transform(X)) @if_delegate_has_method(delegate='estimator') @@ -322,7 +322,7 @@ def predict_log_proba(self, X): The class log-probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ - check_is_fitted(self, 'estimator_') + check_is_fitted(self) return self.estimator_.predict_log_proba(self.transform(X)) def _more_tags(self): diff --git a/sklearn/feature_selection/univariate_selection.py b/sklearn/feature_selection/univariate_selection.py index 554cb3d392b29..5b1cae1823e9c 100644 --- a/sklearn/feature_selection/univariate_selection.py +++ b/sklearn/feature_selection/univariate_selection.py @@ -429,7 +429,7 @@ def _check_params(self, X, y): % self.percentile) def _get_support_mask(self): - check_is_fitted(self, 'scores_') + check_is_fitted(self) # Cater for NaNs if self.percentile == 100: @@ -514,7 +514,7 @@ def _check_params(self, X, y): % (X.shape[1], self.k)) def _get_support_mask(self): - check_is_fitted(self, 'scores_') + check_is_fitted(self) if self.k == 'all': return np.ones(self.scores_.shape, dtype=bool) @@ -587,7 +587,7 @@ def __init__(self, score_func=f_classif, alpha=5e-2): self.alpha = alpha def _get_support_mask(self): - check_is_fitted(self, 'scores_') + check_is_fitted(self) return self.pvalues_ < self.alpha @@ -653,7 +653,7 @@ def __init__(self, score_func=f_classif, alpha=5e-2): self.alpha = alpha def _get_support_mask(self): - check_is_fitted(self, 'scores_') + check_is_fitted(self) n_features = len(self.pvalues_) sv = np.sort(self.pvalues_) @@ -716,7 +716,7 @@ def __init__(self, score_func=f_classif, alpha=5e-2): self.alpha = alpha def _get_support_mask(self): - check_is_fitted(self, 'scores_') + check_is_fitted(self) return (self.pvalues_ < self.alpha / len(self.pvalues_)) @@ -811,7 +811,7 @@ def _check_params(self, X, y): self._make_selector()._check_params(X, y) def _get_support_mask(self): - check_is_fitted(self, 'scores_') + check_is_fitted(self) selector = self._make_selector() selector.pvalues_ = self.pvalues_ diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index 7d98de82c9711..c9eb973dc86c3 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -87,6 +87,6 @@ def fit(self, X, y=None): return self def _get_support_mask(self): - check_is_fitted(self, 'variances_') + check_is_fitted(self) return self.variances_ > self.threshold diff --git a/sklearn/gaussian_process/gpc.py b/sklearn/gaussian_process/gpc.py index 6270409f129b1..5421f7e408472 100644 --- a/sklearn/gaussian_process/gpc.py +++ b/sklearn/gaussian_process/gpc.py @@ -255,7 +255,7 @@ def predict(self, X): C : array, shape = (n_samples,) Predicted target values for X, values are from ``classes_`` """ - check_is_fitted(self, ["X_train_", "y_train_", "pi_", "W_sr_", "L_"]) + check_is_fitted(self) # As discussed on Section 3.4.2 of GPML, for making hard binary # decisions, it is enough to compute the MAP of the posterior and @@ -279,7 +279,7 @@ def predict_proba(self, X): the model. The columns correspond to the classes in sorted order, as they appear in the attribute ``classes_``. """ - check_is_fitted(self, ["X_train_", "y_train_", "pi_", "W_sr_", "L_"]) + check_is_fitted(self) # Based on Algorithm 3.2 of GPML K_star = self.kernel_(self.X_train_, X) # K_star =k(x_star) @@ -663,7 +663,7 @@ def predict(self, X): C : array, shape = (n_samples,) Predicted target values for X, values are from ``classes_`` """ - check_is_fitted(self, ["classes_", "n_classes_"]) + check_is_fitted(self) X = check_array(X) return self.base_estimator_.predict(X) @@ -681,7 +681,7 @@ def predict_proba(self, X): the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ - check_is_fitted(self, ["classes_", "n_classes_"]) + check_is_fitted(self) if self.n_classes_ > 2 and self.multi_class == "one_vs_one": raise ValueError("one_vs_one multi-class mode does not support " "predicting probability estimates. Use " @@ -735,7 +735,7 @@ def log_marginal_likelihood(self, theta=None, eval_gradient=False, hyperparameters at position theta. Only returned when eval_gradient is True. """ - check_is_fitted(self, ["classes_", "n_classes_"]) + check_is_fitted(self) if theta is None: if eval_gradient: diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 88516f70f2e66..e56802bc74326 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -373,7 +373,7 @@ def transform(self, X): X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data to complete. """ - check_is_fitted(self, 'statistics_') + check_is_fitted(self) X = self._validate_input(X) @@ -653,7 +653,7 @@ def transform(self, X): will be boolean. """ - check_is_fitted(self, "features_") + check_is_fitted(self) X = self._validate_input(X) if X.shape[1] != self._n_features: diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index ecf94d5ccfb57..05e2f1484fccf 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -627,7 +627,7 @@ def transform(self, X): Xt : array-like, shape (n_samples, n_features) The imputed input data. """ - check_is_fitted(self, 'initial_imputer_') + check_is_fitted(self) if self.add_indicator: X_trans_indicator = self.indicator_.transform(X) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 1ef79f48a0459..9d257427944dc 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -115,7 +115,7 @@ def transform(self, X): ------- X_new : array-like, shape (n_samples, n_components) """ - check_is_fitted(self, 'random_weights_') + check_is_fitted(self) X = check_array(X, accept_sparse='csr') projection = safe_sparse_dot(X, self.random_weights_) @@ -222,7 +222,7 @@ def transform(self, X): ------- X_new : array-like, shape (n_samples, n_components) """ - check_is_fitted(self, 'random_weights_') + check_is_fitted(self) X = as_float_array(X, copy=True) X = check_array(X, copy=False) @@ -580,7 +580,7 @@ def transform(self, X): X_transformed : array, shape=(n_samples, n_components) Transformed data. """ - check_is_fitted(self, 'components_') + check_is_fitted(self) X = check_array(X, accept_sparse='csr') kernel_params = self._get_kernel_params() diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index 392ae265a8f20..3d69066e342d6 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -188,6 +188,6 @@ def predict(self, X): C : array, shape = [n_samples] or [n_samples, n_targets] Returns predicted values. """ - check_is_fitted(self, ["X_fit_", "dual_coef_"]) + check_is_fitted(self) K = self._get_kernel(X, self.X_fit_) return np.dot(K, self.dual_coef_) diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index 51ff3a2d1588a..b408c8569529d 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -199,7 +199,7 @@ def fit(self, X, y): """Fit model.""" def _decision_function(self, X): - check_is_fitted(self, "coef_") + check_is_fitted(self) X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) return safe_sparse_dot(X, self.coef_.T, @@ -258,7 +258,7 @@ def decision_function(self, X): case, confidence score for self.classes_[1] where >0 means this class would be predicted. """ - check_is_fitted(self, 'coef_') + check_is_fitted(self) X = check_array(X, accept_sparse='csr') diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py index 646839a0a3ae6..1aebbfa5ba54e 100644 --- a/sklearn/linear_model/coordinate_descent.py +++ b/sklearn/linear_model/coordinate_descent.py @@ -785,7 +785,7 @@ def _decision_function(self, X): T : array, shape (n_samples,) The predicted decision function """ - check_is_fitted(self, 'n_iter_') + check_is_fitted(self) if sparse.isspmatrix(X): return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_ diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py index 1ad01e5ddc656..432a5a7db3c0d 100644 --- a/sklearn/linear_model/logistic.py +++ b/sklearn/linear_model/logistic.py @@ -1628,7 +1628,7 @@ def predict_proba(self, X): Returns the probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ - check_is_fitted(self, 'coef_') + check_is_fitted(self) ovr = (self.multi_class in ["ovr", "warn"] or (self.multi_class == 'auto' and (self.classes_.size <= 2 or diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py index b901e848f49bf..e868a31d17c8d 100644 --- a/sklearn/linear_model/ransac.py +++ b/sklearn/linear_model/ransac.py @@ -466,7 +466,7 @@ def predict(self, X): y : array, shape = [n_samples] or [n_samples, n_targets] Returns predicted values. """ - check_is_fitted(self, 'estimator_') + check_is_fitted(self) return self.estimator_.predict(X) @@ -488,6 +488,6 @@ def score(self, X, y): z : float Score of the prediction. """ - check_is_fitted(self, 'estimator_') + check_is_fitted(self) return self.estimator_.score(X, y) diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py index afad2e94ed8c1..e80a6a7ec3ce4 100644 --- a/sklearn/linear_model/stochastic_gradient.py +++ b/sklearn/linear_model/stochastic_gradient.py @@ -982,7 +982,7 @@ def predict_proba(self): return self._predict_proba def _predict_proba(self, X): - check_is_fitted(self, "t_") + check_is_fitted(self) if self.loss == "log": return self._predict_proba_lr(X) @@ -1216,7 +1216,7 @@ def _decision_function(self, X): array, shape (n_samples,) Predicted target values per element in X. """ - check_is_fitted(self, ["t_", "coef_", "intercept_"], all_or_any=all) + check_is_fitted(self, all_or_any=all) X = check_array(X, accept_sparse='csr') diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py index cf3c58486c27a..4e90d4876f4df 100644 --- a/sklearn/manifold/locally_linear.py +++ b/sklearn/manifold/locally_linear.py @@ -717,7 +717,7 @@ def transform(self, X): Because of scaling performed by this method, it is discouraged to use it together with methods that are not scale-invariant (like SVMs) """ - check_is_fitted(self, "nbrs_") + check_is_fitted(self) X = check_array(X) ind = self.nbrs_.kneighbors(X, n_neighbors=self.n_neighbors, diff --git a/sklearn/mixture/gaussian_mixture.py b/sklearn/mixture/gaussian_mixture.py index 120b72f06cd01..610af54cc343a 100644 --- a/sklearn/mixture/gaussian_mixture.py +++ b/sklearn/mixture/gaussian_mixture.py @@ -687,7 +687,7 @@ def _compute_lower_bound(self, _, log_prob_norm): return log_prob_norm def _check_is_fitted(self): - check_is_fitted(self, ['weights_', 'means_', 'precisions_cholesky_']) + check_is_fitted(self) def _get_parameters(self): return (self.weights_, self.means_, self.covariances_, diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 208ab536c8181..74284f3bdb2fd 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -437,7 +437,7 @@ def _check_is_fitted(self, method_name): 'attribute' % (type(self).__name__, method_name)) else: - check_is_fitted(self, 'best_estimator_') + check_is_fitted(self) @if_delegate_has_method(delegate=('best_estimator_', 'estimator')) def predict(self, X): diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 6315197ad7856..5a8dcebd4170b 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -113,17 +113,17 @@ def fit(self, X, y): return self def predict(self, X): - check_is_fitted(self, 'y_') + check_is_fitted(self) return np.repeat(self.y_, X.shape[0]) def decision_function(self, X): - check_is_fitted(self, 'y_') + check_is_fitted(self) return np.repeat(self.y_, X.shape[0]) def predict_proba(self, X): - check_is_fitted(self, 'y_') + check_is_fitted(self) return np.repeat([np.hstack([1 - self.y_, self.y_])], X.shape[0], axis=0) @@ -285,7 +285,7 @@ def predict(self, X): y : (sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]. Predicted multi-class targets. """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) n_samples = _num_samples(X) if self.label_binarizer_.y_type_ == "multiclass": @@ -337,7 +337,7 @@ def predict_proba(self, X): Returns the probability of the sample for each class in the model, where classes are ordered as they are in `self.classes_`. """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) # Y[i, j] gives the probability that sample i has the label j. # In the multi-label case, these are not disjoint. Y = np.array([e.predict_proba(X)[:, 1] for e in self.estimators_]).T @@ -366,7 +366,7 @@ def decision_function(self, X): ------- T : array-like, shape = [n_samples, n_classes] """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) if len(self.estimators_) == 1: return self.estimators_[0].decision_function(X) return np.array([est.decision_function(X).ravel() @@ -383,7 +383,7 @@ def n_classes_(self): @property def coef_(self): - check_is_fitted(self, 'estimators_') + check_is_fitted(self) if not hasattr(self.estimators_[0], "coef_"): raise AttributeError( "Base estimator doesn't have a coef_ attribute.") @@ -394,7 +394,7 @@ def coef_(self): @property def intercept_(self): - check_is_fitted(self, 'estimators_') + check_is_fitted(self) if not hasattr(self.estimators_[0], "intercept_"): raise AttributeError( "Base estimator doesn't have an intercept_ attribute.") @@ -603,7 +603,7 @@ def decision_function(self, X): ------- Y : array-like, shape = [n_samples, n_classes] """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) indices = self.pairwise_indices_ if indices is None: @@ -768,7 +768,7 @@ def predict(self, X): y : numpy array of shape [n_samples] Predicted multi-class targets. """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) X = check_array(X) Y = np.array([_predict_binary(e, X) for e in self.estimators_]).T pred = euclidean_distances(Y, self.code_book_).argmin(axis=1) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 4411919c1821f..463b72d40f47a 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -185,7 +185,7 @@ def predict(self, X): Multi-output targets predicted across multiple predictors. Note: Separate models are generated for each predictor. """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) if not hasattr(self.estimator, "predict"): raise ValueError("The base estimator should implement" " a predict method") @@ -344,7 +344,7 @@ def predict_proba(self, X): The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) if not all([hasattr(estimator, "predict_proba") for estimator in self.estimators_]): raise ValueError("The base estimator should implement " @@ -370,7 +370,7 @@ def score(self, X, y): scores : float accuracy_score of self.predict(X) versus y """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) n_outputs_ = len(self.estimators_) if y.ndim == 1: raise ValueError("y must have at least two dimensions for " @@ -472,7 +472,7 @@ def predict(self, X): The predicted values. """ - check_is_fitted(self, 'estimators_') + check_is_fitted(self) X = check_array(X, accept_sparse=True) Y_pred_chain = np.zeros((X.shape[0], len(self.estimators_))) for chain_idx, estimator in enumerate(self.estimators_): diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index b3518c8f22e0c..aa14a7f085828 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -431,7 +431,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, return self def _joint_log_likelihood(self, X): - check_is_fitted(self, "classes_") + check_is_fitted(self) X = check_array(X) joint_log_likelihood = [] @@ -743,7 +743,7 @@ def _update_feature_log_prob(self, alpha): def _joint_log_likelihood(self, X): """Calculate the posterior log probability of the samples X""" - check_is_fitted(self, "classes_") + check_is_fitted(self) X = check_array(X, accept_sparse='csr') return (safe_sparse_dot(X, self.feature_log_prob_.T) + @@ -852,7 +852,7 @@ def _update_feature_log_prob(self, alpha): def _joint_log_likelihood(self, X): """Calculate the class scores for the samples in X.""" - check_is_fitted(self, "classes_") + check_is_fitted(self) X = check_array(X, accept_sparse="csr") jll = safe_sparse_dot(X, self.feature_log_prob_.T) @@ -963,7 +963,7 @@ def _update_feature_log_prob(self, alpha): def _joint_log_likelihood(self, X): """Calculate the posterior log probability of the samples X""" - check_is_fitted(self, "classes_") + check_is_fitted(self) X = check_array(X, accept_sparse='csr') diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 9a05eb62c2afc..041c13aae5417 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -388,7 +388,7 @@ class from an array representing our data set and ask who's [2]]...) """ - check_is_fitted(self, ["_fit_method", "_fit_X"], all_or_any=any) + check_is_fitted(self, all_or_any=any) if n_neighbors is None: n_neighbors = self.n_neighbors @@ -543,7 +543,7 @@ def kneighbors_graph(self, X=None, n_neighbors=None, -------- NearestNeighbors.radius_neighbors_graph """ - check_is_fitted(self, ["_fit_method", "_fit_X"], all_or_any=any) + check_is_fitted(self, all_or_any=any) if n_neighbors is None: n_neighbors = self.n_neighbors @@ -691,7 +691,7 @@ class from an array representing our data set and ask who's For efficiency, `radius_neighbors` returns arrays of objects, where each object is a 1D array of indices or distances. """ - check_is_fitted(self, ["_fit_method", "_fit_X"], all_or_any=any) + check_is_fitted(self, all_or_any=any) if X is not None: query_is_train = False @@ -828,7 +828,7 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'): -------- kneighbors_graph """ - check_is_fitted(self, ["_fit_method", "_fit_X"], all_or_any=any) + check_is_fitted(self, all_or_any=any) if X is not None: X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) diff --git a/sklearn/neighbors/nca.py b/sklearn/neighbors/nca.py index 5060270ce1e61..68a72c92da865 100644 --- a/sklearn/neighbors/nca.py +++ b/sklearn/neighbors/nca.py @@ -258,7 +258,7 @@ def transform(self, X): If :meth:`fit` has not been called before. """ - check_is_fitted(self, ['components_']) + check_is_fitted(self) X = check_array(X) return np.dot(X, self.components_.T) diff --git a/sklearn/neighbors/nearest_centroid.py b/sklearn/neighbors/nearest_centroid.py index 5626189222e5a..3e1577469c920 100644 --- a/sklearn/neighbors/nearest_centroid.py +++ b/sklearn/neighbors/nearest_centroid.py @@ -191,7 +191,7 @@ def predict(self, X): be the distance matrix between the data to be predicted and ``self.centroids_``. """ - check_is_fitted(self, 'centroids_') + check_is_fitted(self) X = check_array(X, accept_sparse='csr') return self.classes_[pairwise_distances( diff --git a/sklearn/neural_network/multilayer_perceptron.py b/sklearn/neural_network/multilayer_perceptron.py index ebe5f03801ed5..11e1c4a3ab793 100644 --- a/sklearn/neural_network/multilayer_perceptron.py +++ b/sklearn/neural_network/multilayer_perceptron.py @@ -970,7 +970,7 @@ def predict(self, X): y : array-like, shape (n_samples,) or (n_samples, n_classes) The predicted classes. """ - check_is_fitted(self, "coefs_") + check_is_fitted(self) y_pred = self._predict(X) if self.n_outputs_ == 1: @@ -1071,7 +1071,7 @@ def predict_proba(self, X): The predicted probability of the sample for each class in the model, where classes are ordered as they are in `self.classes_`. """ - check_is_fitted(self, "coefs_") + check_is_fitted(self) y_pred = self._predict(X) if self.n_outputs_ == 1: @@ -1332,7 +1332,7 @@ def predict(self, X): y : array-like, shape (n_samples, n_outputs) The predicted values. """ - check_is_fitted(self, "coefs_") + check_is_fitted(self) y_pred = self._predict(X) if y_pred.shape[1] == 1: return y_pred.ravel() diff --git a/sklearn/neural_network/rbm.py b/sklearn/neural_network/rbm.py index b2b6166d4d253..fa39f5f23d80c 100644 --- a/sklearn/neural_network/rbm.py +++ b/sklearn/neural_network/rbm.py @@ -116,7 +116,7 @@ def transform(self, X): h : array, shape (n_samples, n_components) Latent representations of the data. """ - check_is_fitted(self, "components_") + check_is_fitted(self) X = check_array(X, accept_sparse='csr', dtype=np.float64) return self._mean_hiddens(X) @@ -208,7 +208,7 @@ def gibbs(self, v): v_new : array-like, shape (n_samples, n_features) Values of the visible layer after one Gibbs step. """ - check_is_fitted(self, "components_") + check_is_fitted(self) if not hasattr(self, "random_state_"): self.random_state_ = check_random_state(self.random_state) h_ = self._sample_hiddens(v, self.random_state_) @@ -299,7 +299,7 @@ def score_samples(self, X): free energy on X, then on a randomly corrupted version of X, and returns the log of the logistic function of the difference. """ - check_is_fitted(self, "components_") + check_is_fitted(self) v = check_array(X, accept_sparse='csr') rng = check_random_state(self.random_state) diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index b7ffd96032d2a..1be7499f783ec 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -253,7 +253,7 @@ def transform(self, X): Xt : numeric array-like or sparse matrix Data in the binned space. """ - check_is_fitted(self, ["bin_edges_"]) + check_is_fitted(self) Xt = check_array(X, copy=True, dtype=FLOAT_DTYPES) n_features = self.n_bins_.shape[0] @@ -294,7 +294,7 @@ def inverse_transform(self, Xt): Xinv : numeric array-like Data in the original feature space. """ - check_is_fitted(self, ["bin_edges_"]) + check_is_fitted(self) if 'onehot' in self.encode: Xt = self._encoder.inverse_transform(Xt) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index c1d3b1e80c352..6a16b484ad563 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -371,7 +371,7 @@ def transform(self, X): X_out : sparse matrix if sparse=True else a 2-d array Transformed input. """ - check_is_fitted(self, 'categories_') + check_is_fitted(self) # validation of X happens in _check_X called by _transform X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) @@ -423,7 +423,7 @@ def inverse_transform(self, X): Inverse transformed array. """ - check_is_fitted(self, 'categories_') + check_is_fitted(self) X = check_array(X, accept_sparse='csr') n_samples, _ = X.shape @@ -506,7 +506,7 @@ def get_feature_names(self, input_features=None): output_feature_names : array of string, length n_output_features """ - check_is_fitted(self, 'categories_') + check_is_fitted(self) cats = self.categories_ if input_features is None: input_features = ['x%d' % i for i in range(len(cats))] @@ -639,7 +639,7 @@ def inverse_transform(self, X): Inverse transformed array. """ - check_is_fitted(self, 'categories_') + check_is_fitted(self) X = check_array(X, accept_sparse='csr') n_samples, _ = X.shape diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 46530de8e6ad9..e70c98e48e898 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -387,7 +387,7 @@ def transform(self, X): X : array-like, shape [n_samples, n_features] Input data that will be transformed. """ - check_is_fitted(self, 'scale_') + check_is_fitted(self) X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan") @@ -404,7 +404,7 @@ def inverse_transform(self, X): X : array-like, shape [n_samples, n_features] Input data that will be transformed. It cannot be sparse. """ - check_is_fitted(self, 'scale_') + check_is_fitted(self) X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan") @@ -756,7 +756,7 @@ def transform(self, X, copy=None): copy : bool, optional (default: None) Copy the input X or not. """ - check_is_fitted(self, 'scale_') + check_is_fitted(self) copy = copy if copy is not None else self.copy X = check_array(X, accept_sparse='csr', copy=copy, @@ -792,7 +792,7 @@ def inverse_transform(self, X, copy=None): X_tr : array-like, shape [n_samples, n_features] Transformed array. """ - check_is_fitted(self, 'scale_') + check_is_fitted(self) copy = copy if copy is not None else self.copy if sparse.issparse(X): @@ -957,7 +957,7 @@ def transform(self, X): X : {array-like, sparse matrix} The data that should be scaled. """ - check_is_fitted(self, 'scale_') + check_is_fitted(self) X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') @@ -976,7 +976,7 @@ def inverse_transform(self, X): X : {array-like, sparse matrix} The data that should be transformed back. """ - check_is_fitted(self, 'scale_') + check_is_fitted(self) X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') @@ -1206,7 +1206,7 @@ def transform(self, X): X : {array-like, sparse matrix} The data used to scale along the specified axis. """ - check_is_fitted(self, 'center_', 'scale_') + check_is_fitted(self, 'scale_') X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') @@ -1229,7 +1229,7 @@ def inverse_transform(self, X): X : array-like The data used to scale along the specified axis. """ - check_is_fitted(self, 'center_', 'scale_') + check_is_fitted(self, 'scale_') X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') @@ -1415,7 +1415,7 @@ def _combinations(n_features, degree, interaction_only, include_bias): @property def powers_(self): - check_is_fitted(self, 'n_input_features_') + check_is_fitted(self) combinations = self._combinations(self.n_input_features_, self.degree, self.interaction_only, @@ -1502,7 +1502,7 @@ def transform(self, X): The matrix of features, where NP is the number of polynomial features generated from the combination of inputs. """ - check_is_fitted(self, ['n_input_features_', 'n_output_features_']) + check_is_fitted(self) X = check_array(X, order='F', dtype=FLOAT_DTYPES, accept_sparse=('csr', 'csc')) @@ -2014,7 +2014,7 @@ def transform(self, K, copy=True): ------- K_new : numpy array of shape [n_samples1, n_samples2] """ - check_is_fitted(self, 'K_fit_all_') + check_is_fitted(self) K = check_array(K, copy=copy, dtype=FLOAT_DTYPES) @@ -2411,7 +2411,7 @@ def _check_inputs(self, X, accept_sparse_negative=False, copy=False): def _check_is_fitted(self, X): """Check the inputs before transforming""" - check_is_fitted(self, 'quantiles_') + check_is_fitted(self) # check that the dimension of X are adequate with the fitted data if X.shape[1] != self.quantiles_.shape[1]: raise ValueError('X does not have the same number of features as' @@ -2786,7 +2786,7 @@ def transform(self, X): X_trans : array-like, shape (n_samples, n_features) The transformed data. """ - check_is_fitted(self, 'lambdas_') + check_is_fitted(self) X = self._check_input(X, check_positive=True, check_shape=True) transform_function = {'box-cox': boxcox, @@ -2832,7 +2832,7 @@ def inverse_transform(self, X): X : array-like, shape (n_samples, n_features) The original data """ - check_is_fitted(self, 'lambdas_') + check_is_fitted(self) X = self._check_input(X, check_shape=True) if self.standardize: diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index a236589d1698d..118fc22fa7f11 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -256,7 +256,7 @@ def transform(self, y): ------- y : array-like of shape [n_samples] """ - check_is_fitted(self, 'classes_') + check_is_fitted(self) y = column_or_1d(y, warn=True) # transform of empty array is empty array if _num_samples(y) == 0: @@ -277,7 +277,7 @@ def inverse_transform(self, y): ------- y : numpy array of shape [n_samples] """ - check_is_fitted(self, 'classes_') + check_is_fitted(self) y = column_or_1d(y, warn=True) # inverse transform of empty array is empty array if _num_samples(y) == 0: @@ -465,7 +465,7 @@ def transform(self, y): Y : numpy array or CSR matrix of shape [n_samples, n_classes] Shape will be [n_samples, 1] for binary problems. """ - check_is_fitted(self, 'classes_') + check_is_fitted(self) y_is_multilabel = type_of_target(y).startswith('multilabel') if y_is_multilabel and not self.y_type_.startswith('multilabel'): @@ -508,7 +508,7 @@ def inverse_transform(self, Y, threshold=None): linear model's decision_function method directly as the input of inverse_transform. """ - check_is_fitted(self, 'classes_') + check_is_fitted(self) if threshold is None: threshold = (self.pos_label + self.neg_label) / 2. @@ -911,7 +911,7 @@ def transform(self, y): A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in `y[i]`, and 0 otherwise. """ - check_is_fitted(self, 'classes_') + check_is_fitted(self) class_to_index = self._build_cache() yt = self._transform(y, class_to_index) @@ -976,7 +976,7 @@ def inverse_transform(self, yt): The set of labels for each sample such that `y[i]` consists of `classes_[j]` for each `yt[i, j] == 1`. """ - check_is_fitted(self, 'classes_') + check_is_fitted(self) if yt.shape[1] != len(self.classes_): raise ValueError('Expected indicator for {0} classes, but got {1}' diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 8297a42ab17f8..4f8c8af1283b2 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -402,7 +402,7 @@ def transform(self, X): """ X = check_array(X, accept_sparse=['csr', 'csc']) - check_is_fitted(self, 'components_') + check_is_fitted(self) if X.shape[1] != self.components_.shape[1]: raise ValueError( diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 4820af8cb2b69..704a075d95932 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -184,7 +184,7 @@ def predict_proba(self, X): Normalized probability distributions across class labels """ - check_is_fitted(self, 'X_') + check_is_fitted(self) X_2d = check_array(X, accept_sparse=['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia']) diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py index 40f87baec06be..b2723cc7e0c2b 100644 --- a/sklearn/svm/base.py +++ b/sklearn/svm/base.py @@ -437,7 +437,7 @@ def _sparse_decision_function(self, X): self.probA_, self.probB_) def _validate_for_predict(self, X): - check_is_fitted(self, 'support_') + check_is_fitted(self) X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C", accept_large_sparse=False) @@ -562,7 +562,7 @@ def predict(self, X): y_pred : array, shape (n_samples,) Class labels for samples in X. """ - check_is_fitted(self, "classes_") + check_is_fitted(self) if self.break_ties and self.decision_function_shape == 'ovo': raise ValueError("break_ties must be False when " "decision_function_shape is 'ovo'") diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index 822dd0edb5501..6eb9c61ec2b2d 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -65,7 +65,7 @@ def fit(self, X, y=None, *args, **kwargs): return True def _check_fit(self): - check_is_fitted(self, 'coef_') + check_is_fitted(self) @hides def inverse_transform(self, X, *args, **kwargs): diff --git a/sklearn/tree/tree.py b/sklearn/tree/tree.py index 69c3b1a8270b6..9f6bf979717cf 100644 --- a/sklearn/tree/tree.py +++ b/sklearn/tree/tree.py @@ -111,13 +111,13 @@ def get_depth(self): The depth of a tree is the maximum distance between the root and any leaf. """ - check_is_fitted(self, 'tree_') + check_is_fitted(self) return self.tree_.max_depth def get_n_leaves(self): """Returns the number of leaves of the decision tree. """ - check_is_fitted(self, 'tree_') + check_is_fitted(self) return self.tree_.n_leaves def fit(self, X, y, sample_weight=None, check_input=True, @@ -424,7 +424,7 @@ def predict(self, X, check_input=True): y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted classes, or the predict values. """ - check_is_fitted(self, 'tree_') + check_is_fitted(self) X = self._validate_X_predict(X, check_input) proba = self.tree_.predict(X) n_samples = X.shape[0] @@ -478,7 +478,7 @@ def apply(self, X, check_input=True): ``[0; self.tree_.node_count)``, possibly with gaps in the numbering. """ - check_is_fitted(self, 'tree_') + check_is_fitted(self) X = self._validate_X_predict(X, check_input) return self.tree_.apply(X) @@ -520,7 +520,7 @@ def feature_importances_(self): ------- feature_importances_ : array, shape = [n_features] """ - check_is_fitted(self, 'tree_') + check_is_fitted(self) return self.tree_.compute_feature_importances() @@ -841,7 +841,7 @@ class in a leaf. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ - check_is_fitted(self, 'tree_') + check_is_fitted(self) X = self._validate_X_predict(X, check_input) proba = self.tree_.predict(X) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index b4bd2daac00d7..b93c66f7cfbb6 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -169,7 +169,7 @@ def fit(self, X, y): return self def predict(self, X): - check_is_fitted(self, 'coef_') + check_is_fitted(self) X = check_array(X) return np.ones(X.shape[0]) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index e1a1270f21e63..14b13d94ca5d1 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -632,34 +632,34 @@ def test_check_symmetric(): def test_check_is_fitted(): # Check is ValueError raised when non estimator instance passed - assert_raises(ValueError, check_is_fitted, ARDRegression, "coef_") - assert_raises(TypeError, check_is_fitted, "SVR", "support_") + assert_raises(ValueError, check_is_fitted, ARDRegression) + assert_raises(TypeError, check_is_fitted, "SVR") ard = ARDRegression() svr = SVR() try: - assert_raises(NotFittedError, check_is_fitted, ard, "coef_") - assert_raises(NotFittedError, check_is_fitted, svr, "support_") + assert_raises(NotFittedError, check_is_fitted, ard) + assert_raises(NotFittedError, check_is_fitted, svr) except ValueError: assert False, "check_is_fitted failed with ValueError" # NotFittedError is a subclass of both ValueError and AttributeError try: - check_is_fitted(ard, "coef_", "Random message %(name)s, %(name)s") + check_is_fitted(ard, msg="Random message %(name)s, %(name)s") except ValueError as e: assert str(e) == "Random message ARDRegression, ARDRegression" try: - check_is_fitted(svr, "support_", "Another message %(name)s, %(name)s") + check_is_fitted(svr, msg="Another message %(name)s, %(name)s") except AttributeError as e: assert str(e) == "Another message SVR, SVR" ard.fit(*make_blobs()) svr.fit(*make_blobs()) - assert check_is_fitted(ard, "coef_") is None - assert check_is_fitted(svr, "support_") is None + assert check_is_fitted(ard) is None + assert check_is_fitted(svr) is None def test_check_consistent_length(): diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 2150a54ad75d9..b4e5bf8154e25 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -866,7 +866,7 @@ def check_symmetric(array, tol=1E-10, raise_warning=True, return array -def check_is_fitted(estimator, attributes, msg=None, all_or_any=all): +def check_is_fitted(estimator, *, msg=None, all_or_any=all): """Perform is_fitted validation for estimator. Checks if the estimator is fitted by verifying the presence of @@ -910,10 +910,10 @@ def check_is_fitted(estimator, attributes, msg=None, all_or_any=all): if not hasattr(estimator, 'fit'): raise TypeError("%s is not an estimator instance." % (estimator)) - if not isinstance(attributes, (list, tuple)): - attributes = [attributes] + attrs = [v for v in vars(estimator) if v.endswith("_") + and not v.startswith("__")] - if not all_or_any([hasattr(estimator, attr) for attr in attributes]): + if not len(attrs): raise NotFittedError(msg % {'name': type(estimator).__name__}) From e034ed80e20536e38b913e851eeed1427f7ebcc0 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 1 Aug 2019 13:07:18 -0400 Subject: [PATCH 34/86] cleanup, remove any_or_all --- sklearn/decomposition/base.py | 2 +- sklearn/discriminant_analysis.py | 2 +- .../_hist_gradient_boosting/gradient_boosting.py | 4 ++-- sklearn/ensemble/gradient_boosting.py | 10 +++++----- sklearn/kernel_approximation.py | 2 +- sklearn/linear_model/base.py | 4 ++-- sklearn/linear_model/stochastic_gradient.py | 2 +- sklearn/neighbors/base.py | 8 ++++---- sklearn/preprocessing/data.py | 4 ++-- sklearn/utils/validation.py | 12 +++--------- 10 files changed, 22 insertions(+), 28 deletions(-) diff --git a/sklearn/decomposition/base.py b/sklearn/decomposition/base.py index 0dad8c6130d68..2f11d8bd847b8 100644 --- a/sklearn/decomposition/base.py +++ b/sklearn/decomposition/base.py @@ -122,7 +122,7 @@ def transform(self, X): IncrementalPCA(batch_size=3, n_components=2) >>> ipca.transform(X) # doctest: +SKIP """ - check_is_fitted(self, all_or_any=all) + check_is_fitted(self) X = check_array(X) if self.mean_ is not None: diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 4a3542e204288..efe39b8c3fb9a 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -505,7 +505,7 @@ def transform(self, X): if self.solver == 'lsqr': raise NotImplementedError("transform not implemented for 'lsqr' " "solver (use 'svd' or 'eigen').") - check_is_fitted(self, all_or_any=any) + check_is_fitted(self) X = check_array(X) if self.solver == 'svd': diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index dc040ed1fa409..e66b755964058 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -541,7 +541,7 @@ def _raw_predict(self, X): """ X = check_array(X, dtype=[X_DTYPE, X_BINNED_DTYPE], force_all_finite=False) - check_is_fitted(self, '_predictors') + check_is_fitted(self) if X.shape[1] != self.n_features_: raise ValueError( 'X has {} features but this estimator was trained with ' @@ -603,7 +603,7 @@ def _encode_y(self, y=None): @property def n_iter_(self): - check_is_fitted(self, '_predictors') + check_is_fitted(self) return len(self._predictors) def _more_tags(self): diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 11813855d01d8..43c4dae31f66e 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -115,7 +115,7 @@ def predict(self, X): y : array, shape (n_samples,) Returns predicted values. """ - check_is_fitted(self, 'quantile') + check_is_fitted(self) y = np.empty((X.shape[0], 1), dtype=np.float64) y.fill(self.quantile) @@ -158,7 +158,7 @@ def predict(self, X): y : array, shape (n_samples,) Returns predicted values. """ - check_is_fitted(self, 'mean') + check_is_fitted(self) y = np.empty((X.shape[0], 1), dtype=np.float64) y.fill(self.mean) @@ -210,7 +210,7 @@ def predict(self, X): y : array, shape (n_samples,) Returns predicted values. """ - check_is_fitted(self, 'prior') + check_is_fitted(self) y = np.empty((X.shape[0], 1), dtype=np.float64) y.fill(self.prior) @@ -262,7 +262,7 @@ def predict(self, X): y : array, shape (n_samples,) Returns predicted values. """ - check_is_fitted(self, 'priors') + check_is_fitted(self) y = np.empty((X.shape[0], self.priors.shape[0]), dtype=np.float64) y[:] = self.priors @@ -316,7 +316,7 @@ def predict(self, X): y : array, shape (n_samples,) Returns predicted values. """ - check_is_fitted(self, 'n_classes') + check_is_fitted(self) y = np.empty((X.shape[0], self.n_classes), dtype=np.float64) y.fill(0.0) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 9d257427944dc..82cb37104cbff 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -350,7 +350,7 @@ def transform(self, X): """ msg = ("%(name)s is not fitted. Call fit to set the parameters before" " calling transform") - check_is_fitted(self, "sample_interval_", msg=msg) + check_is_fitted(self, msg=msg) X = check_array(X, accept_sparse='csr') sparse = sp.issparse(X) diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py index b408c8569529d..b36516e081392 100644 --- a/sklearn/linear_model/base.py +++ b/sklearn/linear_model/base.py @@ -327,7 +327,7 @@ def densify(self): self : estimator """ msg = "Estimator, %(name)s, must be fitted before densifying." - check_is_fitted(self, "coef_", msg=msg) + check_is_fitted(self, msg=msg) if sp.issparse(self.coef_): self.coef_ = self.coef_.toarray() return self @@ -357,7 +357,7 @@ def sparsify(self): self : estimator """ msg = "Estimator, %(name)s, must be fitted before sparsifying." - check_is_fitted(self, "coef_", msg=msg) + check_is_fitted(self, msg=msg) self.coef_ = sp.csr_matrix(self.coef_) return self diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py index e80a6a7ec3ce4..50c91513c12db 100644 --- a/sklearn/linear_model/stochastic_gradient.py +++ b/sklearn/linear_model/stochastic_gradient.py @@ -1216,7 +1216,7 @@ def _decision_function(self, X): array, shape (n_samples,) Predicted target values per element in X. """ - check_is_fitted(self, all_or_any=all) + check_is_fitted(self) X = check_array(X, accept_sparse='csr') diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 041c13aae5417..4f7ef38a4ae14 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -388,7 +388,7 @@ class from an array representing our data set and ask who's [2]]...) """ - check_is_fitted(self, all_or_any=any) + check_is_fitted(self) if n_neighbors is None: n_neighbors = self.n_neighbors @@ -543,7 +543,7 @@ def kneighbors_graph(self, X=None, n_neighbors=None, -------- NearestNeighbors.radius_neighbors_graph """ - check_is_fitted(self, all_or_any=any) + check_is_fitted(self) if n_neighbors is None: n_neighbors = self.n_neighbors @@ -691,7 +691,7 @@ class from an array representing our data set and ask who's For efficiency, `radius_neighbors` returns arrays of objects, where each object is a 1D array of indices or distances. """ - check_is_fitted(self, all_or_any=any) + check_is_fitted(self) if X is not None: query_is_train = False @@ -828,7 +828,7 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'): -------- kneighbors_graph """ - check_is_fitted(self, all_or_any=any) + check_is_fitted(self) if X is not None: X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index e70c98e48e898..b3f09664f025d 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1206,7 +1206,7 @@ def transform(self, X): X : {array-like, sparse matrix} The data used to scale along the specified axis. """ - check_is_fitted(self, 'scale_') + check_is_fitted(self) X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') @@ -1229,7 +1229,7 @@ def inverse_transform(self, X): X : array-like The data used to scale along the specified axis. """ - check_is_fitted(self, 'scale_') + check_is_fitted(self) X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index b4e5bf8154e25..48daa64da77ee 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -866,21 +866,18 @@ def check_symmetric(array, tol=1E-10, raise_warning=True, return array -def check_is_fitted(estimator, *, msg=None, all_or_any=all): +def check_is_fitted(estimator, *, msg=None): """Perform is_fitted validation for estimator. Checks if the estimator is fitted by verifying the presence of - "all_or_any" of the passed attributes and raises a NotFittedError with the - given message. + fitted attributes (ending with a trailing underscore) and otherwise + raises a NotFittedError with the given message. Parameters ---------- estimator : estimator instance. estimator instance for which the check is performed. - attributes : attribute name(s) given as string or a list/tuple of strings - Eg.: - ``["coef_", "estimator_", ...], "coef_"`` msg : string The default error message is, "This %(name)s instance is not fitted @@ -891,9 +888,6 @@ def check_is_fitted(estimator, *, msg=None, all_or_any=all): Eg. : "Estimator, %(name)s, must be fitted before sparsifying". - all_or_any : callable, {all, any}, default all - Specify whether all or any of the given attributes must exist. - Returns ------- None From 1dc925854709a6a157d32ba4248edb1e9047b77f Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 1 Aug 2019 13:18:24 -0400 Subject: [PATCH 35/86] fix LOF, birch, mixtures --- sklearn/cluster/birch.py | 3 +-- sklearn/mixture/base.py | 13 +++++-------- sklearn/mixture/bayesian_mixture.py | 6 ------ sklearn/neighbors/lof.py | 6 ++---- 4 files changed, 8 insertions(+), 20 deletions(-) diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py index 941b833e977f7..11bb0f17a1dc6 100644 --- a/sklearn/cluster/birch.py +++ b/sklearn/cluster/birch.py @@ -534,8 +534,7 @@ def partial_fit(self, X=None, y=None): return self._fit(X) def _check_fit(self, X): - check_is_fitted(self, - all_or_any=any) + check_is_fitted(self) if (hasattr(self, 'subcluster_centers_') and X.shape[1] != self.subcluster_centers_.shape[1]): diff --git a/sklearn/mixture/base.py b/sklearn/mixture/base.py index 8920bef181226..26410fc5256af 100644 --- a/sklearn/mixture/base.py +++ b/sklearn/mixture/base.py @@ -15,6 +15,7 @@ from ..base import DensityMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, check_random_state +from ..utils.validation import check_is_fitted from ..utils.fixes import logsumexp @@ -308,10 +309,6 @@ def _m_step(self, X, log_resp): """ pass - @abstractmethod - def _check_is_fitted(self): - pass - @abstractmethod def _get_parameters(self): pass @@ -334,7 +331,7 @@ def score_samples(self, X): log_prob : array, shape (n_samples,) Log probabilities of each data point in X. """ - self._check_is_fitted() + check_is_fitted(self) X = _check_X(X, None, self.means_.shape[1]) return logsumexp(self._estimate_weighted_log_prob(X), axis=1) @@ -369,7 +366,7 @@ def predict(self, X): labels : array, shape (n_samples,) Component labels. """ - self._check_is_fitted() + check_is_fitted(self) X = _check_X(X, None, self.means_.shape[1]) return self._estimate_weighted_log_prob(X).argmax(axis=1) @@ -388,7 +385,7 @@ def predict_proba(self, X): Returns the probability each Gaussian (state) in the model given each sample. """ - self._check_is_fitted() + check_is_fitted(self) X = _check_X(X, None, self.means_.shape[1]) _, log_resp = self._estimate_log_prob_resp(X) return np.exp(log_resp) @@ -410,7 +407,7 @@ def sample(self, n_samples=1): Component labels """ - self._check_is_fitted() + check_is_fitted(self) if n_samples < 1: raise ValueError( diff --git a/sklearn/mixture/bayesian_mixture.py b/sklearn/mixture/bayesian_mixture.py index 88c0ab66ae20a..b0cc600d077da 100644 --- a/sklearn/mixture/bayesian_mixture.py +++ b/sklearn/mixture/bayesian_mixture.py @@ -646,12 +646,6 @@ def _estimate_wishart_spherical(self, nk, xk, sk): # Contrary to the original bishop book, we normalize the covariances self.covariances_ /= self.degrees_of_freedom_ - def _check_is_fitted(self): - check_is_fitted(self, ['weight_concentration_', 'mean_precision_', - 'means_', 'degrees_of_freedom_', - 'covariances_', 'precisions_', - 'precisions_cholesky_']) - def _m_step(self, X, log_resp): """M step. diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py index a58997502be91..f4f697565cd3e 100644 --- a/sklearn/neighbors/lof.py +++ b/sklearn/neighbors/lof.py @@ -313,8 +313,7 @@ def _predict(self, X=None): is_inlier : array, shape (n_samples,) Returns -1 for anomalies/outliers and +1 for inliers. """ - check_is_fitted(self, ["offset_", "negative_outlier_factor_", - "n_neighbors_", "_distances_fit_X_"]) + check_is_fitted(self) if X is not None: X = check_array(X, accept_sparse='csr') @@ -454,8 +453,7 @@ def _score_samples(self, X): The opposite of the Local Outlier Factor of each input samples. The lower, the more abnormal. """ - check_is_fitted(self, ["offset_", "negative_outlier_factor_", - "_distances_fit_X_"]) + check_is_fitted(self) X = check_array(X, accept_sparse='csr') distances_X, neighbors_indices_X = ( From 92d1aaf596745c2abe7a3bce9484d34852505d92 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 1 Aug 2019 19:27:04 +0200 Subject: [PATCH 36/86] iter --- sklearn/utils/__init__.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 9e6396e7b5ba1..3ea68ac4ef470 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -194,11 +194,21 @@ def _array_indexing(array, key, axis): def _pandas_indexing(X, key, axis, by_name): """Index a pandas dataframe or a series.""" - if hasattr(key, 'flags'): + if hasattr(key, 'shape'): # Work-around for indexing with read-only key in pandas key = key if key.flags.writeable else key.copy() indexer = 'loc' if by_name else 'iloc' - return getattr(X, indexer)[:, key] if axis else getattr(X, indexer)[key] + try: + return (getattr(X, indexer)[:, key] + if axis else getattr(X, indexer)[key]) + except ValueError: + # Cython typed memoryviews internally used in pandas do not support + # readonly buffers. + warnings.warn( + "Copying input dataframe for slicing.", DataConversionWarning + ) + return (getattr(X.copy(), indexer)[:, key] + if axis else getattr(X.copy(), indexer)[key]) def _list_indexing(X, key): From d6034ea6eb4d8f07be8e2476e9480f7d4a8ff797 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Thu, 1 Aug 2019 13:29:58 -0400 Subject: [PATCH 37/86] remove unused method --- sklearn/mixture/gaussian_mixture.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/mixture/gaussian_mixture.py b/sklearn/mixture/gaussian_mixture.py index 610af54cc343a..b7941365b2609 100644 --- a/sklearn/mixture/gaussian_mixture.py +++ b/sklearn/mixture/gaussian_mixture.py @@ -686,9 +686,6 @@ def _estimate_log_weights(self): def _compute_lower_bound(self, _, log_prob_norm): return log_prob_norm - def _check_is_fitted(self): - check_is_fitted(self) - def _get_parameters(self): return (self.weights_, self.means_, self.covariances_, self.precisions_cholesky_) From b1918e83de705b97b789d970ecd471903141182c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 2 Aug 2019 15:33:41 +0200 Subject: [PATCH 38/86] address different comments --- doc/whats_new/v0.22.rst | 4 ++-- sklearn/compose/tests/test_column_transformer.py | 2 +- sklearn/utils/__init__.py | 2 +- sklearn/utils/tests/test_utils.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index db9e0e574da06..0f3c5665e3aa6 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -65,8 +65,8 @@ Changelog ...................... - |Fix| Fixed a bug in :class:`compose.ColumnTransformer` which failed to - select the proper columns when using a boolean list and NumPy older than - 1.13. + select the proper columns when using a boolean list, with NumPy older than + 1.12. :pr:`14510` by :user:`Guillaume Lemaitre `. :mod:`sklearn.datasets` diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index a667b35cf65e3..d28a82374ad5b 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1114,7 +1114,7 @@ def test_column_transformer_reordered_column_names_remainder(explicit_colname): @pytest.mark.parametrize("array_type", [np.asarray, sparse.csr_matrix]) def test_column_transformer_mask_indexing(array_type): # Regression test for #14510 - # Boolean array-like does not behave as boolean array with NumPy < 1.13 + # Boolean array-like does not behave as boolean array with NumPy < 1.12 # and sparse matrices as well X = np.transpose([[1, 2, 3], [4, 5, 6], [5, 6, 7], [8, 9, 10]]) X = array_type(X) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 3b4a20d08716b..f95a0d6cccc57 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -233,7 +233,7 @@ def _array_indexing(array, key, axis=0): "'axis' should be either 0 (to index rows) or 1 (to index " " column). Got {} instead.".format(axis) ) - if np_version < (1, 13) or issparse(array): + if np_version < (1, 12) or issparse(array): # check if we have an boolean array-likes to make the proper indexing key_array = np.asarray(key) if np.issubdtype(key_array.dtype, np.bool_): diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 49f50eedc0a42..806295f1aae28 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -371,7 +371,7 @@ def test_safe_indexing_mock_pandas(asarray): def test_safe_indexing_mask_axis_1(array_type): # regression test for #14510 # check that boolean array-like and boolean array lead to the same indexing - # even in NumPy < 1.13 + # even in NumPy < 1.12 if array_type == 'array': array_constructor = np.asarray elif array_type == 'sparse': From 6322f99d5d85918643e2ba1b86539cb654d8cb60 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 2 Aug 2019 15:54:55 +0200 Subject: [PATCH 39/86] iter --- sklearn/utils/__init__.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 19e5e78b27cf3..638dc2e6f64fc 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -185,6 +185,7 @@ def axis0_safe_slice(X, mask, len_mask): def _array_indexing(array, key, axis): """Index an array consistently across NumPy version.""" if np_version < (1, 12) or issparse(array): + # FIXME: Remove the check for NumPy when using >= 1.12 # check if we have an boolean array-likes to make the proper indexing key_array = np.asarray(key) if np.issubdtype(key_array.dtype, np.bool_): @@ -196,19 +197,11 @@ def _pandas_indexing(X, key, axis, by_name): """Index a pandas dataframe or a series.""" if hasattr(key, 'shape'): # Work-around for indexing with read-only key in pandas + # FIXME: solved in pandas 0.25 + key = np.asarray(key) key = key if key.flags.writeable else key.copy() indexer = 'loc' if by_name else 'iloc' - try: - return (getattr(X, indexer)[:, key] - if axis else getattr(X, indexer)[key]) - except ValueError: - # Cython typed memoryviews internally used in pandas do not support - # readonly buffers. - warnings.warn( - "Copying input dataframe for slicing.", DataConversionWarning - ) - return (getattr(X.copy(), indexer)[:, key] - if axis else getattr(X.copy(), indexer)[key]) + return (getattr(X, indexer)[:, key] if axis else getattr(X, indexer)[key]) def _list_indexing(X, key): From e478e207cf377b287c5d938a6af0f3314189dd41 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 2 Aug 2019 16:48:16 +0200 Subject: [PATCH 40/86] iter --- build_tools/azure/test_script.sh | 2 +- sklearn/inspection/partial_dependence.py | 1 + sklearn/inspection/tests/test_partial_dependence.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh index 27a3e4b649a13..6e27915e0be6b 100755 --- a/build_tools/azure/test_script.sh +++ b/build_tools/azure/test_script.sh @@ -21,7 +21,7 @@ except ImportError: python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())" pip list -TEST_CMD="python -m pytest --verbose --showlocals --durations=20 --junitxml=$JUNITXML" +TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML" if [[ "$COVERAGE" == "true" ]]; then export COVERAGE_PROCESS_START="$BUILD_SOURCESDIRECTORY/.coveragerc" diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index cb3a3d986dd44..4e219dbb77418 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -359,6 +359,7 @@ def partial_dependence(estimator, X, features, response_method='auto', fitted_attribute = 'n_iter_' check_is_fitted(estimator, fitted_attribute, msg=msg) + print(features) features_indices = np.asarray( _get_column_indices(X, features), dtype=np.int32, order='C' ).ravel() diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index bc98db923449c..6b93e8e427a16 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -341,15 +341,15 @@ def test_partial_dependence_error(estimator, params, err_msg): partial_dependence(estimator, X, **params) +@pytest.mark.parametrize("features", [-1, 100000]) @pytest.mark.parametrize( 'estimator', [LinearRegression(), GradientBoostingClassifier(random_state=0)] ) -def test_partial_dependence_unknown_feature_indices(estimator): +def test_partial_dependence_unknown_feature_indices(features, estimator): X, y = make_classification(random_state=0) estimator.fit(X, y) - features = 100000 err_msg = 'all features must be in' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, [features]) From 4d4cc2db23c47601ae9d0af53cdca5e80330b3e2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 2 Aug 2019 17:12:41 +0200 Subject: [PATCH 41/86] update error message --- sklearn/utils/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 638dc2e6f64fc..d85d755c29eb4 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -340,7 +340,8 @@ def _get_column_indices(X, key): idx = safe_indexing(np.arange(n_columns), key) except IndexError as e: raise ValueError( - 'all features must be in [0, %d]' % (n_columns - 1) + 'all features must be in [0, {}] or [-{}, 0]' + .format(n_columns - 1, n_columns) ) from e return np.atleast_1d(idx).tolist() elif _check_key_type(key, str): From 3cb95ac2d76ecf3c494446d44838b88347e5f1e8 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 2 Aug 2019 14:43:30 -0400 Subject: [PATCH 42/86] fix partial dependence function --- sklearn/ensemble/partial_dependence.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py index 594ed39568c27..cce79342c393f 100644 --- a/sklearn/ensemble/partial_dependence.py +++ b/sklearn/ensemble/partial_dependence.py @@ -138,7 +138,7 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None, """ if not isinstance(gbrt, BaseGradientBoosting): raise ValueError('gbrt has to be an instance of BaseGradientBoosting') - check_is_fitted(gbrt, 'estimators_') + check_is_fitted(gbrt) if (grid is None and X is None) or (grid is not None and X is not None): raise ValueError('Either grid or X must be specified') @@ -270,7 +270,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None, if not isinstance(gbrt, BaseGradientBoosting): raise ValueError('gbrt has to be an instance of BaseGradientBoosting') - check_is_fitted(gbrt, 'estimators_') + check_is_fitted(gbrt) # set label_idx for multi-class GBRT if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2: From 4d3a8b47ac98bc897c7675e21a2c8ac0d8571750 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 2 Aug 2019 14:53:17 -0400 Subject: [PATCH 43/86] make change backward-compatible --- sklearn/utils/tests/test_validation.py | 5 +++++ sklearn/utils/validation.py | 7 +++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 14b13d94ca5d1..ec812b64938bc 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -661,6 +661,11 @@ def test_check_is_fitted(): assert check_is_fitted(ard) is None assert check_is_fitted(svr) is None + assert_warns_message( + DeprecationWarning, + "Passing attributes to check_is_fitted is deprecated", + check_is_fitted, ard, ['coef_']) + def test_check_consistent_length(): check_consistent_length([1], [2], [3], [4], [5]) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 48daa64da77ee..558c3e9de060f 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -155,7 +155,6 @@ def _num_samples(x): raise TypeError(message) - def check_memory(memory): """Check that ``memory`` is joblib.Memory-like. @@ -866,7 +865,7 @@ def check_symmetric(array, tol=1E-10, raise_warning=True, return array -def check_is_fitted(estimator, *, msg=None): +def check_is_fitted(estimator, attributes='deprecated', msg=None): """Perform is_fitted validation for estimator. Checks if the estimator is fitted by verifying the presence of @@ -897,6 +896,10 @@ def check_is_fitted(estimator, *, msg=None): NotFittedError If the attributes are not found. """ + if attributes != 'deprecated': + warnings.warn("Passing attributes to check_is_fitted is deprecated" + "and will be removed in 0.23. The attributes " + "argument is ignored.", DeprecationWarning) if msg is None: msg = ("This %(name)s instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") From 1181982935808ceb7b68ff697baed702471a7d9b Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 2 Aug 2019 15:25:36 -0400 Subject: [PATCH 44/86] also allow private fitted attributes --- sklearn/utils/validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 558c3e9de060f..d0fe8276d853f 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -907,7 +907,8 @@ def check_is_fitted(estimator, attributes='deprecated', msg=None): if not hasattr(estimator, 'fit'): raise TypeError("%s is not an estimator instance." % (estimator)) - attrs = [v for v in vars(estimator) if v.endswith("_") + attrs = [v for v in vars(estimator) + if (v.endswith("_") or v.startswith("_")) and not v.startswith("__")] if not len(attrs): From 7ed876d57c149b763fa4ac4d93c7c62f29e46446 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 2 Aug 2019 15:48:57 -0400 Subject: [PATCH 45/86] slight refactoring in CountVectorizer to mess less with the vocabulary --- sklearn/feature_extraction/text.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 01a7b70587f3d..4944d23200418 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -32,7 +32,7 @@ from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES from ..utils import _IS_32BIT from ..utils.fixes import _astype_copy_false -from ..exceptions import ChangedBehaviorWarning +from ..exceptions import ChangedBehaviorWarning, NotFittedError __all__ = ['HashingVectorizer', @@ -450,9 +450,11 @@ def _validate_vocabulary(self): self.fixed_vocabulary_ = False def _check_vocabulary(self): - """Check if vocabulary is empty or missing (not fit-ed)""" - msg = "%(name)s - Vocabulary wasn't fitted." - check_is_fitted(self, msg=msg), + """Check if vocabulary is empty or missing (not fitted)""" + if not hasattr(self, 'vocabulary_'): + self._validate_vocabulary() + if not self.fixed_vocabulary_: + raise NotFittedError("Vocabulary not fitted or provided") if len(self.vocabulary_) == 0: raise ValueError("Vocabulary is empty") @@ -1172,10 +1174,6 @@ def transform(self, raw_documents): raise ValueError( "Iterable over raw text documents expected, " "string object received.") - - if not hasattr(self, 'vocabulary_'): - self._validate_vocabulary() - self._check_vocabulary() # use the same matrix-building strategy as fit_transform @@ -1216,8 +1214,6 @@ def inverse_transform(self, X): def get_feature_names(self): """Array mapping from feature integer indices to feature name""" - if not hasattr(self, 'vocabulary_'): - self._validate_vocabulary() self._check_vocabulary() From 8701cc0e07a9d070e8d823edbc675a06453e5db8 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 2 Aug 2019 15:52:42 -0400 Subject: [PATCH 46/86] added regression test for not being able to call inverse_transform before transform --- sklearn/feature_extraction/tests/test_text.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 2bc1ad25bca63..e3be2c27955b6 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -261,6 +261,10 @@ def test_countvectorizer_custom_vocabulary(): assert set(vect.vocabulary_) == terms X = vect.transform(JUNK_FOOD_DOCS) assert X.shape[1] == len(terms) + v = typ(vocab) + vect = CountVectorizer(vocabulary=v) + inv = vect.inverse_transform(X) + assert len(inv) == X.shape[0] def test_countvectorizer_custom_vocabulary_pipeline(): From be4a90f0bf82d684f53e3bb405b7b51be115c83e Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 2 Aug 2019 16:16:35 -0400 Subject: [PATCH 47/86] add special check for classes --- sklearn/utils/tests/test_validation.py | 4 ++-- sklearn/utils/validation.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index ec812b64938bc..552bd85e91b99 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -631,8 +631,8 @@ def test_check_symmetric(): def test_check_is_fitted(): - # Check is ValueError raised when non estimator instance passed - assert_raises(ValueError, check_is_fitted, ARDRegression) + # Check is TypeError raised when non estimator instance passed + assert_raises(TypeError, check_is_fitted, ARDRegression) assert_raises(TypeError, check_is_fitted, "SVR") ard = ARDRegression() diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index d0fe8276d853f..06604bb583f66 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -14,7 +14,7 @@ import numpy as np import scipy.sparse as sp from distutils.version import LooseVersion -from inspect import signature +from inspect import signature, isclass from numpy.core.numeric import ComplexWarning import joblib @@ -900,6 +900,8 @@ def check_is_fitted(estimator, attributes='deprecated', msg=None): warnings.warn("Passing attributes to check_is_fitted is deprecated" "and will be removed in 0.23. The attributes " "argument is ignored.", DeprecationWarning) + if isclass(estimator): + raise TypeError("{} is a class, not an instance.".format(estimator)) if msg is None: msg = ("This %(name)s instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") From b62933d25f1ae6177661bc5b956ec981cf98ebef Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Aug 2019 12:33:57 +0200 Subject: [PATCH 48/86] address comments --- sklearn/inspection/partial_dependence.py | 13 +++--- .../tests/test_partial_dependence.py | 40 ++++++++++++------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index 4e219dbb77418..495fbb9bd7deb 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -352,14 +352,13 @@ def partial_dependence(estimator, X, features, response_method='auto', "With the 'recursion' method, the response_method must be " "'decision_function'. Got {}.".format(response_method) ) - msg = "'estimator' parameter must be a fitted estimator" - if isinstance(estimator, BaseGradientBoosting): - fitted_attribute = 'estimators_' - else: - fitted_attribute = 'n_iter_' - check_is_fitted(estimator, fitted_attribute, msg=msg) + # msg = "'estimator' parameter must be a fitted estimator" + # if isinstance(estimator, BaseGradientBoosting): + # fitted_attribute = 'estimators_' + # else: + # fitted_attribute = 'n_iter_' + # check_is_fitted(estimator, fitted_attribute, msg=msg) - print(features) features_indices = np.asarray( _get_column_indices(X, features), dtype=np.int32, order='C' ).ravel() diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 6b93e8e427a16..ca85db1a0d17d 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -341,15 +341,15 @@ def test_partial_dependence_error(estimator, params, err_msg): partial_dependence(estimator, X, **params) -@pytest.mark.parametrize("features", [-1, 100000]) @pytest.mark.parametrize( 'estimator', [LinearRegression(), GradientBoostingClassifier(random_state=0)] ) -def test_partial_dependence_unknown_feature_indices(features, estimator): +def test_partial_dependence_unknown_feature_indices(estimator): X, y = make_classification(random_state=0) estimator.fit(X, y) + features = 100000 err_msg = 'all features must be in' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, [features]) @@ -449,10 +449,11 @@ def test_partial_dependence_pipeline(): features = 0 pdp_pipe, values_pipe = partial_dependence( - pipe, iris.data, features=[features] + pipe, iris.data, features=[features], grid_resolution=10 ) pdp_clf, values_clf = partial_dependence( - clf, scaler.transform(iris.data), features=[features] + clf, scaler.transform(iris.data), features=[features], + grid_resolution=10 ) assert_allclose(pdp_pipe, pdp_clf) assert_allclose( @@ -487,7 +488,9 @@ def test_partial_dependence_dataframe(estimator, preprocessor, features): pipe = make_pipeline(preprocessor, estimator) pipe.fit(df, iris.target) - pdp_pipe, values_pipe = partial_dependence(pipe, df, features=features) + pdp_pipe, values_pipe = partial_dependence( + pipe, df, features=features, grid_resolution=10 + ) # the column transformer will reorder the column when transforming # we mixed the index to be sure that we are computing the partial @@ -501,7 +504,7 @@ def test_partial_dependence_dataframe(estimator, preprocessor, features): clf = clone(estimator).fit(X_proc, iris.target) pdp_clf, values_clf = partial_dependence( - clf, X_proc, features=features_clf, method='brute' + clf, X_proc, features=features_clf, method='brute', grid_resolution=10 ) assert_allclose(pdp_pipe, pdp_clf) @@ -516,13 +519,16 @@ def test_partial_dependence_dataframe(estimator, preprocessor, features): @pytest.mark.parametrize( - "features", - [0, iris.feature_names[0], - [0, 2], [iris.feature_names[i] for i in (0, 2)], - slice(0, 2, 1), [True, False, True, False]], + "features, expected_pd_shape", + [(0, (3, 10)), + (iris.feature_names[0], (3, 10)), + ([0, 2], (3, 10, 10)), + ([iris.feature_names[i] for i in (0, 2)], (3, 10, 10)), + (slice(0, 2, 1), (3, 10, 10)), + ([True, False, True, False], (3, 10, 10))], ids=['scalar-int', 'scalar-str', 'list-int', 'list-str', 'slice', 'mask'] ) -def test_partial_dependence_feature_type(features): +def test_partial_dependence_feature_type(features, expected_pd_shape): # check all possible features type supported in PDP pd = pytest.importorskip("pandas") df = pd.DataFrame(iris.data, columns=iris.feature_names) @@ -535,7 +541,11 @@ def test_partial_dependence_feature_type(features): preprocessor, LogisticRegression(max_iter=1000, random_state=0) ) pipe.fit(df, iris.target) - pdp_pipe, values_pipe = partial_dependence(pipe, df, features=features) + pdp_pipe, values_pipe = partial_dependence( + pipe, df, features=features, grid_resolution=10 + ) + assert pdp_pipe.shape == expected_pd_shape + assert len(values_pipe) == len(pdp_pipe.shape) - 1 def test_plot_partial_dependence(pyplot): @@ -647,10 +657,10 @@ def test_plot_partial_dependence_multioutput(pyplot): (multioutput_regression_data[0], {"target": 100, 'features': [0]}, r'target must be in \[0, n_tasks\]'), (make_classification(random_state=0), - {'features': ['foobar'], 'feature_names': None}, - 'Feature foobar not in feature_names'), + {'features': ['foobar'], 'feature_names': None}, + 'Feature foobar not in feature_names'), (make_classification(random_state=0), - {'features': ['foobar'], 'feature_names': ['abcd', 'def']}, + {'features': ['foobar'], 'feature_names': ['abcd', 'def']}, 'Feature foobar not in feature_names'), (make_classification(random_state=0), {'features': [(1, 2, 3)]}, 'Each entry in features must be either an int, '), From 7e330276fa23ce04f1489b7e4ac99ec0eead7072 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 5 Aug 2019 11:28:49 -0400 Subject: [PATCH 49/86] more functions to fix --- sklearn/tree/export.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py index 99b5e54a4c9b7..8e2e0a486b756 100644 --- a/sklearn/tree/export.py +++ b/sklearn/tree/export.py @@ -11,7 +11,6 @@ # Li Li # Giuseppe Vettigli # License: BSD 3 clause -import warnings from io import StringIO from numbers import Integral @@ -743,7 +742,7 @@ def export_graphviz(decision_tree, out_file=None, max_depth=None, 'digraph Tree {... """ - check_is_fitted(decision_tree, 'tree_') + check_is_fitted(decision_tree) own_file = False return_string = False try: @@ -849,7 +848,7 @@ def export_text(decision_tree, feature_names=None, max_depth=10, | |--- petal width (cm) > 1.75 | | |--- class: 2 """ - check_is_fitted(decision_tree, 'tree_') + check_is_fitted(decision_tree) tree_ = decision_tree.tree_ class_names = decision_tree.classes_ right_child_fmt = "{} {} <= {}\n" From 82fbc6f6f26ffe8e0ee1a9654685d348910ccce6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Aug 2019 18:45:12 +0200 Subject: [PATCH 50/86] address almost all comments --- sklearn/inspection/partial_dependence.py | 26 +++++-------- .../tests/test_partial_dependence.py | 38 +++++++++++++------ sklearn/utils/validation.py | 23 +++++------ 3 files changed, 48 insertions(+), 39 deletions(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index 495fbb9bd7deb..919be3832812e 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -145,7 +145,7 @@ def _partial_dependence_brute(est, grid, features, X, response_method): # (n_points,) for non-multioutput regressors # (n_points, n_tasks) for multioutput regressors # (n_points, 1) for the regressors in cross_decomposition (I think) - # (n_points, 2) for binary classifaction + # (n_points, 2) for binary classification # (n_points, n_classes) for multiclass classification # average over samples @@ -289,17 +289,15 @@ def partial_dependence(estimator, X, features, response_method='auto', """ if not (is_classifier(estimator) or is_regressor(estimator)): raise ValueError( - "'estimator' must be a fitted regressor or classifier.") + "'estimator' must be a fitted regressor or classifier." + ) + check_is_fitted(estimator) - if is_classifier(estimator): - if not hasattr(estimator, 'classes_'): - raise ValueError( - "'estimator' parameter must be a fitted estimator" - ) - if isinstance(estimator.classes_[0], np.ndarray): - raise ValueError( - 'Multiclass-multioutput estimators are not supported' - ) + if (is_classifier(estimator) and + isinstance(estimator.classes_[0], np.ndarray)): + raise ValueError( + 'Multiclass-multioutput estimators are not supported' + ) if not(hasattr(X, '__array__') or sparse.issparse(X)): X = check_array(X, force_all_finite='allow-nan', dtype=np.object) @@ -352,12 +350,6 @@ def partial_dependence(estimator, X, features, response_method='auto', "With the 'recursion' method, the response_method must be " "'decision_function'. Got {}.".format(response_method) ) - # msg = "'estimator' parameter must be a fitted estimator" - # if isinstance(estimator, BaseGradientBoosting): - # fitted_attribute = 'estimators_' - # else: - # fitted_attribute = 'n_iter_' - # check_is_fitted(estimator, fitted_attribute, msg=msg) features_indices = np.asarray( _get_column_indices(X, features), dtype=np.int32, order='C' diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index ca85db1a0d17d..1b8bc0260c119 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -33,8 +33,11 @@ from sklearn.pipeline import make_pipeline from sklearn.dummy import DummyClassifier from sklearn.base import BaseEstimator, ClassifierMixin, clone +from sklearn.exceptions import NotFittedError +from sklearn.utils.testing import all_estimators from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import SkipTest # toy sample @@ -349,7 +352,7 @@ def test_partial_dependence_unknown_feature_indices(estimator): X, y = make_classification(random_state=0) estimator.fit(X, y) - features = 100000 + features = 10000 err_msg = 'all features must be in' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, [features]) @@ -371,16 +374,6 @@ def test_partial_dependence_unknown_feature_string(estimator): partial_dependence(estimator, df, [features]) -@pytest.mark.parametrize( - 'estimator', - [LinearRegression(), GradientBoostingClassifier(random_state=0)] -) -def test_partial_dependence_unfitted_estimator(estimator): - err_msg = "'estimator' parameter must be a fitted estimator" - with pytest.raises(ValueError, match=err_msg): - partial_dependence(estimator, X, [0]) - - @pytest.mark.parametrize( 'estimator', [LinearRegression(), GradientBoostingClassifier(random_state=0)] @@ -548,6 +541,29 @@ def test_partial_dependence_feature_type(features, expected_pd_shape): assert len(values_pipe) == len(pdp_pipe.shape) - 1 +@pytest.mark.parametrize( + "name, Estimator", all_estimators(type_filter=['classifier', 'regressor']) +) +def test_partial_dependence_unfitted(name, Estimator): + try: + estimator = Estimator() + except TypeError: + raise SkipTest( + 'The {} estimator cannot be built with default parameters' + .format(name) + ) + + X, y = iris.data, iris.target + preprocessor = make_column_transformer( + (StandardScaler(), [0, 2]), (RobustScaler(), [1, 3]) + ) + pipe = make_pipeline(preprocessor, estimator) + with pytest.raises(NotFittedError, match="is not fitted yet"): + partial_dependence(pipe, X, features=[0, 2], grid_resolution=10) + with pytest.raises(NotFittedError, match="is not fitted yet"): + partial_dependence(estimator, X, features=[0, 2], grid_resolution=10) + + def test_plot_partial_dependence(pyplot): # Test partial dependence plot function. boston = load_boston() diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 06604bb583f66..12561d1fed450 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -865,7 +865,7 @@ def check_symmetric(array, tol=1E-10, raise_warning=True, return array -def check_is_fitted(estimator, attributes='deprecated', msg=None): +def check_is_fitted(estimator, msg=None): """Perform is_fitted validation for estimator. Checks if the estimator is fitted by verifying the presence of @@ -877,7 +877,6 @@ def check_is_fitted(estimator, attributes='deprecated', msg=None): estimator : estimator instance. estimator instance for which the check is performed. - msg : string The default error message is, "This %(name)s instance is not fitted yet. Call 'fit' with appropriate arguments before using this method." @@ -896,10 +895,6 @@ def check_is_fitted(estimator, attributes='deprecated', msg=None): NotFittedError If the attributes are not found. """ - if attributes != 'deprecated': - warnings.warn("Passing attributes to check_is_fitted is deprecated" - "and will be removed in 0.23. The attributes " - "argument is ignored.", DeprecationWarning) if isclass(estimator): raise TypeError("{} is a class, not an instance.".format(estimator)) if msg is None: @@ -909,12 +904,18 @@ def check_is_fitted(estimator, attributes='deprecated', msg=None): if not hasattr(estimator, 'fit'): raise TypeError("%s is not an estimator instance." % (estimator)) - attrs = [v for v in vars(estimator) - if (v.endswith("_") or v.startswith("_")) - and not v.startswith("__")] + from ..pipeline import Pipeline + if isinstance(estimator, Pipeline): + for est in estimator: + if est is not None: + check_is_fitted(est) + else: + attrs = [v for v in vars(estimator) + if (v.endswith("_") or v.startswith("_")) + and not v.startswith("__")] - if not len(attrs): - raise NotFittedError(msg % {'name': type(estimator).__name__}) + if not attrs: + raise NotFittedError(msg % {'name': type(estimator).__name__}) def check_non_negative(X, whom): From 18c8b551d37c331d7f98c12ccec6dad82d8fd710 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Aug 2019 18:47:48 +0200 Subject: [PATCH 51/86] PEP8 --- sklearn/inspection/tests/test_partial_dependence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 1b8bc0260c119..1a7d4ca0830f1 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -553,7 +553,7 @@ def test_partial_dependence_unfitted(name, Estimator): .format(name) ) - X, y = iris.data, iris.target + X = iris.data preprocessor = make_column_transformer( (StandardScaler(), [0, 2]), (RobustScaler(), [1, 3]) ) From 1587bfea6a6e58815c2f5df697258a25fdcc72b4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 12 Sep 2019 17:25:40 +0200 Subject: [PATCH 52/86] fix merge conflict error --- doc/whats_new/v0.22.rst | 6 ------ sklearn/inspection/partial_dependence.py | 9 +-------- sklearn/utils/__init__.py | 1 - 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index c14cccdfd43c5..4a4e22718c529 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -450,12 +450,6 @@ Changelog ``decision_function_shape='ovr'``, and the number of target classes > 2. :pr:`12557` by `Adrin Jalali`_. -:mod:`sklearn.feature_selection` -................................ -- |Fix| Fixed a bug where :class:`VarianceThreshold` with `threshold=0` did not - remove constant features due to numerical instability, by using range - rather than variance in this case. - :pr:`13704` by `Roddy MacSween `. - |Enhancement| SVM estimators now throw a more specific error when `kernel='precomputed'` and fit on non-square data. :pr:`14336` by :user:`Gregory Dexter `. diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index 0c4adaf081133..d692a3b5917b3 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -392,17 +392,10 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, Parameters ---------- estimator : BaseEstimator -<<<<<<< HEAD - A fitted estimator object implementing `predict`, `predict_proba`, - or `decision_function`. Multioutput-multiclass classifiers are not - supported. - X : array-like or DataFrame, shape (n_samples, n_features) -======= A fitted estimator object implementing :term:`predict`, :term:predict_proba`, or :term:`decision_function`. Multioutput-multiclass classifiers are not supported. - X : array-like, shape (n_samples, n_features) ->>>>>>> origin/master + X : {array-like or dataframe} of shape (n_samples, n_features) The data to use to build the grid of values on which the dependence will be evaluated. This is usually the training data. features : list of {int, str, pair of int, pair of str} diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 734c3d2bd8ccf..c26fab41bfc93 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -1,7 +1,6 @@ """ The :mod:`sklearn.utils` module includes various utilities. """ -from collections.abc import Iterable from collections.abc import Sequence from contextlib import contextmanager from itertools import compress From 8a887cace4254157a60716b95634d723104cd495 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 12 Sep 2019 17:32:18 +0200 Subject: [PATCH 53/86] handle pipeline in partial dependence function --- sklearn/inspection/partial_dependence.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index d692a3b5917b3..f85a2ba97e058 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -15,6 +15,7 @@ from joblib import Parallel, delayed from ..base import is_classifier, is_regressor +from ..pipeline import Pipeline from ..utils.extmath import cartesian from ..utils import check_array from ..utils import check_matplotlib_support # noqa @@ -291,7 +292,13 @@ def partial_dependence(estimator, X, features, response_method='auto', raise ValueError( "'estimator' must be a fitted regressor or classifier." ) - check_is_fitted(estimator) + + if isinstance(estimator, Pipeline): + for est in estimator: + if est not in (None, 'drop'): + check_is_fitted(est) + else: + check_is_fitted(estimator) if (is_classifier(estimator) and isinstance(estimator.classes_[0], np.ndarray)): From b6e6a4456ac02eefb30b5188a14a58b7fa65e28a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 12 Sep 2019 22:29:17 +0200 Subject: [PATCH 54/86] drop support for negative int indexing --- sklearn/inspection/partial_dependence.py | 15 ++++++++++++++- .../inspection/tests/test_partial_dependence.py | 4 ++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index f85a2ba97e058..69f5a28377a5a 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -20,6 +20,7 @@ from ..utils import check_array from ..utils import check_matplotlib_support # noqa from ..utils import safe_indexing +from ..utils import _determine_key_type from ..utils import _get_column_indices from ..utils.validation import check_is_fitted from ..tree._tree import DTYPE @@ -191,7 +192,7 @@ def partial_dependence(estimator, X, features, response_method='auto', ``X`` is used both to generate a grid of values for the ``features``, and to compute the averaged predictions when method is 'brute'. - features : list or array-like of int + features : array-like of {int, str} The target features for which the partial dependency should be computed. response_method : 'auto', 'predict_proba' or 'decision_function', \ @@ -358,6 +359,18 @@ def partial_dependence(estimator, X, features, response_method='auto', "'decision_function'. Got {}.".format(response_method) ) + if _determine_key_type(features) == 'int': + raise_err = False + if isinstance(features, Iterable): + raise_err = np.all(np.less(features, 0)) + elif isinstance(features, numbers.Integral): + raise_err = features < 0 + + if raise_err: + raise ValueError( + 'all features must be in [0, {}]'.format(X.shape[0] - 1) + ) + features_indices = np.asarray( _get_column_indices(X, features), dtype=np.int32, order='C' ).ravel() diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 845c8f3098404..4465a5c8218e8 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -348,11 +348,11 @@ def test_partial_dependence_error(estimator, params, err_msg): 'estimator', [LinearRegression(), GradientBoostingClassifier(random_state=0)] ) -def test_partial_dependence_unknown_feature_indices(estimator): +@pytest.mark.parametrize('features', [-1, 10000]) +def test_partial_dependence_unknown_feature_indices(estimator, features): X, y = make_classification(random_state=0) estimator.fit(X, y) - features = 10000 err_msg = 'all features must be in' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, [features]) From 9dbfea52f6ff420049c78209d05c7e130b60451c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 13 Sep 2019 09:43:46 +0200 Subject: [PATCH 55/86] TST check dataframe are supported in plot_partial_dependence --- .../inspection/tests/test_partial_dependence.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 4465a5c8218e8..36d63cf48e062 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -711,3 +711,19 @@ def test_plot_partial_dependence_fig(pyplot): clf, X, [0, 1], target=0, grid_resolution=grid_resolution, fig=fig) assert pyplot.gcf() is fig + + +def test_plot_partial_dependence_dataframe(pyplot): + pd = pytest.importorskip('pandas') + boston = load_boston() + df = pd.DataFrame(boston.data, columns=boston.feature_names) + y = boston.target + + grid_resolution = 25 + + clf = HistGradientBoostingRegressor(max_iter=50) + clf.fit(df, y) + plot_partial_dependence( + clf, df, ['TAX', 'AGE'], grid_resolution=grid_resolution, + feature_names=df.columns.tolist() + ) From 33865c82a01b9533a6a9cd0b82533bf23cc09d7b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 16 Sep 2019 14:18:22 +0200 Subject: [PATCH 56/86] Update sklearn/inspection/partial_dependence.py Co-Authored-By: Nicolas Hug --- sklearn/inspection/partial_dependence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index 2dce71edc8d91..f7cd230b7a637 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -368,7 +368,7 @@ def partial_dependence(estimator, X, features, response_method='auto', if raise_err: raise ValueError( - 'all features must be in [0, {}]'.format(X.shape[0] - 1) + 'all features must be in [0, {}]'.format(X.shape[1] - 1) ) features_indices = np.asarray( From 3cf6d7572865a65210e862f3af79ea9389c55067 Mon Sep 17 00:00:00 2001 From: Kevin Winata <32704121+kwinata@users.noreply.github.com> Date: Fri, 13 Sep 2019 17:16:20 +0800 Subject: [PATCH 57/86] DOC Add missing attributes to SVC and NuSVC (#14930) --- sklearn/svm/classes.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py index dbf5b78a1a6d1..0c98d9ffb5d3e 100644 --- a/sklearn/svm/classes.py +++ b/sklearn/svm/classes.py @@ -584,15 +584,22 @@ class SVC(BaseSVC): probA_ : array, shape = [n_class * (n_class-1) / 2] probB_ : array, shape = [n_class * (n_class-1) / 2] - If probability=True, the parameters learned in Platt scaling to - produce probability estimates from decision values. If - probability=False, an empty array. Platt scaling uses the logistic - function + If `probability=True`, it corresponds to the parameters learned in + Platt scaling to produce probability estimates from decision values. + If `probability=False`, it's an empty array. Platt scaling uses the + logistic function ``1 / (1 + exp(decision_value * probA_ + probB_))`` where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For more information on the multiclass case and training procedure see section 8 of [1]_. + class_weight_ : ndarray of shape (n_class,) + Multipliers of parameter C for each class. + Computed based on the ``class_weight`` parameter. + + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + Examples -------- >>> import numpy as np @@ -778,6 +785,27 @@ class NuSVC(BaseSVC): classes_ : array of shape = (n_classes,) The unique classes labels. + fit_status_ : int + 0 if correctly fitted, 1 if the algorithm did not converge. + + probA_ : ndarray, shape of (n_class * (n_class-1) / 2,) + probB_ : ndarray of shape (n_class * (n_class-1) / 2,) + If `probability=True`, it corresponds to the parameters learned in + Platt scaling to produce probability estimates from decision values. + If `probability=False`, it's an empty array. Platt scaling uses the + logistic function + ``1 / (1 + exp(decision_value * probA_ + probB_))`` + where ``probA_`` and ``probB_`` are learned from the dataset [2]_. For + more information on the multiclass case and training procedure see + section 8 of [1]_. + + class_weight_ : ndarray of shape (n_class,) + Multipliers of parameter C of each class. + Computed based on the ``class_weight`` parameter. + + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + Examples -------- >>> import numpy as np From 34c82504515f0932a3e7d73be1c2c2cb5bbdf3ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 13 Sep 2019 11:53:57 +0200 Subject: [PATCH 58/86] DOC Remove GraphViz mention in plot_tree docstring (#14973) --- sklearn/tree/export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/export.py b/sklearn/tree/export.py index bb368a6e81f76..e75522b671750 100644 --- a/sklearn/tree/export.py +++ b/sklearn/tree/export.py @@ -96,7 +96,7 @@ def plot_tree(decision_tree, max_depth=None, feature_names=None, Parameters ---------- decision_tree : decision tree regressor or classifier - The decision tree to be exported to GraphViz. + The decision tree to be plotted. max_depth : int, optional (default=None) The maximum depth of the representation. If None, the tree is fully From 27de857e1aedda5046b02397a782bba9bd66a88e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 13 Sep 2019 13:22:33 +0200 Subject: [PATCH 59/86] MAINT filter deprecation warnings triggered by all_estimators (#14691) --- sklearn/utils/testing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index b2cc4bc21dcc3..4645e455d04c6 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -514,7 +514,9 @@ def is_abstract(c): if IS_PYPY and ('_svmlight_format' in modname or 'feature_extraction._hashing' in modname): continue - module = __import__(modname, fromlist="dummy") + # Ignore deprecation warnings triggered at import time. + with ignore_warnings(category=DeprecationWarning): + module = __import__(modname, fromlist="dummy") classes = inspect.getmembers(module, inspect.isclass) all_classes.extend(classes) From 0f9d4819e304fcac3ff4f444f1857dd884991efd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 13 Sep 2019 09:48:13 -0400 Subject: [PATCH 60/86] MNT Deprecate enforce_estimator_tags_y (#14945) * deprecate choose_check_classifiers_labels * made new test file * deprecated enforce_estimator_tags_y * pep8 * Added note in whatsnew * not sure what went wrong in the merge --- doc/whats_new/v0.22.rst | 4 ++ sklearn/utils/estimator_checks.py | 67 +++++++++++--------- sklearn/utils/tests/test_deprecated_utils.py | 8 +++ 3 files changed, 49 insertions(+), 30 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 09eec39bbb9d5..015b0424e386e 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -488,6 +488,10 @@ Changelog :func:`~utils.estimator_checks.parametrize_with_checks`, to parametrize estimator checks for a list of estimators. :pr:`14381` by `Thomas Fan`_. +- |API| The following utils have been deprecated and are now private: + - ``choose_check_classifiers_labels`` + - ``enforce_estimator_tags_y`` + - |Enhancement| :func:`utils.safe_indexing` accepts an ``axis`` parameter to index array-like across rows and columns. The column indexing can be done on NumPy array, SciPy sparse matrix, and Pandas DataFrame. An additional diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index b304280d10a3f..438892db23865 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -149,7 +149,7 @@ def check_supervised_y_no_nan(name, estimator_orig): rng = np.random.RandomState(888) X = rng.randn(10, 5) y = np.full(10, np.inf) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) errmsg = "Input contains NaN, infinity or a value too large for " \ "dtype('float64')." @@ -626,7 +626,7 @@ def check_estimator_sparse_data(name, estimator_orig): # catch deprecation warnings with ignore_warnings(category=DeprecationWarning): estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) for matrix_format, X in _generate_sparse_matrix(X_csr): # catch deprecation warnings with ignore_warnings(category=(DeprecationWarning, FutureWarning)): @@ -709,7 +709,7 @@ def check_sample_weights_list(name, estimator_orig): y = np.arange(10) % 2 else: y = np.arange(10) % 3 - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) sample_weight = [3] * 10 # Test that estimators don't raise any exception estimator.fit(X, y, sample_weight=sample_weight) @@ -735,7 +735,7 @@ def check_sample_weights_invariance(name, estimator_orig): [4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.dtype('float')) y = np.array([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int')) - y = enforce_estimator_tags_y(estimator1, y) + y = _enforce_estimator_tags_y(estimator1, y) estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) estimator2.fit(X, y=y, sample_weight=None) @@ -765,7 +765,7 @@ def check_dtype_object(name, estimator_orig): else: y = (X[:, 0] * 4).astype(np.int) estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) estimator.fit(X, y) if hasattr(estimator, "predict"): @@ -820,7 +820,7 @@ def check_dict_unchanged(name, estimator_orig): y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) if hasattr(estimator, "n_components"): estimator.n_components = 1 @@ -859,7 +859,7 @@ def check_dont_overwrite_parameters(name, estimator_orig): y = X[:, 0].astype(np.int) if _safe_tags(estimator, 'binary_only'): y[y == 2] = 1 - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) if hasattr(estimator, "n_components"): estimator.n_components = 1 @@ -912,7 +912,7 @@ def check_fit2d_predict1d(name, estimator_orig): if tags['binary_only']: y[y == 2] = 1 estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) if hasattr(estimator, "n_components"): estimator.n_components = 1 @@ -960,7 +960,7 @@ def check_methods_subset_invariance(name, estimator_orig): if _safe_tags(estimator_orig, 'binary_only'): y[y == 2] = 1 estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) if hasattr(estimator, "n_components"): estimator.n_components = 1 @@ -1001,7 +1001,7 @@ def check_fit2d_1sample(name, estimator_orig): y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) if hasattr(estimator, "n_components"): estimator.n_components = 1 @@ -1033,7 +1033,7 @@ def check_fit2d_1feature(name, estimator_orig): X = pairwise_estimator_convert_X(X, estimator_orig) y = X[:, 0].astype(np.int) estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) if hasattr(estimator, "n_components"): estimator.n_components = 1 @@ -1046,7 +1046,7 @@ def check_fit2d_1feature(name, estimator_orig): if name == 'RANSACRegressor': estimator.residual_threshold = 0.5 - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) set_random_state(estimator, 1) msgs = ["1 feature(s)", "n_features = 1", "n_features=1"] @@ -1069,7 +1069,7 @@ def check_fit1d(name, estimator_orig): if tags["no_validation"]: # FIXME this is a bit loose return - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) if hasattr(estimator, "n_components"): estimator.n_components = 1 @@ -1210,7 +1210,7 @@ def check_pipeline_consistency(name, estimator_orig): X -= X.min() X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) set_random_state(estimator) pipeline = make_pipeline(estimator) estimator.fit(X, y) @@ -1239,7 +1239,7 @@ def check_fit_score_takes_y(name, estimator_orig): else: y = np.arange(10) % 3 estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) set_random_state(estimator) funcs = ["fit", "score", "partial_fit", "fit_predict", "fit_transform"] @@ -1269,7 +1269,7 @@ def check_estimators_dtypes(name, estimator_orig): y = X_train_int_64[:, 0] if _safe_tags(estimator_orig, 'binary_only'): y[y == 2] = 1 - y = enforce_estimator_tags_y(estimator_orig, y) + y = _enforce_estimator_tags_y(estimator_orig, y) methods = ["predict", "transform", "decision_function", "predict_proba"] @@ -1300,7 +1300,7 @@ def check_estimators_empty_data_messages(name, estimator_orig): X_zero_features = np.empty(0).reshape(3, 0) # the following y should be accepted by both classifiers and regressors # and ignored by unsupervised models - y = enforce_estimator_tags_y(e, np.array([1, 0, 1])) + y = _enforce_estimator_tags_y(e, np.array([1, 0, 1])) msg = (r"0 feature\(s\) \(shape=\(3, 0\)\) while a minimum of \d* " "is required.") assert_raises_regex(ValueError, msg, e.fit, X_zero_features, y) @@ -1318,7 +1318,7 @@ def check_estimators_nan_inf(name, estimator_orig): X_train_inf[0, 0] = np.inf y = np.ones(10) y[:5] = 0 - y = enforce_estimator_tags_y(estimator_orig, y) + y = _enforce_estimator_tags_y(estimator_orig, y) error_string_fit = "Estimator doesn't check for NaN and inf in fit." error_string_predict = ("Estimator doesn't check for NaN and inf in" " predict.") @@ -1413,7 +1413,7 @@ def check_estimators_pickle(name, estimator_orig): estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) set_random_state(estimator) estimator.fit(X, y) @@ -1600,7 +1600,7 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False): n_samples, n_features = X.shape classifier = clone(classifier_orig) X = pairwise_estimator_convert_X(X, classifier) - y = enforce_estimator_tags_y(classifier, y) + y = _enforce_estimator_tags_y(classifier, y) set_random_state(classifier) # raises error on malformed input for fit @@ -1805,7 +1805,7 @@ def check_estimators_fit_returns_self(name, estimator_orig, X = pairwise_estimator_convert_X(X, estimator_orig) estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) if readonly_memmap: X, y = create_memmap_backed_data([X, y]) @@ -1842,7 +1842,7 @@ def check_supervised_y_2d(name, estimator_orig): y = np.arange(10) % 2 else: y = np.arange(10) % 3 - y = enforce_estimator_tags_y(estimator_orig, y) + y = _enforce_estimator_tags_y(estimator_orig, y) estimator = clone(estimator_orig) set_random_state(estimator) # fit @@ -1965,7 +1965,7 @@ def check_regressors_int(name, regressor_orig): X = pairwise_estimator_convert_X(X[:50], regressor_orig) rnd = np.random.RandomState(0) y = rnd.randint(3, size=X.shape[0]) - y = enforce_estimator_tags_y(regressor_orig, y) + y = _enforce_estimator_tags_y(regressor_orig, y) rnd = np.random.RandomState(0) # separate estimators to control random seeds regressor_1 = clone(regressor_orig) @@ -1994,7 +1994,7 @@ def check_regressors_train(name, regressor_orig, readonly_memmap=False): y = StandardScaler().fit_transform(y.reshape(-1, 1)) # X is already scaled y = y.ravel() regressor = clone(regressor_orig) - y = enforce_estimator_tags_y(regressor, y) + y = _enforce_estimator_tags_y(regressor, y) if name in CROSS_DECOMPOSITION: rnd = np.random.RandomState(0) y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) @@ -2040,7 +2040,7 @@ def check_regressors_no_decision_function(name, regressor_orig): X = rng.normal(size=(10, 4)) X = pairwise_estimator_convert_X(X, regressor_orig) - y = enforce_estimator_tags_y(regressor, X[:, 0]) + y = _enforce_estimator_tags_y(regressor, X[:, 0]) if hasattr(regressor, "n_components"): # FIXME CCA, PLS is not robust to rank 1 effects @@ -2180,7 +2180,7 @@ def check_estimators_overwrite_params(name, estimator_orig): X -= X.min() X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) set_random_state(estimator) @@ -2270,7 +2270,7 @@ def check_classifier_data_not_an_array(name, estimator_orig): X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1]]) X = pairwise_estimator_convert_X(X, estimator_orig) y = [1, 1, 1, 2, 2, 2] - y = enforce_estimator_tags_y(estimator_orig, y) + y = _enforce_estimator_tags_y(estimator_orig, y) check_estimators_data_not_an_array(name, estimator_orig, X, y) @@ -2278,7 +2278,7 @@ def check_classifier_data_not_an_array(name, estimator_orig): def check_regressor_data_not_an_array(name, estimator_orig): X, y = _boston_subset(n_samples=50) X = pairwise_estimator_convert_X(X, estimator_orig) - y = enforce_estimator_tags_y(estimator_orig, y) + y = _enforce_estimator_tags_y(estimator_orig, y) check_estimators_data_not_an_array(name, estimator_orig, X, y) @@ -2372,7 +2372,14 @@ def param_filter(p): assert param_value == init_param.default, init_param.name +# TODO: remove in 0.24 +@deprecated("enforce_estimator_tags_y is deprecated in version " + "0.22 and will be removed in version 0.24.") def enforce_estimator_tags_y(estimator, y): + return _enforce_estimator_tags_y(estimator, y) + + +def _enforce_estimator_tags_y(estimator, y): # Estimators with a `requires_positive_y` tag only accept strictly positive # data if _safe_tags(estimator, "requires_positive_y"): @@ -2411,7 +2418,7 @@ def check_non_transformer_estimators_n_iter(name, estimator_orig): if hasattr(estimator, 'max_iter'): iris = load_iris() X, y_ = iris.data, iris.target - y_ = enforce_estimator_tags_y(estimator, y_) + y_ = _enforce_estimator_tags_y(estimator, y_) set_random_state(estimator, 0) @@ -2627,7 +2634,7 @@ def check_fit_idempotent(name, estimator_orig): y = rng.normal(size=n_samples) else: y = rng.randint(low=0, high=2, size=n_samples) - y = enforce_estimator_tags_y(estimator, y) + y = _enforce_estimator_tags_y(estimator, y) train, test = next(ShuffleSplit(test_size=.2, random_state=rng).split(X)) X_train, y_train = _safe_split(estimator, X, y, train) diff --git a/sklearn/utils/tests/test_deprecated_utils.py b/sklearn/utils/tests/test_deprecated_utils.py index c25c1d747ab6b..83dc599e0e7be 100644 --- a/sklearn/utils/tests/test_deprecated_utils.py +++ b/sklearn/utils/tests/test_deprecated_utils.py @@ -1,6 +1,9 @@ import pytest +import numpy as np +from sklearn.dummy import DummyClassifier from sklearn.utils.estimator_checks import choose_check_classifiers_labels +from sklearn.utils.estimator_checks import enforce_estimator_tags_y # This file tests the utils that are deprecated @@ -9,3 +12,8 @@ def test_choose_check_classifiers_labels_deprecated(): with pytest.warns(DeprecationWarning, match="removed in version 0.24"): choose_check_classifiers_labels(None, None, None) + + +def test_enforce_estimator_tags_y(): + with pytest.warns(DeprecationWarning, match="removed in version 0.24"): + enforce_estimator_tags_y(DummyClassifier(), np.array([0, 1])) From 9e0b7d2da4c085c324055ac0277c821f2d790d78 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Fri, 13 Sep 2019 13:20:29 -0400 Subject: [PATCH 61/86] DOC Adds more docstring standards (#14744) --- doc/developers/contributing.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index a400fd1f7f6d4..4b24c7089a5a8 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -629,6 +629,12 @@ Finally, follow the formatting rules below to make it consistently good: of the mentioned shapes. The default value is `np.ones(shape=(n_samples,))`. + list_param : list of int + + typed_ndarray : ndarray of shape (n_samples,), dtype=np.int32 + + sample_weight : array-like of shape (n_samples,), default=None + In general have the following in mind: 1. Use Python basic types. (``bool`` instead of ``boolean``) @@ -639,6 +645,14 @@ In general have the following in mind: 4. 1D or 2D data can be a subset of ``{array-like, ndarray, sparse matrix, dataframe}``. Note that ``array-like`` can also be a ``list``, while ``ndarray`` is explicitly only a ``numpy.ndarray``. + 5. When specifying the data type of a list, use ``of`` as a delimiter: + ``list of int``. + 6. When specifying the dtype of an ndarray, use e.g. ``dtype=np.int32`` + after defining the shape: + ``ndarray of shape (n_samples,), dtype=np.int32``. + 7. When the default is ``None``, ``None`` only needs to be specified at the + end with ``default=None``. Be sure to include in the docstring, what it + means for the parameter or attribute to be ``None``. * For unwritten formatting rules, try to follow existing good works: From 2db5c0d385a3d820c96356258148867fa0dfd6ae Mon Sep 17 00:00:00 2001 From: Jesper Dramsch Date: Fri, 13 Sep 2019 20:29:20 +0200 Subject: [PATCH 62/86] DOC Add example for GroupShuffleSplit (#14906) --- sklearn/model_selection/_split.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index ab681e89c1916..c49a3ce6aea4e 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1481,6 +1481,22 @@ class GroupShuffleSplit(ShuffleSplit): If None, the random number generator is the RandomState instance used by `np.random`. + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection import GroupShuffleSplit + >>> X = np.ones(shape=(8, 2)) + >>> y = np.ones(shape=(8, 1)) + >>> groups = np.array([1, 1, 2, 2, 2, 3, 3, 3]) + >>> print(groups.shape) + (8,) + >>> gss = GroupShuffleSplit(n_splits=2, train_size=.7, random_state=42) + >>> gss.get_n_splits() + 2 + >>> for train_idx, test_idx in gss.split(X, y, groups): + ... print("TRAIN:", train_idx, "TEST:", test_idx) + TRAIN: [2 3 4 5 6 7] TEST: [0 1] + TRAIN: [0 1 5 6 7] TEST: [2 3 4] ''' def __init__(self, n_splits=5, test_size=None, train_size=None, From 334fe5a2faeafcaee29eaa7df866164542ebfee4 Mon Sep 17 00:00:00 2001 From: catajara <48599015+catajara@users.noreply.github.com> Date: Fri, 13 Sep 2019 15:19:56 -0400 Subject: [PATCH 63/86] DOC add missing attributes to OneVsRestClassifier (#14783) --- sklearn/multiclass.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 8b28507906e2b..9cee9661489b6 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -170,11 +170,17 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin, classes_ : array, shape = [`n_classes`] Class labels. + + n_classes_ : int + Number of classes. + label_binarizer_ : LabelBinarizer object Object used to transform multiclass labels to binary labels and vice-versa. + multilabel_ : boolean Whether a OneVsRestClassifier is a multilabel classifier. + """ def __init__(self, estimator, n_jobs=None): self.estimator = estimator From 6f4509a2743d5c6b67f00445272263a96f4a5214 Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Sat, 14 Sep 2019 21:00:58 +0100 Subject: [PATCH 64/86] TST Adjusts rtol for test_lda_predict (#14978) --- sklearn/tests/test_discriminant_analysis.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py index 6a32c4ec15058..002d69357e1c5 100644 --- a/sklearn/tests/test_discriminant_analysis.py +++ b/sklearn/tests/test_discriminant_analysis.py @@ -2,7 +2,6 @@ import pytest -from numpy.testing import assert_allclose from scipy import linalg from sklearn.exceptions import ChangedBehaviorWarning @@ -76,8 +75,8 @@ def test_lda_predict(): assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, 'solver %s' % solver) y_log_proba_pred1 = clf.predict_log_proba(X1) - assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, - 8, 'solver %s' % solver) + assert_allclose(np.exp(y_log_proba_pred1), y_proba_pred1, + rtol=1e-6, err_msg='solver %s' % solver) # Primarily test for commit 2f34950 -- "reuse" of priors y_pred3 = clf.fit(X, y3).predict(X) From c20e3120da809901efb2bd0d851e8cbae049313f Mon Sep 17 00:00:00 2001 From: Andrea Navarrete Date: Sat, 14 Sep 2019 17:40:21 -0400 Subject: [PATCH 65/86] DOC Change default dataset for `plot_johnson_lindenstrauss_bound.py` (#14787) --- examples/plot_johnson_lindenstrauss_bound.py | 28 +++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/examples/plot_johnson_lindenstrauss_bound.py b/examples/plot_johnson_lindenstrauss_bound.py index 988de0fe2735b..b981c14fbf132 100644 --- a/examples/plot_johnson_lindenstrauss_bound.py +++ b/examples/plot_johnson_lindenstrauss_bound.py @@ -102,27 +102,26 @@ # Empirical validation # ==================== # -# We validate the above bounds on the digits dataset or on the 20 newsgroups -# text document (TF-IDF word frequencies) dataset: -# -# - for the digits dataset, some 8x8 gray level pixels data for 500 -# handwritten digits pictures are randomly projected to spaces for various -# larger number of dimensions ``n_components``. +# We validate the above bounds on the 20 newsgroups text document +# (TF-IDF word frequencies) dataset or on the digits dataset: # # - for the 20 newsgroups dataset some 500 documents with 100k # features in total are projected using a sparse random matrix to smaller # euclidean spaces with various values for the target number of dimensions # ``n_components``. # -# The default dataset is the digits dataset. To run the example on the twenty -# newsgroups dataset, pass the --twenty-newsgroups command line argument to +# - for the digits dataset, some 8x8 gray level pixels data for 500 +# handwritten digits pictures are randomly projected to spaces for various +# larger number of dimensions ``n_components``. +# +# The default dataset is the 20 newsgroups dataset. To run the example on the +# digits dataset, pass the ``--use-digits-dataset`` command line argument to # this script. -if '--twenty-newsgroups' in sys.argv: - # Need an internet connection hence not enabled by default - data = fetch_20newsgroups_vectorized().data[:500] -else: +if '--use-digits-dataset' in sys.argv: data = load_digits().data[:500] +else: + data = fetch_20newsgroups_vectorized().data[:500] ########################################################## # For each value of ``n_components``, we plot: @@ -158,7 +157,10 @@ projected_data, squared=True).ravel()[nonzero] plt.figure() - plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu) + min_dist = min(projected_dists.min(), dists.min()) + max_dist = max(projected_dists.max(), dists.max()) + plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu, + extent=[min_dist, max_dist, min_dist, max_dist]) plt.xlabel("Pairwise squared distances in original space") plt.ylabel("Pairwise squared distances in projected space") plt.title("Pairwise distances distribution for n_components=%d" % From 9b65ed75991d739595eb1286f43bb838a15a3199 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 15 Sep 2019 18:09:09 -0400 Subject: [PATCH 66/86] MNT deprecate outputs_2d_ attribute of dummy estimators (#14933) --- doc/whats_new/v0.22.rst | 6 +++++- sklearn/dummy.py | 39 +++++++++++++++++++++++++------------ sklearn/tests/test_dummy.py | 9 +++++++++ 3 files changed, 41 insertions(+), 13 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 015b0424e386e..5606e36e6de81 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -139,12 +139,16 @@ Changelog `Adrin Jalali`_. :mod:`sklearn.dummy` -............................ +.................... - |Fix| :class:`dummy.DummyClassifier` now handles checking the existence of the provided constant in multiouput cases. :pr:`14908` by :user:`Martina G. Vilas `. +- |API| The ``outputs_2d_`` attribute is deprecated in + :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor`. It is + equivalent to ``n_outputs > 1``. :pr:`14933` by `Nicolas Hug`_ + :mod:`sklearn.ensemble` ....................... diff --git a/sklearn/dummy.py b/sklearn/dummy.py index f95fcc3dcf618..233dc27aec076 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -17,6 +17,7 @@ from .utils.random import random_choice_csc from .utils.stats import _weighted_percentile from .utils.multiclass import class_distribution +from .utils import deprecated class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator): @@ -120,8 +121,6 @@ def fit(self, X, y, sample_weight=None): if not self.sparse_output_: y = np.atleast_1d(y) - self.output_2d_ = y.ndim == 2 and y.shape[1] > 1 - if y.ndim == 1: y = np.reshape(y, (-1, 1)) @@ -154,7 +153,7 @@ def fit(self, X, y, sample_weight=None): .format(self.constant, list(self.classes_[k]))) raise ValueError(err_msg) - if self.n_outputs_ == 1 and not self.output_2d_: + if self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] self.class_prior_ = self.class_prior_[0] @@ -185,7 +184,7 @@ def predict(self, X): classes_ = self.classes_ class_prior_ = self.class_prior_ constant = self.constant - if self.n_outputs_ == 1 and not self.output_2d_: + if self.n_outputs_ == 1: # Get same type even for self.n_outputs_ == 1 n_classes_ = [n_classes_] classes_ = [classes_] @@ -194,7 +193,7 @@ def predict(self, X): # Compute probability only once if self.strategy == "stratified": proba = self.predict_proba(X) - if self.n_outputs_ == 1 and not self.output_2d_: + if self.n_outputs_ == 1: proba = [proba] if self.sparse_output_: @@ -231,7 +230,7 @@ def predict(self, X): elif self.strategy == "constant": y = np.tile(self.constant, (n_samples, 1)) - if self.n_outputs_ == 1 and not self.output_2d_: + if self.n_outputs_ == 1: y = np.ravel(y) return y @@ -263,7 +262,7 @@ def predict_proba(self, X): classes_ = self.classes_ class_prior_ = self.class_prior_ constant = self.constant - if self.n_outputs_ == 1 and not self.output_2d_: + if self.n_outputs_ == 1: # Get same type even for self.n_outputs_ == 1 n_classes_ = [n_classes_] classes_ = [classes_] @@ -294,7 +293,7 @@ def predict_proba(self, X): P.append(out) - if self.n_outputs_ == 1 and not self.output_2d_: + if self.n_outputs_ == 1: P = P[0] return P @@ -355,6 +354,15 @@ def score(self, X, y, sample_weight=None): X = np.zeros(shape=(len(y), 1)) return super().score(X, y, sample_weight) + @deprecated( + "The outputs_2d_ attribute is deprecated in version 0.22 " + "and will be removed in version 0.24. It is equivalent to " + "n_outputs_ > 1." + ) + @property + def outputs_2d_(self): + return self.n_outputs_ != 1 + class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator): """ @@ -429,8 +437,6 @@ def fit(self, X, y, sample_weight=None): if len(y) == 0: raise ValueError("y must not be empty.") - self.output_2d_ = y.ndim == 2 and y.shape[1] > 1 - if y.ndim == 1: y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] @@ -470,7 +476,7 @@ def fit(self, X, y, sample_weight=None): accept_sparse=['csr', 'csc', 'coo'], ensure_2d=False, ensure_min_samples=0) - if self.output_2d_ and self.constant.shape[0] != y.shape[1]: + if self.n_outputs_ != 1 and self.constant.shape[0] != y.shape[1]: raise ValueError( "Constant target value should have " "shape (%d, 1)." % y.shape[1]) @@ -508,7 +514,7 @@ def predict(self, X, return_std=False): dtype=np.array(self.constant_).dtype) y_std = np.zeros((n_samples, self.n_outputs_)) - if self.n_outputs_ == 1 and not self.output_2d_: + if self.n_outputs_ == 1: y = np.ravel(y) y_std = np.ravel(y_std) @@ -554,3 +560,12 @@ def score(self, X, y, sample_weight=None): if X is None: X = np.zeros(shape=(len(y), 1)) return super().score(X, y, sample_weight) + + @deprecated( + "The outputs_2d_ attribute is deprecated in version 0.22 " + "and will be removed in version 0.24. It is equivalent to " + "n_outputs_ > 1." + ) + @property + def outputs_2d_(self): + return self.n_outputs_ != 1 diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py index 690fad42a58b8..88b2d16fba46e 100644 --- a/sklearn/tests/test_dummy.py +++ b/sklearn/tests/test_dummy.py @@ -755,3 +755,12 @@ def test_dtype_of_classifier_probas(strategy): probas = model.fit(X, y).predict_proba(X) assert probas.dtype == np.float64 + + +@pytest.mark.parametrize("Dummy", (DummyRegressor, DummyClassifier)) +def test_outputs_2d_deprecation(Dummy): + X = [[1, 2]] + y = [0] + with pytest.warns(DeprecationWarning, + match="will be removed in version 0.24"): + Dummy().fit(X, y).outputs_2d_ From e19a9d730fea5dd017b033ce1b270346fcc5d010 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 16 Sep 2019 04:58:56 -0400 Subject: [PATCH 67/86] [MRG] Make k_means use KMeans instead (#14985) --- sklearn/cluster/k_means_.py | 277 +++++++++++++++++++----------------- 1 file changed, 146 insertions(+), 131 deletions(-) diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index a83df9c836b86..8af8cc6873011 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -288,131 +288,17 @@ def k_means(X, n_clusters, sample_weight=None, init='k-means++', Returned only if `return_n_iter` is set to True. """ - if n_init <= 0: - raise ValueError("Invalid number of initializations." - " n_init=%d must be bigger than zero." % n_init) - random_state = check_random_state(random_state) - - if max_iter <= 0: - raise ValueError('Number of iterations should be a positive number,' - ' got %d instead' % max_iter) - - # avoid forcing order when copy_x=False - order = "C" if copy_x else None - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], - order=order, copy=copy_x) - # verify that the number of samples given is larger than k - if _num_samples(X) < n_clusters: - raise ValueError("n_samples=%d should be >= n_clusters=%d" % ( - _num_samples(X), n_clusters)) - - tol = _tolerance(X, tol) - - # If the distances are precomputed every job will create a matrix of shape - # (n_clusters, n_samples). To stop KMeans from eating up memory we only - # activate this if the created matrix is guaranteed to be under 100MB. 12 - # million entries consume a little under 100MB if they are of type double. - if precompute_distances == 'auto': - n_samples = X.shape[0] - precompute_distances = (n_clusters * n_samples) < 12e6 - elif isinstance(precompute_distances, bool): - pass - else: - raise ValueError("precompute_distances should be 'auto' or True/False" - ", but a value of %r was passed" % - precompute_distances) - - # Validate init array - if hasattr(init, '__array__'): - init = check_array(init, dtype=X.dtype.type, copy=True) - _validate_center_shape(X, n_clusters, init) - - if n_init != 1: - warnings.warn( - 'Explicit initial center position passed: ' - 'performing only one init in k-means instead of n_init=%d' - % n_init, RuntimeWarning, stacklevel=2) - n_init = 1 - - # subtract of mean of x for more accurate distance computations - if not sp.issparse(X): - X_mean = X.mean(axis=0) - # The copy was already done above - X -= X_mean - - if hasattr(init, '__array__'): - init -= X_mean - - # precompute squared norms of data points - x_squared_norms = row_norms(X, squared=True) - - best_labels, best_inertia, best_centers = None, None, None - if n_clusters == 1: - # elkan doesn't make sense for a single cluster, full will produce - # the right result. - algorithm = "full" - if algorithm == "auto": - algorithm = "full" if sp.issparse(X) else 'elkan' - if algorithm == "full": - kmeans_single = _kmeans_single_lloyd - elif algorithm == "elkan": - kmeans_single = _kmeans_single_elkan - else: - raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got" - " %s" % str(algorithm)) - - seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) - if effective_n_jobs(n_jobs) == 1: - # For a single thread, less memory is needed if we just store one set - # of the best results (as opposed to one set per run per thread). - for seed in seeds: - # run a k-means once - labels, inertia, centers, n_iter_ = kmeans_single( - X, sample_weight, n_clusters, max_iter=max_iter, init=init, - verbose=verbose, precompute_distances=precompute_distances, - tol=tol, x_squared_norms=x_squared_norms, - random_state=seed) - # determine if these results are the best so far - if best_inertia is None or inertia < best_inertia: - best_labels = labels.copy() - best_centers = centers.copy() - best_inertia = inertia - best_n_iter = n_iter_ - else: - # parallelisation of k-means runs - results = Parallel(n_jobs=n_jobs, verbose=0)( - delayed(kmeans_single)(X, sample_weight, n_clusters, - max_iter=max_iter, init=init, - verbose=verbose, tol=tol, - precompute_distances=precompute_distances, - x_squared_norms=x_squared_norms, - # Change seed to ensure variety - random_state=seed) - for seed in seeds) - # Get results with the lowest inertia - labels, inertia, centers, n_iters = zip(*results) - best = np.argmin(inertia) - best_labels = labels[best] - best_inertia = inertia[best] - best_centers = centers[best] - best_n_iter = n_iters[best] - - if not sp.issparse(X): - if not copy_x: - X += X_mean - best_centers += X_mean - - distinct_clusters = len(set(best_labels)) - if distinct_clusters < n_clusters: - warnings.warn("Number of distinct clusters ({}) found smaller than " - "n_clusters ({}). Possibly due to duplicate points " - "in X.".format(distinct_clusters, n_clusters), - ConvergenceWarning, stacklevel=2) + est = KMeans( + n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, + verbose=verbose, precompute_distances=precompute_distances, tol=tol, + random_state=random_state, copy_x=copy_x, n_jobs=n_jobs, + algorithm=algorithm + ).fit(X, sample_weight=sample_weight) if return_n_iter: - return best_centers, best_labels, best_inertia, best_n_iter + return est.cluster_centers_, est.labels_, est.inertia_, est.n_iter_ else: - return best_centers, best_labels, best_inertia + return est.cluster_centers_, est.labels_, est.inertia_ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300, @@ -953,15 +839,144 @@ def fit(self, X, y=None, sample_weight=None): """ random_state = check_random_state(self.random_state) - self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ - k_means( - X, n_clusters=self.n_clusters, sample_weight=sample_weight, - init=self.init, n_init=self.n_init, - max_iter=self.max_iter, verbose=self.verbose, - precompute_distances=self.precompute_distances, - tol=self.tol, random_state=random_state, copy_x=self.copy_x, - n_jobs=self.n_jobs, algorithm=self.algorithm, - return_n_iter=True) + n_init = self.n_init + if n_init <= 0: + raise ValueError("Invalid number of initializations." + " n_init=%d must be bigger than zero." % n_init) + + if self.max_iter <= 0: + raise ValueError( + 'Number of iterations should be a positive number,' + ' got %d instead' % self.max_iter + ) + + # avoid forcing order when copy_x=False + order = "C" if self.copy_x else None + X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], + order=order, copy=self.copy_x) + # verify that the number of samples given is larger than k + if _num_samples(X) < self.n_clusters: + raise ValueError("n_samples=%d should be >= n_clusters=%d" % ( + _num_samples(X), self.n_clusters)) + + tol = _tolerance(X, self.tol) + + # If the distances are precomputed every job will create a matrix of + # shape (n_clusters, n_samples). To stop KMeans from eating up memory + # we only activate this if the created matrix is guaranteed to be + # under 100MB. 12 million entries consume a little under 100MB if they + # are of type double. + precompute_distances = self.precompute_distances + if precompute_distances == 'auto': + n_samples = X.shape[0] + precompute_distances = (self.n_clusters * n_samples) < 12e6 + elif isinstance(precompute_distances, bool): + pass + else: + raise ValueError( + "precompute_distances should be 'auto' or True/False" + ", but a value of %r was passed" % + precompute_distances + ) + + # Validate init array + init = self.init + if hasattr(init, '__array__'): + init = check_array(init, dtype=X.dtype.type, copy=True) + _validate_center_shape(X, self.n_clusters, init) + + if n_init != 1: + warnings.warn( + 'Explicit initial center position passed: ' + 'performing only one init in k-means instead of n_init=%d' + % n_init, RuntimeWarning, stacklevel=2) + n_init = 1 + + # subtract of mean of x for more accurate distance computations + if not sp.issparse(X): + X_mean = X.mean(axis=0) + # The copy was already done above + X -= X_mean + + if hasattr(init, '__array__'): + init -= X_mean + + # precompute squared norms of data points + x_squared_norms = row_norms(X, squared=True) + + best_labels, best_inertia, best_centers = None, None, None + algorithm = self.algorithm + if self.n_clusters == 1: + # elkan doesn't make sense for a single cluster, full will produce + # the right result. + algorithm = "full" + if algorithm == "auto": + algorithm = "full" if sp.issparse(X) else 'elkan' + if algorithm == "full": + kmeans_single = _kmeans_single_lloyd + elif algorithm == "elkan": + kmeans_single = _kmeans_single_elkan + else: + raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got" + " %s" % str(algorithm)) + + seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) + if effective_n_jobs(self.n_jobs) == 1: + # For a single thread, less memory is needed if we just store one + # set of the best results (as opposed to one set per run per + # thread). + for seed in seeds: + # run a k-means once + labels, inertia, centers, n_iter_ = kmeans_single( + X, sample_weight, self.n_clusters, + max_iter=self.max_iter, init=init, verbose=self.verbose, + precompute_distances=precompute_distances, tol=tol, + x_squared_norms=x_squared_norms, random_state=seed) + # determine if these results are the best so far + if best_inertia is None or inertia < best_inertia: + best_labels = labels.copy() + best_centers = centers.copy() + best_inertia = inertia + best_n_iter = n_iter_ + else: + # parallelisation of k-means runs + results = Parallel(n_jobs=self.n_jobs, verbose=0)( + delayed(kmeans_single)( + X, sample_weight, self.n_clusters, + max_iter=self.max_iter, init=init, + verbose=self.verbose, tol=tol, + precompute_distances=precompute_distances, + x_squared_norms=x_squared_norms, + # Change seed to ensure variety + random_state=seed + ) + for seed in seeds) + # Get results with the lowest inertia + labels, inertia, centers, n_iters = zip(*results) + best = np.argmin(inertia) + best_labels = labels[best] + best_inertia = inertia[best] + best_centers = centers[best] + best_n_iter = n_iters[best] + + if not sp.issparse(X): + if not self.copy_x: + X += X_mean + best_centers += X_mean + + distinct_clusters = len(set(best_labels)) + if distinct_clusters < self.n_clusters: + warnings.warn( + "Number of distinct clusters ({}) found smaller than " + "n_clusters ({}). Possibly due to duplicate points " + "in X.".format(distinct_clusters, self.n_clusters), + ConvergenceWarning, stacklevel=2 + ) + + self.cluster_centers_ = best_centers + self.labels_ = best_labels + self.inertia_ = best_inertia + self.n_iter_ = best_n_iter return self def fit_predict(self, X, y=None, sample_weight=None): From c289e2716bcaffc04d88f88228426734f386d418 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 16 Sep 2019 13:39:53 +0200 Subject: [PATCH 68/86] EHN update lobpcg from scipy master (#14971) --- sklearn/externals/_lobpcg.py | 422 ++++++++++++++++++++--------------- sklearn/utils/fixes.py | 6 +- 2 files changed, 247 insertions(+), 181 deletions(-) diff --git a/sklearn/externals/_lobpcg.py b/sklearn/externals/_lobpcg.py index 30492c97c182b..4e0d0ad19b753 100644 --- a/sklearn/externals/_lobpcg.py +++ b/sklearn/externals/_lobpcg.py @@ -21,9 +21,8 @@ """ from __future__ import division, print_function, absolute_import -import warnings import numpy as np -from scipy.linalg import (inv, eigh, cho_factor, cho_solve, cholesky, +from scipy.linalg import (inv, eigh, cho_factor, cho_solve, cholesky, orth, LinAlgError) from scipy.sparse.linalg import aslinearoperator @@ -31,6 +30,7 @@ def bmat(*args, **kwargs): + import warnings with warnings.catch_warnings(record=True): warnings.filterwarnings( 'ignore', '.*the matrix subclass is not the recommended way.*') @@ -42,19 +42,20 @@ def _save(ar, fileName): np.savetxt(fileName, ar) -def _report_nonhermitian(M, a, b, name): +def _report_nonhermitian(M, name): """ - Report if `M` is not a hermitian matrix given the tolerances `a`, `b`. + Report if `M` is not a hermitian matrix given its type. """ from scipy.linalg import norm md = M - M.T.conj() nmd = norm(md, 1) - tol = np.spacing(max(10**a, (10**b)*norm(M, 1))) + tol = 10 * np.finfo(M.dtype).eps + tol = max(tol, tol * norm(M, 1)) if nmd > tol: - print('matrix %s is not sufficiently Hermitian for a=%d, b=%d:' - % (name, a, b)) + print('matrix %s of the type %s is not sufficiently Hermitian:' + % (name, M.dtype)) print('condition: %.e < %e' % (nmd, tol)) @@ -88,29 +89,42 @@ def _makeOperator(operatorInput, expectedShape): def _applyConstraints(blockVectorV, factYBY, blockVectorBY, blockVectorY): """Changes blockVectorV in place.""" - gramYBV = np.dot(blockVectorBY.T.conj(), blockVectorV) - tmp = cho_solve(factYBY, gramYBV) + YBV = np.dot(blockVectorBY.T.conj(), blockVectorV) + tmp = cho_solve(factYBY, YBV) blockVectorV -= np.dot(blockVectorY, tmp) def _b_orthonormalize(B, blockVectorV, blockVectorBV=None, retInvR=False): + """B-orthonormalize the given block vector using Cholesky.""" + normalization = blockVectorV.max(axis=0)+np.finfo(blockVectorV.dtype).eps + blockVectorV = blockVectorV / normalization if blockVectorBV is None: if B is not None: blockVectorBV = B(blockVectorV) else: blockVectorBV = blockVectorV # Shared data!!! - gramVBV = np.dot(blockVectorV.T.conj(), blockVectorBV) - gramVBV = cholesky(gramVBV) - gramVBV = inv(gramVBV, overwrite_a=True) - # gramVBV is now R^{-1}. - blockVectorV = np.dot(blockVectorV, gramVBV) - if B is not None: - blockVectorBV = np.dot(blockVectorBV, gramVBV) else: + blockVectorBV = blockVectorBV / normalization + VBV = np.matmul(blockVectorV.T.conj(), blockVectorBV) + try: + # VBV is a Cholesky factor from now on... + VBV = cholesky(VBV, overwrite_a=True) + VBV = inv(VBV, overwrite_a=True) + blockVectorV = np.matmul(blockVectorV, VBV) + # blockVectorV = (cho_solve((VBV.T, True), blockVectorV.T)).T + if B is not None: + blockVectorBV = np.matmul(blockVectorBV, VBV) + # blockVectorBV = (cho_solve((VBV.T, True), blockVectorBV.T)).T + else: + blockVectorBV = None + except LinAlgError: + # raise ValueError('Cholesky has failed') + blockVectorV = None blockVectorBV = None + VBV = None if retInvR: - return blockVectorV, blockVectorBV, gramVBV + return blockVectorV, blockVectorBV, VBV, normalization else: return blockVectorV, blockVectorBV @@ -141,113 +155,65 @@ def lobpcg(A, X, A : {sparse matrix, dense matrix, LinearOperator} The symmetric linear operator of the problem, usually a sparse matrix. Often called the "stiffness matrix". - X : array_like - Initial approximation to the k eigenvectors. If A has - shape=(n,n) then X should have shape shape=(n,k). + X : ndarray, float32 or float64 + Initial approximation to the ``k`` eigenvectors (non-sparse). If `A` + has ``shape=(n,n)`` then `X` should have shape ``shape=(n,k)``. B : {dense matrix, sparse matrix, LinearOperator}, optional - the right hand side operator in a generalized eigenproblem. - by default, B = Identity - often called the "mass matrix" + The right hand side operator in a generalized eigenproblem. + By default, ``B = Identity``. Often called the "mass matrix". M : {dense matrix, sparse matrix, LinearOperator}, optional - preconditioner to A; by default M = Identity - M should approximate the inverse of A - Y : array_like, optional - n-by-sizeY matrix of constraints, sizeY < n + Preconditioner to `A`; by default ``M = Identity``. + `M` should approximate the inverse of `A`. + Y : ndarray, float32 or float64, optional + n-by-sizeY matrix of constraints (non-sparse), sizeY < n The iterations will be performed in the B-orthogonal complement of the column-space of Y. Y must be full rank. tol : scalar, optional - Solver tolerance (stopping criterion) - by default: tol=n*sqrt(eps) - maxiter : integer, optional - maximum number of iterations - by default: maxiter=min(n,20) + Solver tolerance (stopping criterion). + The default is ``tol=n*sqrt(eps)``. + maxiter : int, optional + Maximum number of iterations. The default is ``maxiter=min(n, 20)``. largest : bool, optional - when True, solve for the largest eigenvalues, otherwise the smallest - verbosityLevel : integer, optional - controls solver output. default: verbosityLevel = 0. - retLambdaHistory : boolean, optional - whether to return eigenvalue history - retResidualNormsHistory : boolean, optional - whether to return history of residual norms + When True, solve for the largest eigenvalues, otherwise the smallest. + verbosityLevel : int, optional + Controls solver output. The default is ``verbosityLevel=0``. + retLambdaHistory : bool, optional + Whether to return eigenvalue history. Default is False. + retResidualNormsHistory : bool, optional + Whether to return history of residual norms. Default is False. Returns ------- - w : array - Array of k eigenvalues - v : array - An array of k eigenvectors. V has the same shape as X. - lambdas : list of arrays, optional + w : ndarray + Array of ``k`` eigenvalues + v : ndarray + An array of ``k`` eigenvectors. `v` has the same shape as `X`. + lambdas : list of ndarray, optional The eigenvalue history, if `retLambdaHistory` is True. - rnorms : list of arrays, optional + rnorms : list of ndarray, optional The history of residual norms, if `retResidualNormsHistory` is True. - Examples - -------- - - Solve A x = lambda B x with constraints and preconditioning. - - >>> from scipy.sparse import spdiags, issparse - >>> from scipy.sparse.linalg import lobpcg, LinearOperator - >>> n = 100 - >>> vals = [np.arange(n, dtype=np.float64) + 1] - >>> A = spdiags(vals, 0, n, n) - >>> A.toarray() - array([[ 1., 0., 0., ..., 0., 0., 0.], - [ 0., 2., 0., ..., 0., 0., 0.], - [ 0., 0., 3., ..., 0., 0., 0.], - ..., - [ 0., 0., 0., ..., 98., 0., 0.], - [ 0., 0., 0., ..., 0., 99., 0.], - [ 0., 0., 0., ..., 0., 0., 100.]]) - - Constraints. - - >>> Y = np.eye(n, 3) - - Initial guess for eigenvectors, should have linearly independent - columns. Column dimension = number of requested eigenvalues. - - >>> X = np.random.rand(n, 3) - - Preconditioner -- inverse of A (as an abstract linear operator). - - >>> invA = spdiags([1./vals[0]], 0, n, n) - >>> def precond( x ): - ... return invA * x - >>> M = LinearOperator(matvec=precond, shape=(n, n), dtype=float) - - Here, ``invA`` could of course have been used directly as a preconditioner. - Let us then solve the problem: - - >>> eigs, vecs = lobpcg(A, X, Y=Y, M=M, largest=False) - >>> eigs - array([4., 5., 6.]) - - Note that the vectors passed in Y are the eigenvectors of the 3 smallest - eigenvalues. The results returned are orthogonal to those. - Notes ----- - If both retLambdaHistory and retResidualNormsHistory are True, + If both ``retLambdaHistory`` and ``retResidualNormsHistory`` are True, the return tuple has the following format - (lambda, V, lambda history, residual norms history). + ``(lambda, V, lambda history, residual norms history)``. In the following ``n`` denotes the matrix size and ``m`` the number of required eigenvalues (smallest or largest). - The LOBPCG code internally solves eigenproblems of the size 3``m`` on every + The LOBPCG code internally solves eigenproblems of the size ``3m`` on every iteration by calling the "standard" dense eigensolver, so if ``m`` is not small enough compared to ``n``, it does not make sense to call the LOBPCG - code, but rather one should use the "standard" eigensolver, - e.g. numpy or scipy function in this case. - If one calls the LOBPCG algorithm for 5``m``>``n``, - it will most likely break internally, so the code tries to call - the standard function instead. - - It is not that n should be large for the LOBPCG to work, but rather the - ratio ``n``/``m`` should be large. It you call LOBPCG with ``m``=1 - and ``n``=10, it works though ``n`` is small. The method is intended - for extremely large ``n``/``m``, see e.g., reference [28] in + code, but rather one should use the "standard" eigensolver, e.g. numpy or + scipy function in this case. + If one calls the LOBPCG algorithm for ``5m > n``, it will most likely break + internally, so the code tries to call the standard function instead. + + It is not that ``n`` should be large for the LOBPCG to work, but rather the + ratio ``n / m`` should be large. It you call LOBPCG with ``m=1`` + and ``n=10``, it works though ``n`` is small. The method is intended + for extremely large ``n / m``, see e.g., reference [28] in https://arxiv.org/abs/0705.2626 The convergence speed depends basically on two factors: @@ -260,13 +226,7 @@ def lobpcg(A, X, directory) is ill-conditioned for large ``n``, so convergence will be slow, unless efficient preconditioning is used. For this specific problem, a good simple preconditioner function would be a linear solve - for A, which is easy to code since A is tridiagonal. - - *Acknowledgements* - - lobpcg.py code was written by Robert Cimrman. - Many thanks belong to Andrew Knyazev, the author of the algorithm, - for lots of advice and support. + for `A`, which is easy to code since A is tridiagonal. References ---------- @@ -282,6 +242,62 @@ def lobpcg(A, X, .. [3] A. V. Knyazev's C and MATLAB implementations: https://bitbucket.org/joseroman/blopex + + Examples + -------- + + Solve ``A x = lambda x`` with constraints and preconditioning. + + >>> import numpy as np + >>> from scipy.sparse import spdiags, issparse + >>> from scipy.sparse.linalg import lobpcg, LinearOperator + >>> n = 100 + >>> vals = np.arange(1, n + 1) + >>> A = spdiags(vals, 0, n, n) + >>> A.toarray() + array([[ 1., 0., 0., ..., 0., 0., 0.], + [ 0., 2., 0., ..., 0., 0., 0.], + [ 0., 0., 3., ..., 0., 0., 0.], + ..., + [ 0., 0., 0., ..., 98., 0., 0.], + [ 0., 0., 0., ..., 0., 99., 0.], + [ 0., 0., 0., ..., 0., 0., 100.]]) + + Constraints: + + >>> Y = np.eye(n, 3) + + Initial guess for eigenvectors, should have linearly independent + columns. Column dimension = number of requested eigenvalues. + + >>> X = np.random.rand(n, 3) + + Preconditioner in the inverse of A in this example: + + >>> invA = spdiags([1./vals], 0, n, n) + + The preconditiner must be defined by a function: + + >>> def precond( x ): + ... return invA @ x + + The argument x of the preconditioner function is a matrix inside `lobpcg`, + thus the use of matrix-matrix product ``@``. + + The preconditioner function is passed to lobpcg as a `LinearOperator`: + + >>> M = LinearOperator(matvec=precond, matmat=precond, + ... shape=(n, n), dtype=float) + + Let us now solve the eigenvalue problem for the matrix A: + + >>> eigenvalues, _ = lobpcg(A, X, Y=Y, M=M, largest=False) + >>> eigenvalues + array([4., 5., 6.]) + + Note that the vectors passed in Y are the eigenvectors of the 3 smallest + eigenvalues. The results returned are orthogonal to those. + """ blockVectorX = X blockVectorY = Y @@ -411,6 +427,8 @@ def lobpcg(A, X, blockVectorBP = None iterationNumber = -1 + restart = True + explicitGramFlag = False while iterationNumber < maxIterations: iterationNumber += 1 if verbosityLevel > 0: @@ -418,13 +436,12 @@ def lobpcg(A, X, if B is not None: aux = blockVectorBX * _lambda[np.newaxis, :] - else: aux = blockVectorX * _lambda[np.newaxis, :] blockVectorR = blockVectorAX - aux - aux = np.sum(blockVectorR.conjugate() * blockVectorR, 0) + aux = np.sum(blockVectorR.conj() * blockVectorR, 0) residualNorms = np.sqrt(aux) residualNormsHistory.append(residualNorms) @@ -468,8 +485,20 @@ def lobpcg(A, X, gramYBY, blockVectorBY, blockVectorY) ## - # B-orthonormalize the preconditioned residuals. + # B-orthogonalize the preconditioned residuals to X. + if B is not None: + activeBlockVectorR = activeBlockVectorR - \ + np.matmul(blockVectorX, + np.matmul(blockVectorBX.T.conj(), + activeBlockVectorR)) + else: + activeBlockVectorR = activeBlockVectorR - \ + np.matmul(blockVectorX, + np.matmul(blockVectorX.T.conj(), + activeBlockVectorR)) + ## + # B-orthonormalize the preconditioned residuals. aux = _b_orthonormalize(B, activeBlockVectorR) activeBlockVectorR, activeBlockVectorBR = aux @@ -479,80 +508,112 @@ def lobpcg(A, X, if B is not None: aux = _b_orthonormalize(B, activeBlockVectorP, activeBlockVectorBP, retInvR=True) - activeBlockVectorP, activeBlockVectorBP, invR = aux - activeBlockVectorAP = np.dot(activeBlockVectorAP, invR) - + activeBlockVectorP, activeBlockVectorBP, invR, normal = aux else: aux = _b_orthonormalize(B, activeBlockVectorP, retInvR=True) - activeBlockVectorP, _, invR = aux + activeBlockVectorP, _, invR, normal = aux + # Function _b_orthonormalize returns None if Cholesky fails + if activeBlockVectorP is not None: + activeBlockVectorAP = activeBlockVectorAP / normal activeBlockVectorAP = np.dot(activeBlockVectorAP, invR) + restart = False + else: + restart = True ## # Perform the Rayleigh Ritz Procedure: # Compute symmetric Gram matrices: - if B is not None: - xaw = np.dot(blockVectorX.T.conj(), activeBlockVectorAR) - waw = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorAR) - xbw = np.dot(blockVectorX.T.conj(), activeBlockVectorBR) - - if iterationNumber > 0: - xap = np.dot(blockVectorX.T.conj(), activeBlockVectorAP) - wap = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorAP) - pap = np.dot(activeBlockVectorP.T.conj(), activeBlockVectorAP) - xbp = np.dot(blockVectorX.T.conj(), activeBlockVectorBP) - wbp = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorBP) - - gramA = bmat([[np.diag(_lambda), xaw, xap], - [xaw.T.conj(), waw, wap], - [xap.T.conj(), wap.T.conj(), pap]]) - - gramB = bmat([[ident0, xbw, xbp], - [xbw.T.conj(), ident, wbp], - [xbp.T.conj(), wbp.T.conj(), ident]]) - else: - gramA = bmat([[np.diag(_lambda), xaw], - [xaw.T.conj(), waw]]) - gramB = bmat([[ident0, xbw], - [xbw.T.conj(), ident]]) - + if activeBlockVectorAR.dtype == 'float32': + myeps = 1 + elif activeBlockVectorR.dtype == 'float32': + myeps = 1e-4 else: - xaw = np.dot(blockVectorX.T.conj(), activeBlockVectorAR) - waw = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorAR) - xbw = np.dot(blockVectorX.T.conj(), activeBlockVectorR) - - if iterationNumber > 0: - xap = np.dot(blockVectorX.T.conj(), activeBlockVectorAP) - wap = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorAP) - pap = np.dot(activeBlockVectorP.T.conj(), activeBlockVectorAP) - xbp = np.dot(blockVectorX.T.conj(), activeBlockVectorP) - wbp = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorP) - - gramA = bmat([[np.diag(_lambda), xaw, xap], - [xaw.T.conj(), waw, wap], - [xap.T.conj(), wap.T.conj(), pap]]) - - gramB = bmat([[ident0, xbw, xbp], - [xbw.T.conj(), ident, wbp], - [xbp.T.conj(), wbp.T.conj(), ident]]) - else: - gramA = bmat([[np.diag(_lambda), xaw], - [xaw.T.conj(), waw]]) - gramB = bmat([[ident0, xbw], - [xbw.T.conj(), ident]]) + myeps = 1e-8 - if verbosityLevel > 0: - _report_nonhermitian(gramA, 3, -1, 'gramA') - _report_nonhermitian(gramB, 3, -1, 'gramB') + if residualNorms.max() > myeps and not explicitGramFlag: + explicitGramFlag = False + else: + # Once explicitGramFlag, forever explicitGramFlag. + explicitGramFlag = True - if verbosityLevel > 10: - _save(gramA, 'gramA') - _save(gramB, 'gramB') + # Shared memory assingments to simplify the code + if B is None: + blockVectorBX = blockVectorX + activeBlockVectorBR = activeBlockVectorR + if not restart: + activeBlockVectorBP = activeBlockVectorP + + # Common submatrices: + gramXAR = np.dot(blockVectorX.T.conj(), activeBlockVectorAR) + gramRAR = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorAR) + + if explicitGramFlag: + gramRAR = (gramRAR + gramRAR.T.conj())/2 + gramXAX = np.dot(blockVectorX.T.conj(), blockVectorAX) + gramXAX = (gramXAX + gramXAX.T.conj())/2 + gramXBX = np.dot(blockVectorX.T.conj(), blockVectorBX) + gramRBR = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorBR) + gramXBR = np.dot(blockVectorX.T.conj(), activeBlockVectorBR) + else: + gramXAX = np.diag(_lambda) + gramXBX = ident0 + gramRBR = ident + gramXBR = np.zeros((sizeX, currentBlockSize), dtype=A.dtype) + + def _handle_gramA_gramB_verbosity(gramA, gramB): + if verbosityLevel > 0: + _report_nonhermitian(gramA, 'gramA') + _report_nonhermitian(gramB, 'gramB') + if verbosityLevel > 10: + # Note: not documented, but leave it in here for now + np.savetxt('gramA.txt', gramA) + np.savetxt('gramB.txt', gramB) + + if not restart: + gramXAP = np.dot(blockVectorX.T.conj(), activeBlockVectorAP) + gramRAP = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorAP) + gramPAP = np.dot(activeBlockVectorP.T.conj(), activeBlockVectorAP) + gramXBP = np.dot(blockVectorX.T.conj(), activeBlockVectorBP) + gramRBP = np.dot(activeBlockVectorR.T.conj(), activeBlockVectorBP) + if explicitGramFlag: + gramPAP = (gramPAP + gramPAP.T.conj())/2 + gramPBP = np.dot(activeBlockVectorP.T.conj(), + activeBlockVectorBP) + else: + gramPBP = ident + + gramA = bmat([[gramXAX, gramXAR, gramXAP], + [gramXAR.T.conj(), gramRAR, gramRAP], + [gramXAP.T.conj(), gramRAP.T.conj(), gramPAP]]) + gramB = bmat([[gramXBX, gramXBR, gramXBP], + [gramXBR.T.conj(), gramRBR, gramRBP], + [gramXBP.T.conj(), gramRBP.T.conj(), gramPBP]]) + + _handle_gramA_gramB_verbosity(gramA, gramB) + + try: + _lambda, eigBlockVector = eigh(gramA, gramB, + check_finite=False) + except LinAlgError: + # try again after dropping the direction vectors P from RR + restart = True + + if restart: + gramA = bmat([[gramXAX, gramXAR], + [gramXAR.T.conj(), gramRAR]]) + gramB = bmat([[gramXBX, gramXBR], + [gramXBR.T.conj(), gramRBR]]) + + _handle_gramA_gramB_verbosity(gramA, gramB) + + try: + _lambda, eigBlockVector = eigh(gramA, gramB, + check_finite=False) + except LinAlgError: + raise ValueError('eigh has failed in lobpcg iterations') - # Solve the generalized eigenvalue problem. - _lambda, eigBlockVector = eigh(gramA, gramB, check_finite=False) ii = _get_indx(_lambda, sizeX, largest) - if verbosityLevel > 10: print(ii) print(_lambda) @@ -565,7 +626,7 @@ def lobpcg(A, X, if verbosityLevel > 10: print('lambda:', _lambda) # # Normalize eigenvectors! -# aux = np.sum( eigBlockVector.conjugate() * eigBlockVector, 0 ) +# aux = np.sum( eigBlockVector.conj() * eigBlockVector, 0 ) # eigVecNorms = np.sqrt( aux ) # eigBlockVector = eigBlockVector / eigVecNorms[np.newaxis, :] # eigBlockVector, aux = _b_orthonormalize( B, eigBlockVector ) @@ -575,7 +636,7 @@ def lobpcg(A, X, # Compute Ritz vectors. if B is not None: - if iterationNumber > 0: + if not restart: eigBlockVectorX = eigBlockVector[:sizeX] eigBlockVectorR = eigBlockVector[sizeX:sizeX+currentBlockSize] eigBlockVectorP = eigBlockVector[sizeX+currentBlockSize:] @@ -608,7 +669,7 @@ def lobpcg(A, X, blockVectorP, blockVectorAP, blockVectorBP = pp, app, bpp else: - if iterationNumber > 0: + if not restart: eigBlockVectorX = eigBlockVector[:sizeX] eigBlockVectorR = eigBlockVector[sizeX:sizeX+currentBlockSize] eigBlockVectorP = eigBlockVector[sizeX+currentBlockSize:] @@ -642,9 +703,14 @@ def lobpcg(A, X, blockVectorR = blockVectorAX - aux - aux = np.sum(blockVectorR.conjugate() * blockVectorR, 0) + aux = np.sum(blockVectorR.conj() * blockVectorR, 0) residualNorms = np.sqrt(aux) + # Future work: Need to add Postprocessing here: + # Making sure eigenvectors "exactly" satisfy the blockVectorY constrains? + # Making sure eigenvecotrs are "exactly" othonormalized by final "exact" RR + # Computing the actual true residuals + if verbosityLevel > 0: print('final eigenvalue:', _lambda) print('final residual norms:', residualNorms) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 83ace7d2e76c6..1d7d28a72c2e3 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -38,11 +38,11 @@ def _parse_version(version_string): except ImportError: from scipy.misc import comb, logsumexp # noqa -if sp_version >= (1, 3): +if sp_version >= (1, 4): from scipy.sparse.linalg import lobpcg else: - # Backport of lobpcg functionality from scipy 1.3.0, can be removed - # once support for sp_version < (1, 3) is dropped + # Backport of lobpcg functionality from scipy 1.4.0, can be removed + # once support for sp_version < (1, 4) is dropped from ..externals._lobpcg import lobpcg # noqa if sp_version >= (1, 3): From 27bfcc8d495c6f296756e5a4bcc6665e14627d14 Mon Sep 17 00:00:00 2001 From: DrGFreeman Date: Mon, 16 Sep 2019 07:58:02 -0400 Subject: [PATCH 69/86] FIX implement repr for RepeatedKFold and RepeatedStratifiedKFold (#14983) --- sklearn/model_selection/_split.py | 5 +++++ sklearn/model_selection/tests/test_split.py | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index c49a3ce6aea4e..ceee1a08146ab 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -1163,6 +1163,9 @@ def get_n_splits(self, X=None, y=None, groups=None): **self.cvargs) return cv.get_n_splits(X, y, groups) * self.n_repeats + def __repr__(self): + return _build_repr(self) + class RepeatedKFold(_RepeatedSplits): """Repeated K-Fold cross validator. @@ -2158,6 +2161,8 @@ def _build_repr(self): try: with warnings.catch_warnings(record=True) as w: value = getattr(self, key, None) + if value is None and hasattr(self, 'cvargs'): + value = self.cvargs.get(key, None) if len(w) and w[0].category == DeprecationWarning: # if the parameter is deprecated, don't show it continue diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index a38fb435c9db4..4aa47a753bb1d 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -980,6 +980,17 @@ def test_repeated_cv_value_errors(): assert_raises(ValueError, cv, n_repeats=1.5) +@pytest.mark.parametrize( + "RepeatedCV", [RepeatedKFold, RepeatedStratifiedKFold] +) +def test_repeated_cv_repr(RepeatedCV): + n_splits, n_repeats = 2, 6 + repeated_cv = RepeatedCV(n_splits=n_splits, n_repeats=n_repeats) + repeated_cv_repr = ('{}(n_repeats=6, n_splits=2, random_state=None)' + .format(repeated_cv.__class__.__name__)) + assert repeated_cv_repr == repr(repeated_cv) + + def test_repeated_kfold_determinstic_split(): X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] random_state = 258173307 From a846badbd18d153e616b13346a2ad2f48005f9b8 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 16 Sep 2019 14:24:30 +0200 Subject: [PATCH 70/86] address comments from Nicolas --- sklearn/inspection/partial_dependence.py | 8 +++++--- sklearn/inspection/tests/test_partial_dependence.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index f7cd230b7a637..ced6600c358e9 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -361,10 +361,12 @@ def partial_dependence(estimator, X, features, response_method='auto', if _determine_key_type(features) == 'int': raise_err = False - if isinstance(features, Iterable): + if (isinstance(features, Iterable) or + isinstance(features, numbers.Integral)): + # _get_column_indices() support negative indexing. Here, we limit + # the indexing to be positive. The upper bound will be checked + # by _get_column_indices() raise_err = np.all(np.less(features, 0)) - elif isinstance(features, numbers.Integral): - raise_err = features < 0 if raise_err: raise ValueError( diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 36d63cf48e062..622cf0804e807 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -368,10 +368,10 @@ def test_partial_dependence_unknown_feature_string(estimator): df = pd.DataFrame(X) estimator.fit(df, y) - features = 'random' + features = ['random'] err_msg = 'A given column is not a column of the dataframe' with pytest.raises(ValueError, match=err_msg): - partial_dependence(estimator, df, [features]) + partial_dependence(estimator, df, features) @pytest.mark.parametrize( From 09e589904c13408264f433e383cab96b152f0a96 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 20 Sep 2019 11:47:07 +0200 Subject: [PATCH 71/86] support indices in tuple in safe_indexing --- .../tests/test_plot_partial_dependence.py | 13 +++++++++++ sklearn/utils/__init__.py | 8 +++++-- sklearn/utils/tests/test_utils.py | 22 ++++++++++++++----- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/sklearn/inspection/tests/test_plot_partial_dependence.py b/sklearn/inspection/tests/test_plot_partial_dependence.py index bc0568b058be4..29150168bdba5 100644 --- a/sklearn/inspection/tests/test_plot_partial_dependence.py +++ b/sklearn/inspection/tests/test_plot_partial_dependence.py @@ -303,6 +303,19 @@ def test_plot_partial_dependence_multioutput(pyplot, target): assert ax.get_xlabel() == "{}".format(i) +def test_plot_partial_dependence_dataframe(pyplot, clf_boston, boston): + pd = pytest.importorskip('pandas') + df = pd.DataFrame(boston.data, columns=boston.feature_names) + y = boston.target + + grid_resolution = 25 + + plot_partial_dependence( + clf_boston, df, ['TAX', 'AGE'], grid_resolution=grid_resolution, + feature_names=df.columns.tolist() + ) + + dummy_classification_data = make_classification(random_state=0) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index c26fab41bfc93..22e968d3ea38c 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -188,6 +188,8 @@ def _array_indexing(array, key, key_dtype, axis): # check if we have an boolean array-likes to make the proper indexing if key_dtype == 'bool': key = np.asarray(key) + if isinstance(key, tuple): + key = list(key) return array[key] if axis == 0 else array[:, key] @@ -198,6 +200,8 @@ def _pandas_indexing(X, key, key_dtype, axis): # FIXME: solved in pandas 0.25 key = np.asarray(key) key = key if key.flags.writeable else key.copy() + elif isinstance(key, tuple): + key = list(key) # check whether we should index with loc or iloc indexer = X.iloc if key_dtype == 'int' else X.loc return indexer[:, key] if axis else indexer[key] @@ -254,7 +258,7 @@ def _determine_key_type(key): if key_start_type is not None: return key_start_type return key_stop_type - if isinstance(key, list): + if isinstance(key, (list, tuple)): unique_key = set(key) key_type = {_determine_key_type(elt) for elt in unique_key} if not key_type: @@ -352,7 +356,7 @@ def _get_column_indices(X, key): key_dtype = _determine_key_type(key) - if isinstance(key, list) and not key: + if isinstance(key, (list, tuple)) and not key: # we get an empty list return [] elif key_dtype in ('bool', 'int'): diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 363b77a44b5fc..b96702985aa1b 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -205,15 +205,19 @@ def test_column_or_1d(): (np.bool_(True), 'bool'), ([0, 1, 2], 'int'), (['0', '1', '2'], 'str'), + ((0, 1, 2), 'int'), + (('0', '1', '2'), 'str'), (slice(None, None), None), (slice(0, 2), 'int'), (np.array([0, 1, 2], dtype=np.int32), 'int'), (np.array([0, 1, 2], dtype=np.int64), 'int'), (np.array([0, 1, 2], dtype=np.uint8), 'int'), ([True, False], 'bool'), + ((True, False), 'bool'), (np.array([True, False]), 'bool'), ('col_0', 'str'), (['col_0', 'col_1', 'col_2'], 'str'), + (('col_0', 'col_1', 'col_2'), 'str'), (slice('begin', 'end'), 'str'), (np.array(['col_0', 'col_1', 'col_2']), 'str'), (np.array(['col_0', 'col_1', 'col_2'], dtype=object), 'str')] @@ -230,6 +234,8 @@ def test_determine_key_type_error(): def _convert_container(container, constructor_name, columns_name=None): if constructor_name == 'list': return list(container) + elif constructor_name == 'tuple': + return tuple(container) elif constructor_name == 'array': return np.asarray(container) elif constructor_name == 'sparse': @@ -247,7 +253,9 @@ def _convert_container(container, constructor_name, columns_name=None): @pytest.mark.parametrize( "array_type", ["list", "array", "sparse", "dataframe"] ) -@pytest.mark.parametrize("indices_type", ["list", "array", "series", "slice"]) +@pytest.mark.parametrize( + "indices_type", ["list", "tuple", "array", "series", "slice"] +) def test_safe_indexing_2d_container_axis_0(array_type, indices_type): indices = [1, 2] if indices_type == 'slice' and isinstance(indices[1], int): @@ -261,7 +269,9 @@ def test_safe_indexing_2d_container_axis_0(array_type, indices_type): @pytest.mark.parametrize("array_type", ["list", "array", "series"]) -@pytest.mark.parametrize("indices_type", ["list", "array", "series", "slice"]) +@pytest.mark.parametrize( + "indices_type", ["list", "tuple", "array", "series", "slice"] +) def test_safe_indexing_1d_container(array_type, indices_type): indices = [1, 2] if indices_type == 'slice' and isinstance(indices[1], int): @@ -275,7 +285,9 @@ def test_safe_indexing_1d_container(array_type, indices_type): @pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"]) -@pytest.mark.parametrize("indices_type", ["list", "array", "series", "slice"]) +@pytest.mark.parametrize( + "indices_type", ["list", "tuple", "array", "series", "slice"] +) @pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]]) def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices): # validation of the indices @@ -328,7 +340,7 @@ def test_safe_indexing_2d_read_only_axis_1(array_read_only, indices_read_only, @pytest.mark.parametrize("array_type", ["list", "array", "series"]) -@pytest.mark.parametrize("indices_type", ["list", "array", "series"]) +@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"]) def test_safe_indexing_1d_container_mask(array_type, indices_type): indices = [False] + [True] * 2 + [False] * 6 array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type) @@ -340,7 +352,7 @@ def test_safe_indexing_1d_container_mask(array_type, indices_type): @pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"]) -@pytest.mark.parametrize("indices_type", ["list", "array", "series"]) +@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"]) @pytest.mark.parametrize( "axis, expected_subset", [(0, [[4, 5, 6], [7, 8, 9]]), From 56455ee8a5d48ced4e665a829ce74578bb0d40ce Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 23 Sep 2019 10:02:55 +0200 Subject: [PATCH 72/86] PEP8 --- sklearn/inspection/tests/test_plot_partial_dependence.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/inspection/tests/test_plot_partial_dependence.py b/sklearn/inspection/tests/test_plot_partial_dependence.py index 29150168bdba5..1933063e40f79 100644 --- a/sklearn/inspection/tests/test_plot_partial_dependence.py +++ b/sklearn/inspection/tests/test_plot_partial_dependence.py @@ -306,7 +306,6 @@ def test_plot_partial_dependence_multioutput(pyplot, target): def test_plot_partial_dependence_dataframe(pyplot, clf_boston, boston): pd = pytest.importorskip('pandas') df = pd.DataFrame(boston.data, columns=boston.feature_names) - y = boston.target grid_resolution = 25 From c8c2a08c5e1a34aee9480fa4890c59f83ff5ac5a Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 21 Oct 2019 11:45:53 +0200 Subject: [PATCH 73/86] reviews --- sklearn/inspection/partial_dependence.py | 16 +++++++++------- .../inspection/tests/test_partial_dependence.py | 3 ++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index d0a556d923f8b..039df5eb2dbe3 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -298,7 +298,10 @@ def partial_dependence(estimator, X, features, response_method='auto', ) if isinstance(estimator, Pipeline): + # assuming Pipeline si the only estimator that does not store a new + # attribute for est in estimator: + # FIXME: remove the None option when it will be deprecated if est not in (None, 'drop'): check_is_fitted(est) else: @@ -310,6 +313,8 @@ def partial_dependence(estimator, X, features, response_method='auto', 'Multiclass-multioutput estimators are not supported' ) + # Use check_array only on lists and other non-array-likes / sparse. Do not + # convert DataFrame into a NumPy array. if not(hasattr(X, '__array__') or sparse.issparse(X)): X = check_array(X, force_all_finite='allow-nan', dtype=np.object) @@ -363,18 +368,15 @@ def partial_dependence(estimator, X, features, response_method='auto', ) if _determine_key_type(features) == 'int': - raise_err = False if (isinstance(features, Iterable) or isinstance(features, numbers.Integral)): # _get_column_indices() support negative indexing. Here, we limit # the indexing to be positive. The upper bound will be checked # by _get_column_indices() - raise_err = np.all(np.less(features, 0)) - - if raise_err: - raise ValueError( - 'all features must be in [0, {}]'.format(X.shape[1] - 1) - ) + if np.any(np.less(features, 0)): + raise ValueError( + 'all features must be in [0, {}]'.format(X.shape[1] - 1) + ) features_indices = np.asarray( _get_column_indices(X, features), dtype=np.int32, order='C' diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index caef8b97ade07..3fe3bc2cdbe8e 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -474,7 +474,8 @@ def test_partial_dependence_pipeline(): ids=['features-integer', 'features-string'] ) def test_partial_dependence_dataframe(estimator, preprocessor, features): - # check that the partial dependence support dataframe + # check that the partial dependence support dataframe and pipeline + # including a column transformer pd = pytest.importorskip("pandas") df = pd.DataFrame(iris.data, columns=iris.feature_names) From 4d427aa91791fd2ce499c354a186cc2426ce4f01 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 21 Oct 2019 12:08:25 +0200 Subject: [PATCH 74/86] safe_indexing is private --- sklearn/inspection/partial_dependence.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/inspection/partial_dependence.py b/sklearn/inspection/partial_dependence.py index 039df5eb2dbe3..46be2970c3f7d 100644 --- a/sklearn/inspection/partial_dependence.py +++ b/sklearn/inspection/partial_dependence.py @@ -21,7 +21,7 @@ from ..utils.extmath import cartesian from ..utils import check_array from ..utils import check_matplotlib_support # noqa -from ..utils import safe_indexing +from ..utils import _safe_indexing from ..utils import _determine_key_type from ..utils import _get_column_indices from ..utils.validation import check_is_fitted @@ -79,14 +79,14 @@ def _grid_from_X(X, percentiles, grid_resolution): values = [] for feature in range(X.shape[1]): - uniques = np.unique(safe_indexing(X, feature, axis=1)) + uniques = np.unique(_safe_indexing(X, feature, axis=1)) if uniques.shape[0] < grid_resolution: # feature has low resolution use unique vals axis = uniques else: # create axis based on percentiles and grid resolution emp_percentiles = mquantiles( - safe_indexing(X, feature, axis=1), prob=percentiles, axis=0 + _safe_indexing(X, feature, axis=1), prob=percentiles, axis=0 ) if np.allclose(emp_percentiles[0], emp_percentiles[1]): raise ValueError( @@ -383,7 +383,7 @@ def partial_dependence(estimator, X, features, response_method='auto', ).ravel() grid, values = _grid_from_X( - safe_indexing(X, features_indices, axis=1), percentiles, + _safe_indexing(X, features_indices, axis=1), percentiles, grid_resolution ) From b7c684432a56ca3d6dbc256cec7bc9f06f6a8874 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 23 Oct 2019 16:52:06 +0200 Subject: [PATCH 75/86] fix comments --- sklearn/inspection/_partial_dependence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 46be2970c3f7d..fb568040dc45c 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -370,7 +370,7 @@ def partial_dependence(estimator, X, features, response_method='auto', if _determine_key_type(features) == 'int': if (isinstance(features, Iterable) or isinstance(features, numbers.Integral)): - # _get_column_indices() support negative indexing. Here, we limit + # _get_column_indices() supports negative indexing. Here, we limit # the indexing to be positive. The upper bound will be checked # by _get_column_indices() if np.any(np.less(features, 0)): From 2579ef71b957417fd55108e67f5385c97bbe70f3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 24 Oct 2019 14:20:47 +0200 Subject: [PATCH 76/86] iter --- sklearn/_build_utils/deprecated_modules.py | 2 -- sklearn/inspection/tests/test_permutation_importance.py | 1 - 2 files changed, 3 deletions(-) diff --git a/sklearn/_build_utils/deprecated_modules.py b/sklearn/_build_utils/deprecated_modules.py index cf2bbe2606558..3069aafae0f58 100644 --- a/sklearn/_build_utils/deprecated_modules.py +++ b/sklearn/_build_utils/deprecated_modules.py @@ -143,8 +143,6 @@ ('_partial_dependence', 'sklearn.inspection.partial_dependence', 'sklearn.inspection', 'partial_dependence'), - ('_permutation_importance', 'sklearn.inspection.permutation_importance', - 'sklearn.inspection', 'permutation_importance'), ('_ball_tree', 'sklearn.neighbors.ball_tree', 'sklearn.neighbors', 'BallTree'), diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index b444310695dee..671a1e11b1fec 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -18,7 +18,6 @@ from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import scale - @pytest.mark.parametrize("n_jobs", [1, 2]) def test_permutation_importance_correlated_feature_regression(n_jobs): # Make sure that feature highly correlated to the target have a higher From a5777ad57b81a307c92878c4688e8a2b04c9e2dd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 24 Oct 2019 18:44:12 +0200 Subject: [PATCH 77/86] reduce list of estimator to check for fitness --- sklearn/inspection/tests/test_partial_dependence.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index a2df49985516c..db8d29773404c 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -542,17 +542,10 @@ def test_partial_dependence_feature_type(features, expected_pd_shape): @pytest.mark.parametrize( - "name, Estimator", all_estimators(type_filter=['classifier', 'regressor']) + "estimator", [LinearRegression(), LogisticRegression(), + GradientBoostingRegressor(), GradientBoostingClassifier()] ) -def test_partial_dependence_unfitted(name, Estimator): - try: - estimator = Estimator() - except TypeError: - raise SkipTest( - 'The {} estimator cannot be built with default parameters' - .format(name) - ) - +def test_partial_dependence_unfitted(estimator): X = iris.data preprocessor = make_column_transformer( (StandardScaler(), [0, 2]), (RobustScaler(), [1, 3]) From 0aa3cd901ec0d6c42f6d6ca1373cf61b1a75da5b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 24 Oct 2019 18:45:08 +0200 Subject: [PATCH 78/86] remove unused import --- sklearn/inspection/tests/test_partial_dependence.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index db8d29773404c..2f8d9faf7bc17 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -33,7 +33,6 @@ from sklearn.dummy import DummyClassifier from sklearn.base import BaseEstimator, ClassifierMixin, clone from sklearn.exceptions import NotFittedError -from sklearn.utils.testing import all_estimators from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import SkipTest From dc56f7b0b780cb4da790704ca98445ab692fa664 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 24 Oct 2019 18:47:01 +0200 Subject: [PATCH 79/86] fix --- sklearn/inspection/tests/test_partial_dependence.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 2f8d9faf7bc17..104451aa75457 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -35,7 +35,6 @@ from sklearn.exceptions import NotFittedError from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import SkipTest # toy sample From e1de4a4572bc1d0b121198d89d6771f53fddec7b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 24 Oct 2019 22:40:03 +0200 Subject: [PATCH 80/86] address thomas comments --- sklearn/inspection/_partial_dependence.py | 3 ++- sklearn/inspection/tests/test_partial_dependence.py | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index fb568040dc45c..9538c3b67521e 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -298,7 +298,8 @@ def partial_dependence(estimator, X, features, response_method='auto', ) if isinstance(estimator, Pipeline): - # assuming Pipeline si the only estimator that does not store a new + # TODO: to be removed if/when pipeline get a `steps_` attributes + # assuming Pipeline is the only estimator that does not store a new # attribute for est in estimator: # FIXME: remove the None option when it will be deprecated diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 104451aa75457..b278c7d1e5401 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -463,8 +463,11 @@ def test_partial_dependence_pipeline(): [None, make_column_transformer( (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), - (RobustScaler(), [iris.feature_names[i] for i in (1, 3)]))], - ids=['None', 'column-transformer'] + (RobustScaler(), [iris.feature_names[i] for i in (1, 3)])), + make_column_transformer( + (StandardScaler(), [iris.feature_names[i] for i in (0, 2)]), + remainder='passthrough')], + ids=['None', 'column-transformer', 'column-transformer-passthrough'] ) @pytest.mark.parametrize( "features", From 53cdf4a35c2a8328ffce3c29bc1c75e8d9ec1bc3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 28 Oct 2019 15:57:33 +0100 Subject: [PATCH 81/86] remove support for slice --- sklearn/inspection/_partial_dependence.py | 28 +++++++++++++------ .../tests/test_partial_dependence.py | 19 +++++++++++-- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index fb568040dc45c..b5c2d20d52c00 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -367,16 +367,26 @@ def partial_dependence(estimator, X, features, response_method='auto', "'decision_function'. Got {}.".format(response_method) ) + if isinstance(features, slice): + err_msg = ("Unsupported type for the parameter 'features'. The " + "expected type is {}. Got {!r} instead.") + if hasattr(X, "iloc"): + specified_msg = ("a column name or an integer column indice or an " + "array-like containing column names or integer " + "column indices") + else: + specified_msg = ("an integer column indice or an array-like " + "containing integer column indices") + raise ValueError(err_msg.format(specified_msg, features)) + if _determine_key_type(features) == 'int': - if (isinstance(features, Iterable) or - isinstance(features, numbers.Integral)): - # _get_column_indices() supports negative indexing. Here, we limit - # the indexing to be positive. The upper bound will be checked - # by _get_column_indices() - if np.any(np.less(features, 0)): - raise ValueError( - 'all features must be in [0, {}]'.format(X.shape[1] - 1) - ) + # _get_column_indices() supports negative indexing. Here, we limit + # the indexing to be positive. The upper bound will be checked + # by _get_column_indices() + if np.any(np.less(features, 0)): + raise ValueError( + 'all features must be in [0, {}]'.format(X.shape[1] - 1) + ) features_indices = np.asarray( _get_column_indices(X, features), dtype=np.int32, order='C' diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 104451aa75457..7d49f097254d3 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -341,6 +341,22 @@ def test_partial_dependence_error(estimator, params, err_msg): partial_dependence(estimator, X, **params) +@pytest.mark.parametrize( + "with_dataframe, err_msg", + [(True, "a column name or an integer column indice"), + (False, "an integer column indice or an array-like")] +) +def test_partial_dependence_slice_error(with_dataframe, err_msg): + X, y = make_classification(random_state=0) + if with_dataframe: + pd = pytest.importorskip('pandas') + X = pd.DataFrame(X) + estimator = LogisticRegression().fit(X, y) + + with pytest.raises(ValueError, match=err_msg): + partial_dependence(estimator, X, features=slice(0, 2, 1)) + + @pytest.mark.parametrize( 'estimator', [LinearRegression(), GradientBoostingClassifier(random_state=0)] @@ -515,9 +531,8 @@ def test_partial_dependence_dataframe(estimator, preprocessor, features): (iris.feature_names[0], (3, 10)), ([0, 2], (3, 10, 10)), ([iris.feature_names[i] for i in (0, 2)], (3, 10, 10)), - (slice(0, 2, 1), (3, 10, 10)), ([True, False, True, False], (3, 10, 10))], - ids=['scalar-int', 'scalar-str', 'list-int', 'list-str', 'slice', 'mask'] + ids=['scalar-int', 'scalar-str', 'list-int', 'list-str', 'mask'] ) def test_partial_dependence_feature_type(features, expected_pd_shape): # check all possible features type supported in PDP From fa9f04a44711a1037ddb3e545dcdff08fc6c2aca Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 30 Oct 2019 10:58:15 +0100 Subject: [PATCH 82/86] add accept_slice to _determine_key_dtype --- sklearn/inspection/_partial_dependence.py | 14 +------------- .../inspection/tests/test_partial_dependence.py | 6 +++--- sklearn/utils/__init__.py | 9 ++++++++- sklearn/utils/tests/test_utils.py | 5 +++++ 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 099af0a4b8d9f..90cd94ae8852c 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -368,19 +368,7 @@ def partial_dependence(estimator, X, features, response_method='auto', "'decision_function'. Got {}.".format(response_method) ) - if isinstance(features, slice): - err_msg = ("Unsupported type for the parameter 'features'. The " - "expected type is {}. Got {!r} instead.") - if hasattr(X, "iloc"): - specified_msg = ("a column name or an integer column indice or an " - "array-like containing column names or integer " - "column indices") - else: - specified_msg = ("an integer column indice or an array-like " - "containing integer column indices") - raise ValueError(err_msg.format(specified_msg, features)) - - if _determine_key_type(features) == 'int': + if _determine_key_type(features, accept_slice=False) == 'int': # _get_column_indices() supports negative indexing. Here, we limit # the indexing to be positive. The upper bound will be checked # by _get_column_indices() diff --git a/sklearn/inspection/tests/test_partial_dependence.py b/sklearn/inspection/tests/test_partial_dependence.py index 8646a1d97b645..8d3194f34249f 100644 --- a/sklearn/inspection/tests/test_partial_dependence.py +++ b/sklearn/inspection/tests/test_partial_dependence.py @@ -343,8 +343,8 @@ def test_partial_dependence_error(estimator, params, err_msg): @pytest.mark.parametrize( "with_dataframe, err_msg", - [(True, "a column name or an integer column indice"), - (False, "an integer column indice or an array-like")] + [(True, "Only array-like or scalar are supported"), + (False, "Only array-like or scalar are supported")] ) def test_partial_dependence_slice_error(with_dataframe, err_msg): X, y = make_classification(random_state=0) @@ -353,7 +353,7 @@ def test_partial_dependence_slice_error(with_dataframe, err_msg): X = pd.DataFrame(X) estimator = LogisticRegression().fit(X, y) - with pytest.raises(ValueError, match=err_msg): + with pytest.raises(TypeError, match=err_msg): partial_dependence(estimator, X, features=slice(0, 2, 1)) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index d2d1bc3435add..923eba7a09065 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -223,13 +223,15 @@ def _list_indexing(X, key, key_dtype): return [X[idx] for idx in key] -def _determine_key_type(key): +def _determine_key_type(key, accept_slice=True): """Determine the data type of key. Parameters ---------- key : scalar, slice or array-like The key from which we want to infer the data type. + accept_slice : bool, default=True + Whether or not to raise an error if the key is a slice. Returns ------- @@ -252,6 +254,11 @@ def _determine_key_type(key): except KeyError: raise ValueError(err_msg) if isinstance(key, slice): + if not accept_slice: + raise TypeError( + 'Only array-like or scalar are supported. ' + 'A Python slice was given.' + ) if key.start is None and key.stop is None: return None key_start_type = _determine_key_type(key.start) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 55ae8d373ea1f..2cf1e59a73f29 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -231,6 +231,11 @@ def test_determine_key_type_error(): _determine_key_type(1.0) +def test_determine_key_type_slice_error(): + with pytest.raises(TypeError, match="Only array-like or scalar are"): + _determine_key_type(slice(0, 2, 1), accept_slice=False) + + def _convert_container(container, constructor_name, columns_name=None): if constructor_name == 'list': return list(container) From f7f7096b9f8a74869c8f22b4a7b59ae5763de5bd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 30 Oct 2019 16:55:25 +0100 Subject: [PATCH 83/86] docstring --- sklearn/utils/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 923eba7a09065..4d4ef606341ca 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -230,6 +230,7 @@ def _determine_key_type(key, accept_slice=True): ---------- key : scalar, slice or array-like The key from which we want to infer the data type. + accept_slice : bool, default=True Whether or not to raise an error if the key is a slice. From 8029cf4f6896d70c4025504805c39a065efc39ff Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 30 Oct 2019 17:56:04 +0100 Subject: [PATCH 84/86] docstring --- sklearn/inspection/_partial_dependence.py | 30 +++++++++++++++++------ 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index 90cd94ae8852c..a2475e272dbce 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -49,9 +49,11 @@ def _grid_from_X(X, percentiles, grid_resolution): ---------- X : ndarray, shape (n_samples, n_target_features) The data + percentiles : tuple of floats The percentiles which are used to construct the extreme values of the grid. Must be in [0, 1]. + grid_resolution : int The number of equally spaced points to be placed on the grid for each feature. @@ -61,6 +63,7 @@ def _grid_from_X(X, percentiles, grid_resolution): grid : ndarray, shape (n_points, n_target_features) A value for each feature at each point in the grid. ``n_points`` is always ``<= grid_resolution ** X.shape[1]``. + values : list of 1d ndarrays The values with which the grid has been created. The size of each array ``values[j]`` is either ``grid_resolution``, or the number of @@ -191,13 +194,16 @@ def partial_dependence(estimator, X, features, response_method='auto', A fitted estimator object implementing :term:`predict`, :term:`predict_proba`, or :term:`decision_function`. Multioutput-multiclass classifiers are not supported. + X : {array-like or dataframe} of shape (n_samples, n_features) ``X`` is used both to generate a grid of values for the ``features``, and to compute the averaged predictions when method is 'brute'. + features : array-like of {int, str} - The target features for which the partial dependency should be - computed. + The feature (e.g. `[0]`) or pair of interacting features + (e.g. `[(0, 1)]`) for which the partial dependency should be computed. + response_method : 'auto', 'predict_proba' or 'decision_function', \ optional (default='auto') Specifies whether to use :term:`predict_proba` or @@ -207,12 +213,15 @@ def partial_dependence(estimator, X, features, response_method='auto', and we revert to :term:`decision_function` if it doesn't exist. If ``method`` is 'recursion', the response is always the output of :term:`decision_function`. + percentiles : tuple of float, optional (default=(0.05, 0.95)) The lower and upper percentile used to create the extreme values for the grid. Must be in [0, 1]. + grid_resolution : int, optional (default=100) The number of equally spaced points on the grid, for each target feature. + method : str, optional (default='auto') The method used to calculate the averaged predictions: @@ -224,7 +233,7 @@ def partial_dependence(estimator, X, features, response_method='auto', but is more efficient in terms of speed. With this method, ``X`` is only used to build the grid and the partial dependences are computed using the training - data. This method does not account for the ``init`` predicor of + data. This method does not account for the ``init`` predictor of the boosting process, which may lead to incorrect values (see warning below). With this method, the target response of a classifier is always the decision function, not the predicted @@ -256,6 +265,7 @@ def partial_dependence(estimator, X, features, response_method='auto', regression. For classical regression and binary classification ``n_outputs==1``. ``n_values_feature_j`` corresponds to the size ``values[j]``. + values : seq of 1d ndarrays The values with which the grid has been created. The generated grid is a cartesian product of the arrays in ``values``. ``len(values) == @@ -423,6 +433,7 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, A fitted estimator object implementing :term:`predict`, :term:`predict_proba`, or :term:`decision_function`. Multioutput-multiclass classifiers are not supported. + X : {array-like or dataframe} of shape (n_samples, n_features) The data to use to build the grid of values on which the dependence will be evaluated. This is usually the training data. @@ -481,7 +492,7 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, but is more efficient in terms of speed. With this method, ``X`` is optional and is only used to build the grid and the partial dependences are computed using the training - data. This method does not account for the ``init`` predicor of + data. This method does not account for the ``init`` predictor of the boosting process, which may lead to incorrect values (see warning below. With this method, the target response of a classifier is always the decision function, not the predicted @@ -520,7 +531,7 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, ax : Matplotlib axes or array-like of Matplotlib axes, default=None - If a single axis is passed in, it is treated as a bounding axes - and a grid of partial depedendence plots will be drawn within + and a grid of partial dependence plots will be drawn within these bounds. The `n_cols` parameter controls the number of columns in the grid. - If an array-like of axes are passed in, the partial dependence @@ -581,12 +592,15 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, # regression and binary classification target_idx = 0 + # Use check_array only on lists and other non-array-likes / sparse. Do not + # convert DataFrame into a NumPy array. X = check_array(X) n_features = X.shape[1] # convert feature_names to list if feature_names is None: - # if feature_names is None, use feature indices as name + # if feature_names is None, use feature indices as name for NumPy array + # or the column names for a dataframe feature_names = [str(i) for i in range(n_features)] elif isinstance(feature_names, np.ndarray): feature_names = feature_names.tolist() @@ -709,7 +723,7 @@ class PartialDependenceDisplay: plot a two-way partial dependence curve as a contour plot. feature_names : list of str - Feature names corrsponding to the indicies in ``features``. + Feature names corresponding to the indices in ``features``. target_idx : int @@ -777,7 +791,7 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None): ---------- ax : Matplotlib axes or array-like of Matplotlib axes, default=None - If a single axis is passed in, it is treated as a bounding axes - and a grid of partial depedendence plots will be drawn within + and a grid of partial dependence plots will be drawn within these bounds. The `n_cols` parameter controls the number of columns in the grid. - If an array-like of axes are passed in, the partial dependence From a187e0cf36da74c4709d14c60a63979dcee77daf Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 30 Oct 2019 17:59:41 +0100 Subject: [PATCH 85/86] docstring --- sklearn/inspection/_partial_dependence.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index a2475e272dbce..75cfdb10a621d 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -592,15 +592,12 @@ def plot_partial_dependence(estimator, X, features, feature_names=None, # regression and binary classification target_idx = 0 - # Use check_array only on lists and other non-array-likes / sparse. Do not - # convert DataFrame into a NumPy array. X = check_array(X) n_features = X.shape[1] # convert feature_names to list if feature_names is None: - # if feature_names is None, use feature indices as name for NumPy array - # or the column names for a dataframe + # if feature_names is None, use feature indices as name feature_names = [str(i) for i in range(n_features)] elif isinstance(feature_names, np.ndarray): feature_names = feature_names.tolist() From 46aea9353fa45db45ea8a25fc5f5370d093be71b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 30 Oct 2019 18:13:42 +0100 Subject: [PATCH 86/86] update example --- .../inspection/plot_partial_dependence.py | 24 +++++++++++-------- ...ot_partial_dependence_visualization_api.py | 13 +++++----- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/examples/inspection/plot_partial_dependence.py b/examples/inspection/plot_partial_dependence.py index 0d79401e3f662..d7564d5ec95c7 100644 --- a/examples/inspection/plot_partial_dependence.py +++ b/examples/inspection/plot_partial_dependence.py @@ -30,6 +30,7 @@ from time import time import numpy as np +import pandas as pd import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D @@ -54,8 +55,8 @@ # (here the average target, by default) cal_housing = fetch_california_housing() -names = cal_housing.feature_names -X, y = cal_housing.data, cal_housing.target +X = pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names) +y = cal_housing.target y -= y.mean() @@ -104,8 +105,9 @@ tic = time() # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower # with the brute method. -features = [0, 5, 1, 2] -plot_partial_dependence(est, X_train, features, feature_names=names, +features = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms'] +plot_partial_dependence(est, X_train, features, + feature_names=X_train.columns.tolist(), n_jobs=3, grid_resolution=20) print("done in {:.3f}s".format(time() - tic)) fig = plt.gcf() @@ -143,8 +145,10 @@ print('Computing partial dependence plots...') tic = time() -features = [0, 5, 1, 2, (5, 1)] -plot_partial_dependence(est, X_train, features, feature_names=names, +features = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms', + ('AveOccup', 'HouseAge')] +plot_partial_dependence(est, X_train, features, + feature_names=X_train.columns.tolist(), n_jobs=3, grid_resolution=20) print("done in {:.3f}s".format(time() - tic)) fig = plt.gcf() @@ -192,16 +196,16 @@ fig = plt.figure() -target_feature = (1, 5) -pdp, axes = partial_dependence(est, X_train, target_feature, +features = ('AveOccup', 'HouseAge') +pdp, axes = partial_dependence(est, X_train, features=features, grid_resolution=20) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') -ax.set_xlabel(names[target_feature[0]]) -ax.set_ylabel(names[target_feature[1]]) +ax.set_xlabel(features[0]) +ax.set_ylabel(features[1]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) diff --git a/examples/plot_partial_dependence_visualization_api.py b/examples/plot_partial_dependence_visualization_api.py index 8884d52f80d25..911a2409efe0b 100644 --- a/examples/plot_partial_dependence_visualization_api.py +++ b/examples/plot_partial_dependence_visualization_api.py @@ -15,6 +15,7 @@ """ print(__doc__) +import pandas as pd import matplotlib.pyplot as plt from sklearn.datasets import load_boston from sklearn.neural_network import MLPRegressor @@ -32,8 +33,8 @@ # housing price dataset. boston = load_boston() -X, y = boston.data, boston.target -feature_names = boston.feature_names +X = pd.DataFrame(boston.data, columns=boston.feature_names) +y = boston.target tree = DecisionTreeRegressor() mlp = make_pipeline(StandardScaler(), @@ -55,7 +56,7 @@ fig, ax = plt.subplots(figsize=(12, 6)) ax.set_title("Decision Tree") tree_disp = plot_partial_dependence(tree, X, ["LSTAT", "RM"], - feature_names=feature_names, ax=ax) + feature_names=X.columns.tolist(), ax=ax) ############################################################################## # The partial depdendence curves can be plotted for the multi-layer perceptron. @@ -65,7 +66,7 @@ fig, ax = plt.subplots(figsize=(12, 6)) ax.set_title("Multi-layer Perceptron") mlp_disp = plot_partial_dependence(mlp, X, ["LSTAT", "RM"], - feature_names=feature_names, ax=ax, + feature_names=X.columns.tolist(), ax=ax, line_kw={"c": "red"}) ############################################################################## @@ -134,7 +135,7 @@ # the same axes. In this case, `tree_disp.axes_` is passed into the second # plot function. tree_disp = plot_partial_dependence(tree, X, ["LSTAT"], - feature_names=feature_names) + feature_names=X.columns.tolist()) mlp_disp = plot_partial_dependence(mlp, X, ["LSTAT"], - feature_names=feature_names, + feature_names=X.columns.tolist(), ax=tree_disp.axes_, line_kw={"c": "red"})